From 91515492f27346ed5aba296031189fc6a3dc8c74 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 27 Aug 2022 23:16:37 +0300 Subject: [PATCH 1/4] gh-96346: Use double caching for re._compile() --- Lib/re/__init__.py | 55 ++++++++++++------- ...2-08-27-23-16-09.gh-issue-96346.jJX14I.rst | 1 + 2 files changed, 37 insertions(+), 19 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-08-27-23-16-09.gh-issue-96346.jJX14I.rst diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index d58c2117ef3e14..11533c58bd56bf 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -229,6 +229,7 @@ def compile(pattern, flags=0): def purge(): "Clear the regular expression caches" _cache.clear() + _cache2.clear() _compile_repl.cache_clear() def template(pattern, flags=0): @@ -267,39 +268,55 @@ def escape(pattern): # internals _cache = {} # ordered! +_cache2 = {} # ordered! _MAXCACHE = 512 +_MAXCACHE2 = 256 def _compile(pattern, flags): # internal: compile pattern if isinstance(flags, RegexFlag): flags = flags.value try: - return _cache[type(pattern), pattern, flags] + return _cache2[type(pattern), pattern, flags] except KeyError: pass - if isinstance(pattern, Pattern): - if flags: - raise ValueError( - "cannot process flags argument with a compiled pattern") - return pattern - if not _compiler.isstring(pattern): - raise TypeError("first argument must be string or compiled pattern") - if flags & T: - import warnings - warnings.warn("The re.TEMPLATE/re.T flag is deprecated " - "as it is an undocumented flag " - "without an obvious purpose. " - "Don't use it.", - DeprecationWarning) - p = _compiler.compile(pattern, flags) - if not (flags & DEBUG): + + key = (type(pattern), pattern, flags) + p = _cache.pop(key, None) + if p is None: + if isinstance(pattern, Pattern): + if flags: + raise ValueError( + "cannot process flags argument with a compiled pattern") + return pattern + if not _compiler.isstring(pattern): + raise TypeError("first argument must be string or compiled pattern") + if flags & T: + import warnings + warnings.warn("The re.TEMPLATE/re.T flag is deprecated " + "as it is an undocumented flag " + "without an obvious purpose. " + "Don't use it.", + DeprecationWarning) + p = _compiler.compile(pattern, flags) + if flags & DEBUG: + return p if len(_cache) >= _MAXCACHE: - # Drop the oldest item + # Drop the least used item try: del _cache[next(iter(_cache))] except (StopIteration, RuntimeError, KeyError): pass - _cache[type(pattern), pattern, flags] = p + # Append to the end + _cache[key] = p + + if len(_cache2) >= _MAXCACHE2: + # Drop the oldest item + try: + del _cache2[next(iter(_cache2))] + except (StopIteration, RuntimeError, KeyError): + pass + _cache2[key] = p return p @functools.lru_cache(_MAXCACHE) diff --git a/Misc/NEWS.d/next/Library/2022-08-27-23-16-09.gh-issue-96346.jJX14I.rst b/Misc/NEWS.d/next/Library/2022-08-27-23-16-09.gh-issue-96346.jJX14I.rst new file mode 100644 index 00000000000000..9883348b9c3e24 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-08-27-23-16-09.gh-issue-96346.jJX14I.rst @@ -0,0 +1 @@ +Use double caching for compiled RE patterns. From 5fe3232d745afba9e4859f522f965a9cf1969fe3 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 28 Aug 2022 10:20:47 +0300 Subject: [PATCH 2/4] Add some comments. --- Lib/re/__init__.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index 11533c58bd56bf..d57efc99d028be 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -267,11 +267,16 @@ def escape(pattern): # -------------------------------------------------------------------- # internals -_cache = {} # ordered! -_cache2 = {} # ordered! - +# Use the fact that dict keeps the insertion order. +# _cache2 uses the simple FIFO policy which has better latency. +# _cache uses the LRU policy which has better hit rate. +# OrderedDict is not used because it adds a new dependence, and +# performance difference is negligible. +_cache = {} # LRU +_cache2 = {} # FIFO _MAXCACHE = 512 -_MAXCACHE2 = 256 +_MAXCACHE2 = 256 # Must be less than _MAXCACHE. + def _compile(pattern, flags): # internal: compile pattern if isinstance(flags, RegexFlag): @@ -302,7 +307,7 @@ def _compile(pattern, flags): if flags & DEBUG: return p if len(_cache) >= _MAXCACHE: - # Drop the least used item + # Drop the least recently used item try: del _cache[next(iter(_cache))] except (StopIteration, RuntimeError, KeyError): From d619f6d4d5a3c0ef75c5857946f71c9791feec1a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 31 Aug 2022 09:16:27 +0300 Subject: [PATCH 3/4] Address review comments. --- Lib/re/__init__.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index d57efc99d028be..63f3dc82780684 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -270,12 +270,11 @@ def escape(pattern): # Use the fact that dict keeps the insertion order. # _cache2 uses the simple FIFO policy which has better latency. # _cache uses the LRU policy which has better hit rate. -# OrderedDict is not used because it adds a new dependence, and -# performance difference is negligible. _cache = {} # LRU _cache2 = {} # FIFO _MAXCACHE = 512 -_MAXCACHE2 = 256 # Must be less than _MAXCACHE. +_MAXCACHE2 = 256 +assert _MAXCACHE2 < _MAXCACHE def _compile(pattern, flags): # internal: compile pattern @@ -307,16 +306,19 @@ def _compile(pattern, flags): if flags & DEBUG: return p if len(_cache) >= _MAXCACHE: - # Drop the least recently used item + # Drop the least recently used item. + # next(iter(_cache)) is known to have linear amortized time, + # but it is used here to avoid a dependency from using OrderedDict. + # For the small _MAXCACHE value it doesn't make much of a difference. try: del _cache[next(iter(_cache))] except (StopIteration, RuntimeError, KeyError): pass - # Append to the end + # Append to the end. _cache[key] = p if len(_cache2) >= _MAXCACHE2: - # Drop the oldest item + # Drop the oldest item. try: del _cache2[next(iter(_cache2))] except (StopIteration, RuntimeError, KeyError): From bd618bf65203e5bf77ff656863c62fada6bb56f4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 5 Oct 2022 13:28:43 +0300 Subject: [PATCH 4/4] Add a comment. --- Lib/re/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index 63f3dc82780684..8d6a4ef3880f0c 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -286,6 +286,7 @@ def _compile(pattern, flags): pass key = (type(pattern), pattern, flags) + # Item in _cache should be moved to the end if found. p = _cache.pop(key, None) if p is None: if isinstance(pattern, Pattern):