From 608e91757ce80af34ae38784fbc17ad0ad3f33e0 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 28 Jan 2023 00:07:54 +0000 Subject: [PATCH 01/28] gh-73435: Implement recursive wildcards in pathlib.PurePath.match() Add a new *recursive* argument to `pathlib.PurePath.match()`, defaulting to `False`. If set to true, `match()` handles the `**` wildcard as in `Path.glob()`, i.e. it matches any number of path segments. We now compile a `re.Pattern` object for the entire pattern. This is made more difficult by `fnmatch` not treating directory separators as special when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts onto separate *lines* in a string, and ensure we don't set `re.DOTALL`. --- Doc/library/pathlib.rst | 8 +++++++- Lib/fnmatch.py | 7 ++++++- Lib/pathlib.py | 43 +++++++++++++++++++++------------------- Lib/test/test_pathlib.py | 26 ++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 22 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index f222745a2c56bc..0b8cb28544682c 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -544,11 +544,14 @@ Pure paths provide the following methods and properties: PureWindowsPath('c:/Program Files') -.. method:: PurePath.match(pattern) +.. method:: PurePath.match(pattern, recursive=False) Match this path against the provided glob-style pattern. Return ``True`` if matching is successful, ``False`` otherwise. + If *recursive* is true, the pattern "``**``" will match any number of file + or directory segments. + If *pattern* is relative, the path can be either relative or absolute, and matching is done from the right:: @@ -574,6 +577,9 @@ Pure paths provide the following methods and properties: >>> PureWindowsPath('b.py').match('*.PY') True + .. versionadded:: 3.12 + The *recursive* argument. + .. method:: PurePath.relative_to(other, walk_up=False) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index d5e296f7748c1c..88b2d973b14d0f 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -77,6 +77,11 @@ def translate(pat): There is no way to quote meta-characters. """ + res = _translate(pat) + return fr'(?s:{res})\Z' + + +def _translate(pat): STAR = object() res = [] add = res.append @@ -182,4 +187,4 @@ def translate(pat): add(f"(?>.*?{fixed})") assert i == n res = "".join(res) - return fr'(?s:{res})\Z' + return res diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 17659bcd3e2d7f..8ceb0f82aa75d6 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -64,6 +64,25 @@ def _is_wildcard_pattern(pat): # Globbing helpers # +@functools.lru_cache() +def _make_matcher(path_cls, pattern, recursive): + pattern = path_cls(pattern) + if not pattern._parts: + raise ValueError("empty pattern") + result = [r'\A' if pattern._drv or pattern._root else '^'] + for part in pattern._parts_normcase: + if recursive: + if part == '**': + result.append('(.+\n)*') + continue + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + part = fnmatch._translate(part) + result.append(f'{part}\n') + result.append(r'\Z') + return re.compile(''.join(result), flags=re.MULTILINE) + + @functools.lru_cache() def _make_selector(pattern_parts, flavour): pat = pattern_parts[0] @@ -639,29 +658,13 @@ def is_reserved(self): name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') return name.upper() in _WIN_RESERVED_NAMES - def match(self, path_pattern): + def match(self, path_pattern, recursive=False): """ Return True if this path matches the given pattern. """ - path_pattern = self._flavour.normcase(path_pattern) - drv, root, pat_parts = self._parse_parts((path_pattern,)) - if not pat_parts: - raise ValueError("empty pattern") - elif drv and drv != self._flavour.normcase(self._drv): - return False - elif root and root != self._root: - return False - parts = self._parts_normcase - if drv or root: - if len(pat_parts) != len(parts): - return False - pat_parts = pat_parts[1:] - elif len(pat_parts) > len(parts): - return False - for part, pat in zip(reversed(parts), reversed(pat_parts)): - if not fnmatch.fnmatchcase(part, pat): - return False - return True + matcher = _make_matcher(type(self), path_pattern, recursive) + lines = ''.join(f'{part}\n' for part in self._parts_normcase) + return matcher.search(lines) is not None # Can't subclass os.PathLike from PurePath and keep the constructor # optimizations in PurePath._parse_args(). diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index a596795b44f0fa..7c3e169d3ce0e5 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -319,6 +319,32 @@ def test_match_common(self): # Multi-part glob-style pattern. self.assertFalse(P('/a/b/c.py').match('/**/*.py')) self.assertTrue(P('/a/b/c.py').match('/a/**/*.py')) + # Recursive patterns. + self.assertTrue(P('a').match('**', recursive=True)) + self.assertTrue(P('c.py').match('**', recursive=True)) + self.assertTrue(P('a/b/c.py').match('**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('**/', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/a/**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('**/*.py', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/**/*.py', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/a/**/*.py', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('**/a/b/c.py/**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py', recursive=True)) + self.assertFalse(P('c.py').match('**/a.py', recursive=True)) + self.assertFalse(P('c.py').match('c/**', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b/c', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b/c.', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**', recursive=True)) + self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**', recursive=True)) + self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py', recursive=True)) + self.assertRaises(ValueError, P('a').match, '**a/b/c', recursive=True) + self.assertRaises(ValueError, P('a').match, 'a/b/c**', recursive=True) def test_ordering_common(self): # Ordering is tuple-alike. From 9a43c7ff656fdec6e0050471297c2bc034a18abe Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 29 Jan 2023 00:15:55 +0000 Subject: [PATCH 02/28] Simplify code slightly --- Lib/pathlib.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 8ceb0f82aa75d6..7b4a9805f2e8c3 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -70,15 +70,14 @@ def _make_matcher(path_cls, pattern, recursive): if not pattern._parts: raise ValueError("empty pattern") result = [r'\A' if pattern._drv or pattern._root else '^'] - for part in pattern._parts_normcase: + for line in pattern._lines_normcase: if recursive: - if part == '**': - result.append('(.+\n)*') + if line == '**\n': + result.append('(.*\n)*') continue - elif '**' in part: + elif '**' in line: raise ValueError("Invalid pattern: '**' can only be an entire path component") - part = fnmatch._translate(part) - result.append(f'{part}\n') + result.append(fnmatch._translate(line)) result.append(r'\Z') return re.compile(''.join(result), flags=re.MULTILINE) @@ -658,13 +657,16 @@ def is_reserved(self): name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') return name.upper() in _WIN_RESERVED_NAMES + @property + def _lines_normcase(self): + return [f'{part}\n' for part in self._parts_normcase] + def match(self, path_pattern, recursive=False): """ Return True if this path matches the given pattern. """ matcher = _make_matcher(type(self), path_pattern, recursive) - lines = ''.join(f'{part}\n' for part in self._parts_normcase) - return matcher.search(lines) is not None + return matcher.search(''.join(self._lines_normcase)) is not None # Can't subclass os.PathLike from PurePath and keep the constructor # optimizations in PurePath._parse_args(). From a846279d4274449ee15a084b7db84984694526fe Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 15 Feb 2023 17:47:34 +0000 Subject: [PATCH 03/28] Fix support for newlines --- Lib/pathlib.py | 17 +++++++++++++---- Lib/test/test_pathlib.py | 1 - 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 7b4a9805f2e8c3..6fbf293d275393 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -64,16 +64,23 @@ def _is_wildcard_pattern(pat): # Globbing helpers # + +_SWAP_SLASH_AND_NEWLINE = str.maketrans({'/': '\n', '\n': '/'}) + + @functools.lru_cache() def _make_matcher(path_cls, pattern, recursive): pattern = path_cls(pattern) if not pattern._parts: raise ValueError("empty pattern") result = [r'\A' if pattern._drv or pattern._root else '^'] - for line in pattern._lines_normcase: + for line in pattern._lines_normcase.splitlines(keepends=True): if recursive: if line == '**\n': - result.append('(.*\n)*') + result.append(r'[\S\s]*^') + continue + elif line == '**': + result.append(r'[\S\s]*') continue elif '**' in line: raise ValueError("Invalid pattern: '**' can only be an entire path component") @@ -659,14 +666,16 @@ def is_reserved(self): @property def _lines_normcase(self): - return [f'{part}\n' for part in self._parts_normcase] + path = self._flavour.normcase(self.as_posix()) + return path.translate(_SWAP_SLASH_AND_NEWLINE) def match(self, path_pattern, recursive=False): """ Return True if this path matches the given pattern. """ matcher = _make_matcher(type(self), path_pattern, recursive) - return matcher.search(''.join(self._lines_normcase)) is not None + return matcher.search(self._lines_normcase) is not None + # Can't subclass os.PathLike from PurePath and keep the constructor # optimizations in PurePath._parse_args(). diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index 7c3e169d3ce0e5..fdacdd8f4afc92 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -331,7 +331,6 @@ def test_match_common(self): self.assertTrue(P('/a/b/c.py').match('/**/*.py', recursive=True)) self.assertTrue(P('/a/b/c.py').match('/a/**/*.py', recursive=True)) self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('**/a/b/c.py/**', recursive=True)) self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py', recursive=True)) self.assertFalse(P('c.py').match('**/a.py', recursive=True)) self.assertFalse(P('c.py').match('c/**', recursive=True)) From bbd8cd603c71f87c948e11a1528a29536ae21827 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 15 Feb 2023 18:37:18 +0000 Subject: [PATCH 04/28] Cache translation of individual components --- Lib/pathlib.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 6fbf293d275393..755036bb97c48b 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -68,6 +68,18 @@ def _is_wildcard_pattern(pat): _SWAP_SLASH_AND_NEWLINE = str.maketrans({'/': '\n', '\n': '/'}) +@functools.lru_cache() +def _translate(pattern, recursive): + if recursive: + if pattern == '**\n': + return r'[\S\s]*^' + elif pattern == '**': + return r'[\S\s]*' + elif '**' in pattern: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + return fnmatch._translate(pattern) + + @functools.lru_cache() def _make_matcher(path_cls, pattern, recursive): pattern = path_cls(pattern) @@ -75,16 +87,7 @@ def _make_matcher(path_cls, pattern, recursive): raise ValueError("empty pattern") result = [r'\A' if pattern._drv or pattern._root else '^'] for line in pattern._lines_normcase.splitlines(keepends=True): - if recursive: - if line == '**\n': - result.append(r'[\S\s]*^') - continue - elif line == '**': - result.append(r'[\S\s]*') - continue - elif '**' in line: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - result.append(fnmatch._translate(line)) + result.append(_translate(line, recursive)) result.append(r'\Z') return re.compile(''.join(result), flags=re.MULTILINE) From b5c002e36d7de58bb2a991b7fcfc4a716c4d8154 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 15 Feb 2023 20:12:47 +0000 Subject: [PATCH 05/28] Drop 'recursive' argument, make this the only behaviour. --- Doc/library/pathlib.rst | 10 ++++---- Lib/pathlib.py | 26 ++++++++++----------- Lib/test/test_pathlib.py | 49 +++++++++++++++++++--------------------- 3 files changed, 40 insertions(+), 45 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 0b8cb28544682c..00788ef327753b 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -544,14 +544,11 @@ Pure paths provide the following methods and properties: PureWindowsPath('c:/Program Files') -.. method:: PurePath.match(pattern, recursive=False) +.. method:: PurePath.match(pattern) Match this path against the provided glob-style pattern. Return ``True`` if matching is successful, ``False`` otherwise. - If *recursive* is true, the pattern "``**``" will match any number of file - or directory segments. - If *pattern* is relative, the path can be either relative or absolute, and matching is done from the right:: @@ -577,8 +574,9 @@ Pure paths provide the following methods and properties: >>> PureWindowsPath('b.py').match('*.PY') True - .. versionadded:: 3.12 - The *recursive* argument. + .. versionchanged:: 3.12 + Support for the recursive wildcard "``**``" was added. In previous + versions, it acted like the non-recursive wildcard "``*``". .. method:: PurePath.relative_to(other, walk_up=False) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 755036bb97c48b..484a5d874ba138 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -69,25 +69,25 @@ def _is_wildcard_pattern(pat): @functools.lru_cache() -def _translate(pattern, recursive): - if recursive: - if pattern == '**\n': - return r'[\S\s]*^' - elif pattern == '**': - return r'[\S\s]*' - elif '**' in pattern: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - return fnmatch._translate(pattern) +def _translate(pattern): + if pattern == '**\n': + return r'[\S\s]*^' + elif pattern == '**': + return r'[\S\s]*' + elif '**' in pattern: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + return fnmatch._translate(pattern) @functools.lru_cache() -def _make_matcher(path_cls, pattern, recursive): +def _make_matcher(path_cls, pattern): pattern = path_cls(pattern) if not pattern._parts: raise ValueError("empty pattern") result = [r'\A' if pattern._drv or pattern._root else '^'] for line in pattern._lines_normcase.splitlines(keepends=True): - result.append(_translate(line, recursive)) + result.append(_translate(line)) result.append(r'\Z') return re.compile(''.join(result), flags=re.MULTILINE) @@ -672,11 +672,11 @@ def _lines_normcase(self): path = self._flavour.normcase(self.as_posix()) return path.translate(_SWAP_SLASH_AND_NEWLINE) - def match(self, path_pattern, recursive=False): + def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - matcher = _make_matcher(type(self), path_pattern, recursive) + matcher = _make_matcher(type(self), path_pattern) return matcher.search(self._lines_normcase) is not None diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index fdacdd8f4afc92..1c486be55e5cfc 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -317,33 +317,30 @@ def test_match_common(self): self.assertFalse(P('/ab.py').match('/a/*.py')) self.assertFalse(P('/a/b/c.py').match('/a/*.py')) # Multi-part glob-style pattern. - self.assertFalse(P('/a/b/c.py').match('/**/*.py')) + self.assertTrue(P('a').match('**')) + self.assertTrue(P('c.py').match('**')) + self.assertTrue(P('a/b/c.py').match('**')) + self.assertTrue(P('/a/b/c.py').match('**')) + self.assertTrue(P('/a/b/c.py').match('/**')) + self.assertTrue(P('/a/b/c.py').match('**/')) + self.assertTrue(P('/a/b/c.py').match('/a/**')) + self.assertTrue(P('/a/b/c.py').match('**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/**/*.py')) self.assertTrue(P('/a/b/c.py').match('/a/**/*.py')) - # Recursive patterns. - self.assertTrue(P('a').match('**', recursive=True)) - self.assertTrue(P('c.py').match('**', recursive=True)) - self.assertTrue(P('a/b/c.py').match('**', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('**', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/**', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('**/', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/a/**', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/a/**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py', recursive=True)) - self.assertFalse(P('c.py').match('**/a.py', recursive=True)) - self.assertFalse(P('c.py').match('c/**', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b/c', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b/c.', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b/c./**', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b/c./**', recursive=True)) - self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**', recursive=True)) - self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py', recursive=True)) - self.assertRaises(ValueError, P('a').match, '**a/b/c', recursive=True) - self.assertRaises(ValueError, P('a').match, 'a/b/c**', recursive=True) + self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py')) + self.assertFalse(P('c.py').match('**/a.py')) + self.assertFalse(P('c.py').match('c/**')) + self.assertFalse(P('a/b/c.py').match('**/a')) + self.assertFalse(P('a/b/c.py').match('**/a/b')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c.')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) + self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**')) + self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py')) + self.assertRaises(ValueError, P('a').match, '**a/b/c') + self.assertRaises(ValueError, P('a').match, 'a/b/c**') def test_ordering_common(self): # Ordering is tuple-alike. From 0afcd54884c315587469c49752ce858036a15b04 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 16 Feb 2023 21:05:32 +0000 Subject: [PATCH 06/28] Undo modifications to fnmatch.py --- Lib/fnmatch.py | 7 +------ Lib/pathlib.py | 4 +++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 88b2d973b14d0f..d5e296f7748c1c 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -77,11 +77,6 @@ def translate(pat): There is no way to quote meta-characters. """ - res = _translate(pat) - return fr'(?s:{res})\Z' - - -def _translate(pat): STAR = object() res = [] add = res.append @@ -187,4 +182,4 @@ def _translate(pat): add(f"(?>.*?{fixed})") assert i == n res = "".join(res) - return res + return fr'(?s:{res})\Z' diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 484a5d874ba138..a298a73a9f467f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -66,6 +66,8 @@ def _is_wildcard_pattern(pat): _SWAP_SLASH_AND_NEWLINE = str.maketrans({'/': '\n', '\n': '/'}) +_FNMATCH_PADDING = fnmatch.translate('_').split('_') +_FNMATCH_SLICE = slice(len(_FNMATCH_PADDING[0]), -len(_FNMATCH_PADDING[1])) @functools.lru_cache() @@ -77,7 +79,7 @@ def _translate(pattern): elif '**' in pattern: raise ValueError("Invalid pattern: '**' can only be an entire path component") else: - return fnmatch._translate(pattern) + return fnmatch.translate(pattern)[_FNMATCH_SLICE] @functools.lru_cache() From 7b6f850c99964c12682b275cf65f5b52d7fcfb89 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 17 Feb 2023 16:25:08 +0000 Subject: [PATCH 07/28] Fix Windows support --- Lib/pathlib.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 511e0f0577af0c..f937614611c07c 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -65,7 +65,6 @@ def _is_wildcard_pattern(pat): # -_SWAP_SLASH_AND_NEWLINE = str.maketrans({'/': '\n', '\n': '/'}) _FNMATCH_PADDING = fnmatch.translate('_').split('_') _FNMATCH_SLICE = slice(len(_FNMATCH_PADDING[0]), -len(_FNMATCH_PADDING[1])) @@ -82,6 +81,11 @@ def _translate(pattern): return fnmatch.translate(pattern)[_FNMATCH_SLICE] +@functools.lru_cache() +def _make_matcher_trans(flavour): + return str.maketrans({flavour.sep: '\n', '\n': flavour.sep}) + + @functools.lru_cache() def _make_matcher(path_cls, pattern): pattern = path_cls(pattern) @@ -671,8 +675,8 @@ def is_reserved(self): @property def _lines_normcase(self): - path = self._flavour.normcase(self.as_posix()) - return path.translate(_SWAP_SLASH_AND_NEWLINE) + trans = _make_matcher_trans(self._flavour) + return self._flavour.normcase(str(self)).translate(trans) def match(self, path_pattern): """ From 037488ac370971ce9c8dbfb966a10d4eb1b91656 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 17 Feb 2023 18:38:10 +0000 Subject: [PATCH 08/28] Tidy up code. --- Lib/pathlib.py | 63 ++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index f937614611c07c..ecb39d9a40d0a5 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -65,25 +65,12 @@ def _is_wildcard_pattern(pat): # -_FNMATCH_PADDING = fnmatch.translate('_').split('_') -_FNMATCH_SLICE = slice(len(_FNMATCH_PADDING[0]), -len(_FNMATCH_PADDING[1])) - - -@functools.lru_cache() -def _translate(pattern): - if pattern == '**\n': - return r'[\S\s]*^' - elif pattern == '**': - return r'[\S\s]*' - elif '**' in pattern: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: - return fnmatch.translate(pattern)[_FNMATCH_SLICE] - - -@functools.lru_cache() -def _make_matcher_trans(flavour): - return str.maketrans({flavour.sep: '\n', '\n': flavour.sep}) +_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') +_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) +_SWAP_SEP_AND_NEWLINE = { + '/': str.maketrans({'/': '\n', '\n': '/'}), + '\\': str.maketrans({'\\': '\n', '\n': '\\'}), +} @functools.lru_cache() @@ -91,11 +78,19 @@ def _make_matcher(path_cls, pattern): pattern = path_cls(pattern) if not pattern._parts: raise ValueError("empty pattern") - result = [r'\A' if pattern._drv or pattern._root else '^'] - for line in pattern._lines_normcase.splitlines(keepends=True): - result.append(_translate(line)) - result.append(r'\Z') - return re.compile(''.join(result), flags=re.MULTILINE) + parts = [r'\A' if pattern._drv or pattern._root else '^'] + for part in pattern._lines_normcase.splitlines(keepends=True): + if part == '**\n': + part = r'[\s\S]*^' + elif part == '**': + part = r'[\s\S]*' + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + part = fnmatch.translate(part)[_FNMATCH_SLICE] + parts.append(part) + parts.append(r'\Z') + return re.compile(''.join(parts), flags=re.MULTILINE) @functools.lru_cache() @@ -286,7 +281,8 @@ class PurePath(object): """ __slots__ = ( '_drv', '_root', '_parts', - '_str', '_hash', '_parts_tuple', '_parts_normcase_cached', + '_str', '_hash', '_parts_tuple', + '_parts_normcase_cached', '_lines_normcase_cached', ) _flavour = os.path @@ -415,6 +411,18 @@ def as_uri(self): path = str(self) return prefix + urlquote_from_bytes(os.fsencode(path)) + @property + def _lines_normcase(self): + # Case-normalized path with separators and newlines swapped, for + # pattern matching. + try: + return self._lines_normcase_cached + except AttributeError: + path = self._flavour.normcase(str(self)) + trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] + self._lines_normcase_cached = path.translate(trans) + return self._lines_normcase_cached + @property def _parts_normcase(self): # Cached parts with normalized case, for hashing and comparison. @@ -673,11 +681,6 @@ def is_reserved(self): name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') return name.upper() in _WIN_RESERVED_NAMES - @property - def _lines_normcase(self): - trans = _make_matcher_trans(self._flavour) - return self._flavour.normcase(str(self)).translate(trans) - def match(self, path_pattern): """ Return True if this path matches the given pattern. From 07419501d2dda92ef23899dd542d5c4ec6ad03e3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 17 Feb 2023 18:56:57 +0000 Subject: [PATCH 09/28] Add news blurb. --- .../next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst diff --git a/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst new file mode 100644 index 00000000000000..d5a2ae07700b34 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst @@ -0,0 +1 @@ +Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. From 314679ff7a1a2f2f189c1c3c8675f612104b5fd7 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 9 Apr 2023 20:12:59 +0100 Subject: [PATCH 10/28] Simplify patch; prepare for use in `glob()` --- Lib/pathlib.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index bd6a5869e11e36..847e0031cdd312 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -73,12 +73,9 @@ def _is_wildcard_pattern(pat): @functools.lru_cache() -def _make_matcher(path_cls, pattern): - pattern = path_cls(pattern) - if not pattern.parts: - raise ValueError("empty pattern") - parts = [r'\A' if pattern.drive or pattern.root else '^'] - for part in pattern._lines_normcase.splitlines(keepends=True): +def _make_matcher(lines): + parts = ['^'] + for part in lines.splitlines(keepends=True): if part == '**\n': part = r'[\s\S]*^' elif part == '**': @@ -717,8 +714,15 @@ def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - matcher = _make_matcher(type(self), path_pattern) - return matcher.search(self._lines_normcase) is not None + pat = type(self)(path_pattern) + if not pat.parts: + raise ValueError("empty pattern") + matcher = _make_matcher(pat._lines_normcase) + if pat.drive or pat.root: + match = matcher.match(self._lines_normcase) + else: + match = matcher.search(self._lines_normcase) + return match is not None # Can't subclass os.PathLike from PurePath and keep the constructor From 90eebcc4ea83a7570e49dc9ad38606a1c57ec3a7 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 9 Apr 2023 20:38:30 +0100 Subject: [PATCH 11/28] Make better use of path object caching. --- Lib/pathlib.py | 72 ++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 847e0031cdd312..f8dd6b39f4df1f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -72,23 +72,6 @@ def _is_wildcard_pattern(pat): } -@functools.lru_cache() -def _make_matcher(lines): - parts = ['^'] - for part in lines.splitlines(keepends=True): - if part == '**\n': - part = r'[\s\S]*^' - elif part == '**': - part = r'[\s\S]*' - elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: - part = fnmatch.translate(part)[_FNMATCH_SLICE] - parts.append(part) - parts.append(r'\Z') - return re.compile(''.join(parts), flags=re.MULTILINE) - - @functools.lru_cache() def _make_selector(pattern_parts, flavour): pat = pattern_parts[0] @@ -298,17 +281,19 @@ class PurePath(object): # `__hash__()`, and `_parts_normcase` '_str_normcase_cached', - # The `_lines_normcase_cached` slot stores the string path with - # normalized case, and with path separators and newlines swapped. This - # is used to implement `match()`. - '_lines_normcase_cached', - # The `_parts_normcase_cached` slot stores the case-normalized # string path after splitting on path separators. It's set when the # `_parts_normcase` property is accessed for the first time. It's used # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', + # The `_lines_normcase_cached` and `_matcher_cached` slots store the + # string path with path separators and newlines swapped, and an + # `re.Pattern` object derived thereof. These are used to implement + # `match()`. + '_lines_normcase_cached', + '_matcher_cached', + # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. '_hash', @@ -439,6 +424,15 @@ def _str_normcase(self): self._str_normcase_cached = self._flavour.normcase(str(self)) return self._str_normcase_cached + @property + def _parts_normcase(self): + # Cached parts with normalized case, for comparisons. + try: + return self._parts_normcase_cached + except AttributeError: + self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) + return self._parts_normcase_cached + @property def _lines_normcase(self): # Case-normalized path with separators and newlines swapped, for @@ -451,13 +445,26 @@ def _lines_normcase(self): return self._lines_normcase_cached @property - def _parts_normcase(self): - # Cached parts with normalized case, for comparisons. + def _matcher(self): try: - return self._parts_normcase_cached + return self._matcher_cached except AttributeError: - self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) - return self._parts_normcase_cached + if not self.parts: + raise ValueError("empty pattern") + parts = [r'\A' if self.drive or self.root else '^'] + for part in self._lines_normcase.splitlines(keepends=True): + if part == '**\n': + part = r'[\s\S]*^' + elif part == '**': + part = r'[\s\S]*' + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + part = fnmatch.translate(part)[_FNMATCH_SLICE] + parts.append(part) + parts.append(r'\Z') + self._matcher_cached = re.compile(''.join(parts), flags=re.MULTILINE) + return self._matcher_cached def __eq__(self, other): if not isinstance(other, PurePath): @@ -714,14 +721,9 @@ def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - pat = type(self)(path_pattern) - if not pat.parts: - raise ValueError("empty pattern") - matcher = _make_matcher(pat._lines_normcase) - if pat.drive or pat.root: - match = matcher.match(self._lines_normcase) - else: - match = matcher.search(self._lines_normcase) + if not isinstance(path_pattern, type(self)): + path_pattern = type(self)(path_pattern) + match = path_pattern._matcher.search(self._lines_normcase) return match is not None From 4b5fffdf961e7d187759ed67a68e97891a9b1d19 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 9 Apr 2023 21:09:43 +0100 Subject: [PATCH 12/28] Add performance tip to docs --- Doc/library/pathlib.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 701470b5ebab2b..47057d99272942 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -568,6 +568,13 @@ Pure paths provide the following methods and properties: >>> PurePath('a/b.py').match('/*.py') False + The *pattern* may be another path object; this speeds up matching the same + pattern against multiple files:: + + >>> pattern = PurePath('*.py') + >>> PurePath('a/b.py').match(pattern) + True + As with other methods, case-sensitivity follows platform defaults:: >>> PurePosixPath('b.py').match('*.PY') From 5e8bc280ebbc52921ab7f246f502ccd62fb2fc26 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 20 Apr 2023 19:51:13 +0100 Subject: [PATCH 13/28] Skip re-initialisation of PurePath patterns. --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index f8dd6b39f4df1f..e8300d22de6683 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -721,7 +721,7 @@ def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - if not isinstance(path_pattern, type(self)): + if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: path_pattern = type(self)(path_pattern) match = path_pattern._matcher.search(self._lines_normcase) return match is not None From 722a1ab0d9d52ee888da64a15cb65e94f7d34f06 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 2 May 2023 23:14:32 +0100 Subject: [PATCH 14/28] Use `re.IGNORECASE` rather than `os.path.normcase()` --- Lib/pathlib.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index c6dda1dc3989d2..38374479ae352d 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -290,11 +290,11 @@ class PurePath(object): # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', - # The `_lines_normcase_cached` and `_matcher_cached` slots store the + # The `_lines_cached` and `_matcher_cached` slots store the # string path with path separators and newlines swapped, and an # `re.Pattern` object derived thereof. These are used to implement # `match()`. - '_lines_normcase_cached', + '_lines_cached', '_matcher_cached', # The `_hash` slot stores the hash of the case-normalized string @@ -451,15 +451,14 @@ def _parts_normcase(self): return self._parts_normcase_cached @property - def _lines_normcase(self): - # Case-normalized path with separators and newlines swapped, for - # pattern matching. + def _lines(self): + # Path with separators and newlines swapped, for pattern matching. try: - return self._lines_normcase_cached + return self._lines_cached except AttributeError: trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] - self._lines_normcase_cached = self._str_normcase.translate(trans) - return self._lines_normcase_cached + self._lines_cached = str(self).translate(trans) + return self._lines_cached @property def _matcher(self): @@ -469,7 +468,7 @@ def _matcher(self): if not self.parts: raise ValueError("empty pattern") parts = [r'\A' if self.drive or self.root else '^'] - for part in self._lines_normcase.splitlines(keepends=True): + for part in self._lines.splitlines(keepends=True): if part == '**\n': part = r'[\s\S]*^' elif part == '**': @@ -480,7 +479,10 @@ def _matcher(self): part = fnmatch.translate(part)[_FNMATCH_SLICE] parts.append(part) parts.append(r'\Z') - self._matcher_cached = re.compile(''.join(parts), flags=re.MULTILINE) + flags = re.MULTILINE + if not _is_case_sensitive(self._flavour): + flags |= re.IGNORECASE + self._matcher_cached = re.compile(''.join(parts), flags=flags) return self._matcher_cached def __eq__(self, other): @@ -740,7 +742,7 @@ def match(self, path_pattern): """ if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: path_pattern = type(self)(path_pattern) - match = path_pattern._matcher.search(self._lines_normcase) + match = path_pattern._matcher.search(self._lines) return match is not None From ccea5e18df9980d8c78d3451fc98349df7826211 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 11 May 2023 19:47:59 +0100 Subject: [PATCH 15/28] Add whats new entry --- Doc/whatsnew/3.12.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index ec04178238b6b0..b9ff02619f47e8 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -365,6 +365,9 @@ pathlib * Add :meth:`pathlib.Path.is_junction` as a proxy to :func:`os.path.isjunction`. (Contributed by Charles Machalow in :gh:`99547`.) +* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. + (Contributed by Barney Gale in :gh:`101398`.) + dis --- From dd04294e1300a364c8d1c0bf7c514ef5256424f4 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Thu, 11 May 2023 20:44:59 +0100 Subject: [PATCH 16/28] Update Doc/whatsnew/3.12.rst Co-authored-by: Hugo van Kemenade --- Doc/whatsnew/3.12.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index b9ff02619f47e8..e656c8296394dd 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -366,7 +366,7 @@ pathlib (Contributed by Charles Machalow in :gh:`99547`.) * Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. - (Contributed by Barney Gale in :gh:`101398`.) + (Contributed by Barney Gale in :gh:`73435`.) dis From b258641ce46dca84a8a01b83a4cc629cc6c85db9 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Sun, 14 May 2023 20:18:21 +0100 Subject: [PATCH 17/28] Apply suggestions from code review Co-authored-by: Alex Waygood --- Lib/pathlib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 3cc9ed6cb54379..ca48707c544f4a 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -462,7 +462,7 @@ def _matcher(self): return self._matcher_cached except AttributeError: if not self.parts: - raise ValueError("empty pattern") + raise ValueError("empty pattern") from None parts = [r'\A' if self.drive or self.root else '^'] for part in self._lines.splitlines(keepends=True): if part == '**\n': @@ -470,7 +470,7 @@ def _matcher(self): elif part == '**': part = r'[\s\S]*' elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") + raise ValueError("Invalid pattern: '**' can only be an entire path component") from None else: part = fnmatch.translate(part)[_FNMATCH_SLICE] parts.append(part) From ced899853a5fa2bb0dfbaa80013bd3e1a9863971 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:18:46 +0100 Subject: [PATCH 18/28] Explain _FNMATCH_SLICE --- Lib/pathlib.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index ca48707c544f4a..7bcfe5f3d476f9 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -62,6 +62,12 @@ def _is_case_sensitive(flavour): # +# fnmatch.translate() returns a regular expression that includes a prefix and +# a suffix, which enable matching newlines and ensure the end of the string is +# matched, respectively. These features are undesirable for our implementation +# of PurePatch.match(), which represents path separators as newlines and joins +# pattern segments together. As a workaround, we define a slice object that +# remove the prefix and suffix from any translate() result. _FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') _FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) _SWAP_SEP_AND_NEWLINE = { From a33c7b659ae6f554b4a1e44a411acf872e2d388d Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:20:26 +0100 Subject: [PATCH 19/28] Accidentally a word. --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 7bcfe5f3d476f9..536111dcd9a0e5 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -67,7 +67,7 @@ def _is_case_sensitive(flavour): # matched, respectively. These features are undesirable for our implementation # of PurePatch.match(), which represents path separators as newlines and joins # pattern segments together. As a workaround, we define a slice object that -# remove the prefix and suffix from any translate() result. +# can remove the prefix and suffix from any translate() result. _FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') _FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) _SWAP_SEP_AND_NEWLINE = { From 4b3bddb60e58afc5216323b15b7851c2cdce6702 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:49:51 +0100 Subject: [PATCH 20/28] Cache pattern compilation --- Lib/pathlib.py | 56 ++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 536111dcd9a0e5..41e78f6e7a121d 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -76,6 +76,28 @@ def _is_case_sensitive(flavour): } +@functools.lru_cache() +def _make_matcher(pattern): + if not pattern.parts: + raise ValueError("empty pattern") from None + parts = [r'\A' if pattern.drive or pattern.root else '^'] + for part in pattern._lines.splitlines(keepends=True): + if part == '**\n': + part = r'[\s\S]*^' + elif part == '**': + part = r'[\s\S]*' + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") from None + else: + part = fnmatch.translate(part)[_FNMATCH_SLICE] + parts.append(part) + parts.append(r'\Z') + flags = re.MULTILINE + if not _is_case_sensitive(pattern._flavour): + flags |= re.IGNORECASE + return re.compile(''.join(parts), flags=flags) + + @functools.lru_cache() def _make_selector(pattern_parts, flavour, case_sensitive): pat = pattern_parts[0] @@ -286,12 +308,9 @@ class PurePath(object): # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', - # The `_lines_cached` and `_matcher_cached` slots store the - # string path with path separators and newlines swapped, and an - # `re.Pattern` object derived thereof. These are used to implement - # `match()`. + # The `_lines_cached`slot stores the string path with path separators + # and newlines swapped. This is used to implement `match()`. '_lines_cached', - '_matcher_cached', # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. @@ -462,31 +481,6 @@ def _lines(self): self._lines_cached = str(self).translate(trans) return self._lines_cached - @property - def _matcher(self): - try: - return self._matcher_cached - except AttributeError: - if not self.parts: - raise ValueError("empty pattern") from None - parts = [r'\A' if self.drive or self.root else '^'] - for part in self._lines.splitlines(keepends=True): - if part == '**\n': - part = r'[\s\S]*^' - elif part == '**': - part = r'[\s\S]*' - elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") from None - else: - part = fnmatch.translate(part)[_FNMATCH_SLICE] - parts.append(part) - parts.append(r'\Z') - flags = re.MULTILINE - if not _is_case_sensitive(self._flavour): - flags |= re.IGNORECASE - self._matcher_cached = re.compile(''.join(parts), flags=flags) - return self._matcher_cached - def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented @@ -745,7 +739,7 @@ def match(self, path_pattern): """ if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: path_pattern = self.with_segments(path_pattern) - match = path_pattern._matcher.search(self._lines) + match = _make_matcher(path_pattern).search(self._lines) return match is not None From 6ad30dd5bd30309b3ae33734e7f9ec92e2e34366 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:51:59 +0100 Subject: [PATCH 21/28] Remove unneeded `from None` suffix, whoops. --- Lib/pathlib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 41e78f6e7a121d..f1aaf28f24e049 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -79,7 +79,7 @@ def _is_case_sensitive(flavour): @functools.lru_cache() def _make_matcher(pattern): if not pattern.parts: - raise ValueError("empty pattern") from None + raise ValueError("empty pattern") parts = [r'\A' if pattern.drive or pattern.root else '^'] for part in pattern._lines.splitlines(keepends=True): if part == '**\n': @@ -87,7 +87,7 @@ def _make_matcher(pattern): elif part == '**': part = r'[\s\S]*' elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") from None + raise ValueError("Invalid pattern: '**' can only be an entire path component") else: part = fnmatch.translate(part)[_FNMATCH_SLICE] parts.append(part) From 052890f93c4c4ed309a22f8ecc3d4bfd899f6434 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:55:49 +0100 Subject: [PATCH 22/28] Tiny performance improvement: avoid accessing path.parts --- Lib/pathlib.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index f1aaf28f24e049..25fcf5a2f9e006 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -78,9 +78,12 @@ def _is_case_sensitive(flavour): @functools.lru_cache() def _make_matcher(pattern): - if not pattern.parts: + if pattern.drive or pattern.root: + parts = [r'\A'] + elif pattern._tail: + parts = ['^'] + else: raise ValueError("empty pattern") - parts = [r'\A' if pattern.drive or pattern.root else '^'] for part in pattern._lines.splitlines(keepends=True): if part == '**\n': part = r'[\s\S]*^' From d789b6db75f36069fe0705b4aadcb70c743036f5 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 22:45:09 +0100 Subject: [PATCH 23/28] Typo fix --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 25fcf5a2f9e006..ec87d3c0da7bef 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -311,7 +311,7 @@ class PurePath(object): # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', - # The `_lines_cached`slot stores the string path with path separators + # The `_lines_cached` slot stores the string path with path separators # and newlines swapped. This is used to implement `match()`. '_lines_cached', From 4fe77c64ecd677318fb4d9d04d6c138515724593 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 22:58:29 +0100 Subject: [PATCH 24/28] Avoid hashing path object when compiling pattern. --- Lib/pathlib.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index ec87d3c0da7bef..c887af7e7a1b6e 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -77,14 +77,9 @@ def _is_case_sensitive(flavour): @functools.lru_cache() -def _make_matcher(pattern): - if pattern.drive or pattern.root: - parts = [r'\A'] - elif pattern._tail: - parts = ['^'] - else: - raise ValueError("empty pattern") - for part in pattern._lines.splitlines(keepends=True): +def _compile_pattern(pattern_lines, case_sensitive): + parts = ['^'] + for part in pattern_lines.splitlines(keepends=True): if part == '**\n': part = r'[\s\S]*^' elif part == '**': @@ -96,7 +91,7 @@ def _make_matcher(pattern): parts.append(part) parts.append(r'\Z') flags = re.MULTILINE - if not _is_case_sensitive(pattern._flavour): + if not case_sensitive: flags |= re.IGNORECASE return re.compile(''.join(parts), flags=flags) @@ -742,8 +737,14 @@ def match(self, path_pattern): """ if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: path_pattern = self.with_segments(path_pattern) - match = _make_matcher(path_pattern).search(self._lines) - return match is not None + case_sensitive = _is_case_sensitive(self._flavour) + pattern = _compile_pattern(path_pattern._lines, case_sensitive) + if path_pattern.drive or path_pattern.root: + return pattern.match(self._lines) is not None + elif path_pattern._tail: + return pattern.search(self._lines) is not None + else: + raise ValueError("empty pattern") # Can't subclass os.PathLike from PurePath and keep the constructor From 4770c13c2fe7cc1e9157a82b5d5297cd477dbcd2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 23:41:09 +0100 Subject: [PATCH 25/28] More performance tweaks --- Lib/pathlib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index c887af7e7a1b6e..25922e0e7c8322 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -54,6 +54,7 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) +@functools.lru_cache() def _is_case_sensitive(flavour): return flavour.normcase('Aa') == 'Aa' @@ -735,7 +736,7 @@ def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: + if not isinstance(path_pattern, PurePath): path_pattern = self.with_segments(path_pattern) case_sensitive = _is_case_sensitive(self._flavour) pattern = _compile_pattern(path_pattern._lines, case_sensitive) From eb35dbc3e552c9a4f59bc6d661995b9a01f167bf Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 23 May 2023 23:33:36 +0100 Subject: [PATCH 26/28] Re-target to 3.13. --- Doc/library/pathlib.rst | 2 +- Doc/whatsnew/3.12.rst | 4 ---- Doc/whatsnew/3.13.rst | 6 ++++++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 04c1c61bd0f9fa..cca727233db532 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -588,7 +588,7 @@ Pure paths provide the following methods and properties: .. versionadded:: 3.12 The *case_sensitive* argument. - .. versionchanged:: 3.12 + .. versionchanged:: 3.13 Support for the recursive wildcard "``**``" was added. In previous versions, it acted like the non-recursive wildcard "``*``". diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 1da452f598a612..5e07a4caeb9ebe 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -399,10 +399,6 @@ pathlib :meth:`pathlib.Path.rglob` and :meth:`pathlib.PurePath.match` for matching the path's case sensitivity, allowing for more precise control over the matching process. -* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. - (Contributed by Barney Gale in :gh:`73435`.) - - dis --- diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index e0c3c2a3592ec7..ab5e4e509aa670 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -87,6 +87,12 @@ New Modules Improved Modules ================ +pathlib +------- + +* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. + (Contributed by Barney Gale in :gh:`73435`.) + Optimizations ============= From 9211297b1da404d67a322d4af573ef46e0ce1b3a Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 30 May 2023 18:00:43 +0100 Subject: [PATCH 27/28] Add more comments! --- Lib/pathlib.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 401942287c7bc0..d98c0742c9c7d5 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -68,7 +68,8 @@ def _is_case_sensitive(flavour): # matched, respectively. These features are undesirable for our implementation # of PurePatch.match(), which represents path separators as newlines and joins # pattern segments together. As a workaround, we define a slice object that -# can remove the prefix and suffix from any translate() result. +# can remove the prefix and suffix from any translate() result. See the +# _compile_pattern_lines() function for more details. _FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') _FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) _SWAP_SEP_AND_NEWLINE = { @@ -110,17 +111,42 @@ def _compile_pattern(pat, case_sensitive): @functools.lru_cache() def _compile_pattern_lines(pattern_lines, case_sensitive): + """Compile the given pattern lines to an `re.Pattern` object. + + The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with + its path separators and newlines swapped (e.g. '**\n*.py`). By using + newlines to separate path components, and not setting `re.DOTALL`, we + ensure that the `*` wildcard cannot match path separators. + + The returned `re.Pattern` object may have its `match()` method called to + match a complete pattern, or `search()` to match from the right. The + argument supplied to these methods must also have its path separators and + newlines swapped. + """ + + # Match the start of the path, or just after a path separator parts = ['^'] for part in pattern_lines.splitlines(keepends=True): if part == '**\n': + # '**/' component: we use '[\s\S]' rather than '.' so that path + # separators (i.e. newlines) are matched. The trailing '^' ensures + # we terminate after a path separator (i.e. on a new line). part = r'[\s\S]*^' elif part == '**': + # '**' component. part = r'[\s\S]*' elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") else: + # Any other component: pass to fnmatch.translate(). We slice off + # the common prefix and suffix added by translate() to ensure that + # re.DOTALL is not set, and the end of the string not matched, + # respectively. With DOTALL not set, '*' wildcards will not match + # path separators, because the '.' characters in the pattern will + # not match newlines. part = fnmatch.translate(part)[_FNMATCH_SLICE] parts.append(part) + # Match the end of the path, always. parts.append(r'\Z') flags = re.MULTILINE if not case_sensitive: From 73bb3096844f5014abf15189e41249fc113670a8 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Tue, 30 May 2023 20:49:33 +0100 Subject: [PATCH 28/28] Update Lib/pathlib.py Co-authored-by: Alex Waygood --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index d98c0742c9c7d5..62406473b66e4f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -54,7 +54,7 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) -@functools.lru_cache() +@functools.cache def _is_case_sensitive(flavour): return flavour.normcase('Aa') == 'Aa'