From 7a582518ba3c9965622bc1b19199b0d3b857ae9e Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 15 May 2023 18:51:19 +0100 Subject: [PATCH 01/14] GH-102613: Simplify implementation of `pathlib.Path.glob()` This commit replaces selector classes with selector functions. These generators directly yield results rather calling through to their successor. A new internal `Path._glob()` takes care to chain these generators together, which simplifies the lazy algorithm and slightly improves performance. --- Lib/pathlib.py | 170 ++++++------------ ...-05-15-18-57-42.gh-issue-102613.YD9yx-.rst | 2 + 2 files changed, 57 insertions(+), 115 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst diff --git a/Lib/pathlib.py b/Lib/pathlib.py index ef7c47c9e775e4..9997d6c515b160 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -54,6 +54,7 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) +@functools.lru_cache() def _is_case_sensitive(flavour): return flavour.normcase('Aa') == 'Aa' @@ -62,137 +63,51 @@ def _is_case_sensitive(flavour): # @functools.lru_cache() -def _make_selector(pattern_parts, flavour, case_sensitive): - pat = pattern_parts[0] - if not pat: - return _TerminatingSelector() - if pat == '**': - child_parts_idx = 1 - while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**': - child_parts_idx += 1 - child_parts = pattern_parts[child_parts_idx:] - if '**' in child_parts: - cls = _DoubleRecursiveWildcardSelector - else: - cls = _RecursiveWildcardSelector - else: - child_parts = pattern_parts[1:] - if pat == '..': - cls = _ParentSelector - elif '**' in pat: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: - cls = _WildcardSelector - return cls(pat, child_parts, flavour, case_sensitive) - - -class _Selector: - """A selector matches a specific glob pattern part against the children - of a given path.""" - - def __init__(self, child_parts, flavour, case_sensitive): - self.child_parts = child_parts - if child_parts: - self.successor = _make_selector(child_parts, flavour, case_sensitive) - self.dironly = True - else: - self.successor = _TerminatingSelector() - self.dironly = False - - def select_from(self, parent_path): - """Iterate over all child paths of `parent_path` matched by this - selector. This can contain parent_path itself.""" - path_cls = type(parent_path) - scandir = path_cls._scandir - if not parent_path.is_dir(): - return iter([]) - return self._select_from(parent_path, scandir) - - -class _TerminatingSelector: - - def _select_from(self, parent_path, scandir): - yield parent_path - - -class _ParentSelector(_Selector): - - def __init__(self, name, child_parts, flavour, case_sensitive): - _Selector.__init__(self, child_parts, flavour, case_sensitive) - - def _select_from(self, parent_path, scandir): - path = parent_path._make_child_relpath('..') - for p in self.successor._select_from(path, scandir): - yield p +def _compile_pattern_part(pattern_part, case_sensitive): + flags = re.NOFLAG if case_sensitive else re.IGNORECASE + return re.compile(fnmatch.translate(pattern_part), flags=flags).fullmatch -class _WildcardSelector(_Selector): - - def __init__(self, pat, child_parts, flavour, case_sensitive): - _Selector.__init__(self, child_parts, flavour, case_sensitive) - if case_sensitive is None: - # TODO: evaluate case-sensitivity of each directory in _select_from() - case_sensitive = _is_case_sensitive(flavour) - flags = re.NOFLAG if case_sensitive else re.IGNORECASE - self.match = re.compile(fnmatch.translate(pat), flags=flags).fullmatch - - def _select_from(self, parent_path, scandir): +def _select_children(paths, dir_only, match): + for path in paths: try: # We must close the scandir() object before proceeding to # avoid exhausting file descriptors when globbing deep trees. - with scandir(parent_path) as scandir_it: + with path._scandir() as scandir_it: entries = list(scandir_it) except OSError: pass else: for entry in entries: - if self.dironly: + if dir_only: try: if not entry.is_dir(): continue except OSError: continue name = entry.name - if self.match(name): - path = parent_path._make_child_relpath(name) - for p in self.successor._select_from(path, scandir): - yield p - + if match(name): + yield path._make_child_relpath(name) -class _RecursiveWildcardSelector(_Selector): - def __init__(self, pat, child_parts, flavour, case_sensitive): - _Selector.__init__(self, child_parts, flavour, case_sensitive) - - def _iterate_directories(self, parent_path): - yield parent_path - for dirpath, dirnames, _ in parent_path.walk(): +def _select_recursive(paths): + for path in paths: + yield path + for dirpath, dirnames, _ in path.walk(): for dirname in dirnames: yield dirpath._make_child_relpath(dirname) - def _select_from(self, parent_path, scandir): - successor_select = self.successor._select_from - for starting_point in self._iterate_directories(parent_path): - for p in successor_select(starting_point, scandir): - yield p - -class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector): - """ - Like _RecursiveWildcardSelector, but also de-duplicates results from - successive selectors. This is necessary if the pattern contains - multiple non-adjacent '**' segments. - """ - - def _select_from(self, parent_path, scandir): - yielded = set() - try: - for p in super()._select_from(parent_path, scandir): - if p not in yielded: - yield p - yielded.add(p) - finally: - yielded.clear() +def _select_unique(paths): + yielded = set() + try: + for path in paths: + raw_path = path._raw_path + if raw_path not in yielded: + yield path + yielded.add(raw_path) + finally: + yielded.clear() # @@ -996,9 +911,7 @@ def glob(self, pattern, *, case_sensitive=None): raise NotImplementedError("Non-relative patterns are unsupported") if pattern[-1] in (self._flavour.sep, self._flavour.altsep): pattern_parts.append('') - selector = _make_selector(tuple(pattern_parts), self._flavour, case_sensitive) - for p in selector.select_from(self): - yield p + return self._glob(pattern_parts, case_sensitive) def rglob(self, pattern, *, case_sensitive=None): """Recursively yield all existing files (of any kind, including @@ -1011,9 +924,36 @@ def rglob(self, pattern, *, case_sensitive=None): raise NotImplementedError("Non-relative patterns are unsupported") if pattern and pattern[-1] in (self._flavour.sep, self._flavour.altsep): pattern_parts.append('') - selector = _make_selector(("**",) + tuple(pattern_parts), self._flavour, case_sensitive) - for p in selector.select_from(self): - yield p + return self._glob(['**'] + pattern_parts, case_sensitive) + + def _glob(self, pattern_parts, case_sensitive): + if case_sensitive is None: + # TODO: evaluate case-sensitivity of each directory in _select_children(). + case_sensitive = _is_case_sensitive(self._flavour) + paths = iter([self] if self.is_dir() else []) + deduplicate = False + part_idx = 0 + while part_idx < len(pattern_parts): + part = pattern_parts[part_idx] + part_idx += 1 + if part == '': + pass + elif part == '..': + paths = (path._make_child_relpath('..') for path in paths) + elif part == '**': + while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': + part_idx += 1 + paths = _select_recursive(paths) + if deduplicate: + paths = _select_unique(paths) + deduplicate = True + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + dir_only = part_idx < len(pattern_parts) + match = _compile_pattern_part(part, case_sensitive) + paths = _select_children(paths, dir_only, match) + return paths def walk(self, top_down=True, on_error=None, follow_symlinks=False): """Walk the directory tree from this directory, similar to os.walk().""" diff --git a/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst b/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst new file mode 100644 index 00000000000000..f9de075112bac3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst @@ -0,0 +1,2 @@ +Improve performance of :meth:`pathlib.Path.glob` by simplifying its +implementation. From d5b1836733e8400068d441f02dfe10c5dcbcb3e4 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 15 May 2023 19:37:14 +0100 Subject: [PATCH 02/14] Speed up matching `*` --- Lib/pathlib.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 9997d6c515b160..272ca2d7bf8f2c 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -64,6 +64,8 @@ def _is_case_sensitive(flavour): @functools.lru_cache() def _compile_pattern_part(pattern_part, case_sensitive): + if pattern_part == '*': + return None flags = re.NOFLAG if case_sensitive else re.IGNORECASE return re.compile(fnmatch.translate(pattern_part), flags=flags).fullmatch @@ -86,7 +88,7 @@ def _select_children(paths, dir_only, match): except OSError: continue name = entry.name - if match(name): + if match is None or match(name): yield path._make_child_relpath(name) From d5c86c6e003b529d9c367815615c7644d2f7d5f1 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 15 May 2023 19:47:32 +0100 Subject: [PATCH 03/14] Add comments, docstrings. --- Lib/pathlib.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 272ca2d7bf8f2c..e13229081b163e 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -64,6 +64,8 @@ def _is_case_sensitive(flavour): @functools.lru_cache() def _compile_pattern_part(pattern_part, case_sensitive): + """Compile given glob pattern to a re.Pattern object (observing case + sensitivity), or None if the pattern should match everything.""" if pattern_part == '*': return None flags = re.NOFLAG if case_sensitive else re.IGNORECASE @@ -71,6 +73,7 @@ def _compile_pattern_part(pattern_part, case_sensitive): def _select_children(paths, dir_only, match): + """Yield direct children of given paths, filtering by name and type.""" for path in paths: try: # We must close the scandir() object before proceeding to @@ -93,6 +96,7 @@ def _select_children(paths, dir_only, match): def _select_recursive(paths): + """Yield given paths and all their subdirectories, recursively.""" for path in paths: yield path for dirpath, dirnames, _ in path.walk(): @@ -101,6 +105,7 @@ def _select_recursive(paths): def _select_unique(paths): + """Yields the given paths, filtering out duplicates.""" yielded = set() try: for path in paths: @@ -939,13 +944,16 @@ def _glob(self, pattern_parts, case_sensitive): part = pattern_parts[part_idx] part_idx += 1 if part == '': + # Trailing slash. pass elif part == '..': paths = (path._make_child_relpath('..') for path in paths) elif part == '**': + # Collapse adjacent '**' segments. while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': part_idx += 1 paths = _select_recursive(paths) + # De-duplicate if we've already seen a '**' segment. if deduplicate: paths = _select_unique(paths) deduplicate = True From 40056194ebaac26f30c917bf32b34eacfee99d4a Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 31 May 2023 19:01:37 +0100 Subject: [PATCH 04/14] Add support for matching files recursively. --- Lib/pathlib.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index a72fdcde967d6c..6d624caf2cb6e7 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -158,15 +158,18 @@ def _select_children(paths, dir_only, follow_symlinks, match): yield path._make_child_relpath(name) -def _select_recursive(paths, follow_symlinks): +def _select_recursive(paths, dir_only, follow_symlinks): """Yield given paths and all their subdirectories, recursively.""" if follow_symlinks is None: follow_symlinks = False for path in paths: yield path - for dirpath, dirnames, _ in path.walk(follow_symlinks=follow_symlinks): + for dirpath, dirnames, filenames in path.walk(follow_symlinks=follow_symlinks): for dirname in dirnames: yield dirpath._make_child_relpath(dirname) + if not dir_only: + for filename in filenames: + yield dirpath._make_child_relpath(filename) def _select_unique(paths): @@ -994,14 +997,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): kind, including directories) matching the given relative pattern. """ sys.audit("pathlib.Path.glob", self, pattern) - if not pattern: - raise ValueError("Unacceptable pattern: {!r}".format(pattern)) - drv, root, pattern_parts = self._parse_path(pattern) - if drv or root: - raise NotImplementedError("Non-relative patterns are unsupported") - if pattern[-1] in (self._flavour.sep, self._flavour.altsep): - pattern_parts.append('') - return self._glob(pattern_parts, case_sensitive, follow_symlinks) + return self._glob(pattern, case_sensitive, follow_symlinks) def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): """Recursively yield all existing files (of any kind, including @@ -1009,14 +1005,23 @@ def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): this subtree. """ sys.audit("pathlib.Path.rglob", self, pattern) - drv, root, pattern_parts = self._parse_path(pattern) - if drv or root: + return self._glob(f'**/{pattern}', case_sensitive, follow_symlinks) + + def _glob(self, pattern, case_sensitive, follow_symlinks): + path_pattern = self.with_segments(pattern) + if path_pattern.drive or path_pattern.root: raise NotImplementedError("Non-relative patterns are unsupported") - if pattern and pattern[-1] in (self._flavour.sep, self._flavour.altsep): + elif not path_pattern._tail: + raise ValueError("Unacceptable pattern: {!r}".format(pattern)) + + pattern_parts = list(path_pattern._tail) + if pattern[-1] in (self._flavour.sep, self._flavour.altsep): + # GH-65238: pathlib doesn't preserve trailing slash. Add it back. + pattern_parts.append('') + if pattern_parts[-1] == '**': + # GH-102613: '**' only matches directories. Add trailing slash. pattern_parts.append('') - return self._glob(['**'] + pattern_parts, case_sensitive, follow_symlinks) - def _glob(self, pattern_parts, case_sensitive, follow_symlinks): if case_sensitive is None: # TODO: evaluate case-sensitivity of each directory in _select_children(). case_sensitive = _is_case_sensitive(self._flavour) @@ -1035,7 +1040,8 @@ def _glob(self, pattern_parts, case_sensitive, follow_symlinks): # Collapse adjacent '**' segments. while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': part_idx += 1 - paths = _select_recursive(paths, follow_symlinks) + dir_only = part_idx < len(pattern_parts) + paths = _select_recursive(paths, dir_only, follow_symlinks) # De-duplicate if we've already seen a '**' segment. if deduplicate: paths = _select_unique(paths) From 9401d36e63ae1741fd3b03d0c40e8b92995d4d88 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 31 May 2023 19:28:05 +0100 Subject: [PATCH 05/14] Implement walk-and-match algorithm. --- Lib/pathlib.py | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 6d624caf2cb6e7..4b98133d0a54af 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -978,18 +978,23 @@ def _scandir(self): def _make_child_relpath(self, name): path_str = str(self) + lines_str = self._lines tail = self._tail if tail: path_str = f'{path_str}{self._flavour.sep}{name}' + lines_str = f'{lines_str}\n{name}' elif path_str != '.': path_str = f'{path_str}{name}' + lines_str = f'{lines_str}{name}' else: path_str = name + lines_str = name path = self.with_segments(path_str) path._str = path_str path._drv = self.drive path._root = self.root path._tail_cached = tail + [name] + path._lines_cached = lines_str return path def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): @@ -1025,8 +1030,17 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): if case_sensitive is None: # TODO: evaluate case-sensitivity of each directory in _select_children(). case_sensitive = _is_case_sensitive(self._flavour) + + # If symlinks are handled consistently, and the pattern does not + # contain '..' components, then we can use a 'walk-and-match' strategy + # when expanding '**' wildcards. When a '**' wildcard is encountered, + # all following pattern parts are immediately consumed and used to + # build a `re.Pattern` object. This pattern is used to filter the + # recursive walk. As a result, pattern parts following a '**' wildcard + # do not perform any filesystem access, which can be much faster! + filter_paths_supported = follow_symlinks is not None and '..' not in pattern_parts + deduplicate_paths = False paths = iter([self] if self.is_dir() else []) - deduplicate = False part_idx = 0 while part_idx < len(pattern_parts): part = pattern_parts[part_idx] @@ -1037,15 +1051,29 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): elif part == '..': paths = (path._make_child_relpath('..') for path in paths) elif part == '**': - # Collapse adjacent '**' segments. - while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': - part_idx += 1 + filter_paths = False + if filter_paths_supported: + # Consume remaining path components, except trailing slash. + while part_idx < len(pattern_parts) and pattern_parts[part_idx] != '': + filter_paths = True + part_idx += 1 + else: + # Consume adjacent '**' components. + while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': + part_idx += 1 + dir_only = part_idx < len(pattern_parts) paths = _select_recursive(paths, dir_only, follow_symlinks) - # De-duplicate if we've already seen a '**' segment. - if deduplicate: + + if filter_paths: + # Filter out paths that don't match pattern. + pattern_lines = self.joinpath(path_pattern)._lines + match = _compile_pattern_lines(pattern_lines, case_sensitive).match + paths = (path for path in paths if match(path._lines)) + elif deduplicate_paths: + # De-duplicate if we've already seen a '**' component. paths = _select_unique(paths) - deduplicate = True + deduplicate_paths = True elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") else: From b217587651fd53d2783ca918979f2165377ffb07 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 31 May 2023 20:56:03 +0100 Subject: [PATCH 06/14] Fix up docs, news blurb. --- Doc/library/pathlib.rst | 12 ++++++++---- .../2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst | 6 ++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 67ef36890d5739..ad801d5d7cdc4b 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -917,6 +917,14 @@ call fails (for example because the path doesn't exist). PosixPath('setup.py'), PosixPath('test_pathlib.py')] + .. note:: + Using the "``**``" pattern in large directory trees may consume + an inordinate amount of time. + + .. tip:: + Set *follow_symlinks* to ``True`` or ``False`` to improve performance + of recursive globbing. + By default, or when the *case_sensitive* keyword-only argument is set to ``None``, this method matches paths using platform-specific casing rules: typically, case-sensitive on POSIX, and case-insensitive on Windows. @@ -927,10 +935,6 @@ call fails (for example because the path doesn't exist). wildcards. Set *follow_symlinks* to ``True`` to always follow symlinks, or ``False`` to treat all symlinks as files. - .. note:: - Using the "``**``" pattern in large directory trees may consume - an inordinate amount of time. - .. audit-event:: pathlib.Path.glob self,pattern pathlib.Path.glob .. versionchanged:: 3.11 diff --git a/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst b/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst index f9de075112bac3..841a9e1e26d317 100644 --- a/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst +++ b/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst @@ -1,2 +1,4 @@ -Improve performance of :meth:`pathlib.Path.glob` by simplifying its -implementation. +Improve performance of :meth:`pathlib.Path.glob` when expanding a pattern with +a non-terminal "``**``" component by filtering walked paths through a regular +expression, rather than calling :func:`os.scandir` more than once on each +directory. From de7e8570ae501759d7dabe7b3c3b0e4dd6447759 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 31 May 2023 20:57:55 +0100 Subject: [PATCH 07/14] Fix comment --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 4b98133d0a54af..b7cb6385d370ce 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -1024,7 +1024,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): # GH-65238: pathlib doesn't preserve trailing slash. Add it back. pattern_parts.append('') if pattern_parts[-1] == '**': - # GH-102613: '**' only matches directories. Add trailing slash. + # GH-70303: '**' only matches directories. Add trailing slash. pattern_parts.append('') if case_sensitive is None: From d1023c7c324d43e3a655b7748f76d9bd23680a17 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 31 May 2023 21:04:53 +0100 Subject: [PATCH 08/14] Fix handling of newlines in filenames. --- Lib/pathlib.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index b7cb6385d370ce..eebf84960ebc27 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -977,18 +977,20 @@ def _scandir(self): return os.scandir(self) def _make_child_relpath(self, name): - path_str = str(self) + sep = self._flavour.sep + lines_name = name.replace('\n', sep) lines_str = self._lines + path_str = str(self) tail = self._tail if tail: - path_str = f'{path_str}{self._flavour.sep}{name}' - lines_str = f'{lines_str}\n{name}' + path_str = f'{path_str}{sep}{name}' + lines_str = f'{lines_str}\n{lines_name}' elif path_str != '.': path_str = f'{path_str}{name}' - lines_str = f'{lines_str}{name}' + lines_str = f'{lines_str}{lines_name}' else: path_str = name - lines_str = name + lines_str = lines_name path = self.with_segments(path_str) path._str = path_str path._drv = self.drive From ad33eece664095b3b990687d29da101a04499bfe Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 1 Jun 2023 18:12:26 +0100 Subject: [PATCH 09/14] Speed up recursive selection --- Lib/pathlib.py | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index eebf84960ebc27..01d4e28b48efe1 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -133,15 +133,15 @@ def _compile_pattern_lines(pattern_lines, case_sensitive): return re.compile(''.join(parts), flags=flags) -def _select_children(paths, dir_only, follow_symlinks, match): +def _select_children(parent_paths, dir_only, follow_symlinks, match): """Yield direct children of given paths, filtering by name and type.""" if follow_symlinks is None: follow_symlinks = True - for path in paths: + for parent_path in parent_paths: try: # We must close the scandir() object before proceeding to # avoid exhausting file descriptors when globbing deep trees. - with path._scandir() as scandir_it: + with parent_path._scandir() as scandir_it: entries = list(scandir_it) except OSError: pass @@ -155,21 +155,35 @@ def _select_children(paths, dir_only, follow_symlinks, match): continue name = entry.name if match is None or match(name): - yield path._make_child_relpath(name) + yield parent_path._make_child_relpath(name) -def _select_recursive(paths, dir_only, follow_symlinks): +def _select_recursive(parent_paths, dir_only, follow_symlinks): """Yield given paths and all their subdirectories, recursively.""" if follow_symlinks is None: follow_symlinks = False - for path in paths: - yield path - for dirpath, dirnames, filenames in path.walk(follow_symlinks=follow_symlinks): - for dirname in dirnames: - yield dirpath._make_child_relpath(dirname) - if not dir_only: - for filename in filenames: - yield dirpath._make_child_relpath(filename) + for parent_path in parent_paths: + paths = [parent_path] + while paths: + path = paths.pop() + yield path + try: + # We must close the scandir() object before proceeding to + # avoid exhausting file descriptors when globbing deep trees. + with path._scandir() as scandir_it: + entries = list(scandir_it) + except OSError: + pass + else: + for entry in entries: + try: + if entry.is_dir(follow_symlinks=follow_symlinks): + paths.append(path._make_child_relpath(entry.name)) + continue + except OSError: + pass + if not dir_only: + yield path._make_child_relpath(entry.name) def _select_unique(paths): From 064efdb452f1e0db040d1bda2b2faa9ef57df8c2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 1 Jun 2023 18:14:21 +0100 Subject: [PATCH 10/14] Exclude `self` from walk-and-match matching. --- Lib/pathlib.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 01d4e28b48efe1..300c836ae2b0c1 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -106,7 +106,11 @@ def _compile_pattern_lines(pattern_lines, case_sensitive): # Match the start of the path, or just after a path separator parts = ['^'] for part in pattern_lines.splitlines(keepends=True): - if part == '**\n': + if part == '*\n': + part = r'.+\n' + elif part == '*': + part = r'.+' + elif part == '**\n': # '**/' component: we use '[\s\S]' rather than '.' so that path # separators (i.e. newlines) are matched. The trailing '^' ensures # we terminate after a path separator (i.e. on a new line). @@ -1083,9 +1087,9 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): if filter_paths: # Filter out paths that don't match pattern. - pattern_lines = self.joinpath(path_pattern)._lines - match = _compile_pattern_lines(pattern_lines, case_sensitive).match - paths = (path for path in paths if match(path._lines)) + prefix_len = len(self._make_child_relpath('_')._lines) - 1 + match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match + paths = (path for path in paths if match(path._lines[prefix_len:])) elif deduplicate_paths: # De-duplicate if we've already seen a '**' component. paths = _select_unique(paths) From 14c6a587a22b2492438d1b5dbe90189bd8ee0c24 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 1 Jun 2023 18:46:56 +0100 Subject: [PATCH 11/14] Optimize walk-and-match logic. --- Lib/pathlib.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 300c836ae2b0c1..dc23b7a65f89e6 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -1058,7 +1058,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): # build a `re.Pattern` object. This pattern is used to filter the # recursive walk. As a result, pattern parts following a '**' wildcard # do not perform any filesystem access, which can be much faster! - filter_paths_supported = follow_symlinks is not None and '..' not in pattern_parts + filter_paths = follow_symlinks is not None and '..' not in pattern_parts deduplicate_paths = False paths = iter([self] if self.is_dir() else []) part_idx = 0 @@ -1071,26 +1071,22 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): elif part == '..': paths = (path._make_child_relpath('..') for path in paths) elif part == '**': - filter_paths = False - if filter_paths_supported: - # Consume remaining path components, except trailing slash. - while part_idx < len(pattern_parts) and pattern_parts[part_idx] != '': - filter_paths = True - part_idx += 1 - else: - # Consume adjacent '**' components. - while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': - part_idx += 1 + if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '': + dir_only = pattern_parts[-1] == '' + paths = _select_recursive(paths, dir_only, follow_symlinks) - dir_only = part_idx < len(pattern_parts) - paths = _select_recursive(paths, dir_only, follow_symlinks) - - if filter_paths: # Filter out paths that don't match pattern. prefix_len = len(self._make_child_relpath('_')._lines) - 1 match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match paths = (path for path in paths if match(path._lines[prefix_len:])) - elif deduplicate_paths: + return paths + + # Consume adjacent '**' components. + while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': + part_idx += 1 + dir_only = part_idx < len(pattern_parts) + paths = _select_recursive(paths, dir_only, follow_symlinks) + if deduplicate_paths: # De-duplicate if we've already seen a '**' component. paths = _select_unique(paths) deduplicate_paths = True From 04720bdfb8e0df75991e0bfeb8da19bb8840eb62 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 1 Jun 2023 19:23:21 +0100 Subject: [PATCH 12/14] Consume adjacent '**' segments before considering use of matching. --- Lib/pathlib.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index dc23b7a65f89e6..89c7b1ebdf727f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -1071,6 +1071,10 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): elif part == '..': paths = (path._make_child_relpath('..') for path in paths) elif part == '**': + # Consume adjacent '**' components. + while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': + part_idx += 1 + if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '': dir_only = pattern_parts[-1] == '' paths = _select_recursive(paths, dir_only, follow_symlinks) @@ -1081,9 +1085,6 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): paths = (path for path in paths if match(path._lines[prefix_len:])) return paths - # Consume adjacent '**' components. - while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': - part_idx += 1 dir_only = part_idx < len(pattern_parts) paths = _select_recursive(paths, dir_only, follow_symlinks) if deduplicate_paths: From 9c6b44f54e56aa07edd1b03c29cc5c4e1de0681a Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 1 Jun 2023 19:57:51 +0100 Subject: [PATCH 13/14] Add some more tests for complex patterns. --- Lib/test/test_pathlib.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index 076ace3d930857..75d104b7b79020 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -1899,6 +1899,17 @@ def _check(path, glob, expected): _check(p, "*B/*", ["dirB/fileB", "dirB/linkD", "linkB/fileB", "linkB/linkD"]) _check(p, "*/fileB", ["dirB/fileB", "linkB/fileB"]) _check(p, "*/", ["dirA", "dirB", "dirC", "dirE", "linkB"]) + _check(p, "dir*/*/..", ["dirC/dirD/..", "dirA/linkC/.."]) + _check(p, "dir*/**/", ["dirA", "dirA/linkC", "dirA/linkC/linkD", "dirB", "dirB/linkD", + "dirC", "dirC/dirD", "dirE"]) + _check(p, "dir*/**/..", ["dirA/..", "dirA/linkC/..", "dirB/..", + "dirC/..", "dirC/dirD/..", "dirE/.."]) + _check(p, "dir*/*/**/", ["dirA/linkC", "dirA/linkC/linkD", "dirB/linkD", "dirC/dirD"]) + _check(p, "dir*/*/**/..", ["dirA/linkC/..", "dirC/dirD/.."]) + _check(p, "dir*/**/fileC", ["dirC/fileC"]) + _check(p, "dir*/*/../**/fileC", ["dirA/linkC/../dirC/fileC", "dirC/dirD/../fileC"]) + _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"]) + _check(p, "*/dirD/**/", ["dirC/dirD"]) @os_helper.skip_unless_symlink def test_glob_no_follow_symlinks_common(self): @@ -1913,6 +1924,15 @@ def _check(path, glob, expected): _check(p, "*B/*", ["dirB/fileB", "dirB/linkD"]) _check(p, "*/fileB", ["dirB/fileB"]) _check(p, "*/", ["dirA", "dirB", "dirC", "dirE"]) + _check(p, "dir*/*/..", ["dirC/dirD/.."]) + _check(p, "dir*/**/", ["dirA", "dirB", "dirC", "dirC/dirD", "dirE"]) + _check(p, "dir*/**/..", ["dirA/..", "dirB/..", "dirC/..", "dirC/dirD/..", "dirE/.."]) + _check(p, "dir*/*/**/", ["dirC/dirD"]) + _check(p, "dir*/*/**/..", ["dirC/dirD/.."]) + _check(p, "dir*/**/fileC", ["dirC/fileC"]) + _check(p, "dir*/*/../**/fileC", ["dirC/dirD/../fileC"]) + _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"]) + _check(p, "*/dirD/**/", ["dirC/dirD"]) def test_rglob_common(self): def _check(glob, expected): From 4cfb83670d43ed62956c0c3b6c0473803a25993d Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 1 Jun 2023 20:35:45 +0100 Subject: [PATCH 14/14] Drop test case that doesn't work on Windows. `..` components are resolved lexically, rather than after symlinks. --- Lib/test/test_pathlib.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index 75d104b7b79020..dc94e337ee65e6 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -1907,7 +1907,6 @@ def _check(path, glob, expected): _check(p, "dir*/*/**/", ["dirA/linkC", "dirA/linkC/linkD", "dirB/linkD", "dirC/dirD"]) _check(p, "dir*/*/**/..", ["dirA/linkC/..", "dirC/dirD/.."]) _check(p, "dir*/**/fileC", ["dirC/fileC"]) - _check(p, "dir*/*/../**/fileC", ["dirA/linkC/../dirC/fileC", "dirC/dirD/../fileC"]) _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"]) _check(p, "*/dirD/**/", ["dirC/dirD"]) @@ -1930,7 +1929,6 @@ def _check(path, glob, expected): _check(p, "dir*/*/**/", ["dirC/dirD"]) _check(p, "dir*/*/**/..", ["dirC/dirD/.."]) _check(p, "dir*/**/fileC", ["dirC/fileC"]) - _check(p, "dir*/*/../**/fileC", ["dirC/dirD/../fileC"]) _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"]) _check(p, "*/dirD/**/", ["dirC/dirD"])