Skip to content

Commit 6932522

Browse files
committed
pythonGH-106747: Prepare pathlib globbing for dir_fd support.
The present implementation of `pathlib.Path.glob()` creates a series of 'selectors' that each handle a part of the pattern. The selectors are connected together in `glob()`, without the use of recursion. One very subtle property of this scheme is that each selector is exhaused *before* its successor selector - for example when globbing `*/*.py`, the selector for `*` is exhausted prior to the selector for `*.py`. This doesn't make any difference when globbing strings, but it does prevent us from adding `dir_fd` support, because there's no good moment to call `os.close(fd)` after opening a directory for scanning. This patch refactors globbing to work much as it did in 3.12, where each selector is responsible for creating and feeding its own successor. This inverts the order of selector exhaustion, and so will make it much easier to add `dir_fd` support. There's one behaviour change here: I've removes deduplication of results, and so in some very specific circumstances (multiple non-consecutive `**` segments in pattern, and either `follow_symlinks=None` or `..` segments separating them), `glob()` can yield the same path more than once. Note that `glob.glob()` can also yield duplicate results - see pythonGH-104269.
1 parent 1dce007 commit 6932522

File tree

2 files changed

+90
-87
lines changed

2 files changed

+90
-87
lines changed

Lib/pathlib/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -454,8 +454,8 @@ def as_uri(self):
454454
return prefix + quote_from_bytes(os.fsencode(path))
455455

456456
@property
457-
def _pattern_stack(self):
458-
"""Stack of path components, to be used with patterns in glob()."""
457+
def _pattern_parts(self):
458+
"""List of path components, to be used with patterns in glob()."""
459459
parts = self._tail.copy()
460460
pattern = self._raw_path
461461
if self.anchor:
@@ -465,8 +465,7 @@ def _pattern_stack(self):
465465
elif pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
466466
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
467467
parts.append('')
468-
parts.reverse()
469-
return parts
468+
return tuple(parts)
470469

471470
@property
472471
def _pattern_str(self):

Lib/pathlib/_abc.py

Lines changed: 87 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -60,17 +60,33 @@ def _compile_pattern(pat, sep, case_sensitive, recursive=True):
6060
return re.compile(regex, flags=flags).match
6161

6262

63-
def _select_special(paths, part):
64-
"""Yield special literal children of the given paths."""
65-
for path in paths:
66-
yield path._make_child_relpath(part)
63+
_special_segments = ('', '.', '..')
6764

6865

69-
def _select_children(parent_paths, dir_only, follow_symlinks, match):
70-
"""Yield direct children of given paths, filtering by name and type."""
71-
if follow_symlinks is None:
72-
follow_symlinks = True
73-
for parent_path in parent_paths:
66+
def _terminating_selector(path):
67+
yield path
68+
69+
70+
def _special_selector(part, parts, sep, case_sensitive, follow_symlinks, recurse_symlinks):
71+
"""Returns a function that yields a special literal child of a given path.
72+
"""
73+
select_next = _selector(parts, sep, case_sensitive, follow_symlinks, recurse_symlinks)
74+
75+
def select_special(path):
76+
yield from select_next(path._make_child_relpath(part))
77+
return select_special
78+
79+
80+
def _wildcard_selector(part, parts, sep, case_sensitive, follow_symlinks, recurse_symlinks):
81+
"""Returns a function that yields direct children of a given path,
82+
filtering by name and type."""
83+
84+
# If the pattern component isn't '*', compile an re.Pattern
85+
# object based on the component.
86+
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
87+
select_next = _selector(parts, sep, case_sensitive, follow_symlinks, recurse_symlinks)
88+
89+
def select_wildcard(parent_path):
7490
try:
7591
# We must close the scandir() object before proceeding to
7692
# avoid exhausting file descriptors when globbing deep trees.
@@ -80,7 +96,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
8096
pass
8197
else:
8298
for entry in entries:
83-
if dir_only:
99+
if parts:
84100
try:
85101
if not entry.is_dir(follow_symlinks=follow_symlinks):
86102
continue
@@ -89,16 +105,34 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
89105
# Avoid cost of making a path object for non-matching paths by
90106
# matching against the os.DirEntry.name string.
91107
if match is None or match(entry.name):
92-
yield parent_path._make_child_direntry(entry)
93-
94-
95-
def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
96-
"""Yield given paths and all their children, recursively, filtering by
97-
string and type.
98-
"""
99-
if follow_symlinks is None:
100-
follow_symlinks = False
101-
for parent_path in parent_paths:
108+
yield from select_next(parent_path._make_child_direntry(entry))
109+
return select_wildcard
110+
111+
112+
def _recursive_selector(part, parts, sep, case_sensitive, follow_symlinks, recurse_symlinks):
113+
"""Returns a function that yields a given path and all its children,
114+
recursively, filtering by pattern and type."""
115+
116+
# Consume following '**' components, which have no effect.
117+
part_idx = 0
118+
while part_idx < len(parts) and parts[part_idx] == '**':
119+
part_idx += 1
120+
121+
# Consume following non-special components, provided we're treating
122+
# symlinks consistently. Each component is joined onto 'part', which is
123+
# used to generate an re.Pattern object.
124+
if follow_symlinks == recurse_symlinks:
125+
while part_idx < len(parts) and parts[part_idx] not in _special_segments:
126+
part += sep + parts[part_idx]
127+
part_idx += 1
128+
parts = parts[part_idx:]
129+
130+
# If the previous loop consumed pattern components, compile an re.Pattern
131+
# object based on those components.
132+
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
133+
select_next = _selector(parts, sep, case_sensitive, follow_symlinks, recurse_symlinks)
134+
135+
def select_recursive(parent_path):
102136
if match is not None:
103137
# If we're filtering paths through a regex, record the length of
104138
# the parent path. We'll pass it to match(path, pos=...) later.
@@ -108,7 +142,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
108142
path = paths.pop()
109143
if match is None or match(str(path), parent_len):
110144
# Yield *directory* path that matches pattern (if any).
111-
yield path
145+
yield from select_next(path)
112146
try:
113147
# We must close the scandir() object before proceeding to
114148
# avoid exhausting file descriptors when globbing deep trees.
@@ -120,33 +154,37 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
120154
for entry in entries:
121155
# Handle directory entry.
122156
try:
123-
if entry.is_dir(follow_symlinks=follow_symlinks):
157+
if entry.is_dir(follow_symlinks=recurse_symlinks):
124158
# Recurse into this directory.
125159
paths.append(path._make_child_direntry(entry))
126160
continue
127161
except OSError:
128162
pass
129163

130164
# Handle file entry.
131-
if not dir_only:
165+
if not parts:
132166
# Avoid cost of making a path object for non-matching
133167
# files by matching against the os.DirEntry object.
134168
if match is None or match(path._direntry_str(entry), parent_len):
135169
# Yield *file* path that matches pattern (if any).
136-
yield path._make_child_direntry(entry)
170+
yield from select_next(path._make_child_direntry(entry))
171+
return select_recursive
137172

138173

139-
def _select_unique(paths):
140-
"""Yields the given paths, filtering out duplicates."""
141-
yielded = set()
142-
try:
143-
for path in paths:
144-
path_str = str(path)
145-
if path_str not in yielded:
146-
yield path
147-
yielded.add(path_str)
148-
finally:
149-
yielded.clear()
174+
def _selector(parts, sep, case_sensitive, follow_symlinks, recurse_symlinks):
175+
"""Returns a function that selects from a given path, walking and
176+
filtering according to the glob-style pattern parts in *parts*."""
177+
178+
if not parts:
179+
return _terminating_selector
180+
part = parts[0]
181+
if part == '**':
182+
selector = _recursive_selector
183+
elif part in _special_segments:
184+
selector = _special_selector
185+
else:
186+
selector = _wildcard_selector
187+
return selector(part, parts[1:], sep, case_sensitive, follow_symlinks, recurse_symlinks)
150188

151189

152190
class UnsupportedOperation(NotImplementedError):
@@ -459,12 +497,12 @@ def is_absolute(self):
459497
return self.pathmod.isabs(self._raw_path)
460498

461499
@property
462-
def _pattern_stack(self):
463-
"""Stack of path components, to be used with patterns in glob()."""
500+
def _pattern_parts(self):
501+
"""List of path components, to be used with patterns in glob()."""
464502
anchor, parts = self._stack
465503
if anchor:
466504
raise NotImplementedError("Non-relative patterns are unsupported")
467-
return parts
505+
return tuple(reversed(parts))
468506

469507
@property
470508
def _pattern_str(self):
@@ -798,53 +836,19 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=True):
798836
if case_sensitive is None:
799837
# TODO: evaluate case-sensitivity of each directory in _select_children().
800838
case_sensitive = _is_case_sensitive(self.pathmod)
839+
if follow_symlinks is None:
840+
# TODO: remove this legacy behaviour.
841+
follow_symlinks = True
842+
recurse_symlinks = False
843+
else:
844+
recurse_symlinks = follow_symlinks
801845

802-
stack = pattern._pattern_stack
803-
specials = ('', '.', '..')
804-
deduplicate_paths = False
805846
sep = self.pathmod.sep
806-
paths = iter([self] if self.is_dir() else [])
807-
while stack:
808-
part = stack.pop()
809-
if part in specials:
810-
# Join special component (e.g. '..') onto paths.
811-
paths = _select_special(paths, part)
812-
813-
elif part == '**':
814-
# Consume following '**' components, which have no effect.
815-
while stack and stack[-1] == '**':
816-
stack.pop()
817-
818-
# Consume following non-special components, provided we're
819-
# treating symlinks consistently. Each component is joined
820-
# onto 'part', which is used to generate an re.Pattern object.
821-
if follow_symlinks is not None:
822-
while stack and stack[-1] not in specials:
823-
part += sep + stack.pop()
824-
825-
# If the previous loop consumed pattern components, compile an
826-
# re.Pattern object based on those components.
827-
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
828-
829-
# Recursively walk directories, filtering by type and regex.
830-
paths = _select_recursive(paths, bool(stack), follow_symlinks, match)
831-
832-
# De-duplicate if we've already seen a '**' component.
833-
if deduplicate_paths:
834-
paths = _select_unique(paths)
835-
deduplicate_paths = True
836-
837-
elif '**' in part:
838-
raise ValueError("Invalid pattern: '**' can only be an entire path component")
839-
840-
else:
841-
# If the pattern component isn't '*', compile an re.Pattern
842-
# object based on the component.
843-
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
844-
845-
# Iterate over directories' children filtering by type and regex.
846-
paths = _select_children(paths, bool(stack), follow_symlinks, match)
847-
return paths
847+
parts = pattern._pattern_parts
848+
select = _selector(parts, sep, case_sensitive, follow_symlinks, recurse_symlinks)
849+
if not self.is_dir():
850+
return iter([])
851+
return select(self)
848852

849853
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=True):
850854
"""Recursively yield all existing files (of any kind, including

0 commit comments

Comments
 (0)