diff --git a/Cargo.lock b/Cargo.lock index 6c0f72c9d472e..537905f79384d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2678,6 +2678,18 @@ dependencies = [ "regex-syntax 0.8.5", ] +[[package]] +name = "globset" +version = "0.4.16" +source = "git+https://github.com/lukesandberg/ripgrep?branch=serialize_globset#7c1f24bda92874ec316a70bada5f024e4dfc4ea6" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + [[package]] name = "gloo-timers" version = "0.2.6" @@ -3328,7 +3340,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b46810df39e66e925525d6e38ce1e7f6e1d208f72dc39757880fcb66e2c58af1" dependencies = [ "crossbeam-deque", - "globset", + "globset 0.4.16 (registry+https://github.com/rust-lang/crates.io-index)", "log", "memchr", "regex-automata 0.4.9", @@ -7526,7 +7538,7 @@ checksum = "a01bfcbbdea182bdda93713aeecd997749ae324686bf7944f54d128e56be4ea9" dependencies = [ "anyhow", "dashmap 5.5.3", - "globset", + "globset 0.4.16 (registry+https://github.com/rust-lang/crates.io-index)", "indexmap 2.7.1", "once_cell", "regex", @@ -9728,6 +9740,7 @@ dependencies = [ "dunce", "futures", "futures-retry", + "globset 0.4.16 (git+https://github.com/lukesandberg/ripgrep?branch=serialize_globset)", "include_dir", "indexmap 2.7.1", "jsonc-parser 0.21.0", diff --git a/turbopack/crates/turbo-tasks-fs/Cargo.toml b/turbopack/crates/turbo-tasks-fs/Cargo.toml index 6aeeb5d21b133..7f4a5efa2075c 100644 --- a/turbopack/crates/turbo-tasks-fs/Cargo.toml +++ b/turbopack/crates/turbo-tasks-fs/Cargo.toml @@ -55,6 +55,8 @@ turbo-tasks = { workspace = true } turbo-tasks-hash = { workspace = true } unicode-segmentation = { workspace = true } urlencoding = { workspace = true } +# Remove this once https://github.com/BurntSushi/ripgrep/pull/3048 is merged and released +globset = {git = "https://github.com/lukesandberg/ripgrep", branch="serialize_globset" } [dev-dependencies] criterion = { workspace = true, features = ["async_tokio"] } diff --git a/turbopack/crates/turbo-tasks-fs/src/glob.rs b/turbopack/crates/turbo-tasks-fs/src/glob.rs index 88208831be2b0..4753d8778516a 100644 --- a/turbopack/crates/turbo-tasks-fs/src/glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/glob.rs @@ -1,34 +1,8 @@ -use std::mem::take; - -use anyhow::{Context, Result, anyhow, bail}; +use anyhow::Result; +use globset::{Glob as GlobsetGlob, GlobBuilder, GlobMatcher, GlobSet}; use serde::{Deserialize, Serialize}; use turbo_rcstr::RcStr; -use turbo_tasks::{NonLocalValue, TryJoinIterExt, Vc, trace::TraceRawVcs}; -use unicode_segmentation::GraphemeCursor; - -#[derive(PartialEq, Eq, Debug, Clone, TraceRawVcs, Serialize, Deserialize, NonLocalValue)] -enum GlobPart { - /// `/**/`: Matches any path of directories - AnyDirectories, - - /// `*`: Matches any filename (no path separator) - AnyFile, - - /// `?`: Matches a single filename character (no path separator) - AnyFileChar, - - /// `/`: Matches the path separator - PathSeparator, - - /// `[abc]`: Matches any char of the list - FileChar(Vec), - - /// `abc`: Matches literal filename - File(String), - - /// `{a,b,c}`: Matches any of the globs in the list - Alternatives(Vec), -} +use turbo_tasks::Vc; // Examples: // - file.js = File(file.js) @@ -39,363 +13,89 @@ enum GlobPart { // - {a/**,*}/file = Alternatives([File(a), PathSeparator, AnyDirectories], [AnyFile]), // PathSeparator, File(file) -// Note: a/**/b does match a/b, so we need some special logic about path -// separators - -#[turbo_tasks::value] -#[derive(Debug, Clone)] +#[turbo_tasks::value(eq = "manual")] +#[serde(into = "GlobForm", try_from = "GlobForm")] +#[derive(Clone, Debug)] pub struct Glob { - expression: Vec, + // Store the raw glob strings to support equality and serialization + raw: Vec, + #[turbo_tasks(trace_ignore)] + implementation: GlobImpl, +} +#[derive(Clone, Debug)] +enum GlobImpl { + Set(GlobSet), + Single(GlobMatcher), } -impl Glob { - pub fn execute(&self, path: &str) -> bool { - // TODO(lukesandberg): deprecate this implicit behavior - let match_partial = path.ends_with('/'); - self.iter_matches(path, true, match_partial) - .any(|result| matches!(result, ("", _))) - } - - // Returns true if the glob could match a filename underneath this `path` where the path - // represents a directory. - pub fn match_in_directory(&self, path: &str) -> bool { - debug_assert!(!path.ends_with('/')); - // TODO(lukesandberg): see if we can avoid this allocation by changing the matching - // algorithm - let path = format!("{path}/"); - self.iter_matches(&path, true, true) - .any(|result| matches!(result, ("", _))) - } - - fn iter_matches<'a>( - &'a self, - path: &'a str, - previous_part_is_path_separator_equivalent: bool, - match_in_directory: bool, - ) -> GlobMatchesIterator<'a> { - GlobMatchesIterator { - current: path, - glob: self, - match_in_directory, - is_path_separator_equivalent: previous_part_is_path_separator_equivalent, - stack: Vec::new(), - index: 0, - } - } - - pub fn parse(input: &str) -> Result { - let mut current = input; - let mut expression = Vec::new(); - - while !current.is_empty() { - let (part, remainder) = GlobPart::parse(current, false) - .with_context(|| anyhow!("Failed to parse glob {input}"))?; - expression.push(part); - current = remainder; +impl GlobImpl { + fn is_match(&self, path: &str) -> bool { + match self { + GlobImpl::Set(glob_set) => glob_set.is_match(path), + GlobImpl::Single(glob_matcher) => glob_matcher.is_match(path), } - - Ok(Glob { expression }) } } -struct GlobMatchesIterator<'a> { - current: &'a str, - glob: &'a Glob, - // In this mode we are checking if the glob might match something in the directory represented - // by this path. - match_in_directory: bool, - is_path_separator_equivalent: bool, - stack: Vec>, - index: usize, -} - -impl<'a> Iterator for GlobMatchesIterator<'a> { - type Item = (&'a str, bool); - - fn next(&mut self) -> Option { - loop { - if let Some(part) = self.glob.expression.get(self.index) { - let iter = if let Some(iter) = self.stack.get_mut(self.index) { - iter - } else { - let iter = part.iter_matches( - self.current, - self.is_path_separator_equivalent, - self.match_in_directory, - ); - self.stack.push(iter); - self.stack.last_mut().unwrap() - }; - if let Some((new_path, new_is_path_separator_equivalent)) = iter.next() { - self.current = new_path; - self.is_path_separator_equivalent = new_is_path_separator_equivalent; - - self.index += 1; - - if self.match_in_directory && self.current.is_empty() { - return Some(("", self.is_path_separator_equivalent)); - } - } else { - if self.index == 0 { - // failed to match - return None; - } - // backtrack - self.stack.pop(); - self.index -= 1; - } - } else { - // end of expression, matched successfully - - // backtrack for the next iteration - self.index -= 1; - - return Some((self.current, self.is_path_separator_equivalent)); - } - } +impl PartialEq for Glob { + fn eq(&self, other: &Self) -> bool { + self.raw == other.raw } } +impl Eq for Glob {} -impl GlobPart { - /// Iterates over all possible matches of this part with the provided path. - /// The least greedy match is returned first. This is usually used for - /// backtracking. The string slice returned is the remaining part or the - /// path. The boolean flag returned specifies if the matched part should - /// be considered as path-separator equivalent. - fn iter_matches<'a>( - &'a self, - path: &'a str, - previous_part_is_path_separator_equivalent: bool, - match_in_directory: bool, - ) -> GlobPartMatchesIterator<'a> { - GlobPartMatchesIterator { - path, - part: self, - match_in_directory, - previous_part_is_path_separator_equivalent, - cursor: GraphemeCursor::new(0, path.len(), true), - index: 0, - glob_iterator: None, - } +impl Glob { + pub fn execute(&self, path: &str) -> bool { + self.implementation.is_match(path) } - - fn parse(input: &str, inside_of_braces: bool) -> Result<(GlobPart, &str)> { - debug_assert!(!input.is_empty()); - let two_chars = { - let mut chars = input.chars(); - (chars.next().unwrap(), chars.next()) - }; - match two_chars { - ('/', _) => Ok((GlobPart::PathSeparator, &input[1..])), - ('*', Some('*')) => Ok((GlobPart::AnyDirectories, &input[2..])), - ('*', _) => Ok((GlobPart::AnyFile, &input[1..])), - ('?', _) => Ok((GlobPart::AnyFileChar, &input[1..])), - ('[', Some('[')) => todo!("glob char classes are not implemented yet"), - ('[', _) => todo!("glob char sequences are not implemented yet"), - ('{', Some(_)) => { - let mut current = &input[1..]; - let mut alternatives = Vec::new(); - let mut expression = Vec::new(); - - loop { - let (part, remainder) = GlobPart::parse(current, true)?; - expression.push(part); - current = remainder; - match current.chars().next() { - Some(',') => { - alternatives.push(Glob { - expression: take(&mut expression), - }); - current = ¤t[1..]; - } - Some('}') => { - alternatives.push(Glob { - expression: take(&mut expression), - }); - current = ¤t[1..]; - break; - } - None => bail!("Unterminated glob braces"), - _ => { - // next part of the glob - } - } - } - - Ok((GlobPart::Alternatives(alternatives), current)) - } - ('{', None) => { - bail!("Unterminated glob braces") - } - _ => { - let mut is_escaped = false; - let mut literal = String::new(); - - let mut cursor = GraphemeCursor::new(0, input.len(), true); - - let mut start = cursor.cur_cursor(); - let mut end_cursor = cursor - .next_boundary(input, 0) - .map_err(|e| anyhow!("{:?}", e))?; - - while let Some(end) = end_cursor { - let c = &input[start..end]; - if is_escaped { - is_escaped = false; - } else if c == "\\" { - is_escaped = true; - } else if c == "/" - || c == "*" - || c == "?" - || c == "[" - || c == "{" - || (inside_of_braces && (c == "," || c == "}")) - { - break; - } - literal.push_str(c); - - start = cursor.cur_cursor(); - end_cursor = cursor - .next_boundary(input, end) - .map_err(|e| anyhow!("{:?}", e))?; - } - - Ok((GlobPart::File(literal), &input[start..])) - } - } + fn parse(input: RcStr) -> Result { + let parsed = parse_as_globset_glob(input.as_str())?.compile_matcher(); + Ok(Self { + raw: vec![input], + implementation: GlobImpl::Single(parsed), + }) } } -struct GlobPartMatchesIterator<'a> { - path: &'a str, - part: &'a GlobPart, - match_in_directory: bool, - previous_part_is_path_separator_equivalent: bool, - cursor: GraphemeCursor, - index: usize, - glob_iterator: Option>>, +// Small helper to apply our configuration consistently +fn parse_as_globset_glob(input: &str) -> Result { + Ok(GlobBuilder::new(input) + // allow '\' to escape meta characters + .backslash_escape(true) + // allow empty alternates '{}', not really desired but this isn't ambiguous and is backwards + // compatible. + .empty_alternates(true) + // Don't allow `* ` or `?` to match `/` + .literal_separator(true) + .case_insensitive(false) + .build()?) } -impl<'a> Iterator for GlobPartMatchesIterator<'a> { - type Item = (&'a str, bool); - - fn next(&mut self) -> Option { - match self.part { - GlobPart::AnyDirectories => { - if self.cursor.cur_cursor() == 0 { - let Ok(Some(_)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - return Some((self.path, true)); - } - - if self.cursor.cur_cursor() == self.path.len() { - return None; - } - - loop { - let start = self.cursor.cur_cursor(); - // next_boundary does not set cursor offset to the end of the string - // if there is no next boundary - manually set cursor to the end - let end = match self.cursor.next_boundary(self.path, 0) { - Ok(end) => { - if let Some(end) = end { - end - } else { - self.cursor.set_cursor(self.path.len()); - self.cursor.cur_cursor() - } - } - _ => return None, - }; - - if &self.path[start..end] == "/" { - return Some((&self.path[end..], true)); - } else if start == end { - return Some((&self.path[start..], false)); - } - } - } - GlobPart::AnyFile => { - let Ok(Some(c)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - - let idx = self.path[0..c].len(); +// Serialized form of `Glob` +#[derive(Serialize, Deserialize)] +struct GlobForm { + globs: Vec, +} +impl From for GlobForm { + fn from(value: Glob) -> Self { + Self { globs: value.raw } + } +} +impl TryFrom for Glob { + type Error = anyhow::Error; - // TODO verify if `*` does match zero chars? - if let Some(slice) = self.path.get(0..c) { - if slice.ends_with('/') { - None - } else { - Some(( - &self.path[c..], - self.previous_part_is_path_separator_equivalent && idx == 1, - )) - } - } else { - None - } - } - GlobPart::AnyFileChar => todo!(), - GlobPart::PathSeparator => { - if self.cursor.cur_cursor() == 0 { - let Ok(Some(b)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - if self.path.starts_with('/') { - Some((&self.path[b..], true)) - } else if self.previous_part_is_path_separator_equivalent { - Some((self.path, true)) - } else { - None - } - } else { - None - } - } - GlobPart::FileChar(chars) => { - let start = self.cursor.cur_cursor(); - let Ok(Some(end)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - let mut chars_in_path = self.path[start..end].chars(); - let c = chars_in_path.next()?; - if chars_in_path.next().is_some() { - return None; - } - chars.contains(&c).then(|| (&self.path[end..], false)) - } - GlobPart::File(name) => { - if self.cursor.cur_cursor() == 0 && self.path.starts_with(name) { - let Ok(Some(_)) = self.cursor.next_boundary(self.path, 0) else { - return None; - }; - Some((&self.path[name.len()..], false)) - } else { - None - } - } - GlobPart::Alternatives(alternatives) => loop { - if let Some(glob_iterator) = &mut self.glob_iterator { - if let Some((path, is_path_separator_equivalent)) = glob_iterator.next() { - return Some((path, is_path_separator_equivalent)); - } else { - self.index += 1; - self.glob_iterator = None; - } - } else if let Some(alternative) = alternatives.get(self.index) { - self.glob_iterator = Some(Box::new(alternative.iter_matches( - self.path, - self.previous_part_is_path_separator_equivalent, - self.match_in_directory, - ))); - } else { - return None; - } - }, + fn try_from(value: GlobForm) -> Result { + if value.globs.len() == 1 { + return Glob::parse(value.globs[0].clone()); + } + let mut set = GlobSet::builder(); + for raw in &value.globs { + set.add(parse_as_globset_glob(raw)?); } + Ok(Glob { + raw: value.globs, + implementation: GlobImpl::Set(set.build()?), + }) } } @@ -403,7 +103,7 @@ impl TryFrom<&str> for Glob { type Error = anyhow::Error; fn try_from(value: &str) -> Result { - Glob::parse(value) + Glob::parse(value.into()) } } @@ -411,18 +111,34 @@ impl TryFrom<&str> for Glob { impl Glob { #[turbo_tasks::function] pub fn new(glob: RcStr) -> Result> { - Ok(Self::cell(Glob::try_from(glob.as_str())?)) + Ok(Self::cell(Glob::parse(glob)?)) } #[turbo_tasks::function] pub async fn alternatives(globs: Vec>) -> Result> { + if globs.is_empty() { + return Ok(Self::cell(Glob { + raw: Vec::new(), + implementation: GlobImpl::Set(GlobSet::empty()), + })); + } if globs.len() == 1 { return Ok(globs.into_iter().next().unwrap()); } + let mut set = GlobSet::builder(); + let mut raw = Vec::new(); + for glob in globs { + let glob = &*glob.await?; + for item in &glob.raw { + raw.push(item.clone()); + } + } + for raw in &raw { + set.add(parse_as_globset_glob(raw)?); + } Ok(Self::cell(Glob { - expression: vec![GlobPart::Alternatives( - globs.into_iter().map(|g| g.owned()).try_join().await?, - )], + raw, + implementation: GlobImpl::Set(set.build()?), })) } } @@ -437,20 +153,16 @@ mod tests { #[case::file("file.js", "file.js")] #[case::dir_and_file("../public/äöüščří.png", "../public/äöüščří.png")] #[case::dir_and_file("dir/file.js", "dir/file.js")] - #[case::dir_and_file_partial("dir/file.js", "dir/")] #[case::file_braces("file.{ts,js}", "file.js")] #[case::dir_and_file_braces("dir/file.{ts,js}", "dir/file.js")] #[case::dir_and_file_dir_braces("{dir,other}/file.{ts,js}", "dir/file.js")] #[case::star("*.js", "file.js")] #[case::dir_star("dir/*.js", "dir/file.js")] - #[case::dir_star_partial("dir/*.js", "dir/")] #[case::globstar("**/*.js", "file.js")] #[case::globstar("**/*.js", "dir/file.js")] #[case::globstar("**/*.js", "dir/sub/file.js")] #[case::globstar("**/**/*.js", "file.js")] #[case::globstar("**/**/*.js", "dir/sub/file.js")] - #[case::globstar_partial("**/**/*.js", "dir/sub/")] - #[case::globstar_partial("**/**/*.js", "dir/")] #[case::globstar_in_dir("dir/**/sub/file.js", "dir/sub/file.js")] #[case::globstar_in_dir("dir/**/sub/file.js", "dir/a/sub/file.js")] #[case::globstar_in_dir("dir/**/sub/file.js", "dir/a/b/sub/file.js")] @@ -458,10 +170,6 @@ mod tests { "**/next/dist/**/*.shared-runtime.js", "next/dist/shared/lib/app-router-context.shared-runtime.js" )] - #[case::globstar_in_dir_partial("dir/**/sub/file.js", "dir/a/b/sub/")] - #[case::globstar_in_dir_partial("dir/**/sub/file.js", "dir/a/b/")] - #[case::globstar_in_dir_partial("dir/**/sub/file.js", "dir/a/")] - #[case::globstar_in_dir_partial("dir/**/sub/file.js", "dir/")] #[case::star_dir( "**/*/next/dist/server/next.js", "node_modules/next/dist/server/next.js" @@ -503,9 +211,9 @@ mod tests { #[case::alternatives_nested2("{a,b/c,d/e/{f,g/h}}", "b/c")] #[case::alternatives_nested3("{a,b/c,d/e/{f,g/h}}", "d/e/f")] #[case::alternatives_nested4("{a,b/c,d/e/{f,g/h}}", "d/e/g/h")] - // #[case::alternatives_chars("[abc]", "b")] + #[case::alternatives_chars("[abc]", "b")] fn glob_match(#[case] glob: &str, #[case] path: &str) { - let glob = Glob::parse(glob).unwrap(); + let glob = Glob::parse(glob.into()).unwrap(); println!("{glob:?} {path}"); @@ -519,7 +227,7 @@ mod tests { "next/dist/shared/lib/app-router-context.shared-runtime.js" )] fn glob_not_matching(#[case] glob: &str, #[case] path: &str) { - let glob = Glob::parse(glob).unwrap(); + let glob = Glob::parse(glob.into()).unwrap(); println!("{glob:?} {path}"); diff --git a/turbopack/crates/turbo-tasks-fs/src/read_glob.rs b/turbopack/crates/turbo-tasks-fs/src/read_glob.rs index a9079c72b45bd..73ac1e20ee2d2 100644 --- a/turbopack/crates/turbo-tasks-fs/src/read_glob.rs +++ b/turbopack/crates/turbo-tasks-fs/src/read_glob.rs @@ -13,6 +13,12 @@ pub struct ReadGlobResult { pub inner: FxHashMap>, } +impl ReadGlobResult { + fn is_empty(&self) -> bool { + self.results.is_empty() && self.inner.is_empty() + } +} + /// Reads matches of a glob pattern. /// /// DETERMINISM: Result is in random order. Either sort result or do not depend @@ -64,13 +70,12 @@ async fn read_glob_internal( result.results.insert(entry_path.to_string(), entry); } if let DirectoryEntry::Directory(path) = entry { - if glob_value.match_in_directory(&entry_path) { - result.inner.insert( - entry_path.to_string(), - read_glob_inner(entry_path, *path, glob, include_dot_files) - .to_resolved() - .await?, - ); + let directory = + read_glob_inner(entry_path.clone(), *path, glob, include_dot_files) + .to_resolved() + .await?; + if !directory.await?.is_empty() { + result.inner.insert(entry_path.to_string(), directory); } } } @@ -158,14 +163,12 @@ async fn track_glob_internal( match resolve_symlink_safely(entry).await? { DirectoryEntry::Directory(path) => { - if glob_value.match_in_directory(&entry_path) { - completions.push(track_glob_inner( - entry_path, - *path, - glob, - include_dot_files, - )); - } + completions.push(track_glob_inner( + entry_path, + *path, + glob, + include_dot_files, + )); } DirectoryEntry::File(path) => { if glob_value.execute(&entry_path) {