Skip to content

[CLN] Rename $matches to $regex #4506

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions chromadb/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -827,13 +827,13 @@ def validate_where_document(where_document: WhereDocument) -> None:
if operator not in [
"$contains",
"$not_contains",
"$matches",
"$not_matches",
"$regex",
"$not_regex",
"$and",
"$or",
]:
raise ValueError(
f"Expected where document operator to be one of $contains, $and, $or, got {operator}"
f"Expected where document operator to be one of $contains, $not_contains, $regex, $not_regex, $and, $or, got {operator}"
)
if operator == "$and" or operator == "$or":
if not isinstance(operand, list):
Expand All @@ -846,14 +846,14 @@ def validate_where_document(where_document: WhereDocument) -> None:
)
for where_document_expression in operand:
validate_where_document(where_document_expression)
# Value is a $contains operator
# Value is $contains/$not_contains/$regex/$not_regex operator
elif not isinstance(operand, str):
raise ValueError(
f"Expected where document operand value for operator $contains to be a str, got {operand}"
f"Expected where document operand value for operator {operator} to be a str, got {operand}"
)
elif len(operand) == 0:
raise ValueError(
"Expected where document operand value for operator $contains to be a non-empty str"
f"Expected where document operand value for operator {operator} to be a non-empty str"
)


Expand Down
4 changes: 2 additions & 2 deletions chromadb/base_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
WhereDocumentOperator = Union[
Literal["$contains"],
Literal["$not_contains"],
Literal["$matches"],
Literal["$not_matches"],
Literal["$regex"],
Literal["$not_regex"],
LogicalOperator,
]
WhereDocument = Dict[WhereDocumentOperator, Union[str, List["WhereDocument"]]]
4 changes: 2 additions & 2 deletions idl/chromadb/proto/chroma.proto
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ message DirectWhereDocument {
enum WhereDocumentOperator {
CONTAINS = 0;
NOT_CONTAINS = 1;
MATCHES = 2;
NOT_MATCHES = 3;
REGEX = 2;
NOT_REGEX = 3;
}

// A branch-node `WhereDocument` node has a list of children.
Expand Down
15 changes: 6 additions & 9 deletions rust/index/src/fulltext/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -428,25 +428,22 @@ impl<'me> FullTextIndexReader<'me> {
}

#[async_trait::async_trait]
impl<'me> NgramLiteralProvider<FullTextIndexError> for FullTextIndexReader<'me> {
impl<'reader> NgramLiteralProvider<FullTextIndexError> for FullTextIndexReader<'reader> {
fn maximum_branching_factor(&self) -> usize {
6
}

async fn lookup_ngram_range<'fts, NgramRange>(
&'fts self,
async fn lookup_ngram_range<'me, NgramRange>(
&'me self,
ngram_range: NgramRange,
) -> Result<Vec<(&'fts str, u32, RoaringBitmap)>, FullTextIndexError>
) -> Result<Vec<(&'me str, u32, &'me [u32])>, FullTextIndexError>
where
NgramRange: Clone + RangeBounds<&'fts str> + Send + Sync,
NgramRange: Clone + RangeBounds<&'me str> + Send + Sync,
{
Ok(self
.posting_lists_blockfile_reader
.get_range(ngram_range, ..)
.await?
.into_iter()
.map(|(ngram, doc, pos)| (ngram, doc, pos.iter().collect()))
.collect())
.await?)
}
}

Expand Down
4 changes: 2 additions & 2 deletions rust/segment/src/sqlite_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -487,8 +487,8 @@ impl IntoSqliteExpr for DocumentExpression {
match self.operator {
DocumentOperator::Contains => doc_contains,
DocumentOperator::NotContains => doc_contains.not(),
DocumentOperator::Matches => todo!("Implement Regex matching. The result must be a not-nullable boolean (use `<expr>.is(true)`)"),
DocumentOperator::NotMatches => todo!("Implement negated Regex matching. This must be exact opposite of Regex matching (use `<expr>.not()`)"),
DocumentOperator::Regex => todo!("Implement Regex matching. The result must be a not-nullable boolean (use `<expr>.is(true)`)"),
DocumentOperator::NotRegex => todo!("Implement negated Regex matching. This must be exact opposite of Regex matching (use `<expr>.not()`)"),
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions rust/segment/src/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -477,10 +477,10 @@ impl CheckRecord for DocumentExpression {
DocumentOperator::NotContains => {
!document.is_some_and(|doc| doc.contains(&self.pattern.replace("%", "")))
}
DocumentOperator::Matches => {
DocumentOperator::Regex => {
document.is_some_and(|doc| Regex::new(&self.pattern).unwrap().is_match(doc))
}
DocumentOperator::NotMatches => {
DocumentOperator::NotRegex => {
!document.is_some_and(|doc| Regex::new(&self.pattern).unwrap().is_match(doc))
}
}
Expand Down
12 changes: 6 additions & 6 deletions rust/types/src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -693,16 +693,16 @@ impl From<DocumentExpression> for chroma_proto::DirectWhereDocument {
pub enum DocumentOperator {
Contains,
NotContains,
Matches,
NotMatches,
Regex,
NotRegex,
}
impl From<chroma_proto::WhereDocumentOperator> for DocumentOperator {
fn from(value: chroma_proto::WhereDocumentOperator) -> Self {
match value {
chroma_proto::WhereDocumentOperator::Contains => Self::Contains,
chroma_proto::WhereDocumentOperator::NotContains => Self::NotContains,
chroma_proto::WhereDocumentOperator::Matches => Self::Matches,
chroma_proto::WhereDocumentOperator::NotMatches => Self::NotMatches,
chroma_proto::WhereDocumentOperator::Regex => Self::Regex,
chroma_proto::WhereDocumentOperator::NotRegex => Self::NotRegex,
}
}
}
Expand All @@ -712,8 +712,8 @@ impl From<DocumentOperator> for chroma_proto::WhereDocumentOperator {
match value {
DocumentOperator::Contains => Self::Contains,
DocumentOperator::NotContains => Self::NotContains,
DocumentOperator::Matches => Self::Matches,
DocumentOperator::NotMatches => Self::NotMatches,
DocumentOperator::Regex => Self::Regex,
DocumentOperator::NotRegex => Self::NotRegex,
}
}
}
Expand Down
12 changes: 0 additions & 12 deletions rust/types/src/regex/hir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,6 @@ pub enum ChromaHir {
Alternation(Vec<ChromaHir>),
}

impl ChromaHir {
pub fn contains_ngram_literal(&self, n: usize) -> bool {
match self {
ChromaHir::Empty | ChromaHir::Class(_) => false,
ChromaHir::Literal(s) => s.len() >= n,
ChromaHir::Repetition { min, max: _, sub } => *min > 0 && sub.contains_ngram_literal(n),
ChromaHir::Concat(hirs) => hirs.iter().any(|hir| hir.contains_ngram_literal(n)),
ChromaHir::Alternation(hirs) => hirs.iter().all(|hir| hir.contains_ngram_literal(n)),
}
}
}

impl TryFrom<hir::Hir> for ChromaHir {
type Error = ChromaRegexError;

Expand Down
97 changes: 56 additions & 41 deletions rust/types/src/regex/literal_expr.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use std::{collections::HashMap, ops::RangeBounds};
use std::{
collections::{HashMap, HashSet},
ops::RangeBounds,
};

use regex_syntax::hir::ClassUnicode;
use roaring::RoaringBitmap;
Expand Down Expand Up @@ -27,6 +30,22 @@ pub enum LiteralExpr {
Alternation(Vec<LiteralExpr>),
}

impl LiteralExpr {
pub fn contains_ngram_literal(&self, n: usize, max_literal_width: usize) -> bool {
match self {
LiteralExpr::Literal(literals) => literals
.split(|lit| lit.width() > max_literal_width)
.any(|chunk| chunk.len() >= n),
LiteralExpr::Concat(literal_exprs) => literal_exprs
.iter()
.any(|expr| expr.contains_ngram_literal(n, max_literal_width)),
LiteralExpr::Alternation(literal_exprs) => literal_exprs
.iter()
.all(|expr| expr.contains_ngram_literal(n, max_literal_width)),
}
}
}

impl From<ChromaHir> for LiteralExpr {
fn from(value: ChromaHir) -> Self {
match value {
Expand All @@ -35,10 +54,12 @@ impl From<ChromaHir> for LiteralExpr {
Self::Literal(literal.chars().map(Literal::Char).collect())
}
ChromaHir::Class(class_unicode) => Self::Literal(vec![Literal::Class(class_unicode)]),
ChromaHir::Repetition { min, max: _, sub } => {
ChromaHir::Repetition { min, max, sub } => {
let mut repeat = vec![*sub; min as usize];
// Append a breakpoint Hir to prevent merge with literal on the right
repeat.push(ChromaHir::Alternation(vec![ChromaHir::Empty]));
if max.is_none() || max.is_some_and(|m| m > min) {
// Append a breakpoint Hir to prevent merge with literal on the right
repeat.push(ChromaHir::Alternation(vec![ChromaHir::Empty]));
}
ChromaHir::Concat(repeat).into()
}
ChromaHir::Concat(hirs) => {
Expand Down Expand Up @@ -75,7 +96,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
async fn lookup_ngram_range<'me, NgramRange>(
&'me self,
ngram_range: NgramRange,
) -> Result<Vec<(&'me str, u32, RoaringBitmap)>, E>
) -> Result<Vec<(&'me str, u32, &'me [u32])>, E>
where
NgramRange: Clone + RangeBounds<&'me str> + Send + Sync;

Expand All @@ -84,10 +105,10 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
async fn match_literal_with_mask(
&self,
literals: &[Literal],
mask: Option<&RoaringBitmap>,
) -> Result<Option<RoaringBitmap>, E> {
mask: Option<&HashSet<u32>>,
) -> Result<HashSet<u32>, E> {
if mask.is_some_and(|m| m.is_empty()) {
return Ok(mask.cloned());
return Ok(HashSet::new());
}

let (initial_literals, remaining_literals) = literals.split_at(N);
Expand Down Expand Up @@ -115,7 +136,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
});

// ngram suffix -> doc_id -> position
let mut suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, RoaringBitmap>> = HashMap::new();
let mut suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, HashSet<u32>>> = HashMap::new();
for ngram in initial_ngrams {
let ngram_string = ngram.iter().collect::<String>();
let ngram_doc_pos = self
Expand All @@ -128,12 +149,13 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {

let suffix = ngram[1..].to_vec();
for (_, doc_id, pos) in ngram_doc_pos {
if mask.map(|m| m.contains(doc_id)).unwrap_or(mask.is_none()) {
*suffix_doc_pos
if mask.is_none() || mask.is_some_and(|m| m.contains(&doc_id)) {
suffix_doc_pos
.entry(suffix.clone())
.or_default()
.entry(doc_id)
.or_default() |= pos;
.or_default()
.extend(pos);
}
}
}
Expand All @@ -142,7 +164,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
if suffix_doc_pos.is_empty() {
break;
}
let mut new_suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, RoaringBitmap>> =
let mut new_suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, HashSet<u32>>> =
HashMap::new();
for (mut suffix, doc_pos) in suffix_doc_pos {
let ngram_ranges = match literal {
Expand Down Expand Up @@ -170,27 +192,19 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
.await?;
for (ngram, doc_id, next_pos) in ngram_doc_pos {
if let Some(pos) = doc_pos.get(&doc_id) {
if ngram.chars().last().is_some_and(|c| match literal {
Literal::Char(literal_char) => c == *literal_char,
Literal::Class(class_unicode) => {
class_unicode.iter().any(|r| r.start() <= c && c <= r.end())
}
}) {
// SAFETY(Sicheng): The RoaringBitmap iterator should be sorted
let valid_next_pos = RoaringBitmap::from_sorted_iter(
pos.iter()
.filter_map(|p| next_pos.contains(p + 1).then_some(p + 1)),
)
.expect("RoaringBitmap iterator should be sorted");

if !valid_next_pos.is_empty() {
let new_suffix = ngram.chars().skip(1).collect();
*new_suffix_doc_pos
.entry(new_suffix)
.or_default()
.entry(doc_id)
.or_default() |= valid_next_pos;
}
let next_pos_set: HashSet<&u32> = HashSet::from_iter(next_pos);
let mut valid_next_pos = pos
.iter()
.filter_map(|p| next_pos_set.contains(&(p + 1)).then_some(p + 1))
.peekable();
if valid_next_pos.peek().is_some() {
let new_suffix = ngram.chars().skip(1).collect();
new_suffix_doc_pos
.entry(new_suffix)
.or_default()
.entry(doc_id)
.or_default()
.extend(valid_next_pos);
}
}
}
Expand All @@ -203,16 +217,16 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
.into_values()
.flat_map(|doc_pos| doc_pos.into_keys())
.collect();
Ok(Some(result))
Ok(result)
}

// Return the documents matching the literal expression. The search space is restricted to the documents in the mask if specified
// If all documents could match the literal expression, Ok(None) is returned
async fn match_literal_expression_with_mask(
&self,
literal_expression: &LiteralExpr,
mask: Option<&RoaringBitmap>,
) -> Result<Option<RoaringBitmap>, E> {
mask: Option<&HashSet<u32>>,
) -> Result<Option<HashSet<u32>>, E> {
match literal_expression {
LiteralExpr::Literal(literals) => {
let mut result = mask.cloned();
Expand All @@ -221,7 +235,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
break;
}
if query.len() >= N {
result = self.match_literal_with_mask(query, result.as_ref()).await?;
result = Some(self.match_literal_with_mask(query, result.as_ref()).await?);
}
}
Ok(result)
Expand All @@ -239,17 +253,17 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
Ok(result)
}
LiteralExpr::Alternation(literal_exprs) => {
let mut result = RoaringBitmap::new();
let mut result = Vec::new();
for expr in literal_exprs {
if let Some(matching_docs) =
self.match_literal_expression_with_mask(expr, mask).await?
{
result |= matching_docs;
result.extend(matching_docs);
} else {
return Ok(mask.cloned());
}
}
Ok(Some(result))
Ok(Some(HashSet::from_iter(result)))
}
}
}
Expand All @@ -262,6 +276,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
) -> Result<Option<RoaringBitmap>, E> {
self.match_literal_expression_with_mask(literal_expression, None)
.await
.map(|res| res.map(RoaringBitmap::from_iter))
}

fn can_match_exactly(&self, literal_expression: &LiteralExpr) -> bool {
Expand Down
9 changes: 5 additions & 4 deletions rust/types/src/where_parsing.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::regex::literal_expr::LiteralExpr;
use crate::regex::{ChromaRegex, ChromaRegexError};
use crate::{CompositeExpression, DocumentOperator, MetadataExpression, PrimitiveOperator, Where};
use chroma_error::{ChromaError, ErrorCodes};
Expand Down Expand Up @@ -140,16 +141,16 @@ pub fn parse_where_document(json_payload: &Value) -> Result<Where, WhereValidati
let operator_type = match key.as_str() {
"$contains" => DocumentOperator::Contains,
"$not_contains" => DocumentOperator::NotContains,
"$matches" => DocumentOperator::Matches,
"$not_matches" => DocumentOperator::NotMatches,
"$regex" => DocumentOperator::Regex,
"$not_regex" => DocumentOperator::NotRegex,
_ => return Err(WhereValidationError::WhereDocumentClause),
};
if matches!(
operator_type,
DocumentOperator::Matches | DocumentOperator::NotMatches
DocumentOperator::Regex | DocumentOperator::NotRegex
) {
let regex = ChromaRegex::try_from(value_str.to_string())?;
if !regex.hir().contains_ngram_literal(3) {
if !LiteralExpr::from(regex.hir().clone()).contains_ngram_literal(3, 6) {
return Err(ChromaRegexError::PermissivePattern.into());
}
}
Expand Down
Loading
Loading