Skip to content

Commit e0562e1

Browse files
author
sicheng
committed
Implement simple benchmark
1 parent 711929e commit e0562e1

File tree

2 files changed

+147
-0
lines changed

2 files changed

+147
-0
lines changed

rust/worker/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ harness = false
9696
name = "query"
9797
harness = false
9898

99+
[[bench]]
100+
name = "regex"
101+
harness = false
102+
99103
[[bench]]
100104
name = "spann"
101105
harness = false

rust/worker/benches/regex.rs

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
use std::collections::HashMap;
2+
3+
use chroma_benchmark::benchmark::{bench_run, tokio_multi_thread};
4+
use chroma_benchmark::datasets::types::RecordDataset;
5+
use chroma_benchmark::datasets::wikipedia::WikipediaDataset;
6+
use chroma_log::test::{int_as_id, random_embedding};
7+
use chroma_segment::test::TestDistributedSegment;
8+
use chroma_system::Operator;
9+
use chroma_types::{
10+
Chunk, DocumentExpression, DocumentOperator, LogRecord, Operation, OperationRecord,
11+
ScalarEncoding, SignedRoaringBitmap, Where,
12+
};
13+
use criterion::Criterion;
14+
use criterion::{criterion_group, criterion_main};
15+
use futures::{StreamExt, TryStreamExt};
16+
use indicatif::ProgressIterator;
17+
use regex::Regex;
18+
use roaring::RoaringBitmap;
19+
use worker::execution::operators::filter::{FilterInput, FilterOperator};
20+
21+
const LOG_CHUNK_SIZE: usize = 1000;
22+
const DOCUMENT_SIZE: usize = 10000;
23+
const REGEX_PATTERNS: &[&str] = &["Hello"];
24+
25+
fn bench_regex(criterion: &mut Criterion) {
26+
let runtime = tokio_multi_thread();
27+
28+
let (test_segment, expected_results, doc_count) = runtime.block_on(async {
29+
let wikipedia = WikipediaDataset::init()
30+
.await
31+
.expect("Wikipedia dataset should exist");
32+
let records = wikipedia
33+
.create_records_stream()
34+
.await
35+
.expect("Wikipedia dataset should have content")
36+
.take(DOCUMENT_SIZE)
37+
.try_collect::<Vec<_>>()
38+
.await
39+
.expect("Wikipedia dataset should have valid records");
40+
41+
let mut expected_results = HashMap::new();
42+
let regexes = REGEX_PATTERNS
43+
.iter()
44+
.map(|pattern_str| {
45+
(
46+
*pattern_str,
47+
Regex::new(*pattern_str).expect("Regex pattern should be valid"),
48+
)
49+
})
50+
.collect::<Vec<_>>();
51+
52+
let logs = records
53+
.into_iter()
54+
.progress()
55+
.with_message("Bruteforcing regex for reference")
56+
.enumerate()
57+
.map(|(offset, record)| {
58+
for (pattern_str, pattern) in &regexes {
59+
if pattern.is_match(&record.document) {
60+
expected_results
61+
.entry(pattern_str.to_string())
62+
.or_insert(RoaringBitmap::new())
63+
.insert(offset as u32);
64+
}
65+
}
66+
LogRecord {
67+
log_offset: offset as i64 + 1,
68+
record: OperationRecord {
69+
id: int_as_id(offset),
70+
embedding: Some(random_embedding(3)),
71+
encoding: Some(ScalarEncoding::FLOAT32),
72+
metadata: None,
73+
document: Some(record.document),
74+
operation: Operation::Upsert,
75+
},
76+
}
77+
})
78+
.collect::<Vec<_>>();
79+
let log_count = logs.len();
80+
let mut segment = TestDistributedSegment::default();
81+
for (idx, batch) in logs
82+
.chunks(LOG_CHUNK_SIZE)
83+
.enumerate()
84+
.progress()
85+
.with_message("Applying log chunk")
86+
{
87+
segment
88+
.compact_log(Chunk::new(batch.into()), idx * LOG_CHUNK_SIZE)
89+
.await;
90+
}
91+
(segment, expected_results, log_count)
92+
});
93+
94+
let filter_input = FilterInput {
95+
logs: Chunk::new(Vec::new().into()),
96+
blockfile_provider: test_segment.blockfile_provider,
97+
metadata_segment: test_segment.metadata_segment,
98+
record_segment: test_segment.record_segment,
99+
};
100+
101+
for pattern in REGEX_PATTERNS {
102+
let filter_operator = FilterOperator {
103+
query_ids: None,
104+
where_clause: Some(Where::Document(DocumentExpression {
105+
operator: DocumentOperator::Matches,
106+
pattern: pattern.to_string(),
107+
})),
108+
};
109+
110+
let routine = |(op, input, expected): (
111+
FilterOperator,
112+
FilterInput,
113+
HashMap<String, RoaringBitmap>,
114+
)| async move {
115+
let results = op
116+
.run(&input)
117+
.await
118+
.expect("FilterOperator should not fail");
119+
assert_eq!(
120+
results.compact_offset_ids,
121+
SignedRoaringBitmap::Include(
122+
expected
123+
.get(&(pattern.to_string()))
124+
.cloned()
125+
.unwrap_or((0..doc_count as u32).collect())
126+
)
127+
)
128+
};
129+
130+
let setup = || {
131+
(
132+
filter_operator.clone(),
133+
filter_input.clone(),
134+
expected_results.clone(),
135+
)
136+
};
137+
138+
bench_run(pattern, criterion, &runtime, setup, routine);
139+
}
140+
}
141+
142+
criterion_group!(benches, bench_regex);
143+
criterion_main!(benches);

0 commit comments

Comments
 (0)