Skip to content

Commit f37ecae

Browse files
author
sicheng
committed
Update bench
1 parent a322757 commit f37ecae

File tree

1 file changed

+31
-10
lines changed

1 file changed

+31
-10
lines changed

rust/worker/benches/regex.rs

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use std::collections::HashMap;
2+
use std::time::Duration;
23

34
use chroma_benchmark::benchmark::{bench_run, tokio_multi_thread};
45
use chroma_benchmark::datasets::types::RecordDataset;
@@ -16,16 +17,24 @@ use futures::{StreamExt, TryStreamExt};
1617
use indicatif::ProgressIterator;
1718
use regex::Regex;
1819
use roaring::RoaringBitmap;
20+
use tokio::time::Instant;
1921
use worker::execution::operators::filter::{FilterInput, FilterOperator};
2022

2123
const LOG_CHUNK_SIZE: usize = 1000;
2224
const DOCUMENT_SIZE: usize = 10000;
23-
const REGEX_PATTERNS: &[&str] = &["Hello"];
25+
const REGEX_PATTERNS: &[&str] = &[
26+
r"wikipedia",
27+
r"(?i)wikipedia",
28+
r"20\d\d",
29+
r".*wiki.*",
30+
r"May|June",
31+
r"(March|April) 19\d\d",
32+
];
2433

2534
fn bench_regex(criterion: &mut Criterion) {
2635
let runtime = tokio_multi_thread();
2736

28-
let (test_segment, expected_results, doc_count) = runtime.block_on(async {
37+
let (test_segment, expected_results, bruteforce_time, doc_count) = runtime.block_on(async {
2938
let wikipedia = WikipediaDataset::init()
3039
.await
3140
.expect("Wikipedia dataset should exist");
@@ -39,6 +48,7 @@ fn bench_regex(criterion: &mut Criterion) {
3948
.expect("Wikipedia dataset should have valid records");
4049

4150
let mut expected_results = HashMap::new();
51+
let mut bruteforce_time = HashMap::<_, Duration>::new();
4252
let regexes = REGEX_PATTERNS
4353
.iter()
4454
.map(|pattern_str| {
@@ -56,11 +66,14 @@ fn bench_regex(criterion: &mut Criterion) {
5666
.enumerate()
5767
.map(|(offset, record)| {
5868
for (pattern_str, pattern) in &regexes {
69+
let now = Instant::now();
5970
if pattern.is_match(&record.document) {
71+
let elapsed = now.elapsed();
6072
expected_results
6173
.entry(pattern_str.to_string())
6274
.or_insert(RoaringBitmap::new())
6375
.insert(offset as u32);
76+
*bruteforce_time.entry(pattern_str.to_string()).or_default() += elapsed;
6477
}
6578
}
6679
LogRecord {
@@ -78,17 +91,12 @@ fn bench_regex(criterion: &mut Criterion) {
7891
.collect::<Vec<_>>();
7992
let log_count = logs.len();
8093
let mut segment = TestDistributedSegment::default();
81-
for (idx, batch) in logs
82-
.chunks(LOG_CHUNK_SIZE)
83-
.enumerate()
84-
.progress()
85-
.with_message("Applying log chunk")
86-
{
94+
for (idx, batch) in logs.chunks(LOG_CHUNK_SIZE).enumerate() {
8795
segment
8896
.compact_log(Chunk::new(batch.into()), idx * LOG_CHUNK_SIZE)
8997
.await;
9098
}
91-
(segment, expected_results, log_count)
99+
(segment, expected_results, bruteforce_time, log_count)
92100
});
93101

94102
let filter_input = FilterInput {
@@ -135,7 +143,20 @@ fn bench_regex(criterion: &mut Criterion) {
135143
)
136144
};
137145

138-
bench_run(pattern, criterion, &runtime, setup, routine);
146+
bench_run(
147+
format!(
148+
"Pattern: [{pattern}], Reference duration: [{}µs]",
149+
bruteforce_time
150+
.get(*pattern)
151+
.expect("Reference bruteforce time should be present")
152+
.as_micros()
153+
)
154+
.as_str(),
155+
criterion,
156+
&runtime,
157+
setup,
158+
routine,
159+
);
139160
}
140161
}
141162

0 commit comments

Comments
 (0)