1
1
use std:: collections:: HashMap ;
2
+ use std:: time:: Duration ;
2
3
3
4
use chroma_benchmark:: benchmark:: { bench_run, tokio_multi_thread} ;
4
5
use chroma_benchmark:: datasets:: types:: RecordDataset ;
@@ -16,16 +17,24 @@ use futures::{StreamExt, TryStreamExt};
16
17
use indicatif:: ProgressIterator ;
17
18
use regex:: Regex ;
18
19
use roaring:: RoaringBitmap ;
20
+ use tokio:: time:: Instant ;
19
21
use worker:: execution:: operators:: filter:: { FilterInput , FilterOperator } ;
20
22
21
23
const LOG_CHUNK_SIZE : usize = 1000 ;
22
24
const DOCUMENT_SIZE : usize = 10000 ;
23
- const REGEX_PATTERNS : & [ & str ] = & [ "Hello" ] ;
25
+ const REGEX_PATTERNS : & [ & str ] = & [
26
+ r"wikipedia" ,
27
+ r"(?i)wikipedia" ,
28
+ r"20\d\d" ,
29
+ r".*wiki.*" ,
30
+ r"May|June" ,
31
+ r"(March|April) 19\d\d" ,
32
+ ] ;
24
33
25
34
fn bench_regex ( criterion : & mut Criterion ) {
26
35
let runtime = tokio_multi_thread ( ) ;
27
36
28
- let ( test_segment, expected_results, doc_count) = runtime. block_on ( async {
37
+ let ( test_segment, expected_results, bruteforce_time , doc_count) = runtime. block_on ( async {
29
38
let wikipedia = WikipediaDataset :: init ( )
30
39
. await
31
40
. expect ( "Wikipedia dataset should exist" ) ;
@@ -39,6 +48,7 @@ fn bench_regex(criterion: &mut Criterion) {
39
48
. expect ( "Wikipedia dataset should have valid records" ) ;
40
49
41
50
let mut expected_results = HashMap :: new ( ) ;
51
+ let mut bruteforce_time = HashMap :: < _ , Duration > :: new ( ) ;
42
52
let regexes = REGEX_PATTERNS
43
53
. iter ( )
44
54
. map ( |pattern_str| {
@@ -56,11 +66,14 @@ fn bench_regex(criterion: &mut Criterion) {
56
66
. enumerate ( )
57
67
. map ( |( offset, record) | {
58
68
for ( pattern_str, pattern) in & regexes {
69
+ let now = Instant :: now ( ) ;
59
70
if pattern. is_match ( & record. document ) {
71
+ let elapsed = now. elapsed ( ) ;
60
72
expected_results
61
73
. entry ( pattern_str. to_string ( ) )
62
74
. or_insert ( RoaringBitmap :: new ( ) )
63
75
. insert ( offset as u32 ) ;
76
+ * bruteforce_time. entry ( pattern_str. to_string ( ) ) . or_default ( ) += elapsed;
64
77
}
65
78
}
66
79
LogRecord {
@@ -78,17 +91,12 @@ fn bench_regex(criterion: &mut Criterion) {
78
91
. collect :: < Vec < _ > > ( ) ;
79
92
let log_count = logs. len ( ) ;
80
93
let mut segment = TestDistributedSegment :: default ( ) ;
81
- for ( idx, batch) in logs
82
- . chunks ( LOG_CHUNK_SIZE )
83
- . enumerate ( )
84
- . progress ( )
85
- . with_message ( "Applying log chunk" )
86
- {
94
+ for ( idx, batch) in logs. chunks ( LOG_CHUNK_SIZE ) . enumerate ( ) {
87
95
segment
88
96
. compact_log ( Chunk :: new ( batch. into ( ) ) , idx * LOG_CHUNK_SIZE )
89
97
. await ;
90
98
}
91
- ( segment, expected_results, log_count)
99
+ ( segment, expected_results, bruteforce_time , log_count)
92
100
} ) ;
93
101
94
102
let filter_input = FilterInput {
@@ -135,7 +143,20 @@ fn bench_regex(criterion: &mut Criterion) {
135
143
)
136
144
} ;
137
145
138
- bench_run ( pattern, criterion, & runtime, setup, routine) ;
146
+ bench_run (
147
+ format ! (
148
+ "Pattern: [{pattern}], Reference duration: [{}µs]" ,
149
+ bruteforce_time
150
+ . get( * pattern)
151
+ . expect( "Reference bruteforce time should be present" )
152
+ . as_micros( )
153
+ )
154
+ . as_str ( ) ,
155
+ criterion,
156
+ & runtime,
157
+ setup,
158
+ routine,
159
+ ) ;
139
160
}
140
161
}
141
162
0 commit comments