@@ -109,71 +109,52 @@ const fn uniform_block(b: u8) -> usize {
109
109
( b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */ ) as usize
110
110
}
111
111
112
- // A byte-wise range-check on an enire word/block,
113
- // ensuring all bytes in the word satisfy
114
- // `33 <= x <= 126 && x != '>' && x != '<'`
115
- // IMPORTANT: it false negatives if the block contains '?'
112
+ // A byte-wise range-check on an entire word/block,
113
+ // ensuring all bytes in the word satisfy `33 <= (x != 127) <= 255`
116
114
#[ inline]
117
115
fn match_uri_char_8_swar ( block : ByteBlock ) -> usize {
118
- // 33 <= x <= 126
116
+ // 33 <= (x != 127) <= 255
119
117
const M : u8 = 0x21 ;
120
- const N : u8 = 0x7E ;
118
+ // uniform block full of exclamation mark (!) (33).
121
119
const BM : usize = uniform_block ( M ) ;
122
- const BN : usize = uniform_block ( 127 - N ) ;
120
+ // uniform block full of 1.
121
+ const ONE : usize = uniform_block ( 0x01 ) ;
122
+ // uniform block full of DEL (127).
123
+ const DEL : usize = uniform_block ( 0x7f ) ;
124
+ // uniform block full of 128.
123
125
const M128 : usize = uniform_block ( 128 ) ;
124
126
125
127
let x = usize:: from_ne_bytes ( block) ; // Really just a transmute
126
128
let lt = x. wrapping_sub ( BM ) & !x; // <= m
127
- let gt = x. wrapping_add ( BN ) | x; // >= n
128
-
129
- // XOR checks to catch '<' & '>' for correctness
130
- //
131
- // XOR can be thought of as a "distance function"
132
- // (somewhat extrapolating from the `xor(x, x) = 0` identity and ∀ x != y: xor(x, y) != 0`
133
- // (each u8 "xor key" providing a unique total ordering of u8)
134
- // '<' and '>' have a "xor distance" of 2 (`xor('<', '>') = 2`)
135
- // xor(x, '>') <= 2 => {'>', '?', '<'}
136
- // xor(x, '<') <= 2 => {'<', '=', '>'}
137
- //
138
- // We assume P('=') > P('?'),
139
- // given well/commonly-formatted URLs with querystrings contain
140
- // a single '?' but possibly many '='
141
- //
142
- // Thus it's preferable/near-optimal to "xor distance" on '>',
143
- // since we'll slowpath at most one block per URL
144
- //
145
- // Some rust code to sanity check this yourself:
146
- // ```rs
147
- // fn xordist(x: u8, n: u8) -> Vec<(char, u8)> {
148
- // (0..=255).into_iter().map(|c| (c as char, c ^ x)).filter(|(_c, y)| *y <= n).collect()
149
- // }
150
- // (xordist(b'<', 2), xordist(b'>', 2))
151
- // ```
152
- const B3 : usize = uniform_block ( 3 ) ; // (dist <= 2) + 1 to wrap
153
- const BGT : usize = uniform_block ( b'>' ) ;
154
-
155
- let xgt = x ^ BGT ;
156
- let ltgtq = xgt. wrapping_sub ( B3 ) & !xgt;
157
-
158
- offsetnz ( ( ltgtq | lt | gt) & M128 )
129
+
130
+ let xor_del = x ^ DEL ;
131
+ let eq_del = xor_del. wrapping_sub ( ONE ) & !xor_del; // == DEL
132
+
133
+ offsetnz ( ( lt | eq_del) & M128 )
159
134
}
160
135
161
136
// A byte-wise range-check on an entire word/block,
162
- // ensuring all bytes in the word satisfy `32 <= x <= 126`
163
- // IMPORTANT: false negatives if obs-text is present (0x80..=0xFF)
137
+ // ensuring all bytes in the word satisfy `32 <= (x != 127) <= 255`
164
138
#[ inline]
165
139
fn match_header_value_char_8_swar ( block : ByteBlock ) -> usize {
166
- // 32 <= x <= 126
140
+ // 32 <= (x != 127) <= 255
167
141
const M : u8 = 0x20 ;
168
- const N : u8 = 0x7E ;
142
+ // uniform block full of exclamation mark (!) (33).
169
143
const BM : usize = uniform_block ( M ) ;
170
- const BN : usize = uniform_block ( 127 - N ) ;
144
+ // uniform block full of 1.
145
+ const ONE : usize = uniform_block ( 0x01 ) ;
146
+ // uniform block full of DEL (127).
147
+ const DEL : usize = uniform_block ( 0x7f ) ;
148
+ // uniform block full of 128.
171
149
const M128 : usize = uniform_block ( 128 ) ;
172
150
173
151
let x = usize:: from_ne_bytes ( block) ; // Really just a transmute
174
152
let lt = x. wrapping_sub ( BM ) & !x; // <= m
175
- let gt = x. wrapping_add ( BN ) | x; // >= n
176
- offsetnz ( ( lt | gt) & M128 )
153
+
154
+ let xor_del = x ^ DEL ;
155
+ let eq_del = xor_del. wrapping_sub ( ONE ) & !xor_del; // == DEL
156
+
157
+ offsetnz ( ( lt | eq_del) & M128 )
177
158
}
178
159
179
160
/// Check block to find offset of first non-zero byte
@@ -202,13 +183,15 @@ fn test_is_header_value_block() {
202
183
for b in 0 ..32_u8 {
203
184
assert ! ( !is_header_value_block( [ b; BLOCK_SIZE ] ) , "b={}" , b) ;
204
185
}
205
- // 32..127 => true
206
- for b in 32 ..127_u8 {
186
+ // 32..=126 => true
187
+ for b in 32 ..= 126_u8 {
207
188
assert ! ( is_header_value_block( [ b; BLOCK_SIZE ] ) , "b={}" , b) ;
208
189
}
209
- // 127..=255 => false
210
- for b in 127 ..=255_u8 {
211
- assert ! ( !is_header_value_block( [ b; BLOCK_SIZE ] ) , "b={}" , b) ;
190
+ // 127 => false
191
+ assert ! ( !is_header_value_block( [ b'\x7F' ; BLOCK_SIZE ] ) , "b={}" , b'\x7F' ) ;
192
+ // 128..=255 => true
193
+ for b in 128 ..=255_u8 {
194
+ assert ! ( is_header_value_block( [ b; BLOCK_SIZE ] ) , "b={}" , b) ;
212
195
}
213
196
214
197
@@ -228,14 +211,15 @@ fn test_is_uri_block() {
228
211
for b in 0 ..33_u8 {
229
212
assert ! ( !is_uri_block( [ b; BLOCK_SIZE ] ) , "b={}" , b) ;
230
213
}
231
- // 33..127 => true if b not in { '<', '?', '>' }
232
- let falsy = |b| b"<?>" . contains ( & b) ;
233
- for b in 33 ..127_u8 {
234
- assert_eq ! ( is_uri_block( [ b; BLOCK_SIZE ] ) , !falsy( b) , "b={}" , b) ;
214
+ // 33..=126 => true
215
+ for b in 33 ..=126_u8 {
216
+ assert ! ( is_uri_block( [ b; BLOCK_SIZE ] ) , "b={}" , b) ;
235
217
}
236
- // 127..=255 => false
237
- for b in 127 ..=255_u8 {
238
- assert ! ( !is_uri_block( [ b; BLOCK_SIZE ] ) , "b={}" , b) ;
218
+ // 127 => false
219
+ assert ! ( !is_uri_block( [ b'\x7F' ; BLOCK_SIZE ] ) , "b={}" , b'\x7F' ) ;
220
+ // 128..=255 => true
221
+ for b in 128 ..=255_u8 {
222
+ assert ! ( is_uri_block( [ b; BLOCK_SIZE ] ) , "b={}" , b) ;
239
223
}
240
224
}
241
225
0 commit comments