Skip to content

Commit 11a86d1

Browse files
authored
fix(swar): utf8 support (#205)
This supersedes #202. SWAR validator now allows UTF8 characters as other SIMD backends. Tests are also updated. Closes #201
1 parent 36c38e5 commit 11a86d1

File tree

2 files changed

+44
-61
lines changed

2 files changed

+44
-61
lines changed

src/simd/avx2.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
5151

5252
// unsigned comparison dat >= LOW
5353
//
54-
// We create a new via `_mm256_max_epu8` which compares vectors `dat` and `LOW`
54+
// `_mm256_max_epu8` creates a new vector by comparing vectors `dat` and `LOW`
5555
// and picks the max. values from each for all indices.
5656
// So if a byte in `dat` is <= 32, it'll be represented as 33
5757
// which is the smallest valid character.
@@ -67,8 +67,7 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
6767

6868
// We glue the both comparisons via `_mm256_andnot_si256`.
6969
//
70-
// Since the representation of truthy/falsy differ in these comparisons,
71-
// we cannot use
70+
// Since the representation of truthiness differ in these comparisons,
7271
// we are in need of bitwise NOT to convert valid characters of `del`.
7372
let bit = _mm256_andnot_si256(del, low);
7473
// This creates a bitmask from the most significant bit of each byte.

src/simd/swar.rs

Lines changed: 42 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -109,71 +109,52 @@ const fn uniform_block(b: u8) -> usize {
109109
(b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
110110
}
111111

112-
// A byte-wise range-check on an enire word/block,
113-
// ensuring all bytes in the word satisfy
114-
// `33 <= x <= 126 && x != '>' && x != '<'`
115-
// IMPORTANT: it false negatives if the block contains '?'
112+
// A byte-wise range-check on an entire word/block,
113+
// ensuring all bytes in the word satisfy `33 <= (x != 127) <= 255`
116114
#[inline]
117115
fn match_uri_char_8_swar(block: ByteBlock) -> usize {
118-
// 33 <= x <= 126
116+
// 33 <= (x != 127) <= 255
119117
const M: u8 = 0x21;
120-
const N: u8 = 0x7E;
118+
// uniform block full of exclamation mark (!) (33).
121119
const BM: usize = uniform_block(M);
122-
const BN: usize = uniform_block(127 - N);
120+
// uniform block full of 1.
121+
const ONE: usize = uniform_block(0x01);
122+
// uniform block full of DEL (127).
123+
const DEL: usize = uniform_block(0x7f);
124+
// uniform block full of 128.
123125
const M128: usize = uniform_block(128);
124126

125127
let x = usize::from_ne_bytes(block); // Really just a transmute
126128
let lt = x.wrapping_sub(BM) & !x; // <= m
127-
let gt = x.wrapping_add(BN) | x; // >= n
128-
129-
// XOR checks to catch '<' & '>' for correctness
130-
//
131-
// XOR can be thought of as a "distance function"
132-
// (somewhat extrapolating from the `xor(x, x) = 0` identity and ∀ x != y: xor(x, y) != 0`
133-
// (each u8 "xor key" providing a unique total ordering of u8)
134-
// '<' and '>' have a "xor distance" of 2 (`xor('<', '>') = 2`)
135-
// xor(x, '>') <= 2 => {'>', '?', '<'}
136-
// xor(x, '<') <= 2 => {'<', '=', '>'}
137-
//
138-
// We assume P('=') > P('?'),
139-
// given well/commonly-formatted URLs with querystrings contain
140-
// a single '?' but possibly many '='
141-
//
142-
// Thus it's preferable/near-optimal to "xor distance" on '>',
143-
// since we'll slowpath at most one block per URL
144-
//
145-
// Some rust code to sanity check this yourself:
146-
// ```rs
147-
// fn xordist(x: u8, n: u8) -> Vec<(char, u8)> {
148-
// (0..=255).into_iter().map(|c| (c as char, c ^ x)).filter(|(_c, y)| *y <= n).collect()
149-
// }
150-
// (xordist(b'<', 2), xordist(b'>', 2))
151-
// ```
152-
const B3: usize = uniform_block(3); // (dist <= 2) + 1 to wrap
153-
const BGT: usize = uniform_block(b'>');
154-
155-
let xgt = x ^ BGT;
156-
let ltgtq = xgt.wrapping_sub(B3) & !xgt;
157-
158-
offsetnz((ltgtq | lt | gt) & M128)
129+
130+
let xor_del = x ^ DEL;
131+
let eq_del = xor_del.wrapping_sub(ONE) & !xor_del; // == DEL
132+
133+
offsetnz((lt | eq_del) & M128)
159134
}
160135

161136
// A byte-wise range-check on an entire word/block,
162-
// ensuring all bytes in the word satisfy `32 <= x <= 126`
163-
// IMPORTANT: false negatives if obs-text is present (0x80..=0xFF)
137+
// ensuring all bytes in the word satisfy `32 <= (x != 127) <= 255`
164138
#[inline]
165139
fn match_header_value_char_8_swar(block: ByteBlock) -> usize {
166-
// 32 <= x <= 126
140+
// 32 <= (x != 127) <= 255
167141
const M: u8 = 0x20;
168-
const N: u8 = 0x7E;
142+
// uniform block full of exclamation mark (!) (33).
169143
const BM: usize = uniform_block(M);
170-
const BN: usize = uniform_block(127 - N);
144+
// uniform block full of 1.
145+
const ONE: usize = uniform_block(0x01);
146+
// uniform block full of DEL (127).
147+
const DEL: usize = uniform_block(0x7f);
148+
// uniform block full of 128.
171149
const M128: usize = uniform_block(128);
172150

173151
let x = usize::from_ne_bytes(block); // Really just a transmute
174152
let lt = x.wrapping_sub(BM) & !x; // <= m
175-
let gt = x.wrapping_add(BN) | x; // >= n
176-
offsetnz((lt | gt) & M128)
153+
154+
let xor_del = x ^ DEL;
155+
let eq_del = xor_del.wrapping_sub(ONE) & !xor_del; // == DEL
156+
157+
offsetnz((lt | eq_del) & M128)
177158
}
178159

179160
/// Check block to find offset of first non-zero byte
@@ -202,13 +183,15 @@ fn test_is_header_value_block() {
202183
for b in 0..32_u8 {
203184
assert!(!is_header_value_block([b; BLOCK_SIZE]), "b={}", b);
204185
}
205-
// 32..127 => true
206-
for b in 32..127_u8 {
186+
// 32..=126 => true
187+
for b in 32..=126_u8 {
207188
assert!(is_header_value_block([b; BLOCK_SIZE]), "b={}", b);
208189
}
209-
// 127..=255 => false
210-
for b in 127..=255_u8 {
211-
assert!(!is_header_value_block([b; BLOCK_SIZE]), "b={}", b);
190+
// 127 => false
191+
assert!(!is_header_value_block([b'\x7F'; BLOCK_SIZE]), "b={}", b'\x7F');
192+
// 128..=255 => true
193+
for b in 128..=255_u8 {
194+
assert!(is_header_value_block([b; BLOCK_SIZE]), "b={}", b);
212195
}
213196

214197

@@ -228,14 +211,15 @@ fn test_is_uri_block() {
228211
for b in 0..33_u8 {
229212
assert!(!is_uri_block([b; BLOCK_SIZE]), "b={}", b);
230213
}
231-
// 33..127 => true if b not in { '<', '?', '>' }
232-
let falsy = |b| b"<?>".contains(&b);
233-
for b in 33..127_u8 {
234-
assert_eq!(is_uri_block([b; BLOCK_SIZE]), !falsy(b), "b={}", b);
214+
// 33..=126 => true
215+
for b in 33..=126_u8 {
216+
assert!(is_uri_block([b; BLOCK_SIZE]), "b={}", b);
235217
}
236-
// 127..=255 => false
237-
for b in 127..=255_u8 {
238-
assert!(!is_uri_block([b; BLOCK_SIZE]), "b={}", b);
218+
// 127 => false
219+
assert!(!is_uri_block([b'\x7F'; BLOCK_SIZE]), "b={}", b'\x7F');
220+
// 128..=255 => true
221+
for b in 128..=255_u8 {
222+
assert!(is_uri_block([b; BLOCK_SIZE]), "b={}", b);
239223
}
240224
}
241225

0 commit comments

Comments
 (0)