Skip to content

Commit 1c5faf8

Browse files
authored
cleanup: drop SWAR's 64-bit assumptions (#140)
The SWAR code now operates on a register of the host CPU at a time as intended. Note this might actually not be faster on 32-bit, I would have to bench it but in some cases 4 memory reads / lookup-table reads might be faster than blockwide-operations
1 parent f34faf2 commit 1c5faf8

File tree

1 file changed

+41
-38
lines changed

1 file changed

+41
-38
lines changed

src/simd/swar.rs

Lines changed: 41 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
/// SWAR: SIMD Within A Register
22
/// SIMD validator backend that validates register-sized chunks of data at a time.
3-
// TODO: current impl assumes 64-bit registers, optimize for 32-bit
43
use crate::{is_header_name_token, is_header_value_token, is_uri_token, Bytes};
54

5+
// Adapt block-size to match native register size, i.e: 32bit => 4, 64bit => 8
6+
const BLOCK_SIZE: usize = core::mem::size_of::<usize>();
7+
type ByteBlock = [u8; BLOCK_SIZE];
8+
69
#[inline]
710
pub fn match_uri_vectored(bytes: &mut Bytes) {
811
loop {
9-
if let Some(bytes8) = bytes.peek_n::<[u8; 8]>(8) {
12+
if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
1013
let n = match_uri_char_8_swar(bytes8);
1114
unsafe {
1215
bytes.advance(n);
1316
}
14-
if n == 8 {
17+
if n == BLOCK_SIZE {
1518
continue;
1619
}
1720
}
@@ -28,12 +31,12 @@ pub fn match_uri_vectored(bytes: &mut Bytes) {
2831
#[inline]
2932
pub fn match_header_value_vectored(bytes: &mut Bytes) {
3033
loop {
31-
if let Some(bytes8) = bytes.peek_n::<[u8; 8]>(8) {
34+
if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
3235
let n = match_header_value_char_8_swar(bytes8);
3336
unsafe {
3437
bytes.advance(n);
3538
}
36-
if n == 8 {
39+
if n == BLOCK_SIZE {
3740
continue;
3841
}
3942
}
@@ -49,19 +52,19 @@ pub fn match_header_value_vectored(bytes: &mut Bytes) {
4952

5053
#[inline]
5154
pub fn match_header_name_vectored(bytes: &mut Bytes) {
52-
while let Some(block) = bytes.peek_n::<[u8; 8]>(8) {
55+
while let Some(block) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
5356
let n = match_block(is_header_name_token, block);
5457
unsafe {
5558
bytes.advance(n);
5659
}
57-
if n != 8 {
60+
if n != BLOCK_SIZE {
5861
return;
5962
}
6063
}
6164
unsafe { bytes.advance(match_tail(is_header_name_token, bytes.as_ref())) };
6265
}
6366

64-
// Matches "tail", i.e: when we have <8 bytes in the buffer, should be uncommon
67+
// Matches "tail", i.e: when we have <BLOCK_SIZE bytes in the buffer, should be uncommon
6568
#[cold]
6669
#[inline]
6770
fn match_tail(f: impl Fn(u8) -> bool, bytes: &[u8]) -> usize {
@@ -75,35 +78,35 @@ fn match_tail(f: impl Fn(u8) -> bool, bytes: &[u8]) -> usize {
7578

7679
// Naive fallback block matcher
7780
#[inline(always)]
78-
fn match_block(f: impl Fn(u8) -> bool, block: [u8; 8]) -> usize {
81+
fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize {
7982
for (i, &b) in block.iter().enumerate() {
8083
if !f(b) {
8184
return i;
8285
}
8386
}
84-
8
87+
BLOCK_SIZE
8588
}
8689

87-
/// // A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44)
90+
// A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44)
8891
// creates a u64 whose bytes are each equal to b
89-
const fn uniform_block(b: u8) -> u64 {
90-
b as u64 * 0x01_01_01_01_01_01_01_01 // [1_u8; 8]
92+
const fn uniform_block(b: u8) -> usize {
93+
(b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
9194
}
9295

9396
// A byte-wise range-check on an enire word/block,
9497
// ensuring all bytes in the word satisfy
9598
// `33 <= x <= 126 && x != '>' && x != '<'`
9699
// IMPORTANT: it false negatives if the block contains '?'
97100
#[inline]
98-
fn match_uri_char_8_swar(block: [u8; 8]) -> usize {
101+
fn match_uri_char_8_swar(block: ByteBlock) -> usize {
99102
// 33 <= x <= 126
100103
const M: u8 = 0x21;
101104
const N: u8 = 0x7E;
102-
const BM: u64 = uniform_block(M);
103-
const BN: u64 = uniform_block(127 - N);
104-
const M128: u64 = uniform_block(128);
105+
const BM: usize = uniform_block(M);
106+
const BN: usize = uniform_block(127 - N);
107+
const M128: usize = uniform_block(128);
105108

106-
let x = u64::from_ne_bytes(block); // Really just a transmute
109+
let x = usize::from_ne_bytes(block); // Really just a transmute
107110
let lt = x.wrapping_sub(BM) & !x; // <= m
108111
let gt = x.wrapping_add(BN) | x; // >= n
109112

@@ -130,8 +133,8 @@ fn match_uri_char_8_swar(block: [u8; 8]) -> usize {
130133
// }
131134
// (xordist(b'<', 2), xordist(b'>', 2))
132135
// ```
133-
const B3: u64 = uniform_block(3); // (dist <= 2) + 1 to wrap
134-
const BGT: u64 = uniform_block(b'>');
136+
const B3: usize = uniform_block(3); // (dist <= 2) + 1 to wrap
137+
const BGT: usize = uniform_block(b'>');
135138

136139
let xgt = x ^ BGT;
137140
let ltgtq = xgt.wrapping_sub(B3) & !xgt;
@@ -143,15 +146,15 @@ fn match_uri_char_8_swar(block: [u8; 8]) -> usize {
143146
// ensuring all bytes in the word satisfy `32 <= x <= 126`
144147
// IMPORTANT: false negatives if obs-text is present (0x80..=0xFF)
145148
#[inline]
146-
fn match_header_value_char_8_swar(block: [u8; 8]) -> usize {
149+
fn match_header_value_char_8_swar(block: ByteBlock) -> usize {
147150
// 32 <= x <= 126
148151
const M: u8 = 0x20;
149152
const N: u8 = 0x7E;
150-
const BM: u64 = uniform_block(M);
151-
const BN: u64 = uniform_block(127 - N);
152-
const M128: u64 = uniform_block(128);
153+
const BM: usize = uniform_block(M);
154+
const BN: usize = uniform_block(127 - N);
155+
const M128: usize = uniform_block(128);
153156

154-
let x = u64::from_ne_bytes(block); // Really just a transmute
157+
let x = usize::from_ne_bytes(block); // Really just a transmute
155158
let lt = x.wrapping_sub(BM) & !x; // <= m
156159
let gt = x.wrapping_add(BN) | x; // >= n
157160
offsetnz((lt | gt) & M128)
@@ -160,10 +163,10 @@ fn match_header_value_char_8_swar(block: [u8; 8]) -> usize {
160163
/// Check block to find offset of first non-zero byte
161164
// NOTE: Curiously `block.trailing_zeros() >> 3` appears to be slower, maybe revisit
162165
#[inline]
163-
fn offsetnz(block: u64) -> usize {
166+
fn offsetnz(block: usize) -> usize {
164167
// fast path optimistic case (common for long valid sequences)
165168
if block == 0 {
166-
return 8;
169+
return BLOCK_SIZE;
167170
}
168171

169172
// perf: rust will unroll this loop
@@ -177,19 +180,19 @@ fn offsetnz(block: u64) -> usize {
177180

178181
#[test]
179182
fn test_is_header_value_block() {
180-
let is_header_value_block = |b| match_header_value_char_8_swar(b) == 8;
183+
let is_header_value_block = |b| match_header_value_char_8_swar(b) == BLOCK_SIZE;
181184

182185
// 0..32 => false
183186
for b in 0..32_u8 {
184-
assert_eq!(is_header_value_block([b; 8]), false, "b={}", b);
187+
assert_eq!(is_header_value_block([b; BLOCK_SIZE]), false, "b={}", b);
185188
}
186189
// 32..127 => true
187190
for b in 32..127_u8 {
188-
assert_eq!(is_header_value_block([b; 8]), true, "b={}", b);
191+
assert_eq!(is_header_value_block([b; BLOCK_SIZE]), true, "b={}", b);
189192
}
190193
// 127..=255 => false
191194
for b in 127..=255_u8 {
192-
assert_eq!(is_header_value_block([b; 8]), false, "b={}", b);
195+
assert_eq!(is_header_value_block([b; BLOCK_SIZE]), false, "b={}", b);
193196
}
194197

195198
// A few sanity checks on non-uniform bytes for safe-measure
@@ -199,30 +202,30 @@ fn test_is_header_value_block() {
199202

200203
#[test]
201204
fn test_is_uri_block() {
202-
let is_uri_block = |b| match_uri_char_8_swar(b) == 8;
205+
let is_uri_block = |b| match_uri_char_8_swar(b) == BLOCK_SIZE;
203206

204207
// 0..33 => false
205208
for b in 0..33_u8 {
206-
assert_eq!(is_uri_block([b; 8]), false, "b={}", b);
209+
assert_eq!(is_uri_block([b; BLOCK_SIZE]), false, "b={}", b);
207210
}
208211
// 33..127 => true if b not in { '<', '?', '>' }
209212
let falsy = |b| b"<?>".contains(&b);
210213
for b in 33..127_u8 {
211-
assert_eq!(is_uri_block([b; 8]), !falsy(b), "b={}", b);
214+
assert_eq!(is_uri_block([b; BLOCK_SIZE]), !falsy(b), "b={}", b);
212215
}
213216
// 127..=255 => false
214217
for b in 127..=255_u8 {
215-
assert_eq!(is_uri_block([b; 8]), false, "b={}", b);
218+
assert_eq!(is_uri_block([b; BLOCK_SIZE]), false, "b={}", b);
216219
}
217220
}
218221

219222
#[test]
220223
fn test_offsetnz() {
221-
let seq = [0_u8; 8];
222-
for i in 0..8 {
224+
let seq = [0_u8; BLOCK_SIZE];
225+
for i in 0..BLOCK_SIZE {
223226
let mut seq = seq.clone();
224227
seq[i] = 1;
225-
let x = u64::from_ne_bytes(seq);
228+
let x = usize::from_ne_bytes(seq);
226229
assert_eq!(offsetnz(x), i);
227230
}
228231
}

0 commit comments

Comments
 (0)