Skip to content

Commit 36c38e5

Browse files
authored
docs: document the AVX2 instructions (#203)
1 parent 876d08c commit 36c38e5

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed

src/simd/avx2.rs

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,25 +21,62 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
2121
#[allow(non_snake_case, overflowing_literals)]
2222
#[allow(unused)]
2323
unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
24+
// NOTE: This check might be not necessary since this function is only used in
25+
// `match_uri_vectored` where buffer overflow is taken care of.
2426
debug_assert!(buf.len() >= 32);
2527

2628
#[cfg(target_arch = "x86")]
2729
use core::arch::x86::*;
2830
#[cfg(target_arch = "x86_64")]
2931
use core::arch::x86_64::*;
3032

33+
// pointer to buffer
3134
let ptr = buf.as_ptr();
3235

3336
// %x21-%x7e %x80-%xff
37+
//
38+
// Character ranges allowed by this function, can also be interpreted as:
39+
// 33 =< (x != 127) =< 255
40+
//
41+
// Create a vector full of DEL (0x7f) characters.
3442
let DEL: __m256i = _mm256_set1_epi8(0x7f);
43+
// Create a vector full of exclamation mark (!) (0x21) characters.
44+
// Used as lower threshold, characters in URLs cannot be smaller than this.
3545
let LOW: __m256i = _mm256_set1_epi8(0x21);
3646

47+
// Load a chunk of 32 bytes from `ptr` as a vector.
48+
// We can check 32 bytes in parallel at most with AVX2 since
49+
// YMM registers can only have 256 bits most.
3750
let dat = _mm256_lddqu_si256(ptr as *const _);
51+
3852
// unsigned comparison dat >= LOW
53+
//
54+
// We create a new via `_mm256_max_epu8` which compares vectors `dat` and `LOW`
55+
// and picks the max. values from each for all indices.
56+
// So if a byte in `dat` is <= 32, it'll be represented as 33
57+
// which is the smallest valid character.
58+
//
59+
// Then, we compare the new vector with `dat` for equality.
60+
//
61+
// `_mm256_cmpeq_epi8` returns a new vector where;
62+
// * matching bytes are set to 0xFF (all bits set),
63+
// * nonmatching bytes are set to 0 (no bits set).
3964
let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat);
65+
// Similar to what we did before, but now invalid characters are set to 0xFF.
4066
let del = _mm256_cmpeq_epi8(dat, DEL);
67+
68+
// We glue the both comparisons via `_mm256_andnot_si256`.
69+
//
70+
// Since the representation of truthy/falsy differ in these comparisons,
71+
// we cannot use
72+
// we are in need of bitwise NOT to convert valid characters of `del`.
4173
let bit = _mm256_andnot_si256(del, low);
74+
// This creates a bitmask from the most significant bit of each byte.
75+
// Simply, we're converting a vector value to scalar value here.
4276
let res = _mm256_movemask_epi8(bit) as u32;
77+
78+
// Count trailing zeros to find the first encountered invalid character.
79+
// Bitwise NOT is required once again to flip truthiness.
4380
// TODO: use .trailing_ones() once MSRV >= 1.46
4481
(!res).trailing_zeros() as usize
4582
}
@@ -72,17 +109,38 @@ unsafe fn match_header_value_char_32_avx(buf: &[u8]) -> usize {
72109
let ptr = buf.as_ptr();
73110

74111
// %x09 %x20-%x7e %x80-%xff
112+
// Create a vector full of horizontal tab (\t) (0x09) characters.
75113
let TAB: __m256i = _mm256_set1_epi8(0x09);
114+
// Create a vector full of DEL (0x7f) characters.
76115
let DEL: __m256i = _mm256_set1_epi8(0x7f);
116+
// Create a vector full of space (0x20) characters.
77117
let LOW: __m256i = _mm256_set1_epi8(0x20);
78118

119+
// Load a chunk of 32 bytes from `ptr` as a vector.
79120
let dat = _mm256_lddqu_si256(ptr as *const _);
121+
80122
// unsigned comparison dat >= LOW
123+
//
124+
// Same as what we do in `match_url_char_32_avx`.
125+
// This time the lower threshold is set to space character though.
81126
let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat);
127+
// Check if `dat` includes `TAB` characters.
82128
let tab = _mm256_cmpeq_epi8(dat, TAB);
129+
// Check if `dat` includes `DEL` characters.
83130
let del = _mm256_cmpeq_epi8(dat, DEL);
131+
132+
// Combine all comparisons together, notice that we're also using OR
133+
// to connect `low` and `tab` but flip bits of `del`.
134+
//
135+
// In the end, this is simply:
136+
// ~del & (low | tab)
84137
let bit = _mm256_andnot_si256(del, _mm256_or_si256(low, tab));
138+
// This creates a bitmask from the most significant bit of each byte.
139+
// Creates a scalar value from vector value.
85140
let res = _mm256_movemask_epi8(bit) as u32;
141+
142+
// Count trailing zeros to find the first encountered invalid character.
143+
// Bitwise NOT is required once again to flip truthiness.
86144
// TODO: use .trailing_ones() once MSRV >= 1.46
87145
(!res).trailing_zeros() as usize
88146
}

0 commit comments

Comments
 (0)