Skip to content

Commit ab65f6a

Browse files
committed
Fix method parsing
The `is_token` function, used exclusively for parsing the method in a request line, allows more values than it should. In particular, it allows a leading space to be parsed. This problem is not exposed in hyper, which revalidates any method extracted by httparse, otherwise I'm sure this would have been noticed sooner! Checking for a single range of valid bytes is very fast, so I've taken care to make sure that making `is_token` more complicated doesn't slow down the most common case. While exploring a variety of options, I found the existing benchmark scheme to be a bit misleading because it would test only a single method at a time, so I've made a new benchmark that roughly simulates a mix of requests. Ultimately, what I found to be a reasonable fix without any slowdown for the 99.9999% case is to check `b'A'..=b'Z'` and then fall back to a "byte map". Both methods and header names have the same set of allowed bytes, a "token", but their uses are slightly different. I thought it would make sense to rename `is_token` to `is_method_token`, to mimic `is_header_name_token`.
1 parent 97c7e6e commit ab65f6a

File tree

4 files changed

+70
-14
lines changed

4 files changed

+70
-14
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ std = []
1818

1919
[dev-dependencies]
2020
criterion = "0.3.5"
21+
rand = "0.8.5"
2122

2223
[lib]
2324
bench = false

benches/parse.rs

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ fn version(c: &mut Criterion) {
181181
}
182182

183183
fn method(c: &mut Criterion) {
184-
fn _method(c: &mut Criterion, name: &str, input: &'static [u8]) {
184+
fn _method(c: &mut Criterion, name: &str, input: &[u8]) {
185185
c.benchmark_group("method")
186186
.throughput(Throughput::Bytes(input.len() as u64))
187187
.bench_function(name, |b| b.iter(|| {
@@ -193,10 +193,42 @@ fn method(c: &mut Criterion) {
193193
// Common methods should be fast-pathed
194194
const COMMON_METHODS: &[&str] = &["GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"];
195195
for method in COMMON_METHODS {
196-
_method(c, &method.to_lowercase(), format!("{} / HTTP/1.1\r\n", method).into_bytes().leak());
196+
_method(c, &method.to_lowercase(), format!("{} / HTTP/1.1\r\n", method).as_bytes());
197197
}
198198
// Custom methods should be infrequent and thus not worth optimizing
199199
_method(c, "custom", b"CUSTOM / HTTP/1.1\r\n");
200+
_method(c, "w3!rd", b"w3!rd / HTTP/1.1\r\n");
201+
}
202+
203+
fn many_requests(c: &mut Criterion) {
204+
use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
205+
let mut requests = [
206+
("GET", 500),
207+
("POST", 300),
208+
("OPTIONS", 100),
209+
("HEAD", 50),
210+
("w3!r`d", 20),
211+
]
212+
.iter()
213+
.flat_map(|&(method, count)| std::iter::repeat(method).take(count))
214+
.map(|method| format!("{method} / HTTP/1.1\r\n\r\n"))
215+
.collect::<Vec<_>>();
216+
SliceRandom::shuffle(&mut *requests, &mut StdRng::seed_from_u64(0));
217+
218+
let total_bytes: usize = requests.iter().map(String::len).sum();
219+
220+
c.benchmark_group("many_requests")
221+
.throughput(Throughput::Bytes(total_bytes as u64))
222+
.measurement_time(Duration::from_secs(1))
223+
.sample_size(1000)
224+
.bench_function("_", |b| {
225+
b.iter(|| {
226+
requests.iter().for_each(|req| {
227+
let mut b = httparse::_benchable::Bytes::new(black_box(req.as_bytes()));
228+
httparse::_benchable::parse_method(&mut b).unwrap();
229+
});
230+
})
231+
});
200232
}
201233

202234
const WARMUP: Duration = Duration::from_millis(100);
@@ -205,6 +237,6 @@ const SAMPLES: usize = 200;
205237
criterion_group!{
206238
name = benches;
207239
config = Criterion::default().sample_size(SAMPLES).warm_up_time(WARMUP).measurement_time(MTIME);
208-
targets = req, req_short, resp, resp_short, uri, header, version, method
240+
targets = req, req_short, resp, resp_short, uri, header, version, method, many_requests
209241
}
210242
criterion_main!(benches);

src/lib.rs

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ pub mod _benchable {
4444
pub use super::iter::Bytes;
4545
}
4646

47-
/// Determines if byte is a token char.
47+
/// Determines if byte is a method token char.
4848
///
4949
/// > ```notrust
5050
/// > token = 1*tchar
@@ -55,8 +55,11 @@ pub mod _benchable {
5555
/// > ; any VCHAR, except delimiters
5656
/// > ```
5757
#[inline]
58-
fn is_token(b: u8) -> bool {
59-
b > 0x1F && b < 0x7F
58+
fn is_method_token(b: u8) -> bool {
59+
match b {
60+
b'A'..=b'Z' => true,
61+
_ => TOKEN_MAP[b as usize],
62+
}
6063
}
6164

6265
// ASCII codes to accept URI string.
@@ -95,7 +98,7 @@ pub(crate) fn is_uri_token(b: u8) -> bool {
9598
URI_MAP[b as usize]
9699
}
97100

98-
static HEADER_NAME_MAP: [bool; 256] = byte_map![
101+
static TOKEN_MAP: [bool; 256] = byte_map![
99102
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100103
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101104
0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
@@ -116,7 +119,7 @@ static HEADER_NAME_MAP: [bool; 256] = byte_map![
116119

117120
#[inline]
118121
pub(crate) fn is_header_name_token(b: u8) -> bool {
119-
HEADER_NAME_MAP[b as usize]
122+
TOKEN_MAP[b as usize]
120123
}
121124

122125
static HEADER_VALUE_MAP: [bool; 256] = byte_map![
@@ -930,7 +933,7 @@ fn parse_reason<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
930933
#[inline]
931934
fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
932935
let b = next!(bytes);
933-
if !is_token(b) {
936+
if !is_method_token(b) {
934937
// First char must be a token char, it can't be a space which would indicate an empty token.
935938
return Err(Error::Token);
936939
}
@@ -939,10 +942,10 @@ fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
939942
let b = next!(bytes);
940943
if b == b' ' {
941944
return Ok(Status::Complete(
942-
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
945+
// SAFETY: all bytes up till `i` must have been `is_method_token` and therefore also utf-8.
943946
unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
944947
));
945-
} else if !is_token(b) {
948+
} else if !is_method_token(b) {
946949
return Err(Error::Token);
947950
}
948951
}
@@ -964,7 +967,7 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
964967
}
965968

966969
return Ok(Status::Complete(
967-
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
970+
// SAFETY: all bytes up till `i` must have been `is_method_token` and therefore also utf-8.
968971
unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
969972
));
970973
} else {
@@ -1383,7 +1386,7 @@ pub fn parse_chunk_size(buf: &[u8])
13831386

13841387
#[cfg(test)]
13851388
mod tests {
1386-
use super::{Request, Response, Status, EMPTY_HEADER, parse_chunk_size};
1389+
use super::{Error, Request, Response, Status, EMPTY_HEADER, parse_chunk_size};
13871390

13881391
const NUM_OF_HEADERS: usize = 4;
13891392

@@ -2676,4 +2679,24 @@ mod tests {
26762679
assert_eq!(response.headers[0].name, "foo");
26772680
assert_eq!(response.headers[0].value, &b"bar"[..]);
26782681
}
2682+
2683+
#[test]
2684+
fn test_request_with_leading_space() {
2685+
let mut headers = [EMPTY_HEADER; 1];
2686+
let mut request = Request::new(&mut headers[..]);
2687+
let result = crate::ParserConfig::default()
2688+
.parse_request(&mut request, b" GET / HTTP/1.1\r\nfoo:bar\r\n\r\n");
2689+
2690+
assert_eq!(result, Err(Error::Token));
2691+
}
2692+
2693+
#[test]
2694+
fn test_request_with_invalid_method() {
2695+
let mut headers = [EMPTY_HEADER; 1];
2696+
let mut request = Request::new(&mut headers[..]);
2697+
let result = crate::ParserConfig::default()
2698+
.parse_request(&mut request, b"P()ST / HTTP/1.1\r\nfoo:bar\r\n\r\n");
2699+
2700+
assert_eq!(result, Err(Error::Token));
2701+
}
26792702
}

src/simd/neon.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ fn neon_code_matches_header_name_chars_table() {
235235
unsafe {
236236
assert!(byte_is_allowed(b'_', match_header_name_vectored));
237237

238-
for (b, allowed) in crate::HEADER_NAME_MAP.iter().cloned().enumerate() {
238+
for (b, allowed) in crate::TOKEN_MAP.iter().cloned().enumerate() {
239239
assert_eq!(
240240
byte_is_allowed(b as u8, match_header_name_vectored),
241241
allowed,

0 commit comments

Comments
 (0)