httparse/simd/
neon.rs

1use crate::iter::Bytes;
2use core::arch::aarch64::*;
3
4#[inline]
5pub fn match_header_name_vectored(bytes: &mut Bytes) {
6    while bytes.as_ref().len() >= 16 {
7        // SAFETY: ensured that there are at least 16 bytes remaining 
8        unsafe {
9            let advance = match_header_name_char_16_neon(bytes.as_ref().as_ptr());
10            bytes.advance(advance);
11
12            if advance != 16 {
13                return;
14            }
15        }
16    }
17    super::swar::match_header_name_vectored(bytes);
18}
19
20#[inline]
21pub fn match_header_value_vectored(bytes: &mut Bytes) {
22    while bytes.as_ref().len() >= 16 {
23        // SAFETY: ensured that there are at least 16 bytes remaining 
24        unsafe {
25            let advance = match_header_value_char_16_neon(bytes.as_ref().as_ptr());
26            bytes.advance(advance);
27
28            if advance != 16 {
29                return;
30            }
31        }
32    }
33    super::swar::match_header_value_vectored(bytes);
34}
35
36#[inline]
37pub fn match_uri_vectored(bytes: &mut Bytes) {
38    while bytes.as_ref().len() >= 16 {
39        // SAFETY: ensured that there are at least 16 bytes remaining 
40        unsafe {
41            let advance = match_url_char_16_neon(bytes.as_ref().as_ptr());
42            bytes.advance(advance);
43
44            if advance != 16 {
45                return;
46            }
47        }
48    }
49    super::swar::match_uri_vectored(bytes);
50}
51
52const fn bit_set(x: u8) -> bool {
53    // Validates if a byte is a valid header name character
54    // https://tools.ietf.org/html/rfc7230#section-3.2.6
55    matches!(x, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~')
56}
57
58// A 256-bit bitmap, split into two halves
59// lower half contains bits whose higher nibble is <= 7
60// higher half contains bits whose higher nibble is >= 8
61const fn build_bitmap() -> ([u8; 16], [u8; 16]) {
62    let mut bitmap_0_7 = [0u8; 16]; // 0x00..0x7F
63    let mut bitmap_8_15 = [0u8; 16]; // 0x80..0xFF
64    let mut i = 0;
65    while i < 256 {
66        if bit_set(i as u8) {
67            // Nibbles
68            let (lo, hi) = (i & 0x0F, i >> 4);
69            if i < 128 {
70                bitmap_0_7[lo] |= 1 << hi;
71            } else {
72                bitmap_8_15[lo] |= 1 << hi;
73            }
74        }
75        i += 1;
76    }
77    (bitmap_0_7, bitmap_8_15)
78}
79
80const BITMAPS: ([u8; 16], [u8; 16]) = build_bitmap();
81
82// NOTE: adapted from 256-bit version, with upper 128-bit ops commented out
83#[inline]
84unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
85    let bitmaps = BITMAPS;
86    // NOTE: ideally compile-time constants
87    let (bitmap_0_7, _bitmap_8_15) = bitmaps;
88    let bitmap_0_7 = vld1q_u8(bitmap_0_7.as_ptr());
89    // let bitmap_8_15 = vld1q_u8(bitmap_8_15.as_ptr());
90
91    // Initialize the bitmask_lookup.
92    const BITMASK_LOOKUP_DATA: [u8; 16] =
93        [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
94    let bitmask_lookup = vld1q_u8(BITMASK_LOOKUP_DATA.as_ptr());
95
96    // Load 16 input bytes.
97    let input = vld1q_u8(ptr);
98
99    // Extract indices for row_0_7.
100    let indices_0_7 = vandq_u8(input, vdupq_n_u8(0x8F)); // 0b1000_1111;
101
102    // Extract indices for row_8_15.
103    // let msb = vandq_u8(input, vdupq_n_u8(0x80));
104    // let indices_8_15 = veorq_u8(indices_0_7, msb);
105
106    // Fetch row_0_7 and row_8_15.
107    let row_0_7 = vqtbl1q_u8(bitmap_0_7, indices_0_7);
108    // let row_8_15 = vqtbl1q_u8(bitmap_8_15, indices_8_15);
109
110    // Calculate a bitmask, i.e. (1 << hi_nibble % 8).
111    let bitmask = vqtbl1q_u8(bitmask_lookup, vshrq_n_u8(input, 4));
112
113    // Choose rows halves depending on higher nibbles.
114    // let bitsets = vorrq_u8(row_0_7, row_8_15);
115    let bitsets = row_0_7;
116
117    // Finally check which bytes belong to the set.
118    let tmp = vandq_u8(bitsets, bitmask);
119    let result = vceqq_u8(tmp, bitmask);
120
121    offsetz(result) as usize
122}
123
124#[inline]
125unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {
126    let input = vld1q_u8(ptr);
127
128    // Check that b'!' <= and b != 127
129    let result = vcleq_u8(vdupq_n_u8(b'!'), input);
130
131    // Disallow del
132    let del = vceqq_u8(input, vdupq_n_u8(0x7F));
133    let result = vbicq_u8(result, del);
134
135    offsetz(result) as usize
136}
137
138#[inline]
139unsafe fn match_header_value_char_16_neon(ptr: *const u8) -> usize {
140    let input = vld1q_u8(ptr);
141
142    // Check that b' ' <= and b != 127 or b == 9
143    let result = vcleq_u8(vdupq_n_u8(b' '), input);
144
145    // Allow tab
146    let tab = vceqq_u8(input, vdupq_n_u8(0x09));
147    let result = vorrq_u8(result, tab);
148
149    // Disallow del
150    let del = vceqq_u8(input, vdupq_n_u8(0x7F));
151    let result = vbicq_u8(result, del);
152
153    offsetz(result) as usize
154}
155
156#[inline]
157unsafe fn offsetz(x: uint8x16_t) -> u32 {
158    // NOT the vector since it's faster to operate with zeros instead
159    offsetnz(vmvnq_u8(x))
160}
161
162#[inline]
163unsafe fn offsetnz(x: uint8x16_t) -> u32 {
164    // Extract two u64
165    let x = vreinterpretq_u64_u8(x);
166    // Extract to general purpose registers to perform clz
167    let low: u64 = vgetq_lane_u64::<0>(x);
168    let high: u64 = vgetq_lane_u64::<1>(x);
169
170    #[inline]
171    fn clz(x: u64) -> u32 {
172        // perf: rust will unroll this loop
173        // and it's much faster than rbit + clz so voila
174        for (i, b) in x.to_ne_bytes().iter().copied().enumerate() {
175            if b != 0 {
176                return i as u32;
177            }
178        }
179        8 // Technically not reachable since zero-guarded
180    }
181
182    if low != 0 {
183        clz(low)
184    } else if high != 0 {
185        return 8 + clz(high);
186    } else {
187        return 16;
188    }
189}
190
191#[test]
192fn neon_code_matches_uri_chars_table() {
193    #[allow(clippy::undocumented_unsafe_blocks)]
194    unsafe {
195        assert!(byte_is_allowed(b'_', match_uri_vectored));
196
197        for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
198            assert_eq!(
199                byte_is_allowed(b as u8, match_uri_vectored),
200                allowed,
201                "byte_is_allowed({:?}) should be {:?}",
202                b,
203                allowed,
204            );
205        }
206    }
207}
208
209#[test]
210fn neon_code_matches_header_value_chars_table() {
211    #[allow(clippy::undocumented_unsafe_blocks)]
212    unsafe {
213        assert!(byte_is_allowed(b'_', match_header_value_vectored));
214
215        for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() {
216            assert_eq!(
217                byte_is_allowed(b as u8, match_header_value_vectored),
218                allowed,
219                "byte_is_allowed({:?}) should be {:?}",
220                b,
221                allowed,
222            );
223        }
224    }
225}
226
227#[test]
228fn neon_code_matches_header_name_chars_table() {
229    #[allow(clippy::undocumented_unsafe_blocks)]
230    unsafe {
231        assert!(byte_is_allowed(b'_', match_header_name_vectored));
232
233        for (b, allowed) in crate::TOKEN_MAP.iter().cloned().enumerate() {
234            assert_eq!(
235                byte_is_allowed(b as u8, match_header_name_vectored),
236                allowed,
237                "byte_is_allowed({:?}) should be {:?}",
238                b,
239                allowed,
240            );
241        }
242    }
243}
244
245#[cfg(test)]
246unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {
247    let mut slice = [b'_'; 16];
248    slice[10] = byte;
249    let mut bytes = Bytes::new(&slice);
250
251    f(&mut bytes);
252
253    match bytes.pos() {
254        16 => true,
255        10 => false,
256        x => panic!("unexpected pos: {}", x),
257    }
258}