bitcode/
str.rs

1use crate::coder::{Buffer, Decoder, Encoder, Result, View};
2use crate::consume::consume_bytes;
3use crate::derive::vec::VecEncoder;
4use crate::error::err;
5use crate::fast::{NextUnchecked, SliceImpl};
6use crate::length::LengthDecoder;
7use crate::u8_char::U8Char;
8use alloc::borrow::ToOwned;
9use alloc::string::String;
10use alloc::vec::Vec;
11use core::num::NonZeroUsize;
12use core::str::{from_utf8, from_utf8_unchecked};
13
14#[derive(Default)]
15pub struct StrEncoder(pub(crate) VecEncoder<U8Char>); // pub(crate) for arrayvec.rs
16
17#[inline(always)]
18fn str_as_u8_chars(s: &str) -> &[U8Char] {
19    bytemuck::must_cast_slice(s.as_bytes())
20}
21
22impl Buffer for StrEncoder {
23    fn collect_into(&mut self, out: &mut Vec<u8>) {
24        self.0.collect_into(out);
25    }
26
27    fn reserve(&mut self, additional: NonZeroUsize) {
28        self.0.reserve(additional);
29    }
30}
31
32impl Encoder<str> for StrEncoder {
33    #[inline(always)]
34    fn encode(&mut self, t: &str) {
35        self.0.encode(str_as_u8_chars(t));
36    }
37
38    #[inline(always)]
39    fn encode_vectored<'a>(&mut self, i: impl Iterator<Item = &'a str> + Clone) {
40        self.0.encode_vectored(i.map(str_as_u8_chars));
41    }
42}
43
44// TODO find a way to remove this shim.
45impl<'b> Encoder<&'b str> for StrEncoder {
46    #[inline(always)]
47    fn encode(&mut self, t: &&str) {
48        self.encode(*t);
49    }
50
51    #[inline(always)]
52    fn encode_vectored<'a>(&mut self, i: impl Iterator<Item = &'a &'b str> + Clone)
53    where
54        &'b str: 'a,
55    {
56        self.encode_vectored(i.copied());
57    }
58}
59
60impl Encoder<String> for StrEncoder {
61    #[inline(always)]
62    fn encode(&mut self, t: &String) {
63        self.encode(t.as_str());
64    }
65
66    #[inline(always)]
67    fn encode_vectored<'a>(&mut self, i: impl Iterator<Item = &'a String> + Clone)
68    where
69        String: 'a,
70    {
71        self.encode_vectored(i.map(String::as_str));
72    }
73}
74
75// Doesn't use VecDecoder because can't decode &[u8].
76#[derive(Default)]
77pub struct StrDecoder<'a> {
78    // pub(crate) for arrayvec::ArrayString.
79    pub(crate) lengths: LengthDecoder<'a>,
80    strings: SliceImpl<'a, u8>,
81}
82
83impl<'a> View<'a> for StrDecoder<'a> {
84    fn populate(&mut self, input: &mut &'a [u8], length: usize) -> Result<()> {
85        self.lengths.populate(input, length)?;
86        let bytes = consume_bytes(input, self.lengths.length())?;
87
88        // Fast path: If bytes are ASCII then they're valid UTF-8 and no char boundary can be invalid.
89        // TODO(optimization):
90        // - Worst case when bytes doesn't fit in CPU cache, this will load bytes 3 times from RAM.
91        // - We should subdivide it into chunks in that case.
92        if is_ascii_simd(bytes)
93            || from_utf8(bytes).is_ok_and(|s| {
94                // length == 0 implies bytes.is_empty() so no char boundaries can be broken. This
95                // early exit allows us to do length.get() - 1 without possibility of overflow.
96                let Some(length) = NonZeroUsize::new(length) else {
97                    debug_assert_eq!(bytes.len(), 0);
98                    return true;
99                };
100                // Check that gaps between individual strings are on char boundaries in larger string.
101                // Boundaries at start and end of `s` aren't checked since s: &str guarantees them.
102                let mut length_decoder = self.lengths.borrowed_clone();
103                let mut end = 0;
104                for _ in 0..length.get() - 1 {
105                    end += length_decoder.decode();
106                    // TODO(optimization) is_char_boundary has unnecessary checks.
107                    if !s.is_char_boundary(end) {
108                        return false;
109                    }
110                }
111                true
112            })
113        {
114            self.strings = bytes.into();
115            Ok(())
116        } else {
117            err("invalid utf8")
118        }
119    }
120}
121
122impl<'a> Decoder<'a, &'a str> for StrDecoder<'a> {
123    #[inline(always)]
124    fn decode(&mut self) -> &'a str {
125        let bytes = unsafe { self.strings.chunk_unchecked(self.lengths.decode()) };
126        debug_assert!(from_utf8(bytes).is_ok());
127
128        // Safety: `bytes` is valid UTF-8 because populate checked that `self.strings` is valid UTF-8
129        // and that every sub string starts and ends on char boundaries.
130        unsafe { from_utf8_unchecked(bytes) }
131    }
132}
133
134impl<'a> Decoder<'a, String> for StrDecoder<'a> {
135    #[inline(always)]
136    fn decode(&mut self) -> String {
137        let v: &str = self.decode();
138        v.to_owned()
139    }
140}
141
142/// Tests 128 bytes a time instead of `<[u8]>::is_ascii` which only tests 8.
143/// 390% faster on 8KB, 27% faster on 1GB (RAM bottleneck).
144fn is_ascii_simd(v: &[u8]) -> bool {
145    const CHUNK: usize = 128;
146    let chunks_exact = v.chunks_exact(CHUNK);
147    let remainder = chunks_exact.remainder();
148    for chunk in chunks_exact {
149        let mut any = false;
150        for &v in chunk {
151            any |= v & 0x80 != 0;
152        }
153        if any {
154            debug_assert!(!chunk.is_ascii());
155            return false;
156        }
157    }
158    debug_assert!(v[..v.len() - remainder.len()].is_ascii());
159    remainder.is_ascii()
160}
161
162#[cfg(test)]
163mod tests {
164    use super::is_ascii_simd;
165    use crate::u8_char::U8Char;
166    use crate::{decode, encode};
167    use alloc::borrow::ToOwned;
168    use test::{black_box, Bencher};
169
170    #[test]
171    fn utf8_validation() {
172        // Check from_utf8:
173        assert!(decode::<&str>(&encode(&vec![U8Char(255u8)])).is_err());
174        assert_eq!(decode::<&str>(&encode("\0")).unwrap(), "\0");
175        assert_eq!(decode::<&str>(&encode(&"☺".to_owned())).unwrap(), "☺");
176
177        let c = "☺";
178        let full = super::str_as_u8_chars(c);
179        let start = &full[..1];
180        let end = &full[1..];
181
182        // Check is_char_boundary:
183        assert!(decode::<[&str; 2]>(&encode(&[start.to_vec(), end.to_vec()])).is_err());
184        assert_eq!(decode::<[&str; 2]>(&encode(&[c, c])).unwrap(), [c, c]);
185    }
186
187    #[test]
188    fn test_is_ascii_simd() {
189        assert!(is_ascii_simd(&[0x7F; 128]));
190        assert!(!is_ascii_simd(&[0xFF; 128]));
191    }
192
193    #[bench]
194    fn bench_is_ascii(b: &mut Bencher) {
195        b.iter(|| black_box(&[0; 8192]).is_ascii())
196    }
197
198    #[bench]
199    fn bench_is_ascii_simd(b: &mut Bencher) {
200        b.iter(|| is_ascii_simd(black_box(&[0; 8192])))
201    }
202
203    type S = &'static str;
204    fn bench_data() -> (S, S, S, S, S, S, S) {
205        ("a", "b", "c", "d", "e", "f", "g")
206    }
207    crate::bench_encode_decode!(str_tuple: (&str, &str, &str, &str, &str, &str, &str));
208}
209
210#[cfg(test)]
211mod tests2 {
212    use alloc::string::String;
213    use alloc::vec::Vec;
214
215    fn bench_data() -> Vec<String> {
216        crate::random_data::<u8>(40000)
217            .into_iter()
218            .map(|n| {
219                let n = (8 + n / 32) as usize;
220                " ".repeat(n)
221            })
222            .collect()
223    }
224    crate::bench_encode_decode!(str_vec: Vec<String>);
225}