1use crate::coder::{Buffer, Decoder, Encoder, Result, View};
2use crate::consume::consume_bytes;
3use crate::derive::vec::VecEncoder;
4use crate::error::err;
5use crate::fast::{NextUnchecked, SliceImpl};
6use crate::length::LengthDecoder;
7use crate::u8_char::U8Char;
8use alloc::borrow::ToOwned;
9use alloc::string::String;
10use alloc::vec::Vec;
11use core::num::NonZeroUsize;
12use core::str::{from_utf8, from_utf8_unchecked};
13
14#[derive(Default)]
15pub struct StrEncoder(pub(crate) VecEncoder<U8Char>); #[inline(always)]
18fn str_as_u8_chars(s: &str) -> &[U8Char] {
19 bytemuck::must_cast_slice(s.as_bytes())
20}
21
22impl Buffer for StrEncoder {
23 fn collect_into(&mut self, out: &mut Vec<u8>) {
24 self.0.collect_into(out);
25 }
26
27 fn reserve(&mut self, additional: NonZeroUsize) {
28 self.0.reserve(additional);
29 }
30}
31
32impl Encoder<str> for StrEncoder {
33 #[inline(always)]
34 fn encode(&mut self, t: &str) {
35 self.0.encode(str_as_u8_chars(t));
36 }
37
38 #[inline(always)]
39 fn encode_vectored<'a>(&mut self, i: impl Iterator<Item = &'a str> + Clone) {
40 self.0.encode_vectored(i.map(str_as_u8_chars));
41 }
42}
43
44impl<'b> Encoder<&'b str> for StrEncoder {
46 #[inline(always)]
47 fn encode(&mut self, t: &&str) {
48 self.encode(*t);
49 }
50
51 #[inline(always)]
52 fn encode_vectored<'a>(&mut self, i: impl Iterator<Item = &'a &'b str> + Clone)
53 where
54 &'b str: 'a,
55 {
56 self.encode_vectored(i.copied());
57 }
58}
59
60impl Encoder<String> for StrEncoder {
61 #[inline(always)]
62 fn encode(&mut self, t: &String) {
63 self.encode(t.as_str());
64 }
65
66 #[inline(always)]
67 fn encode_vectored<'a>(&mut self, i: impl Iterator<Item = &'a String> + Clone)
68 where
69 String: 'a,
70 {
71 self.encode_vectored(i.map(String::as_str));
72 }
73}
74
75#[derive(Default)]
77pub struct StrDecoder<'a> {
78 pub(crate) lengths: LengthDecoder<'a>,
80 strings: SliceImpl<'a, u8>,
81}
82
83impl<'a> View<'a> for StrDecoder<'a> {
84 fn populate(&mut self, input: &mut &'a [u8], length: usize) -> Result<()> {
85 self.lengths.populate(input, length)?;
86 let bytes = consume_bytes(input, self.lengths.length())?;
87
88 if is_ascii_simd(bytes)
93 || from_utf8(bytes).is_ok_and(|s| {
94 let Some(length) = NonZeroUsize::new(length) else {
97 debug_assert_eq!(bytes.len(), 0);
98 return true;
99 };
100 let mut length_decoder = self.lengths.borrowed_clone();
103 let mut end = 0;
104 for _ in 0..length.get() - 1 {
105 end += length_decoder.decode();
106 if !s.is_char_boundary(end) {
108 return false;
109 }
110 }
111 true
112 })
113 {
114 self.strings = bytes.into();
115 Ok(())
116 } else {
117 err("invalid utf8")
118 }
119 }
120}
121
122impl<'a> Decoder<'a, &'a str> for StrDecoder<'a> {
123 #[inline(always)]
124 fn decode(&mut self) -> &'a str {
125 let bytes = unsafe { self.strings.chunk_unchecked(self.lengths.decode()) };
126 debug_assert!(from_utf8(bytes).is_ok());
127
128 unsafe { from_utf8_unchecked(bytes) }
131 }
132}
133
134impl<'a> Decoder<'a, String> for StrDecoder<'a> {
135 #[inline(always)]
136 fn decode(&mut self) -> String {
137 let v: &str = self.decode();
138 v.to_owned()
139 }
140}
141
142fn is_ascii_simd(v: &[u8]) -> bool {
145 const CHUNK: usize = 128;
146 let chunks_exact = v.chunks_exact(CHUNK);
147 let remainder = chunks_exact.remainder();
148 for chunk in chunks_exact {
149 let mut any = false;
150 for &v in chunk {
151 any |= v & 0x80 != 0;
152 }
153 if any {
154 debug_assert!(!chunk.is_ascii());
155 return false;
156 }
157 }
158 debug_assert!(v[..v.len() - remainder.len()].is_ascii());
159 remainder.is_ascii()
160}
161
162#[cfg(test)]
163mod tests {
164 use super::is_ascii_simd;
165 use crate::u8_char::U8Char;
166 use crate::{decode, encode};
167 use alloc::borrow::ToOwned;
168 use test::{black_box, Bencher};
169
170 #[test]
171 fn utf8_validation() {
172 assert!(decode::<&str>(&encode(&vec![U8Char(255u8)])).is_err());
174 assert_eq!(decode::<&str>(&encode("\0")).unwrap(), "\0");
175 assert_eq!(decode::<&str>(&encode(&"☺".to_owned())).unwrap(), "☺");
176
177 let c = "☺";
178 let full = super::str_as_u8_chars(c);
179 let start = &full[..1];
180 let end = &full[1..];
181
182 assert!(decode::<[&str; 2]>(&encode(&[start.to_vec(), end.to_vec()])).is_err());
184 assert_eq!(decode::<[&str; 2]>(&encode(&[c, c])).unwrap(), [c, c]);
185 }
186
187 #[test]
188 fn test_is_ascii_simd() {
189 assert!(is_ascii_simd(&[0x7F; 128]));
190 assert!(!is_ascii_simd(&[0xFF; 128]));
191 }
192
193 #[bench]
194 fn bench_is_ascii(b: &mut Bencher) {
195 b.iter(|| black_box(&[0; 8192]).is_ascii())
196 }
197
198 #[bench]
199 fn bench_is_ascii_simd(b: &mut Bencher) {
200 b.iter(|| is_ascii_simd(black_box(&[0; 8192])))
201 }
202
203 type S = &'static str;
204 fn bench_data() -> (S, S, S, S, S, S, S) {
205 ("a", "b", "c", "d", "e", "f", "g")
206 }
207 crate::bench_encode_decode!(str_tuple: (&str, &str, &str, &str, &str, &str, &str));
208}
209
210#[cfg(test)]
211mod tests2 {
212 use alloc::string::String;
213 use alloc::vec::Vec;
214
215 fn bench_data() -> Vec<String> {
216 crate::random_data::<u8>(40000)
217 .into_iter()
218 .map(|n| {
219 let n = (8 + n / 32) as usize;
220 " ".repeat(n)
221 })
222 .collect()
223 }
224 crate::bench_encode_decode!(str_vec: Vec<String>);
225}