zerovec/ule/
unvalidated.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use super::{AsULE, RawBytesULE, VarULE};
6use crate::ule::EqULE;
7use crate::{map::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroVecError};
8use alloc::boxed::Box;
9use core::cmp::Ordering;
10use core::fmt;
11use core::ops::Deref;
12
13/// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant.
14///
15/// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For
16/// example, strings that are keys of a map don't need to ever be reified as `str`s.
17///
18/// [`UnvalidatedStr`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`].
19///
20/// The main advantage of this type over `[u8]` is that it serializes as a string in
21/// human-readable formats like JSON.
22///
23/// # Examples
24///
25/// Using an [`UnvalidatedStr`] as the key of a [`ZeroMap`]:
26///
27/// ```
28/// use zerovec::ule::UnvalidatedStr;
29/// use zerovec::ZeroMap;
30///
31/// let map: ZeroMap<UnvalidatedStr, usize> = [
32///     (UnvalidatedStr::from_str("abc"), 11),
33///     (UnvalidatedStr::from_str("def"), 22),
34///     (UnvalidatedStr::from_str("ghi"), 33),
35/// ]
36/// .into_iter()
37/// .collect();
38///
39/// let key = "abc";
40/// let value = map.get_copied_by(|uvstr| uvstr.as_bytes().cmp(key.as_bytes()));
41/// assert_eq!(Some(11), value);
42/// ```
43///
44/// [`ZeroMap`]: crate::ZeroMap
45#[repr(transparent)]
46#[derive(PartialEq, Eq, PartialOrd, Ord)]
47#[allow(clippy::exhaustive_structs)] // transparent newtype
48pub struct UnvalidatedStr([u8]);
49
50impl fmt::Debug for UnvalidatedStr {
51    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
52        // Debug as a string if possible
53        match self.try_as_str() {
54            Ok(s) => fmt::Debug::fmt(s, f),
55            Err(_) => fmt::Debug::fmt(&self.0, f),
56        }
57    }
58}
59
60impl UnvalidatedStr {
61    /// Create a [`UnvalidatedStr`] from a byte slice.
62    #[inline]
63    pub const fn from_bytes(other: &[u8]) -> &Self {
64        // Safety: UnvalidatedStr is transparent over [u8]
65        unsafe { core::mem::transmute(other) }
66    }
67
68    /// Create a [`UnvalidatedStr`] from a string slice.
69    #[inline]
70    pub const fn from_str(s: &str) -> &Self {
71        Self::from_bytes(s.as_bytes())
72    }
73
74    /// Create a [`UnvalidatedStr`] from boxed bytes.
75    #[inline]
76    pub fn from_boxed_bytes(other: Box<[u8]>) -> Box<Self> {
77        // Safety: UnvalidatedStr is transparent over [u8]
78        unsafe { core::mem::transmute(other) }
79    }
80
81    /// Create a [`UnvalidatedStr`] from a boxed `str`.
82    #[inline]
83    pub fn from_boxed_str(other: Box<str>) -> Box<Self> {
84        Self::from_boxed_bytes(other.into_boxed_bytes())
85    }
86
87    /// Get the bytes from a [`UnvalidatedStr].
88    #[inline]
89    pub const fn as_bytes(&self) -> &[u8] {
90        &self.0
91    }
92
93    /// Attempt to convert a [`UnvalidatedStr`] to a `str`.
94    ///
95    /// # Examples
96    ///
97    /// ```
98    /// use zerovec::ule::UnvalidatedStr;
99    ///
100    /// static A: &UnvalidatedStr = UnvalidatedStr::from_bytes(b"abc");
101    ///
102    /// let b = A.try_as_str().unwrap();
103    /// assert_eq!(b, "abc");
104    /// ```
105    // Note: this is const starting in 1.63
106    #[inline]
107    pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> {
108        core::str::from_utf8(&self.0)
109    }
110}
111
112impl<'a> From<&'a str> for &'a UnvalidatedStr {
113    #[inline]
114    fn from(other: &'a str) -> Self {
115        UnvalidatedStr::from_str(other)
116    }
117}
118
119impl From<Box<str>> for Box<UnvalidatedStr> {
120    #[inline]
121    fn from(other: Box<str>) -> Self {
122        UnvalidatedStr::from_boxed_str(other)
123    }
124}
125
126impl Deref for UnvalidatedStr {
127    type Target = [u8];
128    fn deref(&self) -> &Self::Target {
129        &self.0
130    }
131}
132
133impl<'a> ZeroMapKV<'a> for UnvalidatedStr {
134    type Container = VarZeroVec<'a, UnvalidatedStr>;
135    type Slice = VarZeroSlice<UnvalidatedStr>;
136    type GetType = UnvalidatedStr;
137    type OwnedType = Box<UnvalidatedStr>;
138}
139
140// Safety (based on the safety checklist on the VarULE trait):
141//  1. UnvalidatedStr does not include any uninitialized or padding bytes (transparent over a ULE)
142//  2. UnvalidatedStr is aligned to 1 byte (transparent over a ULE)
143//  3. The impl of `validate_byte_slice()` returns an error if any byte is not valid (impossible)
144//  4. The impl of `validate_byte_slice()` returns an error if the slice cannot be used in its entirety (impossible)
145//  5. The impl of `from_byte_slice_unchecked()` returns a reference to the same data (returns the argument directly)
146//  6. All other methods are defaulted
147//  7. `[T]` byte equality is semantic equality (transparent over a ULE)
148unsafe impl VarULE for UnvalidatedStr {
149    #[inline]
150    fn validate_byte_slice(_: &[u8]) -> Result<(), ZeroVecError> {
151        Ok(())
152    }
153    #[inline]
154    unsafe fn from_byte_slice_unchecked(bytes: &[u8]) -> &Self {
155        UnvalidatedStr::from_bytes(bytes)
156    }
157}
158
159/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
160#[cfg(feature = "serde")]
161impl serde::Serialize for UnvalidatedStr {
162    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
163    where
164        S: serde::Serializer,
165    {
166        use serde::ser::Error;
167        let s = self
168            .try_as_str()
169            .map_err(|_| S::Error::custom("invalid UTF-8 in UnvalidatedStr"))?;
170        if serializer.is_human_readable() {
171            serializer.serialize_str(s)
172        } else {
173            serializer.serialize_bytes(s.as_bytes())
174        }
175    }
176}
177
178/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
179#[cfg(feature = "serde")]
180impl<'de> serde::Deserialize<'de> for Box<UnvalidatedStr> {
181    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
182    where
183        D: serde::Deserializer<'de>,
184    {
185        if deserializer.is_human_readable() {
186            let boxed_str = Box::<str>::deserialize(deserializer)?;
187            Ok(UnvalidatedStr::from_boxed_str(boxed_str))
188        } else {
189            let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?;
190            Ok(UnvalidatedStr::from_boxed_bytes(boxed_bytes))
191        }
192    }
193}
194
195/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
196#[cfg(feature = "serde")]
197impl<'de, 'a> serde::Deserialize<'de> for &'a UnvalidatedStr
198where
199    'de: 'a,
200{
201    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
202    where
203        D: serde::Deserializer<'de>,
204    {
205        if deserializer.is_human_readable() {
206            let s = <&str>::deserialize(deserializer)?;
207            Ok(UnvalidatedStr::from_str(s))
208        } else {
209            let bytes = <&[u8]>::deserialize(deserializer)?;
210            Ok(UnvalidatedStr::from_bytes(bytes))
211        }
212    }
213}
214
215/// A u8 array of little-endian data that is expected to be a Unicode scalar value, but is not
216/// validated as such.
217///
218/// Use this type instead of `char` when you want to deal with data that is expected to be valid
219/// Unicode scalar values, but you want control over when or if you validate that assumption.
220///
221/// # Examples
222///
223/// ```
224/// use zerovec::ule::UnvalidatedChar;
225/// use zerovec::{ZeroSlice, ZeroVec};
226///
227/// // data known to be little-endian three-byte chunks of valid Unicode scalar values
228/// let data = [0x68, 0x00, 0x00, 0x69, 0x00, 0x00, 0x4B, 0xF4, 0x01];
229/// // ground truth expectation
230/// let real = ['h', 'i', '👋'];
231///
232/// let chars: &ZeroSlice<UnvalidatedChar> = ZeroSlice::parse_byte_slice(&data).expect("invalid data length");
233/// let parsed: Vec<_> = chars.iter().map(|c| unsafe { c.to_char_unchecked() }).collect();
234/// assert_eq!(&parsed, &real);
235///
236/// let real_chars: ZeroVec<_> = real.iter().copied().map(UnvalidatedChar::from_char).collect();
237/// let serialized_data = chars.as_bytes();
238/// assert_eq!(serialized_data, &data);
239/// ```
240#[repr(transparent)]
241#[derive(PartialEq, Eq, Clone, Copy, Hash)]
242pub struct UnvalidatedChar([u8; 3]);
243
244impl UnvalidatedChar {
245    /// Create a [`UnvalidatedChar`] from a `char`.
246    ///
247    /// # Examples
248    ///
249    /// ```
250    /// use zerovec::ule::UnvalidatedChar;
251    ///
252    /// let a = UnvalidatedChar::from_char('a');
253    /// assert_eq!(a.try_to_char().unwrap(), 'a');
254    /// ```
255    #[inline]
256    pub const fn from_char(c: char) -> Self {
257        let [u0, u1, u2, _u3] = (c as u32).to_le_bytes();
258        Self([u0, u1, u2])
259    }
260
261    #[inline]
262    #[doc(hidden)]
263    pub const fn from_u24(c: u32) -> Self {
264        let [u0, u1, u2, _u3] = c.to_le_bytes();
265        Self([u0, u1, u2])
266    }
267
268    /// Attempt to convert a [`UnvalidatedChar`] to a `char`.
269    ///
270    /// # Examples
271    ///
272    /// ```
273    /// use zerovec::ule::{AsULE, UnvalidatedChar};
274    ///
275    /// let a = UnvalidatedChar::from_char('a');
276    /// assert_eq!(a.try_to_char(), Ok('a'));
277    ///
278    /// let b = UnvalidatedChar::from_unaligned([0xFF, 0xFF, 0xFF].into());
279    /// assert!(matches!(b.try_to_char(), Err(_)));
280    /// ```
281    #[inline]
282    pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> {
283        let [u0, u1, u2] = self.0;
284        char::try_from(u32::from_le_bytes([u0, u1, u2, 0]))
285    }
286
287    /// Convert a [`UnvalidatedChar`] to a `char', returning [`char::REPLACEMENT_CHARACTER`]
288    /// if the `UnvalidatedChar` does not represent a valid Unicode scalar value.
289    ///
290    /// # Examples
291    ///
292    /// ```
293    /// use zerovec::ule::{AsULE, UnvalidatedChar};
294    ///
295    /// let a = UnvalidatedChar::from_unaligned([0xFF, 0xFF, 0xFF].into());
296    /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER);
297    /// ```
298    #[inline]
299    pub fn to_char_lossy(self) -> char {
300        self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
301    }
302
303    /// Convert a [`UnvalidatedChar`] to a `char` without checking that it is
304    /// a valid Unicode scalar value.
305    ///
306    /// # Safety
307    ///
308    /// The `UnvalidatedChar` must be a valid Unicode scalar value in little-endian order.
309    ///
310    /// # Examples
311    ///
312    /// ```
313    /// use zerovec::ule::UnvalidatedChar;
314    ///
315    /// let a = UnvalidatedChar::from_char('a');
316    /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a');
317    /// ```
318    #[inline]
319    pub unsafe fn to_char_unchecked(self) -> char {
320        let [u0, u1, u2] = self.0;
321        char::from_u32_unchecked(u32::from_le_bytes([u0, u1, u2, 0]))
322    }
323}
324
325impl RawBytesULE<3> {
326    /// Converts a [`UnvalidatedChar`] to its ULE type. This is equivalent to calling
327    /// [`AsULE::to_unaligned`].
328    #[inline]
329    pub const fn from_unvalidated_char(uc: UnvalidatedChar) -> Self {
330        RawBytesULE(uc.0)
331    }
332}
333
334impl AsULE for UnvalidatedChar {
335    type ULE = RawBytesULE<3>;
336
337    #[inline]
338    fn to_unaligned(self) -> Self::ULE {
339        RawBytesULE(self.0)
340    }
341
342    #[inline]
343    fn from_unaligned(unaligned: Self::ULE) -> Self {
344        Self(unaligned.0)
345    }
346}
347
348// Safety: UnvalidatedChar is always the little-endian representation of a char,
349// which corresponds to its AsULE::ULE type
350unsafe impl EqULE for UnvalidatedChar {}
351
352impl fmt::Debug for UnvalidatedChar {
353    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
354        // Debug as a char if possible
355        match self.try_to_char() {
356            Ok(c) => fmt::Debug::fmt(&c, f),
357            Err(_) => fmt::Debug::fmt(&self.0, f),
358        }
359    }
360}
361
362impl PartialOrd for UnvalidatedChar {
363    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
364        Some(self.cmp(other))
365    }
366}
367
368impl Ord for UnvalidatedChar {
369    // custom implementation, as derived Ord would compare lexicographically
370    fn cmp(&self, other: &Self) -> Ordering {
371        let [a0, a1, a2] = self.0;
372        let a = u32::from_le_bytes([a0, a1, a2, 0]);
373        let [b0, b1, b2] = other.0;
374        let b = u32::from_le_bytes([b0, b1, b2, 0]);
375        a.cmp(&b)
376    }
377}
378
379impl From<char> for UnvalidatedChar {
380    #[inline]
381    fn from(value: char) -> Self {
382        Self::from_char(value)
383    }
384}
385
386impl TryFrom<UnvalidatedChar> for char {
387    type Error = core::char::CharTryFromError;
388
389    #[inline]
390    fn try_from(value: UnvalidatedChar) -> Result<char, Self::Error> {
391        value.try_to_char()
392    }
393}
394
395/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
396#[cfg(feature = "serde")]
397impl serde::Serialize for UnvalidatedChar {
398    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
399    where
400        S: serde::Serializer,
401    {
402        use serde::ser::Error;
403        let c = self
404            .try_to_char()
405            .map_err(|_| S::Error::custom("invalid Unicode scalar value in UnvalidatedChar"))?;
406        if serializer.is_human_readable() {
407            serializer.serialize_char(c)
408        } else {
409            self.0.serialize(serializer)
410        }
411    }
412}
413
414/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
415#[cfg(feature = "serde")]
416impl<'de> serde::Deserialize<'de> for UnvalidatedChar {
417    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
418    where
419        D: serde::Deserializer<'de>,
420    {
421        if deserializer.is_human_readable() {
422            let c = <char>::deserialize(deserializer)?;
423            Ok(UnvalidatedChar::from_char(c))
424        } else {
425            let bytes = <[u8; 3]>::deserialize(deserializer)?;
426            Ok(UnvalidatedChar(bytes))
427        }
428    }
429}
430
431#[cfg(feature = "databake")]
432impl databake::Bake for UnvalidatedChar {
433    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
434        match self.try_to_char() {
435            Ok(ch) => {
436                env.insert("zerovec");
437                let ch = ch.bake(env);
438                databake::quote! {
439                    zerovec::ule::UnvalidatedChar::from_char(#ch)
440                }
441            }
442            Err(_) => {
443                env.insert("zerovec");
444                let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]);
445                databake::quote! {
446                    zerovec::ule::UnvalidatedChar::from_u24(#u24)
447                }
448            }
449        }
450    }
451}
452
453#[cfg(test)]
454mod test {
455    use super::*;
456    use crate::ZeroVec;
457
458    #[test]
459    fn test_serde_fail() {
460        let uc = UnvalidatedChar([0xFF, 0xFF, 0xFF]);
461        serde_json::to_string(&uc).expect_err("serialize invalid char bytes");
462        bincode::serialize(&uc).expect_err("serialize invalid char bytes");
463    }
464
465    #[test]
466    fn test_serde_json() {
467        let c = '🙃';
468        let uc = UnvalidatedChar::from_char(c);
469        let json_ser = serde_json::to_string(&uc).unwrap();
470
471        assert_eq!(json_ser, r#""🙃""#);
472
473        let json_de: UnvalidatedChar = serde_json::from_str(&json_ser).unwrap();
474
475        assert_eq!(uc, json_de);
476    }
477
478    #[test]
479    fn test_serde_bincode() {
480        let c = '🙃';
481        let uc = UnvalidatedChar::from_char(c);
482        let bytes_ser = bincode::serialize(&uc).unwrap();
483
484        assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]);
485
486        let bytes_de: UnvalidatedChar = bincode::deserialize(&bytes_ser).unwrap();
487
488        assert_eq!(uc, bytes_de);
489    }
490
491    #[test]
492    fn test_representation() {
493        let chars = ['w', 'ω', '文', '𑄃', '🙃'];
494
495        // backed by [UnvalidatedChar]
496        let uvchars: Vec<_> = chars
497            .iter()
498            .copied()
499            .map(UnvalidatedChar::from_char)
500            .collect();
501        // backed by [RawBytesULE<3>]
502        let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect();
503
504        let ule_bytes = zvec.as_bytes();
505        let uvbytes;
506        unsafe {
507            let ptr = &uvchars[..] as *const _ as *const u8;
508            uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len());
509        }
510
511        // UnvalidatedChar is defined as little-endian, so this must be true on all platforms
512        // also asserts that to_unaligned/from_unaligned are no-ops
513        assert_eq!(uvbytes, ule_bytes);
514
515        assert_eq!(
516            &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
517            ule_bytes
518        );
519    }
520
521    #[test]
522    fn test_char_bake() {
523        databake::test_bake!(UnvalidatedChar, const: crate::ule::UnvalidatedChar::from_char('b'), zerovec);
524        // surrogate code point
525        databake::test_bake!(UnvalidatedChar, const: crate::ule::UnvalidatedChar::from_u24(55296u32), zerovec);
526    }
527}