icu_locid/extensions/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with
6//! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`]
7//! is called [`Locale`].
8//!
9//! There are four types of extensions:
10//!
11//!  * [`Unicode Extensions`] - marked as `u`.
12//!  * [`Transform Extensions`] - marked as `t`.
13//!  * [`Private Use Extensions`] - marked as `x`.
14//!  * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`.
15//!
16//! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`].
17//!
18//! Notice: `Other` extension type is currently not supported.
19//!
20//! # Examples
21//!
22//! ```
23//! use icu::locid::extensions::unicode::{Key, Value};
24//! use icu::locid::Locale;
25//!
26//! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo"
27//!     .parse()
28//!     .expect("Failed to parse.");
29//!
30//! assert_eq!(loc.id.language, "en".parse().unwrap());
31//! assert_eq!(loc.id.script, None);
32//! assert_eq!(loc.id.region, Some("US".parse().unwrap()));
33//! assert_eq!(loc.id.variants.len(), 0);
34//!
35//! let key: Key = "ca".parse().expect("Parsing key failed.");
36//! let value: Value = "buddhist".parse().expect("Parsing value failed.");
37//! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value));
38//! ```
39//!
40//! [`LanguageIdentifier`]: super::LanguageIdentifier
41//! [`Locale`]: super::Locale
42//! [`subtags`]: super::subtags
43//! [`Other Extensions`]: other
44//! [`Private Use Extensions`]: private
45//! [`Transform Extensions`]: transform
46//! [`Unicode Extensions`]: unicode
47pub mod other;
48pub mod private;
49pub mod transform;
50pub mod unicode;
51
52use core::cmp::Ordering;
53
54use other::Other;
55use private::Private;
56use transform::Transform;
57use unicode::Unicode;
58
59use alloc::vec::Vec;
60
61use crate::parser::ParserError;
62use crate::parser::SubtagIterator;
63use crate::subtags;
64
65/// Defines the type of extension.
66#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
67#[non_exhaustive]
68pub enum ExtensionType {
69    /// Transform Extension Type marked as `t`.
70    Transform,
71    /// Unicode Extension Type marked as `u`.
72    Unicode,
73    /// Private Extension Type marked as `x`.
74    Private,
75    /// All other extension types.
76    Other(u8),
77}
78
79impl ExtensionType {
80    pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParserError> {
81        let key = key.to_ascii_lowercase();
82        match key {
83            b'u' => Ok(Self::Unicode),
84            b't' => Ok(Self::Transform),
85            b'x' => Ok(Self::Private),
86            b'a'..=b'z' => Ok(Self::Other(key)),
87            _ => Err(ParserError::InvalidExtension),
88        }
89    }
90
91    pub(crate) const fn try_from_bytes_manual_slice(
92        bytes: &[u8],
93        start: usize,
94        end: usize,
95    ) -> Result<Self, ParserError> {
96        if end - start != 1 {
97            return Err(ParserError::InvalidExtension);
98        }
99        #[allow(clippy::indexing_slicing)]
100        Self::try_from_byte(bytes[start])
101    }
102}
103
104/// A map of extensions associated with a given [`Locale`](crate::Locale).
105#[derive(Debug, Default, PartialEq, Eq, Clone, Hash)]
106#[non_exhaustive]
107pub struct Extensions {
108    /// A representation of the data for a Unicode extension, when present in the locale identifier.
109    pub unicode: Unicode,
110    /// A representation of the data for a transform extension, when present in the locale identifier.
111    pub transform: Transform,
112    /// A representation of the data for a private-use extension, when present in the locale identifier.
113    pub private: Private,
114    /// A sequence of any other extensions that are present in the locale identifier but are not formally
115    /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
116    /// and [`Private`] are.
117    pub other: Vec<Other>,
118}
119
120impl Extensions {
121    /// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`.
122    ///
123    /// # Examples
124    ///
125    /// ```
126    /// use icu::locid::extensions::Extensions;
127    ///
128    /// assert_eq!(Extensions::new(), Extensions::default());
129    /// ```
130    #[inline]
131    pub const fn new() -> Self {
132        Self {
133            unicode: Unicode::new(),
134            transform: Transform::new(),
135            private: Private::new(),
136            other: Vec::new(),
137        }
138    }
139
140    /// Function to create a new map of extensions containing exactly one unicode extension, callable in `const`
141    /// context.
142    #[inline]
143    pub const fn from_unicode(unicode: Unicode) -> Self {
144        Self {
145            unicode,
146            transform: Transform::new(),
147            private: Private::new(),
148            other: Vec::new(),
149        }
150    }
151
152    /// Returns whether there are no extensions present.
153    ///
154    /// # Examples
155    ///
156    /// ```
157    /// use icu::locid::Locale;
158    ///
159    /// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed.");
160    ///
161    /// assert!(!loc.extensions.is_empty());
162    /// ```
163    pub fn is_empty(&self) -> bool {
164        self.unicode.is_empty()
165            && self.transform.is_empty()
166            && self.private.is_empty()
167            && self.other.is_empty()
168    }
169
170    #[allow(clippy::type_complexity)]
171    pub(crate) fn as_tuple(
172        &self,
173    ) -> (
174        (&unicode::Attributes, &unicode::Keywords),
175        (
176            Option<(
177                subtags::Language,
178                Option<subtags::Script>,
179                Option<subtags::Region>,
180                &subtags::Variants,
181            )>,
182            &transform::Fields,
183        ),
184        &private::Private,
185        &[other::Other],
186    ) {
187        (
188            self.unicode.as_tuple(),
189            self.transform.as_tuple(),
190            &self.private,
191            &self.other,
192        )
193    }
194
195    /// Returns an ordering suitable for use in [`BTreeSet`].
196    ///
197    /// The ordering may or may not be equivalent to string ordering, and it
198    /// may or may not be stable across ICU4X releases.
199    ///
200    /// [`BTreeSet`]: alloc::collections::BTreeSet
201    pub fn total_cmp(&self, other: &Self) -> Ordering {
202        self.as_tuple().cmp(&other.as_tuple())
203    }
204
205    /// Retains the specified extension types, clearing all others.
206    ///
207    /// # Examples
208    ///
209    /// ```
210    /// use icu::locid::extensions::ExtensionType;
211    /// use icu::locid::Locale;
212    ///
213    /// let loc: Locale =
214    ///     "und-a-hello-t-mul-u-world-z-zzz-x-extra".parse().unwrap();
215    ///
216    /// let mut only_unicode = loc.clone();
217    /// only_unicode
218    ///     .extensions
219    ///     .retain_by_type(|t| t == ExtensionType::Unicode);
220    /// assert_eq!(only_unicode, "und-u-world".parse().unwrap());
221    ///
222    /// let mut only_t_z = loc.clone();
223    /// only_t_z.extensions.retain_by_type(|t| {
224    ///     t == ExtensionType::Transform || t == ExtensionType::Other(b'z')
225    /// });
226    /// assert_eq!(only_t_z, "und-t-mul-z-zzz".parse().unwrap());
227    /// ```
228    pub fn retain_by_type<F>(&mut self, mut predicate: F)
229    where
230        F: FnMut(ExtensionType) -> bool,
231    {
232        if !predicate(ExtensionType::Unicode) {
233            self.unicode.clear();
234        }
235        if !predicate(ExtensionType::Transform) {
236            self.transform.clear();
237        }
238        if !predicate(ExtensionType::Private) {
239            self.private.clear();
240        }
241        self.other
242            .retain(|o| predicate(ExtensionType::Other(o.get_ext_byte())));
243    }
244
245    pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParserError> {
246        let mut unicode = None;
247        let mut transform = None;
248        let mut private = None;
249        let mut other = Vec::new();
250
251        while let Some(subtag) = iter.next() {
252            if subtag.is_empty() {
253                return Err(ParserError::InvalidExtension);
254            }
255            match subtag.first().map(|b| ExtensionType::try_from_byte(*b)) {
256                Some(Ok(ExtensionType::Unicode)) => {
257                    if unicode.is_some() {
258                        return Err(ParserError::DuplicatedExtension);
259                    }
260                    unicode = Some(Unicode::try_from_iter(iter)?);
261                }
262                Some(Ok(ExtensionType::Transform)) => {
263                    if transform.is_some() {
264                        return Err(ParserError::DuplicatedExtension);
265                    }
266                    transform = Some(Transform::try_from_iter(iter)?);
267                }
268                Some(Ok(ExtensionType::Private)) => {
269                    if private.is_some() {
270                        return Err(ParserError::DuplicatedExtension);
271                    }
272                    private = Some(Private::try_from_iter(iter)?);
273                }
274                Some(Ok(ExtensionType::Other(ext))) => {
275                    if other.iter().any(|o: &Other| o.get_ext_byte() == ext) {
276                        return Err(ParserError::DuplicatedExtension);
277                    }
278                    let parsed = Other::try_from_iter(ext, iter)?;
279                    if let Err(idx) = other.binary_search(&parsed) {
280                        other.insert(idx, parsed);
281                    } else {
282                        return Err(ParserError::InvalidExtension);
283                    }
284                }
285                _ => return Err(ParserError::InvalidExtension),
286            }
287        }
288
289        Ok(Self {
290            unicode: unicode.unwrap_or_default(),
291            transform: transform.unwrap_or_default(),
292            private: private.unwrap_or_default(),
293            other,
294        })
295    }
296
297    pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
298    where
299        F: FnMut(&str) -> Result<(), E>,
300    {
301        let mut wrote_tu = false;
302        // Alphabetic by singleton
303        self.other.iter().try_for_each(|other| {
304            if other.get_ext() > 't' && !wrote_tu {
305                // Since 't' and 'u' are next to each other in alphabetical
306                // order, write both now.
307                self.transform.for_each_subtag_str(f)?;
308                self.unicode.for_each_subtag_str(f)?;
309                wrote_tu = true;
310            }
311            other.for_each_subtag_str(f)?;
312            Ok(())
313        })?;
314
315        if !wrote_tu {
316            self.transform.for_each_subtag_str(f)?;
317            self.unicode.for_each_subtag_str(f)?;
318        }
319
320        // Private must be written last, since it allows single character
321        // keys. Extensions must also be written in alphabetical order,
322        // which would seem to imply that other extensions `y` and `z` are
323        // invalid, but this is not specified.
324        self.private.for_each_subtag_str(f)?;
325        Ok(())
326    }
327}
328
329impl_writeable_for_each_subtag_str_no_test!(Extensions);
330
331#[test]
332fn test_writeable() {
333    use crate::Locale;
334    use writeable::assert_writeable_eq;
335    assert_writeable_eq!(Extensions::new(), "");
336    assert_writeable_eq!(
337        "my-t-my-d0-zawgyi".parse::<Locale>().unwrap().extensions,
338        "t-my-d0-zawgyi",
339    );
340    assert_writeable_eq!(
341        "ar-SA-u-ca-islamic-civil"
342            .parse::<Locale>()
343            .unwrap()
344            .extensions,
345        "u-ca-islamic-civil",
346    );
347    assert_writeable_eq!(
348        "en-001-x-foo-bar".parse::<Locale>().unwrap().extensions,
349        "x-foo-bar",
350    );
351    assert_writeable_eq!(
352        "und-t-m0-true".parse::<Locale>().unwrap().extensions,
353        "t-m0-true",
354    );
355    assert_writeable_eq!(
356        "und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo"
357            .parse::<Locale>()
358            .unwrap()
359            .extensions,
360        "a-foo-t-foo-u-foo-w-foo-z-foo-x-foo",
361    );
362}