icu_locid/
locale.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[allow(deprecated)]
6use crate::ordering::SubtagOrderingResult;
7use crate::parser::{
8    parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
9    ParserError, ParserMode, SubtagIterator,
10};
11use crate::{extensions, subtags, LanguageIdentifier};
12use alloc::string::String;
13use core::cmp::Ordering;
14use core::str::FromStr;
15use tinystr::TinyAsciiStr;
16use writeable::Writeable;
17
18/// A core struct representing a [`Unicode Locale Identifier`].
19///
20/// A locale is made of two parts:
21///  * Unicode Language Identifier
22///  * A set of Unicode Extensions
23///
24/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
25/// on top of that is able to parse, manipulate and serialize unicode extension fields.
26///
27///
28/// # Examples
29///
30/// ```
31/// use icu::locid::{
32///     extensions::unicode::{key, value},
33///     locale,
34///     subtags::{language, region},
35/// };
36///
37/// let loc = locale!("en-US-u-ca-buddhist");
38///
39/// assert_eq!(loc.id.language, language!("en"));
40/// assert_eq!(loc.id.script, None);
41/// assert_eq!(loc.id.region, Some(region!("US")));
42/// assert_eq!(loc.id.variants.len(), 0);
43/// assert_eq!(
44///     loc.extensions.unicode.keywords.get(&key!("ca")),
45///     Some(&value!("buddhist"))
46/// );
47/// ```
48///
49/// # Parsing
50///
51/// Unicode recognizes three levels of standard conformance for a locale:
52///
53///  * *well-formed* - syntactically correct
54///  * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
55///  * *canonical* - valid and no deprecated codes or structure.
56///
57/// At the moment parsing normalizes a well-formed locale identifier converting
58/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
59///
60/// Any bogus subtags will cause the parsing to fail with an error.
61///
62/// No subtag validation or alias resolution is performed.
63///
64/// # Examples
65///
66/// ```
67/// use icu::locid::{subtags::*, Locale};
68///
69/// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12"
70///     .parse()
71///     .expect("Failed to parse.");
72///
73/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
74/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
75/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
76/// assert_eq!(
77///     loc.id.variants.get(0),
78///     "valencia".parse::<Variant>().ok().as_ref()
79/// );
80/// ```
81/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
82#[derive(Default, PartialEq, Eq, Clone, Hash)]
83#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
84pub struct Locale {
85    /// The basic language/script/region components in the locale identifier along with any variants.
86    pub id: LanguageIdentifier,
87    /// Any extensions present in the locale identifier.
88    pub extensions: extensions::Extensions,
89}
90
91#[test]
92fn test_sizes() {
93    assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
94    assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
95    assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
96    assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
97    assert_eq!(core::mem::size_of::<subtags::Variants>(), 16);
98    assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32);
99
100    assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56);
101    assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32);
102    assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);
103
104    assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16);
105    assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24);
106    assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
107    assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16);
108    assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136);
109
110    assert_eq!(core::mem::size_of::<Locale>(), 168);
111}
112
113impl Locale {
114    /// A constructor which takes a utf8 slice, parses it and
115    /// produces a well-formed [`Locale`].
116    ///
117    /// # Examples
118    ///
119    /// ```
120    /// use icu::locid::Locale;
121    ///
122    /// Locale::try_from_bytes(b"en-US-u-hc-h12").unwrap();
123    /// ```
124    pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
125        parse_locale(v)
126    }
127
128    /// The default undefined locale "und". Same as [`default()`](Default::default()).
129    ///
130    /// # Examples
131    ///
132    /// ```
133    /// use icu::locid::Locale;
134    ///
135    /// assert_eq!(Locale::default(), Locale::UND);
136    /// ```
137    pub const UND: Self = Self {
138        id: LanguageIdentifier::UND,
139        extensions: extensions::Extensions::new(),
140    };
141
142    /// This is a best-effort operation that performs all available levels of canonicalization.
143    ///
144    /// At the moment the operation will normalize casing and the separator, but in the future
145    /// it may also validate and update from deprecated subtags to canonical ones.
146    ///
147    /// # Examples
148    ///
149    /// ```
150    /// use icu::locid::Locale;
151    ///
152    /// assert_eq!(
153    ///     Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
154    ///     Ok("pl-Latn-PL-u-hc-h12")
155    /// );
156    /// ```
157    pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
158        let locale = Self::try_from_bytes(input.as_ref())?;
159        Ok(locale.write_to_string().into_owned())
160    }
161
162    /// Compare this [`Locale`] with BCP-47 bytes.
163    ///
164    /// The return value is equivalent to what would happen if you first converted this
165    /// [`Locale`] to a BCP-47 string and then performed a byte comparison.
166    ///
167    /// This function is case-sensitive and results in a *total order*, so it is appropriate for
168    /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
169    ///
170    /// # Examples
171    ///
172    /// ```
173    /// use icu::locid::Locale;
174    /// use std::cmp::Ordering;
175    ///
176    /// let bcp47_strings: &[&str] = &[
177    ///     "pl-Latn-PL",
178    ///     "und",
179    ///     "und-fonipa",
180    ///     "und-t-m0-true",
181    ///     "und-u-ca-hebrew",
182    ///     "und-u-ca-japanese",
183    ///     "zh",
184    /// ];
185    ///
186    /// for ab in bcp47_strings.windows(2) {
187    ///     let a = ab[0];
188    ///     let b = ab[1];
189    ///     assert!(a.cmp(b) == Ordering::Less);
190    ///     let a_loc = a.parse::<Locale>().unwrap();
191    ///     assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal);
192    ///     assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less);
193    /// }
194    /// ```
195    pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
196        self.writeable_cmp_bytes(other)
197    }
198
199    #[allow(clippy::type_complexity)]
200    pub(crate) fn as_tuple(
201        &self,
202    ) -> (
203        (
204            subtags::Language,
205            Option<subtags::Script>,
206            Option<subtags::Region>,
207            &subtags::Variants,
208        ),
209        (
210            (
211                &extensions::unicode::Attributes,
212                &extensions::unicode::Keywords,
213            ),
214            (
215                Option<(
216                    subtags::Language,
217                    Option<subtags::Script>,
218                    Option<subtags::Region>,
219                    &subtags::Variants,
220                )>,
221                &extensions::transform::Fields,
222            ),
223            &extensions::private::Private,
224            &[extensions::other::Other],
225        ),
226    ) {
227        (self.id.as_tuple(), self.extensions.as_tuple())
228    }
229
230    /// Returns an ordering suitable for use in [`BTreeSet`].
231    ///
232    /// The ordering may or may not be equivalent to string ordering, and it
233    /// may or may not be stable across ICU4X releases.
234    ///
235    /// [`BTreeSet`]: alloc::collections::BTreeSet
236    pub fn total_cmp(&self, other: &Self) -> Ordering {
237        self.as_tuple().cmp(&other.as_tuple())
238    }
239
240    /// Compare this [`Locale`] with an iterator of BCP-47 subtags.
241    ///
242    /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as
243    /// a more modular version that allows multiple subtag iterators to be chained together.
244    ///
245    /// For an additional example, see [`SubtagOrderingResult`].
246    ///
247    /// # Examples
248    ///
249    /// ```
250    /// use icu::locid::locale;
251    /// use std::cmp::Ordering;
252    ///
253    /// let subtags: &[&[u8]] =
254    ///     &[b"ca", b"ES", b"valencia", b"u", b"ca", b"hebrew"];
255    ///
256    /// let loc = locale!("ca-ES-valencia-u-ca-hebrew");
257    /// assert_eq!(
258    ///     Ordering::Equal,
259    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
260    /// );
261    ///
262    /// let loc = locale!("ca-ES-valencia");
263    /// assert_eq!(
264    ///     Ordering::Less,
265    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
266    /// );
267    ///
268    /// let loc = locale!("ca-ES-valencia-u-nu-arab");
269    /// assert_eq!(
270    ///     Ordering::Greater,
271    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
272    /// );
273    /// ```
274    #[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
275    #[allow(deprecated)]
276    pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
277    where
278        I: Iterator<Item = &'l [u8]>,
279    {
280        let r = self.for_each_subtag_str(&mut |subtag| {
281            if let Some(other) = subtags.next() {
282                match subtag.as_bytes().cmp(other) {
283                    Ordering::Equal => Ok(()),
284                    not_equal => Err(not_equal),
285                }
286            } else {
287                Err(Ordering::Greater)
288            }
289        });
290        match r {
291            Ok(_) => SubtagOrderingResult::Subtags(subtags),
292            Err(o) => SubtagOrderingResult::Ordering(o),
293        }
294    }
295
296    /// Compare this `Locale` with a potentially unnormalized BCP-47 string.
297    ///
298    /// The return value is equivalent to what would happen if you first parsed the
299    /// BCP-47 string to a `Locale` and then performed a structural comparison.
300    ///
301    /// # Examples
302    ///
303    /// ```
304    /// use icu::locid::Locale;
305    ///
306    /// let bcp47_strings: &[&str] = &[
307    ///     "pl-LaTn-pL",
308    ///     "uNd",
309    ///     "UND-FONIPA",
310    ///     "UnD-t-m0-TrUe",
311    ///     "uNd-u-CA-Japanese",
312    ///     "ZH",
313    /// ];
314    ///
315    /// for a in bcp47_strings {
316    ///     assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
317    /// }
318    /// ```
319    pub fn normalizing_eq(&self, other: &str) -> bool {
320        macro_rules! subtag_matches {
321            ($T:ty, $iter:ident, $expected:expr) => {
322                $iter
323                    .next()
324                    .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
325                    .unwrap_or(false)
326            };
327        }
328
329        let mut iter = SubtagIterator::new(other.as_bytes());
330        if !subtag_matches!(subtags::Language, iter, self.id.language) {
331            return false;
332        }
333        if let Some(ref script) = self.id.script {
334            if !subtag_matches!(subtags::Script, iter, *script) {
335                return false;
336            }
337        }
338        if let Some(ref region) = self.id.region {
339            if !subtag_matches!(subtags::Region, iter, *region) {
340                return false;
341            }
342        }
343        for variant in self.id.variants.iter() {
344            if !subtag_matches!(subtags::Variant, iter, *variant) {
345                return false;
346            }
347        }
348        if !self.extensions.is_empty() {
349            match extensions::Extensions::try_from_iter(&mut iter) {
350                Ok(exts) => {
351                    if self.extensions != exts {
352                        return false;
353                    }
354                }
355                Err(_) => {
356                    return false;
357                }
358            }
359        }
360        iter.next().is_none()
361    }
362
363    #[doc(hidden)]
364    #[allow(clippy::type_complexity)]
365    pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension(
366        v: &[u8],
367    ) -> Result<
368        (
369            subtags::Language,
370            Option<subtags::Script>,
371            Option<subtags::Region>,
372            Option<subtags::Variant>,
373            Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>,
374        ),
375        ParserError,
376    > {
377        parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
378            v,
379            ParserMode::Locale,
380        )
381    }
382
383    pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
384    where
385        F: FnMut(&str) -> Result<(), E>,
386    {
387        self.id.for_each_subtag_str(f)?;
388        self.extensions.for_each_subtag_str(f)?;
389        Ok(())
390    }
391}
392
393impl FromStr for Locale {
394    type Err = ParserError;
395
396    fn from_str(source: &str) -> Result<Self, Self::Err> {
397        Self::try_from_bytes(source.as_bytes())
398    }
399}
400
401impl From<LanguageIdentifier> for Locale {
402    fn from(id: LanguageIdentifier) -> Self {
403        Self {
404            id,
405            extensions: extensions::Extensions::default(),
406        }
407    }
408}
409
410impl From<Locale> for LanguageIdentifier {
411    fn from(loc: Locale) -> Self {
412        loc.id
413    }
414}
415
416impl AsRef<LanguageIdentifier> for Locale {
417    #[inline(always)]
418    fn as_ref(&self) -> &LanguageIdentifier {
419        &self.id
420    }
421}
422
423impl AsMut<LanguageIdentifier> for Locale {
424    fn as_mut(&mut self) -> &mut LanguageIdentifier {
425        &mut self.id
426    }
427}
428
429impl core::fmt::Debug for Locale {
430    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
431        writeable::Writeable::write_to(self, f)
432    }
433}
434
435impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string());
436
437#[test]
438fn test_writeable() {
439    use writeable::assert_writeable_eq;
440    assert_writeable_eq!(Locale::UND, "und");
441    assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
442    assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
443    assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
444    assert_writeable_eq!(
445        "my-Mymr-MM-posix".parse::<Locale>().unwrap(),
446        "my-Mymr-MM-posix",
447    );
448    assert_writeable_eq!(
449        "zh-macos-posix".parse::<Locale>().unwrap(),
450        "zh-macos-posix",
451    );
452    assert_writeable_eq!(
453        "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
454        "my-t-my-d0-zawgyi",
455    );
456    assert_writeable_eq!(
457        "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
458        "ar-SA-u-ca-islamic-civil",
459    );
460    assert_writeable_eq!(
461        "en-001-x-foo-bar".parse::<Locale>().unwrap(),
462        "en-001-x-foo-bar",
463    );
464    assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
465}
466
467/// # Examples
468///
469/// ```
470/// use icu::locid::Locale;
471/// use icu::locid::{locale, subtags::language};
472///
473/// assert_eq!(Locale::from(language!("en")), locale!("en"));
474/// ```
475impl From<subtags::Language> for Locale {
476    fn from(language: subtags::Language) -> Self {
477        Self {
478            id: language.into(),
479            ..Default::default()
480        }
481    }
482}
483
484/// # Examples
485///
486/// ```
487/// use icu::locid::Locale;
488/// use icu::locid::{locale, subtags::script};
489///
490/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
491/// ```
492impl From<Option<subtags::Script>> for Locale {
493    fn from(script: Option<subtags::Script>) -> Self {
494        Self {
495            id: script.into(),
496            ..Default::default()
497        }
498    }
499}
500
501/// # Examples
502///
503/// ```
504/// use icu::locid::Locale;
505/// use icu::locid::{locale, subtags::region};
506///
507/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
508/// ```
509impl From<Option<subtags::Region>> for Locale {
510    fn from(region: Option<subtags::Region>) -> Self {
511        Self {
512            id: region.into(),
513            ..Default::default()
514        }
515    }
516}
517
518/// # Examples
519///
520/// ```
521/// use icu::locid::Locale;
522/// use icu::locid::{
523///     locale,
524///     subtags::{language, region, script},
525/// };
526///
527/// assert_eq!(
528///     Locale::from((
529///         language!("en"),
530///         Some(script!("Latn")),
531///         Some(region!("US"))
532///     )),
533///     locale!("en-Latn-US")
534/// );
535/// ```
536impl
537    From<(
538        subtags::Language,
539        Option<subtags::Script>,
540        Option<subtags::Region>,
541    )> for Locale
542{
543    fn from(
544        lsr: (
545            subtags::Language,
546            Option<subtags::Script>,
547            Option<subtags::Region>,
548        ),
549    ) -> Self {
550        Self {
551            id: lsr.into(),
552            ..Default::default()
553        }
554    }
555}