icu_locid/
langid.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use core::cmp::Ordering;
6use core::str::FromStr;
7
8#[allow(deprecated)]
9use crate::ordering::SubtagOrderingResult;
10use crate::parser::{
11    parse_language_identifier, parse_language_identifier_with_single_variant, ParserError,
12    ParserMode, SubtagIterator,
13};
14use crate::subtags;
15use alloc::string::String;
16use writeable::Writeable;
17
18/// A core struct representing a [`Unicode BCP47 Language Identifier`].
19///
20/// # Examples
21///
22/// ```
23/// use icu::locid::{
24///     langid,
25///     subtags::{language, region},
26/// };
27///
28/// let li = langid!("en-US");
29///
30/// assert_eq!(li.language, language!("en"));
31/// assert_eq!(li.script, None);
32/// assert_eq!(li.region, Some(region!("US")));
33/// assert_eq!(li.variants.len(), 0);
34/// ```
35///
36/// # Parsing
37///
38/// Unicode recognizes three levels of standard conformance for any language identifier:
39///
40///  * *well-formed* - syntactically correct
41///  * *valid* - well-formed and only uses registered language, region, script and variant subtags...
42///  * *canonical* - valid and no deprecated codes or structure.
43///
44/// At the moment parsing normalizes a well-formed language identifier converting
45/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
46///
47/// Any bogus subtags will cause the parsing to fail with an error.
48/// No subtag validation is performed.
49///
50/// # Examples
51///
52/// ```
53/// use icu::locid::{
54///     langid,
55///     subtags::{language, region, script, variant},
56/// };
57///
58/// let li = langid!("eN_latn_Us-Valencia");
59///
60/// assert_eq!(li.language, language!("en"));
61/// assert_eq!(li.script, Some(script!("Latn")));
62/// assert_eq!(li.region, Some(region!("US")));
63/// assert_eq!(li.variants.get(0), Some(&variant!("valencia")));
64/// ```
65///
66/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
67#[derive(Default, PartialEq, Eq, Clone, Hash)]
68#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
69pub struct LanguageIdentifier {
70    /// Language subtag of the language identifier.
71    pub language: subtags::Language,
72    /// Script subtag of the language identifier.
73    pub script: Option<subtags::Script>,
74    /// Region subtag of the language identifier.
75    pub region: Option<subtags::Region>,
76    /// Variant subtags of the language identifier.
77    pub variants: subtags::Variants,
78}
79
80impl LanguageIdentifier {
81    /// A constructor which takes a utf8 slice, parses it and
82    /// produces a well-formed [`LanguageIdentifier`].
83    ///
84    /// # Examples
85    ///
86    /// ```
87    /// use icu::locid::LanguageIdentifier;
88    ///
89    /// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed");
90    /// ```
91    pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
92        parse_language_identifier(v, ParserMode::LanguageIdentifier)
93    }
94
95    #[doc(hidden)]
96    #[allow(clippy::type_complexity)]
97    // The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops`
98    // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
99    pub const fn try_from_bytes_with_single_variant(
100        v: &[u8],
101    ) -> Result<
102        (
103            subtags::Language,
104            Option<subtags::Script>,
105            Option<subtags::Region>,
106            Option<subtags::Variant>,
107        ),
108        ParserError,
109    > {
110        parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier)
111    }
112
113    /// A constructor which takes a utf8 slice which may contain extension keys,
114    /// parses it and produces a well-formed [`LanguageIdentifier`].
115    ///
116    /// # Examples
117    ///
118    /// ```
119    /// use icu::locid::{langid, LanguageIdentifier};
120    ///
121    /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
122    ///     .expect("Parsing failed.");
123    ///
124    /// assert_eq!(li, langid!("en-US"));
125    /// ```
126    ///
127    /// This method should be used for input that may be a locale identifier.
128    /// All extensions will be lost.
129    pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParserError> {
130        parse_language_identifier(v, ParserMode::Locale)
131    }
132
133    /// The default undefined language "und". Same as [`default()`](Default::default()).
134    ///
135    /// # Examples
136    ///
137    /// ```
138    /// use icu::locid::LanguageIdentifier;
139    ///
140    /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND);
141    /// ```
142    pub const UND: Self = Self {
143        language: subtags::Language::UND,
144        script: None,
145        region: None,
146        variants: subtags::Variants::new(),
147    };
148
149    /// This is a best-effort operation that performs all available levels of canonicalization.
150    ///
151    /// At the moment the operation will normalize casing and the separator, but in the future
152    /// it may also validate and update from deprecated subtags to canonical ones.
153    ///
154    /// # Examples
155    ///
156    /// ```
157    /// use icu::locid::LanguageIdentifier;
158    ///
159    /// assert_eq!(
160    ///     LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
161    ///     Ok("pl-Latn-PL")
162    /// );
163    /// ```
164    pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
165        let lang_id = Self::try_from_bytes(input.as_ref())?;
166        Ok(lang_id.write_to_string().into_owned())
167    }
168
169    /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
170    ///
171    /// The return value is equivalent to what would happen if you first converted this
172    /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
173    ///
174    /// This function is case-sensitive and results in a *total order*, so it is appropriate for
175    /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
176    ///
177    /// # Examples
178    ///
179    /// ```
180    /// use icu::locid::LanguageIdentifier;
181    /// use std::cmp::Ordering;
182    ///
183    /// let bcp47_strings: &[&str] = &[
184    ///     "pl-Latn-PL",
185    ///     "und",
186    ///     "und-Adlm",
187    ///     "und-GB",
188    ///     "und-ZA",
189    ///     "und-fonipa",
190    ///     "zh",
191    /// ];
192    ///
193    /// for ab in bcp47_strings.windows(2) {
194    ///     let a = ab[0];
195    ///     let b = ab[1];
196    ///     assert!(a.cmp(b) == Ordering::Less);
197    ///     let a_langid = a.parse::<LanguageIdentifier>().unwrap();
198    ///     assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal);
199    ///     assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less);
200    /// }
201    /// ```
202    pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
203        self.writeable_cmp_bytes(other)
204    }
205
206    pub(crate) fn as_tuple(
207        &self,
208    ) -> (
209        subtags::Language,
210        Option<subtags::Script>,
211        Option<subtags::Region>,
212        &subtags::Variants,
213    ) {
214        (self.language, self.script, self.region, &self.variants)
215    }
216
217    /// Compare this [`LanguageIdentifier`] with another [`LanguageIdentifier`] field-by-field.
218    /// The result is a total ordering sufficient for use in a [`BTreeMap`].
219    ///
220    /// Unlike [`Self::strict_cmp`], this function's ordering may not equal string ordering.
221    ///
222    /// [`BTreeMap`]: alloc::collections::BTreeMap
223    pub fn total_cmp(&self, other: &Self) -> Ordering {
224        self.as_tuple().cmp(&other.as_tuple())
225    }
226
227    /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags.
228    ///
229    /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as
230    /// a more modular version that allows multiple subtag iterators to be chained together.
231    ///
232    /// For an additional example, see [`SubtagOrderingResult`].
233    ///
234    /// # Examples
235    ///
236    /// ```
237    /// use icu::locid::LanguageIdentifier;
238    /// use std::cmp::Ordering;
239    ///
240    /// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"];
241    ///
242    /// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap();
243    /// assert_eq!(
244    ///     Ordering::Equal,
245    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
246    /// );
247    ///
248    /// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap();
249    /// assert_eq!(
250    ///     Ordering::Less,
251    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
252    /// );
253    ///
254    /// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap();
255    /// assert_eq!(
256    ///     Ordering::Greater,
257    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
258    /// );
259    /// ```
260    #[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
261    #[allow(deprecated)]
262    pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
263    where
264        I: Iterator<Item = &'l [u8]>,
265    {
266        let r = self.for_each_subtag_str(&mut |subtag| {
267            if let Some(other) = subtags.next() {
268                match subtag.as_bytes().cmp(other) {
269                    Ordering::Equal => Ok(()),
270                    not_equal => Err(not_equal),
271                }
272            } else {
273                Err(Ordering::Greater)
274            }
275        });
276        match r {
277            Ok(_) => SubtagOrderingResult::Subtags(subtags),
278            Err(o) => SubtagOrderingResult::Ordering(o),
279        }
280    }
281
282    /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
283    ///
284    /// The return value is equivalent to what would happen if you first parsed the
285    /// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison.
286    ///
287    /// # Examples
288    ///
289    /// ```
290    /// use icu::locid::LanguageIdentifier;
291    ///
292    /// let bcp47_strings: &[&str] = &[
293    ///     "pl-LaTn-pL",
294    ///     "uNd",
295    ///     "UnD-adlm",
296    ///     "uNd-GB",
297    ///     "UND-FONIPA",
298    ///     "ZH",
299    /// ];
300    ///
301    /// for a in bcp47_strings {
302    ///     assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
303    /// }
304    /// ```
305    pub fn normalizing_eq(&self, other: &str) -> bool {
306        macro_rules! subtag_matches {
307            ($T:ty, $iter:ident, $expected:expr) => {
308                $iter
309                    .next()
310                    .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
311                    .unwrap_or(false)
312            };
313        }
314
315        let mut iter = SubtagIterator::new(other.as_bytes());
316        if !subtag_matches!(subtags::Language, iter, self.language) {
317            return false;
318        }
319        if let Some(ref script) = self.script {
320            if !subtag_matches!(subtags::Script, iter, *script) {
321                return false;
322            }
323        }
324        if let Some(ref region) = self.region {
325            if !subtag_matches!(subtags::Region, iter, *region) {
326                return false;
327            }
328        }
329        for variant in self.variants.iter() {
330            if !subtag_matches!(subtags::Variant, iter, *variant) {
331                return false;
332            }
333        }
334        iter.next().is_none()
335    }
336
337    pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
338    where
339        F: FnMut(&str) -> Result<(), E>,
340    {
341        f(self.language.as_str())?;
342        if let Some(ref script) = self.script {
343            f(script.as_str())?;
344        }
345        if let Some(ref region) = self.region {
346            f(region.as_str())?;
347        }
348        for variant in self.variants.iter() {
349            f(variant.as_str())?;
350        }
351        Ok(())
352    }
353
354    /// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
355    /// lowercase ascii form.
356    ///
357    /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
358    /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
359    ///
360    /// > _The canonical form for all subtags in the extension is lowercase, with the fields
361    /// ordered by the separators, alphabetically._
362    ///
363    /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
364    /// canonicalization of the language identifier.
365    ///
366    /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
367    /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
368    /// but titlecased and uppercased outside T extensions respectively.
369    ///
370    /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
371    /// [`Transform extensions`]: crate::extensions::transform
372    pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
373    where
374        F: FnMut(&str) -> Result<(), E>,
375    {
376        f(self.language.as_str())?;
377        if let Some(ref script) = self.script {
378            f(script.into_tinystr().to_ascii_lowercase().as_str())?;
379        }
380        if let Some(ref region) = self.region {
381            f(region.into_tinystr().to_ascii_lowercase().as_str())?;
382        }
383        for variant in self.variants.iter() {
384            f(variant.as_str())?;
385        }
386        Ok(())
387    }
388
389    /// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
390    /// lowercase ascii chars.
391    ///
392    /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
393    /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
394    ///
395    /// > _The canonical form for all subtags in the extension is lowercase, with the fields
396    /// ordered by the separators, alphabetically._
397    ///
398    /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
399    /// canonicalization of the language identifier.
400    ///
401    /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
402    /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
403    /// but titlecased and uppercased outside T extensions respectively.
404    ///
405    /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
406    /// [`Transform extensions`]: crate::extensions::transform
407    pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
408        &self,
409        sink: &mut W,
410    ) -> core::fmt::Result {
411        let mut initial = true;
412        self.for_each_subtag_str_lowercased(&mut |subtag| {
413            if initial {
414                initial = false;
415            } else {
416                sink.write_char('-')?;
417            }
418            sink.write_str(subtag)
419        })
420    }
421}
422
423impl AsRef<LanguageIdentifier> for LanguageIdentifier {
424    #[inline(always)]
425    fn as_ref(&self) -> &Self {
426        self
427    }
428}
429
430impl AsMut<LanguageIdentifier> for LanguageIdentifier {
431    fn as_mut(&mut self) -> &mut Self {
432        self
433    }
434}
435
436impl core::fmt::Debug for LanguageIdentifier {
437    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
438        core::fmt::Display::fmt(&self, f)
439    }
440}
441
442impl FromStr for LanguageIdentifier {
443    type Err = ParserError;
444
445    fn from_str(source: &str) -> Result<Self, Self::Err> {
446        Self::try_from_bytes(source.as_bytes())
447    }
448}
449
450impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string());
451
452#[test]
453fn test_writeable() {
454    use writeable::assert_writeable_eq;
455    assert_writeable_eq!(LanguageIdentifier::UND, "und");
456    assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
457    assert_writeable_eq!(
458        "und-Mymr".parse::<LanguageIdentifier>().unwrap(),
459        "und-Mymr",
460    );
461    assert_writeable_eq!(
462        "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
463        "my-Mymr-MM",
464    );
465    assert_writeable_eq!(
466        "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
467        "my-Mymr-MM-posix",
468    );
469    assert_writeable_eq!(
470        "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
471        "zh-macos-posix",
472    );
473}
474
475/// # Examples
476///
477/// ```
478/// use icu::locid::{langid, subtags::language, LanguageIdentifier};
479///
480/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
481/// ```
482impl From<subtags::Language> for LanguageIdentifier {
483    fn from(language: subtags::Language) -> Self {
484        Self {
485            language,
486            ..Default::default()
487        }
488    }
489}
490
491/// # Examples
492///
493/// ```
494/// use icu::locid::{langid, subtags::script, LanguageIdentifier};
495///
496/// assert_eq!(
497///     LanguageIdentifier::from(Some(script!("latn"))),
498///     langid!("und-Latn")
499/// );
500/// ```
501impl From<Option<subtags::Script>> for LanguageIdentifier {
502    fn from(script: Option<subtags::Script>) -> Self {
503        Self {
504            script,
505            ..Default::default()
506        }
507    }
508}
509
510/// # Examples
511///
512/// ```
513/// use icu::locid::{langid, subtags::region, LanguageIdentifier};
514///
515/// assert_eq!(
516///     LanguageIdentifier::from(Some(region!("US"))),
517///     langid!("und-US")
518/// );
519/// ```
520impl From<Option<subtags::Region>> for LanguageIdentifier {
521    fn from(region: Option<subtags::Region>) -> Self {
522        Self {
523            region,
524            ..Default::default()
525        }
526    }
527}
528
529/// Convert from an LSR tuple to a [`LanguageIdentifier`].
530///
531/// # Examples
532///
533/// ```
534/// use icu::locid::{
535///     langid,
536///     subtags::{language, region, script},
537///     LanguageIdentifier,
538/// };
539///
540/// let lang = language!("en");
541/// let script = script!("Latn");
542/// let region = region!("US");
543/// assert_eq!(
544///     LanguageIdentifier::from((lang, Some(script), Some(region))),
545///     langid!("en-Latn-US")
546/// );
547/// ```
548impl
549    From<(
550        subtags::Language,
551        Option<subtags::Script>,
552        Option<subtags::Region>,
553    )> for LanguageIdentifier
554{
555    fn from(
556        lsr: (
557            subtags::Language,
558            Option<subtags::Script>,
559            Option<subtags::Region>,
560        ),
561    ) -> Self {
562        Self {
563            language: lsr.0,
564            script: lsr.1,
565            region: lsr.2,
566            ..Default::default()
567        }
568    }
569}
570
571/// Convert from a [`LanguageIdentifier`] to an LSR tuple.
572///
573/// # Examples
574///
575/// ```
576/// use icu::locid::{
577///     langid,
578///     subtags::{language, region, script},
579/// };
580///
581/// let lid = langid!("en-Latn-US");
582/// let (lang, script, region) = (&lid).into();
583///
584/// assert_eq!(lang, language!("en"));
585/// assert_eq!(script, Some(script!("Latn")));
586/// assert_eq!(region, Some(region!("US")));
587/// ```
588impl From<&LanguageIdentifier>
589    for (
590        subtags::Language,
591        Option<subtags::Script>,
592        Option<subtags::Region>,
593    )
594{
595    fn from(langid: &LanguageIdentifier) -> Self {
596        (langid.language, langid.script, langid.region)
597    }
598}