icu_locid/langid.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use core::cmp::Ordering;
6use core::str::FromStr;
7
8#[allow(deprecated)]
9use crate::ordering::SubtagOrderingResult;
10use crate::parser::{
11 parse_language_identifier, parse_language_identifier_with_single_variant, ParserError,
12 ParserMode, SubtagIterator,
13};
14use crate::subtags;
15use alloc::string::String;
16use writeable::Writeable;
17
18/// A core struct representing a [`Unicode BCP47 Language Identifier`].
19///
20/// # Examples
21///
22/// ```
23/// use icu::locid::{
24/// langid,
25/// subtags::{language, region},
26/// };
27///
28/// let li = langid!("en-US");
29///
30/// assert_eq!(li.language, language!("en"));
31/// assert_eq!(li.script, None);
32/// assert_eq!(li.region, Some(region!("US")));
33/// assert_eq!(li.variants.len(), 0);
34/// ```
35///
36/// # Parsing
37///
38/// Unicode recognizes three levels of standard conformance for any language identifier:
39///
40/// * *well-formed* - syntactically correct
41/// * *valid* - well-formed and only uses registered language, region, script and variant subtags...
42/// * *canonical* - valid and no deprecated codes or structure.
43///
44/// At the moment parsing normalizes a well-formed language identifier converting
45/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
46///
47/// Any bogus subtags will cause the parsing to fail with an error.
48/// No subtag validation is performed.
49///
50/// # Examples
51///
52/// ```
53/// use icu::locid::{
54/// langid,
55/// subtags::{language, region, script, variant},
56/// };
57///
58/// let li = langid!("eN_latn_Us-Valencia");
59///
60/// assert_eq!(li.language, language!("en"));
61/// assert_eq!(li.script, Some(script!("Latn")));
62/// assert_eq!(li.region, Some(region!("US")));
63/// assert_eq!(li.variants.get(0), Some(&variant!("valencia")));
64/// ```
65///
66/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
67#[derive(Default, PartialEq, Eq, Clone, Hash)]
68#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
69pub struct LanguageIdentifier {
70 /// Language subtag of the language identifier.
71 pub language: subtags::Language,
72 /// Script subtag of the language identifier.
73 pub script: Option<subtags::Script>,
74 /// Region subtag of the language identifier.
75 pub region: Option<subtags::Region>,
76 /// Variant subtags of the language identifier.
77 pub variants: subtags::Variants,
78}
79
80impl LanguageIdentifier {
81 /// A constructor which takes a utf8 slice, parses it and
82 /// produces a well-formed [`LanguageIdentifier`].
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use icu::locid::LanguageIdentifier;
88 ///
89 /// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed");
90 /// ```
91 pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
92 parse_language_identifier(v, ParserMode::LanguageIdentifier)
93 }
94
95 #[doc(hidden)]
96 #[allow(clippy::type_complexity)]
97 // The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops`
98 // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
99 pub const fn try_from_bytes_with_single_variant(
100 v: &[u8],
101 ) -> Result<
102 (
103 subtags::Language,
104 Option<subtags::Script>,
105 Option<subtags::Region>,
106 Option<subtags::Variant>,
107 ),
108 ParserError,
109 > {
110 parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier)
111 }
112
113 /// A constructor which takes a utf8 slice which may contain extension keys,
114 /// parses it and produces a well-formed [`LanguageIdentifier`].
115 ///
116 /// # Examples
117 ///
118 /// ```
119 /// use icu::locid::{langid, LanguageIdentifier};
120 ///
121 /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
122 /// .expect("Parsing failed.");
123 ///
124 /// assert_eq!(li, langid!("en-US"));
125 /// ```
126 ///
127 /// This method should be used for input that may be a locale identifier.
128 /// All extensions will be lost.
129 pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParserError> {
130 parse_language_identifier(v, ParserMode::Locale)
131 }
132
133 /// The default undefined language "und". Same as [`default()`](Default::default()).
134 ///
135 /// # Examples
136 ///
137 /// ```
138 /// use icu::locid::LanguageIdentifier;
139 ///
140 /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND);
141 /// ```
142 pub const UND: Self = Self {
143 language: subtags::Language::UND,
144 script: None,
145 region: None,
146 variants: subtags::Variants::new(),
147 };
148
149 /// This is a best-effort operation that performs all available levels of canonicalization.
150 ///
151 /// At the moment the operation will normalize casing and the separator, but in the future
152 /// it may also validate and update from deprecated subtags to canonical ones.
153 ///
154 /// # Examples
155 ///
156 /// ```
157 /// use icu::locid::LanguageIdentifier;
158 ///
159 /// assert_eq!(
160 /// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
161 /// Ok("pl-Latn-PL")
162 /// );
163 /// ```
164 pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
165 let lang_id = Self::try_from_bytes(input.as_ref())?;
166 Ok(lang_id.write_to_string().into_owned())
167 }
168
169 /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
170 ///
171 /// The return value is equivalent to what would happen if you first converted this
172 /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
173 ///
174 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
175 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
176 ///
177 /// # Examples
178 ///
179 /// ```
180 /// use icu::locid::LanguageIdentifier;
181 /// use std::cmp::Ordering;
182 ///
183 /// let bcp47_strings: &[&str] = &[
184 /// "pl-Latn-PL",
185 /// "und",
186 /// "und-Adlm",
187 /// "und-GB",
188 /// "und-ZA",
189 /// "und-fonipa",
190 /// "zh",
191 /// ];
192 ///
193 /// for ab in bcp47_strings.windows(2) {
194 /// let a = ab[0];
195 /// let b = ab[1];
196 /// assert!(a.cmp(b) == Ordering::Less);
197 /// let a_langid = a.parse::<LanguageIdentifier>().unwrap();
198 /// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal);
199 /// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less);
200 /// }
201 /// ```
202 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
203 self.writeable_cmp_bytes(other)
204 }
205
206 pub(crate) fn as_tuple(
207 &self,
208 ) -> (
209 subtags::Language,
210 Option<subtags::Script>,
211 Option<subtags::Region>,
212 &subtags::Variants,
213 ) {
214 (self.language, self.script, self.region, &self.variants)
215 }
216
217 /// Compare this [`LanguageIdentifier`] with another [`LanguageIdentifier`] field-by-field.
218 /// The result is a total ordering sufficient for use in a [`BTreeMap`].
219 ///
220 /// Unlike [`Self::strict_cmp`], this function's ordering may not equal string ordering.
221 ///
222 /// [`BTreeMap`]: alloc::collections::BTreeMap
223 pub fn total_cmp(&self, other: &Self) -> Ordering {
224 self.as_tuple().cmp(&other.as_tuple())
225 }
226
227 /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags.
228 ///
229 /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as
230 /// a more modular version that allows multiple subtag iterators to be chained together.
231 ///
232 /// For an additional example, see [`SubtagOrderingResult`].
233 ///
234 /// # Examples
235 ///
236 /// ```
237 /// use icu::locid::LanguageIdentifier;
238 /// use std::cmp::Ordering;
239 ///
240 /// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"];
241 ///
242 /// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap();
243 /// assert_eq!(
244 /// Ordering::Equal,
245 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
246 /// );
247 ///
248 /// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap();
249 /// assert_eq!(
250 /// Ordering::Less,
251 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
252 /// );
253 ///
254 /// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap();
255 /// assert_eq!(
256 /// Ordering::Greater,
257 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
258 /// );
259 /// ```
260 #[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
261 #[allow(deprecated)]
262 pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
263 where
264 I: Iterator<Item = &'l [u8]>,
265 {
266 let r = self.for_each_subtag_str(&mut |subtag| {
267 if let Some(other) = subtags.next() {
268 match subtag.as_bytes().cmp(other) {
269 Ordering::Equal => Ok(()),
270 not_equal => Err(not_equal),
271 }
272 } else {
273 Err(Ordering::Greater)
274 }
275 });
276 match r {
277 Ok(_) => SubtagOrderingResult::Subtags(subtags),
278 Err(o) => SubtagOrderingResult::Ordering(o),
279 }
280 }
281
282 /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
283 ///
284 /// The return value is equivalent to what would happen if you first parsed the
285 /// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison.
286 ///
287 /// # Examples
288 ///
289 /// ```
290 /// use icu::locid::LanguageIdentifier;
291 ///
292 /// let bcp47_strings: &[&str] = &[
293 /// "pl-LaTn-pL",
294 /// "uNd",
295 /// "UnD-adlm",
296 /// "uNd-GB",
297 /// "UND-FONIPA",
298 /// "ZH",
299 /// ];
300 ///
301 /// for a in bcp47_strings {
302 /// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
303 /// }
304 /// ```
305 pub fn normalizing_eq(&self, other: &str) -> bool {
306 macro_rules! subtag_matches {
307 ($T:ty, $iter:ident, $expected:expr) => {
308 $iter
309 .next()
310 .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
311 .unwrap_or(false)
312 };
313 }
314
315 let mut iter = SubtagIterator::new(other.as_bytes());
316 if !subtag_matches!(subtags::Language, iter, self.language) {
317 return false;
318 }
319 if let Some(ref script) = self.script {
320 if !subtag_matches!(subtags::Script, iter, *script) {
321 return false;
322 }
323 }
324 if let Some(ref region) = self.region {
325 if !subtag_matches!(subtags::Region, iter, *region) {
326 return false;
327 }
328 }
329 for variant in self.variants.iter() {
330 if !subtag_matches!(subtags::Variant, iter, *variant) {
331 return false;
332 }
333 }
334 iter.next().is_none()
335 }
336
337 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
338 where
339 F: FnMut(&str) -> Result<(), E>,
340 {
341 f(self.language.as_str())?;
342 if let Some(ref script) = self.script {
343 f(script.as_str())?;
344 }
345 if let Some(ref region) = self.region {
346 f(region.as_str())?;
347 }
348 for variant in self.variants.iter() {
349 f(variant.as_str())?;
350 }
351 Ok(())
352 }
353
354 /// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
355 /// lowercase ascii form.
356 ///
357 /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
358 /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
359 ///
360 /// > _The canonical form for all subtags in the extension is lowercase, with the fields
361 /// ordered by the separators, alphabetically._
362 ///
363 /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
364 /// canonicalization of the language identifier.
365 ///
366 /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
367 /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
368 /// but titlecased and uppercased outside T extensions respectively.
369 ///
370 /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
371 /// [`Transform extensions`]: crate::extensions::transform
372 pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
373 where
374 F: FnMut(&str) -> Result<(), E>,
375 {
376 f(self.language.as_str())?;
377 if let Some(ref script) = self.script {
378 f(script.into_tinystr().to_ascii_lowercase().as_str())?;
379 }
380 if let Some(ref region) = self.region {
381 f(region.into_tinystr().to_ascii_lowercase().as_str())?;
382 }
383 for variant in self.variants.iter() {
384 f(variant.as_str())?;
385 }
386 Ok(())
387 }
388
389 /// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
390 /// lowercase ascii chars.
391 ///
392 /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
393 /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
394 ///
395 /// > _The canonical form for all subtags in the extension is lowercase, with the fields
396 /// ordered by the separators, alphabetically._
397 ///
398 /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
399 /// canonicalization of the language identifier.
400 ///
401 /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
402 /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
403 /// but titlecased and uppercased outside T extensions respectively.
404 ///
405 /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
406 /// [`Transform extensions`]: crate::extensions::transform
407 pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
408 &self,
409 sink: &mut W,
410 ) -> core::fmt::Result {
411 let mut initial = true;
412 self.for_each_subtag_str_lowercased(&mut |subtag| {
413 if initial {
414 initial = false;
415 } else {
416 sink.write_char('-')?;
417 }
418 sink.write_str(subtag)
419 })
420 }
421}
422
423impl AsRef<LanguageIdentifier> for LanguageIdentifier {
424 #[inline(always)]
425 fn as_ref(&self) -> &Self {
426 self
427 }
428}
429
430impl AsMut<LanguageIdentifier> for LanguageIdentifier {
431 fn as_mut(&mut self) -> &mut Self {
432 self
433 }
434}
435
436impl core::fmt::Debug for LanguageIdentifier {
437 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
438 core::fmt::Display::fmt(&self, f)
439 }
440}
441
442impl FromStr for LanguageIdentifier {
443 type Err = ParserError;
444
445 fn from_str(source: &str) -> Result<Self, Self::Err> {
446 Self::try_from_bytes(source.as_bytes())
447 }
448}
449
450impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string());
451
452#[test]
453fn test_writeable() {
454 use writeable::assert_writeable_eq;
455 assert_writeable_eq!(LanguageIdentifier::UND, "und");
456 assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
457 assert_writeable_eq!(
458 "und-Mymr".parse::<LanguageIdentifier>().unwrap(),
459 "und-Mymr",
460 );
461 assert_writeable_eq!(
462 "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
463 "my-Mymr-MM",
464 );
465 assert_writeable_eq!(
466 "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
467 "my-Mymr-MM-posix",
468 );
469 assert_writeable_eq!(
470 "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
471 "zh-macos-posix",
472 );
473}
474
475/// # Examples
476///
477/// ```
478/// use icu::locid::{langid, subtags::language, LanguageIdentifier};
479///
480/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
481/// ```
482impl From<subtags::Language> for LanguageIdentifier {
483 fn from(language: subtags::Language) -> Self {
484 Self {
485 language,
486 ..Default::default()
487 }
488 }
489}
490
491/// # Examples
492///
493/// ```
494/// use icu::locid::{langid, subtags::script, LanguageIdentifier};
495///
496/// assert_eq!(
497/// LanguageIdentifier::from(Some(script!("latn"))),
498/// langid!("und-Latn")
499/// );
500/// ```
501impl From<Option<subtags::Script>> for LanguageIdentifier {
502 fn from(script: Option<subtags::Script>) -> Self {
503 Self {
504 script,
505 ..Default::default()
506 }
507 }
508}
509
510/// # Examples
511///
512/// ```
513/// use icu::locid::{langid, subtags::region, LanguageIdentifier};
514///
515/// assert_eq!(
516/// LanguageIdentifier::from(Some(region!("US"))),
517/// langid!("und-US")
518/// );
519/// ```
520impl From<Option<subtags::Region>> for LanguageIdentifier {
521 fn from(region: Option<subtags::Region>) -> Self {
522 Self {
523 region,
524 ..Default::default()
525 }
526 }
527}
528
529/// Convert from an LSR tuple to a [`LanguageIdentifier`].
530///
531/// # Examples
532///
533/// ```
534/// use icu::locid::{
535/// langid,
536/// subtags::{language, region, script},
537/// LanguageIdentifier,
538/// };
539///
540/// let lang = language!("en");
541/// let script = script!("Latn");
542/// let region = region!("US");
543/// assert_eq!(
544/// LanguageIdentifier::from((lang, Some(script), Some(region))),
545/// langid!("en-Latn-US")
546/// );
547/// ```
548impl
549 From<(
550 subtags::Language,
551 Option<subtags::Script>,
552 Option<subtags::Region>,
553 )> for LanguageIdentifier
554{
555 fn from(
556 lsr: (
557 subtags::Language,
558 Option<subtags::Script>,
559 Option<subtags::Region>,
560 ),
561 ) -> Self {
562 Self {
563 language: lsr.0,
564 script: lsr.1,
565 region: lsr.2,
566 ..Default::default()
567 }
568 }
569}
570
571/// Convert from a [`LanguageIdentifier`] to an LSR tuple.
572///
573/// # Examples
574///
575/// ```
576/// use icu::locid::{
577/// langid,
578/// subtags::{language, region, script},
579/// };
580///
581/// let lid = langid!("en-Latn-US");
582/// let (lang, script, region) = (&lid).into();
583///
584/// assert_eq!(lang, language!("en"));
585/// assert_eq!(script, Some(script!("Latn")));
586/// assert_eq!(region, Some(region!("US")));
587/// ```
588impl From<&LanguageIdentifier>
589 for (
590 subtags::Language,
591 Option<subtags::Script>,
592 Option<subtags::Region>,
593 )
594{
595 fn from(langid: &LanguageIdentifier) -> Self {
596 (langid.language, langid.script, langid.region)
597 }
598}