icu_locid/locale.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[allow(deprecated)]
6use crate::ordering::SubtagOrderingResult;
7use crate::parser::{
8 parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
9 ParserError, ParserMode, SubtagIterator,
10};
11use crate::{extensions, subtags, LanguageIdentifier};
12use alloc::string::String;
13use core::cmp::Ordering;
14use core::str::FromStr;
15use tinystr::TinyAsciiStr;
16use writeable::Writeable;
17
18/// A core struct representing a [`Unicode Locale Identifier`].
19///
20/// A locale is made of two parts:
21/// * Unicode Language Identifier
22/// * A set of Unicode Extensions
23///
24/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
25/// on top of that is able to parse, manipulate and serialize unicode extension fields.
26///
27///
28/// # Examples
29///
30/// ```
31/// use icu::locid::{
32/// extensions::unicode::{key, value},
33/// locale,
34/// subtags::{language, region},
35/// };
36///
37/// let loc = locale!("en-US-u-ca-buddhist");
38///
39/// assert_eq!(loc.id.language, language!("en"));
40/// assert_eq!(loc.id.script, None);
41/// assert_eq!(loc.id.region, Some(region!("US")));
42/// assert_eq!(loc.id.variants.len(), 0);
43/// assert_eq!(
44/// loc.extensions.unicode.keywords.get(&key!("ca")),
45/// Some(&value!("buddhist"))
46/// );
47/// ```
48///
49/// # Parsing
50///
51/// Unicode recognizes three levels of standard conformance for a locale:
52///
53/// * *well-formed* - syntactically correct
54/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
55/// * *canonical* - valid and no deprecated codes or structure.
56///
57/// At the moment parsing normalizes a well-formed locale identifier converting
58/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
59///
60/// Any bogus subtags will cause the parsing to fail with an error.
61///
62/// No subtag validation or alias resolution is performed.
63///
64/// # Examples
65///
66/// ```
67/// use icu::locid::{subtags::*, Locale};
68///
69/// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12"
70/// .parse()
71/// .expect("Failed to parse.");
72///
73/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
74/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
75/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
76/// assert_eq!(
77/// loc.id.variants.get(0),
78/// "valencia".parse::<Variant>().ok().as_ref()
79/// );
80/// ```
81/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
82#[derive(Default, PartialEq, Eq, Clone, Hash)]
83#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
84pub struct Locale {
85 /// The basic language/script/region components in the locale identifier along with any variants.
86 pub id: LanguageIdentifier,
87 /// Any extensions present in the locale identifier.
88 pub extensions: extensions::Extensions,
89}
90
91#[test]
92fn test_sizes() {
93 assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
94 assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
95 assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
96 assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
97 assert_eq!(core::mem::size_of::<subtags::Variants>(), 16);
98 assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32);
99
100 assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56);
101 assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32);
102 assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);
103
104 assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16);
105 assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24);
106 assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
107 assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16);
108 assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136);
109
110 assert_eq!(core::mem::size_of::<Locale>(), 168);
111}
112
113impl Locale {
114 /// A constructor which takes a utf8 slice, parses it and
115 /// produces a well-formed [`Locale`].
116 ///
117 /// # Examples
118 ///
119 /// ```
120 /// use icu::locid::Locale;
121 ///
122 /// Locale::try_from_bytes(b"en-US-u-hc-h12").unwrap();
123 /// ```
124 pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
125 parse_locale(v)
126 }
127
128 /// The default undefined locale "und". Same as [`default()`](Default::default()).
129 ///
130 /// # Examples
131 ///
132 /// ```
133 /// use icu::locid::Locale;
134 ///
135 /// assert_eq!(Locale::default(), Locale::UND);
136 /// ```
137 pub const UND: Self = Self {
138 id: LanguageIdentifier::UND,
139 extensions: extensions::Extensions::new(),
140 };
141
142 /// This is a best-effort operation that performs all available levels of canonicalization.
143 ///
144 /// At the moment the operation will normalize casing and the separator, but in the future
145 /// it may also validate and update from deprecated subtags to canonical ones.
146 ///
147 /// # Examples
148 ///
149 /// ```
150 /// use icu::locid::Locale;
151 ///
152 /// assert_eq!(
153 /// Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
154 /// Ok("pl-Latn-PL-u-hc-h12")
155 /// );
156 /// ```
157 pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
158 let locale = Self::try_from_bytes(input.as_ref())?;
159 Ok(locale.write_to_string().into_owned())
160 }
161
162 /// Compare this [`Locale`] with BCP-47 bytes.
163 ///
164 /// The return value is equivalent to what would happen if you first converted this
165 /// [`Locale`] to a BCP-47 string and then performed a byte comparison.
166 ///
167 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
168 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
169 ///
170 /// # Examples
171 ///
172 /// ```
173 /// use icu::locid::Locale;
174 /// use std::cmp::Ordering;
175 ///
176 /// let bcp47_strings: &[&str] = &[
177 /// "pl-Latn-PL",
178 /// "und",
179 /// "und-fonipa",
180 /// "und-t-m0-true",
181 /// "und-u-ca-hebrew",
182 /// "und-u-ca-japanese",
183 /// "zh",
184 /// ];
185 ///
186 /// for ab in bcp47_strings.windows(2) {
187 /// let a = ab[0];
188 /// let b = ab[1];
189 /// assert!(a.cmp(b) == Ordering::Less);
190 /// let a_loc = a.parse::<Locale>().unwrap();
191 /// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal);
192 /// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less);
193 /// }
194 /// ```
195 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
196 self.writeable_cmp_bytes(other)
197 }
198
199 #[allow(clippy::type_complexity)]
200 pub(crate) fn as_tuple(
201 &self,
202 ) -> (
203 (
204 subtags::Language,
205 Option<subtags::Script>,
206 Option<subtags::Region>,
207 &subtags::Variants,
208 ),
209 (
210 (
211 &extensions::unicode::Attributes,
212 &extensions::unicode::Keywords,
213 ),
214 (
215 Option<(
216 subtags::Language,
217 Option<subtags::Script>,
218 Option<subtags::Region>,
219 &subtags::Variants,
220 )>,
221 &extensions::transform::Fields,
222 ),
223 &extensions::private::Private,
224 &[extensions::other::Other],
225 ),
226 ) {
227 (self.id.as_tuple(), self.extensions.as_tuple())
228 }
229
230 /// Returns an ordering suitable for use in [`BTreeSet`].
231 ///
232 /// The ordering may or may not be equivalent to string ordering, and it
233 /// may or may not be stable across ICU4X releases.
234 ///
235 /// [`BTreeSet`]: alloc::collections::BTreeSet
236 pub fn total_cmp(&self, other: &Self) -> Ordering {
237 self.as_tuple().cmp(&other.as_tuple())
238 }
239
240 /// Compare this [`Locale`] with an iterator of BCP-47 subtags.
241 ///
242 /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as
243 /// a more modular version that allows multiple subtag iterators to be chained together.
244 ///
245 /// For an additional example, see [`SubtagOrderingResult`].
246 ///
247 /// # Examples
248 ///
249 /// ```
250 /// use icu::locid::locale;
251 /// use std::cmp::Ordering;
252 ///
253 /// let subtags: &[&[u8]] =
254 /// &[b"ca", b"ES", b"valencia", b"u", b"ca", b"hebrew"];
255 ///
256 /// let loc = locale!("ca-ES-valencia-u-ca-hebrew");
257 /// assert_eq!(
258 /// Ordering::Equal,
259 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
260 /// );
261 ///
262 /// let loc = locale!("ca-ES-valencia");
263 /// assert_eq!(
264 /// Ordering::Less,
265 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
266 /// );
267 ///
268 /// let loc = locale!("ca-ES-valencia-u-nu-arab");
269 /// assert_eq!(
270 /// Ordering::Greater,
271 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
272 /// );
273 /// ```
274 #[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
275 #[allow(deprecated)]
276 pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
277 where
278 I: Iterator<Item = &'l [u8]>,
279 {
280 let r = self.for_each_subtag_str(&mut |subtag| {
281 if let Some(other) = subtags.next() {
282 match subtag.as_bytes().cmp(other) {
283 Ordering::Equal => Ok(()),
284 not_equal => Err(not_equal),
285 }
286 } else {
287 Err(Ordering::Greater)
288 }
289 });
290 match r {
291 Ok(_) => SubtagOrderingResult::Subtags(subtags),
292 Err(o) => SubtagOrderingResult::Ordering(o),
293 }
294 }
295
296 /// Compare this `Locale` with a potentially unnormalized BCP-47 string.
297 ///
298 /// The return value is equivalent to what would happen if you first parsed the
299 /// BCP-47 string to a `Locale` and then performed a structural comparison.
300 ///
301 /// # Examples
302 ///
303 /// ```
304 /// use icu::locid::Locale;
305 ///
306 /// let bcp47_strings: &[&str] = &[
307 /// "pl-LaTn-pL",
308 /// "uNd",
309 /// "UND-FONIPA",
310 /// "UnD-t-m0-TrUe",
311 /// "uNd-u-CA-Japanese",
312 /// "ZH",
313 /// ];
314 ///
315 /// for a in bcp47_strings {
316 /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
317 /// }
318 /// ```
319 pub fn normalizing_eq(&self, other: &str) -> bool {
320 macro_rules! subtag_matches {
321 ($T:ty, $iter:ident, $expected:expr) => {
322 $iter
323 .next()
324 .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
325 .unwrap_or(false)
326 };
327 }
328
329 let mut iter = SubtagIterator::new(other.as_bytes());
330 if !subtag_matches!(subtags::Language, iter, self.id.language) {
331 return false;
332 }
333 if let Some(ref script) = self.id.script {
334 if !subtag_matches!(subtags::Script, iter, *script) {
335 return false;
336 }
337 }
338 if let Some(ref region) = self.id.region {
339 if !subtag_matches!(subtags::Region, iter, *region) {
340 return false;
341 }
342 }
343 for variant in self.id.variants.iter() {
344 if !subtag_matches!(subtags::Variant, iter, *variant) {
345 return false;
346 }
347 }
348 if !self.extensions.is_empty() {
349 match extensions::Extensions::try_from_iter(&mut iter) {
350 Ok(exts) => {
351 if self.extensions != exts {
352 return false;
353 }
354 }
355 Err(_) => {
356 return false;
357 }
358 }
359 }
360 iter.next().is_none()
361 }
362
363 #[doc(hidden)]
364 #[allow(clippy::type_complexity)]
365 pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension(
366 v: &[u8],
367 ) -> Result<
368 (
369 subtags::Language,
370 Option<subtags::Script>,
371 Option<subtags::Region>,
372 Option<subtags::Variant>,
373 Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>,
374 ),
375 ParserError,
376 > {
377 parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
378 v,
379 ParserMode::Locale,
380 )
381 }
382
383 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
384 where
385 F: FnMut(&str) -> Result<(), E>,
386 {
387 self.id.for_each_subtag_str(f)?;
388 self.extensions.for_each_subtag_str(f)?;
389 Ok(())
390 }
391}
392
393impl FromStr for Locale {
394 type Err = ParserError;
395
396 fn from_str(source: &str) -> Result<Self, Self::Err> {
397 Self::try_from_bytes(source.as_bytes())
398 }
399}
400
401impl From<LanguageIdentifier> for Locale {
402 fn from(id: LanguageIdentifier) -> Self {
403 Self {
404 id,
405 extensions: extensions::Extensions::default(),
406 }
407 }
408}
409
410impl From<Locale> for LanguageIdentifier {
411 fn from(loc: Locale) -> Self {
412 loc.id
413 }
414}
415
416impl AsRef<LanguageIdentifier> for Locale {
417 #[inline(always)]
418 fn as_ref(&self) -> &LanguageIdentifier {
419 &self.id
420 }
421}
422
423impl AsMut<LanguageIdentifier> for Locale {
424 fn as_mut(&mut self) -> &mut LanguageIdentifier {
425 &mut self.id
426 }
427}
428
429impl core::fmt::Debug for Locale {
430 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
431 writeable::Writeable::write_to(self, f)
432 }
433}
434
435impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string());
436
437#[test]
438fn test_writeable() {
439 use writeable::assert_writeable_eq;
440 assert_writeable_eq!(Locale::UND, "und");
441 assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
442 assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
443 assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
444 assert_writeable_eq!(
445 "my-Mymr-MM-posix".parse::<Locale>().unwrap(),
446 "my-Mymr-MM-posix",
447 );
448 assert_writeable_eq!(
449 "zh-macos-posix".parse::<Locale>().unwrap(),
450 "zh-macos-posix",
451 );
452 assert_writeable_eq!(
453 "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
454 "my-t-my-d0-zawgyi",
455 );
456 assert_writeable_eq!(
457 "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
458 "ar-SA-u-ca-islamic-civil",
459 );
460 assert_writeable_eq!(
461 "en-001-x-foo-bar".parse::<Locale>().unwrap(),
462 "en-001-x-foo-bar",
463 );
464 assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
465}
466
467/// # Examples
468///
469/// ```
470/// use icu::locid::Locale;
471/// use icu::locid::{locale, subtags::language};
472///
473/// assert_eq!(Locale::from(language!("en")), locale!("en"));
474/// ```
475impl From<subtags::Language> for Locale {
476 fn from(language: subtags::Language) -> Self {
477 Self {
478 id: language.into(),
479 ..Default::default()
480 }
481 }
482}
483
484/// # Examples
485///
486/// ```
487/// use icu::locid::Locale;
488/// use icu::locid::{locale, subtags::script};
489///
490/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
491/// ```
492impl From<Option<subtags::Script>> for Locale {
493 fn from(script: Option<subtags::Script>) -> Self {
494 Self {
495 id: script.into(),
496 ..Default::default()
497 }
498 }
499}
500
501/// # Examples
502///
503/// ```
504/// use icu::locid::Locale;
505/// use icu::locid::{locale, subtags::region};
506///
507/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
508/// ```
509impl From<Option<subtags::Region>> for Locale {
510 fn from(region: Option<subtags::Region>) -> Self {
511 Self {
512 id: region.into(),
513 ..Default::default()
514 }
515 }
516}
517
518/// # Examples
519///
520/// ```
521/// use icu::locid::Locale;
522/// use icu::locid::{
523/// locale,
524/// subtags::{language, region, script},
525/// };
526///
527/// assert_eq!(
528/// Locale::from((
529/// language!("en"),
530/// Some(script!("Latn")),
531/// Some(region!("US"))
532/// )),
533/// locale!("en-Latn-US")
534/// );
535/// ```
536impl
537 From<(
538 subtags::Language,
539 Option<subtags::Script>,
540 Option<subtags::Region>,
541 )> for Locale
542{
543 fn from(
544 lsr: (
545 subtags::Language,
546 Option<subtags::Script>,
547 Option<subtags::Region>,
548 ),
549 ) -> Self {
550 Self {
551 id: lsr.into(),
552 ..Default::default()
553 }
554 }
555}