encode_unicode/errors.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
/* Copyright 2016-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Boilerplate-y error types.
//!
//! The discriminant values of the enums might change in minor releases.
//! (to reduce the size of the `Result<>` types they are returned in)
extern crate core;
use core::fmt::{self,Display,Formatter};
use core::ops::RangeInclusive;
#[cfg(feature="std")]
use std::error::Error;
macro_rules! description {($err:ty, $desc:expr) => {
#[cfg(not(feature="std"))]
impl $err {
#[allow(missing_docs)]
pub fn description(&self) -> &'static str {
($desc)(self)
}
}
#[cfg(feature="std")]
impl Error for $err {
fn description(&self) -> &'static str {
($desc)(self)
}
}
impl Display for $err {
fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
#![allow(deprecated)] // calling our own function
write!(fmtr, "{}", self.description())
}
}
}}
macro_rules! single_cause {($(#[$doc:meta])* $err:ident => $desc:expr) => {
$(#[$doc])*
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub struct $err;
description!{$err, |_| $desc }
}}
single_cause!{
/// Error returned by [`U16UtfExt::utf16_needs_extra_unit()`](../trait.U16UtfExt.html#tymethod.utf16_needs_extra_unit)
/// when called on an `u16` that's a trailing surrogate.
Utf16FirstUnitError => "is a trailing surrogate"
}
single_cause!{
/// Error returned by [`Utf8Char::from_ascii()`](../struct.Utf8Char.html#method.from_ascii)
/// for bytes that are not ASCII characters.
NonAsciiError => "not an ASCII character"
}
single_cause!{
/// Error returned by [`Utf16Char::from_bmp()`](../struct.Utf16Char.html#method.from_bmp)
/// for units that are not a standalone codepoint.
NonBmpError => "not a codepoint in the basic multilingual plane"
}
single_cause!{
/// Error returned by [`Utf8Char::from_str_start()`](../struct.Utf8Char.html#method.from_str_start)
/// and [`Utf16Char::from_str_start()`](../struct.Utf16Char.html#method.from_str_start)
/// when called with an empty string.
EmptyStrError => "is empty"
}
macro_rules! simple {($(#[$tydoc:meta])* $err:ident {
$( $(#[$vardoc:meta])* $variant:ident => $string:expr, )+
} ) => {
$(#[$tydoc])*
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum $err {
$( $(#[$vardoc])* $variant, )*
}
description!{$err, |e: &$err| match *e {$($err::$variant => $string),*} }
}}
simple!{
/// Error returned when an `u32` is not a valid unicode codepoint.
CodepointError {
/// It's reserved for UTF-16 surrogate pairs.
Utf16Reserved => "is reserved for UTF-16 surrogate pairs",
/// It's higher than the highest codepoint (which is 0x10ffff).
TooHigh => "is higher than the highest codepoint",
}}
use CodepointError::*;
impl CodepointError {
/// Get the range of values for which this error would be given.
pub const fn error_range(self) -> RangeInclusive<u32> {match self {
Utf16Reserved => 0xd8_00..=0xdf_ff,
TooHigh => 0x00_10_ff_ff..=0xff_ff_ff_ff,
}}
}
simple!{
/// Error returned when an `[u16; 2]` doesn't form a valid UTF-16 codepoint.
Utf16ArrayError {
/// The first element is a trailing / low surrogate, which is never valid.
FirstIsTrailingSurrogate => "the first element is a trailing surrogate",
/// The second element is needed, but is not a trailing surrogate.
SecondIsNotTrailingSurrogate => "the second element is needed but is not a trailing surrogate",
}}
simple!{
/// Error returned when one or two `u16`s are not valid UTF-16.
///
/// They are returned in sinking precedence;
/// The condition that causes the first variant to be returned is checked
/// for before the condition the next variant is returned for.
Utf16TupleError {
/// The first unit is a trailing / low surrogate, which is never valid.
FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
/// The provided second unit is not necessary.
SuperfluousSecond => "the second unit is superfluous",
/// The first and only unit requires a second unit.
MissingSecond => "the first unit requires a second unit",
/// The second unit is needed and was provided, but is not a trailing surrogate.
SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
}}
simple!{
/// Error returned when a slice of `u16`s doesn't start with valid UTF-16.
Utf16SliceError {
/// The slice is empty.
EmptySlice => "the slice is empty",
/// The first unit is a trailing surrogate.
FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
/// The first and only unit requires a second unit.
MissingSecond => "the first and only unit requires a second one",
/// The first unit requires a second one, but it's not a trailing surrogate.
SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
}}
simple!{
/// Error returned by [`Utf16CharDecoder`](../iterator/struct.Utf16CharMerger.html#impl-Iterator)
/// when it encounters an invalid sequence.
Utf16PairError {
/// A trailing surrogate was not preceeded by a leading surrogate.
UnexpectedTrailingSurrogate => "a trailing surrogate was not preceeded by a leading surrogate",
/// A leading surrogate was followed by an unit that was not a trailing surrogate.
UnmatchedLeadingSurrogate => "a leading surrogate was followed by an unit that was not a trailing surrogate",
/// A trailing surrogate was expected when the end was reached.
Incomplete => "a trailing surrogate was expected when the end was reached",
}}
simple!{
/// Error returned when [`Utf8Char::from_str()`](../struct.Utf8Char.html#impl-FromStr)
/// or [`Utf16Char::from_str()`](../struct.Utf16Char.html#impl-FromStr) fails.
FromStrError {
/// `Utf8Char` and `Utf16Char` cannot store more than a single codepoint.
MultipleCodepoints => "contains more than one codepoint",
/// `Utf8Char` and `Utf16Char` cannot be empty.
Empty => "is empty",
}
}
/// Error returned when an invalid UTF-8 sequence is encountered.
///
/// See [`Utf8ErrorKind`](enum.Utf8ErrorKind.html) for the types of errors
/// that this type can be returned for.
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub struct Utf8Error {
pub(crate) kind: Utf8ErrorKind,
}
impl Utf8Error {
/// Get the type of error.
pub const fn kind(&self) -> Utf8ErrorKind {
self.kind
}
#[cfg(not(feature="std"))]
#[allow(missing_docs)]
pub const fn description(&self) -> &'static str {
utf8_error_description(self.kind)
}
}
#[cfg(feature="std")]
impl Error for Utf8Error {
fn description(&self) -> &'static str {
utf8_error_description(self.kind)
}
}
impl Display for Utf8Error {
fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
fmtr.write_str(utf8_error_description(self.kind))
}
}
/// The types of errors that can occur when decoding a UTF-8 codepoint.
///
/// The variants are more technical than what an end user is likely interested
/// in, but might be useful for deciding how to handle the error.
///
/// They can be grouped into three categories:
/// * Will happen regularly if decoding chunked or buffered text: `TooFewBytes`.
/// * Input might be binary, a different encoding or corrupted, `UnexpectedContinuationByte`
/// and `InterruptedSequence`.
/// (Broken UTF-8 sequence).
/// * Less likely to happen accidentaly and might be malicious:
/// `OverlongEncoding`, `Utf16ReservedCodepoint` and `TooHighCodepoint`.
/// Note that theese can still be caused by certain valid latin-1 strings
/// such as `"Á©"` (`b"\xC1\xA9"`).
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum Utf8ErrorKind {
/// There are too few bytes to decode the codepoint.
///
/// This can happen when a slice is empty or too short, or an iterator
/// returned `None` while in the middle of a codepoint.
/// This error is never produced by functions accepting fixed-size
/// `[u8; 4]` arrays.
///
/// If decoding text coming chunked (such as in buffers passed to `Read`),
/// the remaing bytes should be carried over into the next chunk or buffer.
/// (including the byte this error was produced for.)
TooFewBytes,
/// A byte which is never used by well-formed UTF-8 was encountered.
///
/// This means that the input is using a different encoding,
/// is corrupted or binary.
///
/// This error is returned when a byte in the following ranges
/// is encountered anywhere in an UTF-8 sequence:
///
/// * `192` and `193` (`0b1100_000x`): Indicates an overlong encoding
/// of a single-byte, ASCII, character, and should therefore never occur.
/// * `248..` (`0b1111_1xxx`): Sequences cannot be longer than 4 bytes.
/// * `245..=247` (`0b1111_0101 | 0b1111_0110`): Indicates a too high
/// codepoint. (above `\u10ffff`)
NonUtf8Byte,
/// The first byte is not a valid start of a codepoint.
///
/// This might happen as a result of slicing into the middle of a codepoint,
/// the input not being UTF-8 encoded or being corrupted.
/// Errors of this type coming right after another error should probably
/// be ignored, unless returned more than three times in a row.
///
/// This error is returned when the first byte has a value in the range
/// `128..=191` (`0b1000_0000..=0b1011_1111`).
UnexpectedContinuationByte,
/// The byte at index 1..=3 should be a continuation byte,
/// but doesn't fit the pattern `0b10xx_xxxx`.
///
/// When the input slice or iterator has too few bytes,
/// [`TooFewBytes`](#Incomplete) is returned instead.
InterruptedSequence,
/// The encoding of the codepoint has so many leading zeroes that it
/// could be a byte shorter.
///
/// [Successfully decoding this can present a security issue](https://tools.ietf.org/html/rfc3629#section-10):
/// Doing so could allow an attacker to circumvent input validation that
/// only checks for ASCII characters, and input characters or strings that
/// would otherwise be rejected, such as `/../`.
///
/// This error is only returned for 3 and 4-byte encodings;
/// `NonUtf8Byte` is returned for bytes that start longer or shorter
/// overlong encodings.
OverlongEncoding,
/// The codepoint is reserved for UTF-16 surrogate pairs.
///
/// (`Utf8Char` cannot be used to work with the
/// [WTF-8](https://simonsapin.github.io/wtf-8) encoding for UCS-2 strings.)
///
/// This error is returned for codepoints in the range `\ud800`..=`\udfff`.
/// (which are three bytes long as UTF-8)
Utf16ReservedCodepoint,
/// The codepoint is higher than `\u10ffff`, which is the highest codepoint
/// unicode permits.
TooHighCodepoint,
}
const fn utf8_error_description(kind: Utf8ErrorKind) -> &'static str {
match kind {
Utf8ErrorKind::TooFewBytes => "too few bytes",
Utf8ErrorKind::NonUtf8Byte => "not UTF-8",
Utf8ErrorKind::UnexpectedContinuationByte => "not UTF-8",
Utf8ErrorKind::InterruptedSequence => "not UTF-8",
Utf8ErrorKind::OverlongEncoding => "malformed input",
Utf8ErrorKind::Utf16ReservedCodepoint => "malformed input",
Utf8ErrorKind::TooHighCodepoint => "invalid character",
}
}
impl PartialEq<Utf8ErrorKind> for Utf8Error {
fn eq(&self, kind: &Utf8ErrorKind) -> bool {
self.kind == *kind
}
}
impl PartialEq<Utf8Error> for Utf8ErrorKind {
fn eq(&self, error: &Utf8Error) -> bool {
*self == error.kind
}
}