aws_smithy_xml/
unescape.rs

1/*
2 * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 * SPDX-License-Identifier: Apache-2.0
4 */
5
6use crate::decode::XmlDecodeError;
7use std::borrow::Cow;
8
9/// Unescape XML encoded characters
10///
11/// This function will unescape the 4 literal escapes:
12/// - `<`, `>`, `&`, `"`, and `'`
13/// - Decimal escapes: `{`
14/// - Hex escapes: `
`
15///
16/// If no escape sequences are present, Cow<&'str> will be returned, avoiding the need
17/// to copy the String.
18pub(crate) fn unescape(s: &str) -> Result<Cow<'_, str>, XmlDecodeError> {
19    // no &, no need to escape anything
20    if !s.contains('&') {
21        return Ok(Cow::Borrowed(s));
22    }
23    // this will be strictly larger than required avoiding the need for another allocation
24    let mut res = String::with_capacity(s.len());
25    // could consider memchr as performance optimization
26    let mut sections = s.split('&');
27    // push content before the first &
28    if let Some(prefix) = sections.next() {
29        res.push_str(prefix);
30    }
31    for section in sections {
32        // entities look like &<somedata>;
33        match section.find(';') {
34            Some(idx) => {
35                let entity = &section[..idx];
36                match entity {
37                    "lt" => res.push('<'),
38                    "gt" => res.push('>'),
39                    "amp" => res.push('&'),
40                    "quot" => res.push('"'),
41                    "apos" => res.push('\''),
42                    entity => {
43                        // e.g. &#xD;
44                        let (entity, radix) = if let Some(entity) = entity.strip_prefix("#x") {
45                            (entity, 16)
46                        } else if let Some(entity) = entity.strip_prefix('#') {
47                            // e.g. &#123;
48                            (entity, 10)
49                        } else {
50                            return Err(XmlDecodeError::invalid_escape(entity));
51                        };
52                        let char_code = u32::from_str_radix(entity, radix).map_err(|_| {
53                            XmlDecodeError::invalid_escape(format!(
54                                "expected numeric escape in base {}; got: {}",
55                                radix, &entity
56                            ))
57                        })?;
58                        let chr = std::char::from_u32(char_code).ok_or_else(|| {
59                            XmlDecodeError::invalid_escape(format!(
60                                "invalid char code: {}",
61                                char_code
62                            ))
63                        })?;
64                        res.push(chr);
65                    }
66                }
67                // push everything from the `;` to the next `&`
68                res.push_str(&section[idx + 1..])
69            }
70            None => return Err(XmlDecodeError::invalid_escape("unterminated pattern")),
71        }
72    }
73    Ok(Cow::Owned(res))
74}
75
76#[cfg(test)]
77mod test {
78    use crate::unescape::unescape;
79    use std::borrow::Cow;
80
81    #[test]
82    fn basic_unescape() {
83        assert_eq!(
84            unescape("&lt; &gt; &apos; &quot; &amp;").unwrap(),
85            "< > ' \" &"
86        );
87        assert_eq!(
88            unescape("Since a &gt; b, b is less than a").unwrap(),
89            "Since a > b, b is less than a"
90        );
91    }
92
93    #[test]
94    fn no_need_to_escape() {
95        assert_eq!(unescape("hello 🍕!").unwrap(), Cow::Borrowed("hello 🍕!"));
96    }
97
98    #[test]
99    fn complex_unescape() {
100        // Test cases adapted from Apache Commons StringEscapeUtilsTest.java
101        assert_eq!(
102            unescape("a&lt;b&gt;c&quot;d&apos;e&amp;f;;").unwrap(),
103            "a<b>c\"d'e&f;;"
104        );
105        assert_eq!(unescape("&amp;lt;").unwrap(), "&lt;")
106    }
107
108    #[test]
109    fn newline_encoding() {
110        assert_eq!(unescape("&#10;").unwrap(), "\n");
111        assert_eq!(unescape("&#xD;").unwrap(), "\r");
112    }
113
114    #[test]
115    fn xml_eol_encoding() {
116        assert_eq!(unescape("&#xA; &#xA;").unwrap(), "\n \n");
117        assert_eq!(
118            unescape("a&#xD;&#xA; b&#xA; c&#xD;").unwrap(),
119            "a\r\n b\n c\r"
120        );
121        assert_eq!(
122            unescape("a&#xD;&#x85; b&#x85;").unwrap(),
123            "a\r\u{0085} b\u{0085}"
124        );
125        assert_eq!(
126            unescape("a&#xD;&#x2028; b&#x85; c&#x2028;").unwrap(),
127            "a\r\u{2028} b\u{0085} c\u{2028}"
128        );
129    }
130
131    #[test]
132    fn invalid_escapes() {
133        unescape("&lte;").expect_err("lte does not make a ≤");
134        unescape("&lt").expect_err("unterminated escape sequence");
135        unescape("&#Q1234;").expect_err("Q does not began a numeric sequence");
136        unescape("&#3.14;").expect_err("decimal escape");
137        unescape("&#xZZ").expect_err("Z is not hex");
138        unescape("here is a & but without an escape sequence...").expect_err("naked &");
139    }
140
141    use proptest::prelude::*;
142    proptest! {
143        #[test]
144        fn no_panics(s: String) {
145            let unescaped = unescape(&s);
146            // if the string needed to be escaped, we
147            if s.contains('&') {
148                assert!(
149                    matches!(unescaped, Ok(Cow::Owned(_)) | Err(_))
150                );
151            }
152        }
153    }
154}