aws_smithy_xml/
decode.rs

1/*
2 * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 * SPDX-License-Identifier: Apache-2.0
4 */
5
6use crate::unescape::unescape;
7use std::borrow::Cow;
8use std::error::Error;
9use std::fmt::{Display, Formatter};
10use xmlparser::{ElementEnd, Token, Tokenizer};
11
12pub type Depth = usize;
13
14// in general, these errors are just for reporting what happened, there isn't
15// much value in lots of different match variants
16
17#[derive(Debug)]
18enum XmlDecodeErrorKind {
19    InvalidXml(xmlparser::Error),
20    InvalidEscape { esc: String },
21    Custom(Cow<'static, str>),
22    Unhandled(Box<dyn std::error::Error + Send + Sync + 'static>),
23}
24
25#[derive(Debug)]
26pub struct XmlDecodeError {
27    kind: XmlDecodeErrorKind,
28}
29
30impl Display for XmlDecodeError {
31    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
32        match &self.kind {
33            XmlDecodeErrorKind::InvalidXml(_) => write!(f, "XML parse error"),
34            XmlDecodeErrorKind::InvalidEscape { esc } => write!(f, "invalid XML escape: {}", esc),
35            XmlDecodeErrorKind::Custom(msg) => write!(f, "error parsing XML: {}", msg),
36            XmlDecodeErrorKind::Unhandled(_) => write!(f, "error parsing XML"),
37        }
38    }
39}
40
41impl Error for XmlDecodeError {
42    fn source(&self) -> Option<&(dyn Error + 'static)> {
43        match &self.kind {
44            XmlDecodeErrorKind::InvalidXml(source) => Some(source as _),
45            XmlDecodeErrorKind::Unhandled(source) => Some(source.as_ref() as _),
46            XmlDecodeErrorKind::InvalidEscape { .. } | XmlDecodeErrorKind::Custom(..) => None,
47        }
48    }
49}
50
51impl XmlDecodeError {
52    pub(crate) fn invalid_xml(error: xmlparser::Error) -> Self {
53        Self {
54            kind: XmlDecodeErrorKind::InvalidXml(error),
55        }
56    }
57
58    pub(crate) fn invalid_escape(esc: impl Into<String>) -> Self {
59        Self {
60            kind: XmlDecodeErrorKind::InvalidEscape { esc: esc.into() },
61        }
62    }
63
64    pub fn custom(msg: impl Into<Cow<'static, str>>) -> Self {
65        Self {
66            kind: XmlDecodeErrorKind::Custom(msg.into()),
67        }
68    }
69
70    pub fn unhandled(error: impl Into<Box<dyn Error + Send + Sync + 'static>>) -> Self {
71        Self {
72            kind: XmlDecodeErrorKind::Unhandled(error.into()),
73        }
74    }
75}
76
77#[derive(PartialEq, Debug)]
78pub struct Name<'a> {
79    pub prefix: &'a str,
80    pub local: &'a str,
81}
82
83impl Name<'_> {
84    /// Check if a given name matches a tag name composed of `prefix:local` or just `local`
85    pub fn matches(&self, tag_name: &str) -> bool {
86        let split = tag_name.find(':');
87        match split {
88            None => tag_name == self.local,
89            Some(idx) => {
90                let (prefix, local) = tag_name.split_at(idx);
91                let local = &local[1..];
92                self.local == local && self.prefix == prefix
93            }
94        }
95    }
96}
97
98#[derive(Debug, PartialEq)]
99pub struct Attr<'a> {
100    name: Name<'a>,
101    // attribute values can be escaped (e.g. with double quotes, so we need a Cow)
102    value: Cow<'a, str>,
103}
104
105#[derive(Debug, PartialEq)]
106pub struct StartEl<'a> {
107    name: Name<'a>,
108    attributes: Vec<Attr<'a>>,
109    closed: bool,
110    depth: Depth,
111}
112
113/// Xml Start Element
114///
115/// ```xml
116/// <a:b   c="d">
117///  ^^^   ^^^^^
118///  name  attributes
119/// ```
120impl<'a> StartEl<'a> {
121    pub fn depth(&self) -> Depth {
122        self.depth
123    }
124
125    fn new(local: &'a str, prefix: &'a str, depth: Depth) -> Self {
126        Self {
127            name: Name { prefix, local },
128            attributes: vec![],
129            closed: false,
130            depth,
131        }
132    }
133
134    /// Retrieve an attribute with a given key
135    ///
136    /// key `prefix:local` combined as a str, joined by a `:`
137    pub fn attr<'b>(&'b self, key: &'b str) -> Option<&'b str> {
138        self.attributes
139            .iter()
140            .find(|attr| attr.name.matches(key))
141            .map(|attr| attr.value.as_ref())
142    }
143
144    /// Returns whether this `StartEl` matches a given name
145    /// in `prefix:local` form.
146    pub fn matches(&self, pat: &str) -> bool {
147        self.name.matches(pat)
148    }
149
150    /// Local component of this element's name
151    ///
152    /// ```xml
153    /// <foo:bar>
154    ///      ^^^
155    /// ```
156    pub fn local(&self) -> &str {
157        self.name.local
158    }
159
160    /// Prefix component of this elements name (or empty string)
161    /// ```xml
162    /// <foo:bar>
163    ///  ^^^
164    /// ```
165    pub fn prefix(&self) -> &str {
166        self.name.prefix
167    }
168
169    /// Returns true of `el` at `depth` is a match for this `start_el`
170    fn end_el(&self, el: ElementEnd<'_>, depth: Depth) -> bool {
171        if depth != self.depth {
172            return false;
173        }
174        match el {
175            ElementEnd::Open => false,
176            ElementEnd::Close(prefix, local) => {
177                prefix.as_str() == self.name.prefix && local.as_str() == self.name.local
178            }
179            ElementEnd::Empty => false,
180        }
181    }
182}
183
184/// Xml Document abstraction
185///
186/// This document wraps a lazy tokenizer with depth tracking.
187/// Constructing a document is essentially free.
188pub struct Document<'a> {
189    tokenizer: Tokenizer<'a>,
190    depth: Depth,
191}
192
193impl<'a> TryFrom<&'a [u8]> for Document<'a> {
194    type Error = XmlDecodeError;
195
196    fn try_from(value: &'a [u8]) -> Result<Self, Self::Error> {
197        Ok(Document::new(
198            std::str::from_utf8(value).map_err(XmlDecodeError::unhandled)?,
199        ))
200    }
201}
202
203impl<'inp> Document<'inp> {
204    pub fn new(doc: &'inp str) -> Self {
205        Document {
206            tokenizer: Tokenizer::from(doc),
207            depth: 0,
208        }
209    }
210
211    /// "Depth first" iterator
212    ///
213    /// Unlike [`next_tag()`](ScopedDecoder::next_tag), this method returns the next
214    /// start element regardless of depth. This is useful to give a pointer into the middle
215    /// of a document to start reading.
216    ///
217    /// ```xml
218    /// <Response> <-- first call returns this:
219    ///    <A> <-- next call
220    ///      <Nested /> <-- next call returns this
221    ///      <MoreNested>hello</MoreNested> <-- then this:
222    ///    </A>
223    ///    <B/> <-- second call to next_tag returns this
224    /// </Response>
225    /// ```
226    pub fn next_start_element<'a>(&'a mut self) -> Option<StartEl<'inp>> {
227        next_start_element(self)
228    }
229
230    /// A scoped reader for the entire document
231    pub fn root_element<'a>(&'a mut self) -> Result<ScopedDecoder<'inp, 'a>, XmlDecodeError> {
232        let start_el = self
233            .next_start_element()
234            .ok_or_else(|| XmlDecodeError::custom("no root element"))?;
235        Ok(ScopedDecoder {
236            doc: self,
237            start_el,
238            terminated: false,
239        })
240    }
241
242    /// A scoped reader for a specific tag
243    ///
244    /// This method is necessary for when you need to return a ScopedDecoder from a function
245    /// since normally the stacked-ownership that `next_tag()` uses would prevent returning a reference
246    /// to a field owned by the current function
247    pub fn scoped_to<'a>(&'a mut self, start_el: StartEl<'inp>) -> ScopedDecoder<'inp, 'a> {
248        ScopedDecoder {
249            doc: self,
250            start_el,
251            terminated: false,
252        }
253    }
254}
255
256/// A new-type wrapper around `Token` to prevent the wrapped third party type from showing up in
257/// public API
258#[derive(Debug)]
259pub struct XmlToken<'inp>(Token<'inp>);
260
261/// Depth tracking iterator
262///
263/// ```xml
264/// <a> <- startel depth 0
265///   <b> <- startel depth 1
266///     <c> <- startel depth 2
267///     </c> <- endel depth 2
268///   </b> <- endel depth 1
269/// </a> <- endel depth 0
270/// ```
271impl<'inp> Iterator for Document<'inp> {
272    type Item = Result<(XmlToken<'inp>, Depth), XmlDecodeError>;
273    fn next<'a>(&'a mut self) -> Option<Result<(XmlToken<'inp>, Depth), XmlDecodeError>> {
274        let tok = self.tokenizer.next()?;
275        let tok = match tok {
276            Err(e) => return Some(Err(XmlDecodeError::invalid_xml(e))),
277            Ok(tok) => tok,
278        };
279        // depth bookkeeping
280        match tok {
281            Token::ElementEnd {
282                end: ElementEnd::Close(_, _),
283                ..
284            } => {
285                self.depth -= 1;
286            }
287            Token::ElementEnd {
288                end: ElementEnd::Empty,
289                ..
290            } => self.depth -= 1,
291            t @ Token::ElementStart { .. } => {
292                self.depth += 1;
293                // We want the startel and endel to have the same depth, but after the opener,
294                // the parser will be at depth 1. Return the previous depth:
295                return Some(Ok((XmlToken(t), self.depth - 1)));
296            }
297            _ => {}
298        }
299        Some(Ok((XmlToken(tok), self.depth)))
300    }
301}
302
303/// XmlTag Abstraction
304///
305/// ScopedDecoder represents a tag-scoped view into an XML document. Methods
306/// on `ScopedDecoder` return `None` when the current tag has been exhausted.
307pub struct ScopedDecoder<'inp, 'a> {
308    doc: &'a mut Document<'inp>,
309    start_el: StartEl<'inp>,
310    terminated: bool,
311}
312
313/// When a scoped decoder is dropped, its entire scope is consumed so that the
314/// next read begins at the next tag at the same depth.
315impl Drop for ScopedDecoder<'_, '_> {
316    fn drop(&mut self) {
317        for _ in self {}
318    }
319}
320
321impl<'inp> ScopedDecoder<'inp, '_> {
322    /// The start element for this scope
323    pub fn start_el<'a>(&'a self) -> &'a StartEl<'inp> {
324        &self.start_el
325    }
326
327    /// Returns the next top-level tag in this scope
328    /// The returned reader will fully read the tag during its lifetime. If it is dropped without
329    /// the data being read, the reader will be advanced until the matching close tag. If you read
330    /// an element with `next_tag()` and you want to ignore it, simply drop the resulting `ScopeDecoder`.
331    ///
332    /// ```xml
333    /// <Response> <-- scoped reader on this tag
334    ///    <A> <-- first call to next_tag returns this
335    ///      <Nested /> <-- to get inner data, call `next_tag` on the returned decoder for `A`
336    ///      <MoreNested>hello</MoreNested>
337    ///    </A>
338    ///    <B/> <-- second call to next_tag returns this
339    /// </Response>
340    /// ```
341    pub fn next_tag<'a>(&'a mut self) -> Option<ScopedDecoder<'inp, 'a>> {
342        let next_tag = next_start_element(self)?;
343        Some(self.nested_decoder(next_tag))
344    }
345
346    fn nested_decoder<'a>(&'a mut self, start_el: StartEl<'inp>) -> ScopedDecoder<'inp, 'a> {
347        ScopedDecoder {
348            doc: self.doc,
349            start_el,
350            terminated: false,
351        }
352    }
353}
354
355impl<'inp, 'a> Iterator for ScopedDecoder<'inp, 'a> {
356    type Item = Result<(XmlToken<'inp>, Depth), XmlDecodeError>;
357
358    fn next(&mut self) -> Option<Self::Item> {
359        if self.start_el.closed {
360            self.terminated = true;
361        }
362        if self.terminated {
363            return None;
364        }
365        let (tok, depth) = match self.doc.next() {
366            Some(Ok((tok, depth))) => (tok, depth),
367            other => return other,
368        };
369
370        match tok.0 {
371            Token::ElementEnd { end, .. } if self.start_el.end_el(end, depth) => {
372                self.terminated = true;
373                return None;
374            }
375            _ => {}
376        }
377        Some(Ok((tok, depth)))
378    }
379}
380
381/// Load the next start element out of a depth-tagged token iterator
382fn next_start_element<'a, 'inp>(
383    tokens: &'a mut impl Iterator<Item = Result<(XmlToken<'inp>, Depth), XmlDecodeError>>,
384) -> Option<StartEl<'inp>> {
385    let mut out = StartEl::new("", "", 0);
386    loop {
387        match tokens.next()? {
388            Ok((XmlToken(Token::ElementStart { local, prefix, .. }), depth)) => {
389                out.name.local = local.as_str();
390                out.name.prefix = prefix.as_str();
391                out.depth = depth;
392            }
393            Ok((
394                XmlToken(Token::Attribute {
395                    prefix,
396                    local,
397                    value,
398                    ..
399                }),
400                _,
401            )) => out.attributes.push(Attr {
402                name: Name {
403                    local: local.as_str(),
404                    prefix: prefix.as_str(),
405                },
406                value: unescape(value.as_str()).ok()?,
407            }),
408            Ok((
409                XmlToken(Token::ElementEnd {
410                    end: ElementEnd::Open,
411                    ..
412                }),
413                _,
414            )) => break,
415            Ok((
416                XmlToken(Token::ElementEnd {
417                    end: ElementEnd::Empty,
418                    ..
419                }),
420                _,
421            )) => {
422                out.closed = true;
423                break;
424            }
425            _ => {}
426        }
427    }
428    Some(out)
429}
430
431/// Returns the data element at the current position
432///
433/// If the current position is not a data element (and is instead a `<start-element>`) an error
434/// will be returned
435pub fn try_data<'a, 'inp>(
436    tokens: &'a mut impl Iterator<Item = Result<(XmlToken<'inp>, Depth), XmlDecodeError>>,
437) -> Result<Cow<'inp, str>, XmlDecodeError> {
438    loop {
439        match tokens.next().map(|opt| opt.map(|opt| opt.0)) {
440            None => return Ok(Cow::Borrowed("")),
441            Some(Ok(XmlToken(Token::Text { text }))) => return unescape(text.as_str()),
442            Some(Ok(e @ XmlToken(Token::ElementStart { .. }))) => {
443                return Err(XmlDecodeError::custom(format!(
444                    "looking for a data element, found: {:?}",
445                    e
446                )))
447            }
448            Some(Err(e)) => return Err(e),
449            _ => {}
450        }
451    }
452}
453
454#[cfg(test)]
455mod test {
456    use crate::decode::{try_data, Attr, Depth, Document, Name, StartEl};
457
458    // test helper to create a closed startel
459    fn closed<'a>(local: &'a str, prefix: &'a str, depth: Depth) -> StartEl<'a> {
460        let mut s = StartEl::new(local, prefix, depth);
461        s.closed = true;
462        s
463    }
464
465    #[test]
466    fn scoped_tokens() {
467        let xml = r#"<Response><A></A></Response>"#;
468        let mut doc = Document::new(xml);
469        let mut root = doc.root_element().expect("valid document");
470        assert_eq!(root.start_el().local(), "Response");
471        assert_eq!(root.next_tag().expect("tag exists").start_el().local(), "A");
472        assert!(root.next_tag().is_none());
473    }
474
475    #[test]
476    fn handle_depth_properly() {
477        let xml = r#"<Response><Response></Response><A/></Response>"#;
478        let mut doc = Document::new(xml);
479        let mut scoped = doc.root_element().expect("valid document");
480        assert_eq!(
481            scoped.next_tag().unwrap().start_el(),
482            &StartEl::new("Response", "", 1)
483        );
484        let closed_a = closed("A", "", 1);
485        assert_eq!(scoped.next_tag().unwrap().start_el(), &closed_a);
486        assert!(scoped.next_tag().is_none())
487    }
488
489    #[test]
490    fn self_closing() {
491        let xml = r#"<Response/>"#;
492        let mut doc = Document::new(xml);
493        let mut scoped = doc.root_element().expect("valid doc");
494        assert!(scoped.start_el.closed);
495        assert!(scoped.next_tag().is_none())
496    }
497
498    #[test]
499    fn terminate_scope() {
500        let xml = r#"<Response><Struct><A></A><Also/></Struct><More/></Response>"#;
501        let mut doc = Document::new(xml);
502        let mut response_iter = doc.root_element().expect("valid doc");
503        let mut struct_iter = response_iter.next_tag().unwrap();
504        assert_eq!(
505            struct_iter.next_tag().as_ref().map(|t| t.start_el()),
506            Some(&StartEl::new("A", "", 2))
507        );
508        // When the inner iter is dropped, it will read to the end of its scope
509        // prevent accidental behavior where we didn't read a full node
510        drop(struct_iter);
511        assert_eq!(
512            response_iter.next_tag().unwrap().start_el(),
513            &closed("More", "", 1)
514        );
515    }
516
517    #[test]
518    fn read_data_invalid() {
519        let xml = r#"<Response><A></A></Response>"#;
520        let mut doc = Document::new(xml);
521        let mut resp = doc.root_element().unwrap();
522        try_data(&mut resp).expect_err("no data");
523    }
524
525    #[test]
526    fn read_data() {
527        let xml = r#"<Response>hello</Response>"#;
528        let mut doc = Document::new(xml);
529        let mut scoped = doc.root_element().unwrap();
530        assert_eq!(try_data(&mut scoped).unwrap(), "hello");
531    }
532
533    /// Whitespace within an element is preserved
534    #[test]
535    fn read_data_whitespace() {
536        let xml = r#"<Response> hello </Response>"#;
537        let mut doc = Document::new(xml);
538        let mut scoped = doc.root_element().unwrap();
539        assert_eq!(try_data(&mut scoped).unwrap(), " hello ");
540    }
541
542    #[test]
543    fn ignore_insignificant_whitespace() {
544        let xml = r#"<Response>   <A>  </A>    </Response>"#;
545        let mut doc = Document::new(xml);
546        let mut resp = doc.root_element().unwrap();
547        let mut a = resp.next_tag().expect("should be a");
548        let data = try_data(&mut a).expect("valid");
549        assert_eq!(data, "  ");
550    }
551
552    #[test]
553    fn read_attributes() {
554        let xml = r#"<Response xsi:type="CanonicalUser">hello</Response>"#;
555        let mut tokenizer = Document::new(xml);
556        let root = tokenizer.root_element().unwrap();
557
558        assert_eq!(
559            root.start_el().attributes,
560            vec![Attr {
561                name: Name {
562                    prefix: "xsi",
563                    local: "type"
564                },
565                value: "CanonicalUser".into()
566            }]
567        )
568    }
569
570    #[test]
571    fn unescape_data() {
572        let xml = r#"<Response key="&quot;hey&quot;>">&gt;</Response>"#;
573        let mut doc = Document::new(xml);
574        let mut root = doc.root_element().unwrap();
575        assert_eq!(try_data(&mut root).unwrap(), ">");
576        assert_eq!(root.start_el().attr("key"), Some("\"hey\">"));
577    }
578
579    #[test]
580    fn nested_self_closer() {
581        let xml = r#"<XmlListsInputOutput>
582                <stringList/>
583                <stringSet></stringSet>
584        </XmlListsInputOutput>"#;
585        let mut doc = Document::new(xml);
586        let mut root = doc.root_element().unwrap();
587        let mut string_list = root.next_tag().unwrap();
588        assert_eq!(string_list.start_el(), &closed("stringList", "", 1));
589        assert!(string_list.next_tag().is_none());
590        drop(string_list);
591        assert_eq!(
592            root.next_tag().unwrap().start_el(),
593            &StartEl::new("stringSet", "", 1)
594        );
595    }
596
597    #[test]
598    fn confusing_nested_same_name_tag() {
599        // an inner b which could be confused as closing the outer b if depth
600        // is not properly tracked:
601        let root_tags = &["a", "b", "c", "d"];
602        let xml = r#"<XmlListsInputOutput>
603                <a/>
604                <b>
605                  <c/>
606                  <b></b>
607                  <here/>
608                </b>
609                <c></c>
610                <d>more</d>
611        </XmlListsInputOutput>"#;
612        let mut doc = Document::new(xml);
613        let mut root = doc.root_element().unwrap();
614        let mut cmp = vec![];
615        while let Some(tag) = root.next_tag() {
616            cmp.push(tag.start_el().local().to_owned());
617        }
618        assert_eq!(root_tags, cmp.as_slice());
619    }
620}