xmlparser/
lib.rs

1/*!
2*xmlparser* is a low-level, pull-based, zero-allocation
3[XML 1.0](https://www.w3.org/TR/xml/) parser.
4
5## Example
6
7```rust
8for token in xmlparser::Tokenizer::from("<tagname name='value'/>") {
9    println!("{:?}", token);
10}
11```
12
13## Why a new library?
14
15This library is basically a low-level XML tokenizer that preserves the positions of the tokens
16and is not intended to be used directly.
17If you are looking for a higher level solution, check out
18[roxmltree](https://github.com/RazrFalcon/roxmltree).
19
20## Benefits
21
22- All tokens contain `StrSpan` structs which represent the position of the substring
23  in the original document.
24- Good error processing. All error types contain the position (line:column) where it occurred.
25- No heap allocations.
26- No dependencies.
27- Tiny. ~1400 LOC and ~30KiB in the release build according to `cargo-bloat`.
28- Supports `no_std` builds. To use without the standard library, disable the default features.
29
30## Limitations
31
32- Currently, only ENTITY objects are parsed from the DOCTYPE. All others are ignored.
33- No tree structure validation. So an XML like `<root><child></root></child>`
34  or a string without root element
35  will be parsed without errors. You should check for this manually.
36  On the other hand `<a/><a/>` will lead to an error.
37- Duplicated attributes is not an error. So XML like `<item a="v1" a="v2"/>`
38  will be parsed without errors. You should check for this manually.
39- UTF-8 only.
40
41## Safety
42
43- The library must not panic. Any panic is considered a critical bug
44  and should be reported.
45- The library forbids unsafe code.
46*/
47
48#![no_std]
49
50#![forbid(unsafe_code)]
51#![warn(missing_docs)]
52#![allow(ellipsis_inclusive_range_patterns)]
53
54#[cfg(feature = "std")]
55#[macro_use]
56extern crate std;
57
58
59macro_rules! matches {
60    ($expression:expr, $($pattern:tt)+) => {
61        match $expression {
62            $($pattern)+ => true,
63            _ => false
64        }
65    }
66}
67
68
69mod error;
70mod stream;
71mod strspan;
72mod xmlchar;
73
74pub use crate::error::*;
75pub use crate::stream::*;
76pub use crate::strspan::*;
77pub use crate::xmlchar::*;
78
79
80/// An XML token.
81#[allow(missing_docs)]
82#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
83pub enum Token<'a> {
84    /// Declaration token.
85    ///
86    /// ```text
87    /// <?xml version='1.0' encoding='UTF-8' standalone='yes'?>
88    ///                ---                                      - version
89    ///                               -----                     - encoding?
90    ///                                                  ---    - standalone?
91    /// ------------------------------------------------------- - span
92    /// ```
93    Declaration {
94        version: StrSpan<'a>,
95        encoding: Option<StrSpan<'a>>,
96        standalone: Option<bool>,
97        span: StrSpan<'a>,
98    },
99
100    /// Processing instruction token.
101    ///
102    /// ```text
103    /// <?target content?>
104    ///   ------           - target
105    ///          -------   - content?
106    /// ------------------ - span
107    /// ```
108    ProcessingInstruction {
109        target: StrSpan<'a>,
110        content: Option<StrSpan<'a>>,
111        span: StrSpan<'a>,
112    },
113
114    /// Comment token.
115    ///
116    /// ```text
117    /// <!-- text -->
118    ///     ------    - text
119    /// ------------- - span
120    /// ```
121    Comment {
122        text: StrSpan<'a>,
123        span: StrSpan<'a>,
124    },
125
126    /// DOCTYPE start token.
127    ///
128    /// ```text
129    /// <!DOCTYPE greeting SYSTEM "hello.dtd" [
130    ///           --------                      - name
131    ///                    ------------------   - external_id?
132    /// --------------------------------------- - span
133    /// ```
134    DtdStart {
135        name: StrSpan<'a>,
136        external_id: Option<ExternalId<'a>>,
137        span: StrSpan<'a>,
138    },
139
140    /// Empty DOCTYPE token.
141    ///
142    /// ```text
143    /// <!DOCTYPE greeting SYSTEM "hello.dtd">
144    ///           --------                     - name
145    ///                    ------------------  - external_id?
146    /// -------------------------------------- - span
147    /// ```
148    EmptyDtd {
149        name: StrSpan<'a>,
150        external_id: Option<ExternalId<'a>>,
151        span: StrSpan<'a>,
152    },
153
154    /// ENTITY token.
155    ///
156    /// Can appear only inside the DTD.
157    ///
158    /// ```text
159    /// <!ENTITY ns_extend "http://test.com">
160    ///          ---------                    - name
161    ///                     ---------------   - definition
162    /// ------------------------------------- - span
163    /// ```
164    EntityDeclaration {
165        name: StrSpan<'a>,
166        definition: EntityDefinition<'a>,
167        span: StrSpan<'a>,
168    },
169
170    /// DOCTYPE end token.
171    ///
172    /// ```text
173    /// <!DOCTYPE svg [
174    ///    ...
175    /// ]>
176    /// -- - span
177    /// ```
178    DtdEnd {
179        span: StrSpan<'a>,
180    },
181
182    /// Element start token.
183    ///
184    /// ```text
185    /// <ns:elem attr="value"/>
186    ///  --                     - prefix
187    ///     ----                - local
188    /// --------                - span
189    /// ```
190    ElementStart {
191        prefix: StrSpan<'a>,
192        local: StrSpan<'a>,
193        span: StrSpan<'a>,
194    },
195
196    /// Attribute token.
197    ///
198    /// ```text
199    /// <elem ns:attr="value"/>
200    ///       --              - prefix
201    ///          ----         - local
202    ///                -----  - value
203    ///       --------------- - span
204    /// ```
205    Attribute {
206        prefix: StrSpan<'a>,
207        local: StrSpan<'a>,
208        value: StrSpan<'a>,
209        span: StrSpan<'a>,
210    },
211
212    /// Element end token.
213    ///
214    /// ```text
215    /// <ns:elem>text</ns:elem>
216    ///                         - ElementEnd::Open
217    ///         -               - span
218    /// ```
219    ///
220    /// ```text
221    /// <ns:elem>text</ns:elem>
222    ///                -- ----  - ElementEnd::Close(prefix, local)
223    ///              ---------- - span
224    /// ```
225    ///
226    /// ```text
227    /// <ns:elem/>
228    ///                         - ElementEnd::Empty
229    ///         --              - span
230    /// ```
231    ElementEnd {
232        end: ElementEnd<'a>,
233        span: StrSpan<'a>,
234    },
235
236    /// Text token.
237    ///
238    /// Contains text between elements including whitespaces.
239    /// Basically everything between `>` and `<`.
240    /// Except `]]>`, which is not allowed and will lead to an error.
241    ///
242    /// ```text
243    /// <p> text </p>
244    ///    ------     - text
245    /// ```
246    ///
247    /// The token span is equal to the `text`.
248    Text {
249        text: StrSpan<'a>,
250    },
251
252    /// CDATA token.
253    ///
254    /// ```text
255    /// <p><![CDATA[text]]></p>
256    ///             ----        - text
257    ///    ----------------     - span
258    /// ```
259    Cdata {
260        text: StrSpan<'a>,
261        span: StrSpan<'a>,
262    },
263}
264
265impl<'a> Token<'a> {
266    /// Returns the [`StrSpan`] encompassing all of the token.
267    pub fn span(&self) -> StrSpan<'a> {
268        let span = match self {
269            Token::Declaration { span, .. } => span,
270            Token::ProcessingInstruction { span, .. } => span,
271            Token::Comment { span, .. } => span,
272            Token::DtdStart { span, .. } => span,
273            Token::EmptyDtd { span, .. } => span,
274            Token::EntityDeclaration { span, .. } => span,
275            Token::DtdEnd { span, .. } => span,
276            Token::ElementStart { span, .. } => span,
277            Token::Attribute { span, .. } => span,
278            Token::ElementEnd { span, .. } => span,
279            Token::Text { text, .. } => text,
280            Token::Cdata { span, .. } => span,
281        };
282        *span
283    }
284}
285
286/// `ElementEnd` token.
287#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
288pub enum ElementEnd<'a> {
289    /// Indicates `>`
290    Open,
291    /// Indicates `</name>`
292    Close(StrSpan<'a>, StrSpan<'a>),
293    /// Indicates `/>`
294    Empty,
295}
296
297
298/// Representation of the [ExternalID](https://www.w3.org/TR/xml/#NT-ExternalID) value.
299#[allow(missing_docs)]
300#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
301pub enum ExternalId<'a> {
302    System(StrSpan<'a>),
303    Public(StrSpan<'a>, StrSpan<'a>),
304}
305
306
307/// Representation of the [EntityDef](https://www.w3.org/TR/xml/#NT-EntityDef) value.
308#[allow(missing_docs)]
309#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
310pub enum EntityDefinition<'a> {
311    EntityValue(StrSpan<'a>),
312    ExternalId(ExternalId<'a>),
313}
314
315
316type Result<T> = core::result::Result<T, Error>;
317type StreamResult<T> = core::result::Result<T, StreamError>;
318
319
320#[derive(Clone, Copy, PartialEq, Debug)]
321enum State {
322    Declaration,
323    AfterDeclaration,
324    Dtd,
325    AfterDtd,
326    Elements,
327    Attributes,
328    AfterElements,
329    End,
330}
331
332
333/// Tokenizer for the XML structure.
334#[derive(Clone)]
335pub struct Tokenizer<'a> {
336    stream: Stream<'a>,
337    state: State,
338    depth: usize,
339    fragment_parsing: bool,
340}
341
342impl core::fmt::Debug for Tokenizer<'_> {
343    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
344        write!(f, "Tokenizer {{ ... }}")
345    }
346}
347
348impl<'a> From<&'a str> for Tokenizer<'a> {
349    #[inline]
350    fn from(text: &'a str) -> Self {
351        let mut stream = Stream::from(text);
352
353        // Skip UTF-8 BOM.
354        if stream.starts_with(&[0xEF, 0xBB, 0xBF]) {
355            stream.advance(3);
356        }
357
358        Tokenizer {
359            stream,
360            state: State::Declaration,
361            depth: 0,
362            fragment_parsing: false,
363        }
364    }
365}
366
367
368macro_rules! map_err_at {
369    ($fun:expr, $stream:expr, $err:ident) => {{
370        let start = $stream.pos();
371        $fun.map_err(|e|
372            Error::$err(e, $stream.gen_text_pos_from(start))
373        )
374    }}
375}
376
377impl<'a> Tokenizer<'a> {
378    /// Enables document fragment parsing.
379    ///
380    /// By default, `xmlparser` will check for DTD, root element, etc.
381    /// But if we have to parse an XML fragment, it will lead to an error.
382    /// This method switches the parser to the root element content parsing mode,
383    /// so it will treat any data as a content of the root element.
384    pub fn from_fragment(full_text: &'a str, fragment: core::ops::Range<usize>) -> Self {
385        Tokenizer {
386            stream: Stream::from_substr(full_text, fragment),
387            state: State::Elements,
388            depth: 0,
389            fragment_parsing: true,
390        }
391    }
392
393    fn parse_next_impl(&mut self) -> Option<Result<Token<'a>>> {
394        let s = &mut self.stream;
395
396        if s.at_end() {
397            return None;
398        }
399
400        let start = s.pos();
401
402        match self.state {
403            State::Declaration => {
404                self.state = State::AfterDeclaration;
405                if s.starts_with(b"<?xml ") {
406                    Some(Self::parse_declaration(s))
407                } else {
408                    None
409                }
410            }
411            State::AfterDeclaration => {
412                if s.starts_with(b"<!DOCTYPE") {
413                    let t = Self::parse_doctype(s);
414                    match t {
415                        Ok(Token::DtdStart { .. }) => self.state = State::Dtd,
416                        Ok(Token::EmptyDtd { .. }) => self.state = State::AfterDtd,
417                        _ => {}
418                    }
419
420                    Some(t)
421                } else if s.starts_with(b"<!--") {
422                    Some(Self::parse_comment(s))
423                } else if s.starts_with(b"<?") {
424                    if s.starts_with(b"<?xml ") {
425                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
426                    } else {
427                        Some(Self::parse_pi(s))
428                    }
429                } else if s.starts_with_space() {
430                    s.skip_spaces();
431                    None
432                } else {
433                    self.state = State::AfterDtd;
434                    None
435                }
436            }
437            State::Dtd => {
438                if s.starts_with(b"<!ENTITY") {
439                    Some(Self::parse_entity_decl(s))
440                } else if s.starts_with(b"<!--") {
441                    Some(Self::parse_comment(s))
442                } else if s.starts_with(b"<?") {
443                    if s.starts_with(b"<?xml ") {
444                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
445                    } else {
446                        Some(Self::parse_pi(s))
447                    }
448                } else if s.starts_with(b"]") {
449                    // DTD ends with ']' S? '>', therefore we have to skip possible spaces.
450                    s.advance(1);
451                    s.skip_spaces();
452                    match s.curr_byte() {
453                        Ok(b'>') => {
454                            self.state = State::AfterDtd;
455                            s.advance(1);
456                            Some(Ok(Token::DtdEnd { span: s.slice_back(start) }))
457                        }
458                        Ok(c) => {
459                            let e = StreamError::InvalidChar(c, b'>', s.gen_text_pos());
460                            Some(Err(Error::InvalidDoctype(e, s.gen_text_pos_from(start))))
461                        }
462                        Err(_) => {
463                            let e = StreamError::UnexpectedEndOfStream;
464                            Some(Err(Error::InvalidDoctype(e, s.gen_text_pos_from(start))))
465                        }
466                    }
467                } else if s.starts_with_space() {
468                    s.skip_spaces();
469                    None
470                } else if    s.starts_with(b"<!ELEMENT")
471                          || s.starts_with(b"<!ATTLIST")
472                          || s.starts_with(b"<!NOTATION")
473                {
474                    if Self::consume_decl(s).is_err() {
475                        let pos = s.gen_text_pos_from(start);
476                        Some(Err(Error::UnknownToken(pos)))
477                    } else {
478                        None
479                    }
480                } else {
481                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
482                }
483            }
484            State::AfterDtd => {
485                if s.starts_with(b"<!--") {
486                    Some(Self::parse_comment(s))
487                } else if s.starts_with(b"<?") {
488                    if s.starts_with(b"<?xml ") {
489                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
490                    } else {
491                        Some(Self::parse_pi(s))
492                    }
493                } else if s.starts_with(b"<!") {
494                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
495                } else if s.starts_with(b"<") {
496                    self.state = State::Attributes;
497                    Some(Self::parse_element_start(s))
498                } else if s.starts_with_space() {
499                    s.skip_spaces();
500                    None
501                } else {
502                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
503                }
504            }
505            State::Elements => {
506                // Use `match` only here, because only this section is performance-critical.
507                match s.curr_byte() {
508                    Ok(b'<') => {
509                        match s.next_byte() {
510                            Ok(b'!') => {
511                                if s.starts_with(b"<!--") {
512                                    Some(Self::parse_comment(s))
513                                } else if s.starts_with(b"<![CDATA[") {
514                                    Some(Self::parse_cdata(s))
515                                } else {
516                                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
517                                }
518                            }
519                            Ok(b'?') => {
520                                if !s.starts_with(b"<?xml ") {
521                                    Some(Self::parse_pi(s))
522                                } else {
523                                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
524                                }
525                            }
526                            Ok(b'/') => {
527                                if self.depth > 0 {
528                                    self.depth -= 1;
529                                }
530
531                                if self.depth == 0 && !self.fragment_parsing {
532                                    self.state = State::AfterElements;
533                                } else {
534                                    self.state = State::Elements;
535                                }
536
537                                Some(Self::parse_close_element(s))
538                            }
539                            Ok(_) => {
540                                self.state = State::Attributes;
541                                Some(Self::parse_element_start(s))
542                            }
543                            Err(_) => {
544                                return Some(Err(Error::UnknownToken(s.gen_text_pos())));
545                            }
546                        }
547                    }
548                    Ok(_) => {
549                        Some(Self::parse_text(s))
550                    }
551                    Err(_) => {
552                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
553                    }
554                }
555            }
556            State::Attributes => {
557                let t = Self::parse_attribute(s);
558
559                if let Ok(Token::ElementEnd { end, .. }) = t {
560                    if end == ElementEnd::Open {
561                        self.depth += 1;
562                    }
563
564                    if self.depth == 0 && !self.fragment_parsing {
565                        self.state = State::AfterElements;
566                    } else {
567                        self.state = State::Elements;
568                    }
569                }
570
571                Some(t.map_err(|e| Error::InvalidAttribute(e, s.gen_text_pos_from(start))))
572            }
573            State::AfterElements => {
574                if s.starts_with(b"<!--") {
575                    Some(Self::parse_comment(s))
576                } else if s.starts_with(b"<?") {
577                    if s.starts_with(b"<?xml ") {
578                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
579                    } else {
580                        Some(Self::parse_pi(s))
581                    }
582                } else if s.starts_with_space() {
583                    s.skip_spaces();
584                    None
585                } else {
586                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
587                }
588            }
589            State::End => {
590                None
591            }
592        }
593    }
594
595    fn parse_declaration(s: &mut Stream<'a>) -> Result<Token<'a>> {
596        map_err_at!(Self::parse_declaration_impl(s), s, InvalidDeclaration)
597    }
598
599    // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
600    fn parse_declaration_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
601        fn consume_spaces(s: &mut Stream) -> StreamResult<()> {
602            if s.starts_with_space() {
603                s.skip_spaces();
604            } else if !s.starts_with(b"?>") && !s.at_end() {
605                return Err(StreamError::InvalidSpace(s.curr_byte_unchecked(), s.gen_text_pos()));
606            }
607
608            Ok(())
609        }
610
611        let start = s.pos();
612        s.advance(6);
613
614        let version = Self::parse_version_info(s)?;
615        consume_spaces(s)?;
616
617        let encoding = Self::parse_encoding_decl(s)?;
618        if encoding.is_some() {
619            consume_spaces(s)?;
620        }
621
622        let standalone = Self::parse_standalone(s)?;
623
624        s.skip_spaces();
625        s.skip_string(b"?>")?;
626
627        let span = s.slice_back(start);
628        Ok(Token::Declaration { version, encoding, standalone, span })
629    }
630
631    // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
632    // VersionNum  ::= '1.' [0-9]+
633    fn parse_version_info(s: &mut Stream<'a>) -> StreamResult<StrSpan<'a>> {
634        s.skip_spaces();
635        s.skip_string(b"version")?;
636        s.consume_eq()?;
637        let quote = s.consume_quote()?;
638
639        let start = s.pos();
640        s.skip_string(b"1.")?;
641        s.skip_bytes(|_, c| c.is_xml_digit());
642        let ver = s.slice_back(start);
643
644        s.consume_byte(quote)?;
645
646        Ok(ver)
647    }
648
649    // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
650    // EncName      ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
651    fn parse_encoding_decl(s: &mut Stream<'a>) -> StreamResult<Option<StrSpan<'a>>> {
652        if !s.starts_with(b"encoding") {
653            return Ok(None);
654        }
655
656        s.advance(8);
657        s.consume_eq()?;
658        let quote = s.consume_quote()?;
659        // [A-Za-z] ([A-Za-z0-9._] | '-')*
660        // TODO: check that first byte is [A-Za-z]
661        let name = s.consume_bytes(|_, c| {
662               c.is_xml_letter()
663            || c.is_xml_digit()
664            || c == b'.'
665            || c == b'-'
666            || c == b'_'
667        });
668        s.consume_byte(quote)?;
669
670        Ok(Some(name))
671    }
672
673    // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
674    fn parse_standalone(s: &mut Stream<'a>) -> StreamResult<Option<bool>> {
675        if !s.starts_with(b"standalone") {
676            return Ok(None);
677        }
678
679        s.advance(10);
680        s.consume_eq()?;
681        let quote = s.consume_quote()?;
682
683        let start = s.pos();
684        let value = s.consume_name()?.as_str();
685
686        let flag = match value {
687            "yes" => true,
688            "no" => false,
689            _ => {
690                let pos = s.gen_text_pos_from(start);
691
692                return Err(StreamError::InvalidString("yes', 'no", pos));
693            }
694        };
695
696        s.consume_byte(quote)?;
697
698        Ok(Some(flag))
699    }
700
701    fn parse_comment(s: &mut Stream<'a>) -> Result<Token<'a>> {
702        let start = s.pos();
703        Self::parse_comment_impl(s)
704            .map_err(|e| Error::InvalidComment(e, s.gen_text_pos_from(start)))
705    }
706
707    // '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
708    fn parse_comment_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
709        let start = s.pos();
710        s.advance(4);
711        let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
712        s.skip_string(b"-->")?;
713
714        if text.as_str().contains("--") {
715            return Err(StreamError::InvalidCommentData);
716        }
717
718        if text.as_str().ends_with('-') {
719            return Err(StreamError::InvalidCommentEnd);
720        }
721
722        let span = s.slice_back(start);
723
724        Ok(Token::Comment { text, span })
725    }
726
727    fn parse_pi(s: &mut Stream<'a>) -> Result<Token<'a>> {
728        map_err_at!(Self::parse_pi_impl(s), s, InvalidPI)
729    }
730
731    // PI       ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
732    // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
733    fn parse_pi_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
734        let start = s.pos();
735        s.advance(2);
736        let target = s.consume_name()?;
737        s.skip_spaces();
738        let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
739        let content = if !content.is_empty() {
740            Some(content)
741        } else {
742            None
743        };
744
745        s.skip_string(b"?>")?;
746
747        let span = s.slice_back(start);
748
749        Ok(Token::ProcessingInstruction { target, content, span })
750    }
751
752    fn parse_doctype(s: &mut Stream<'a>) -> Result<Token<'a>> {
753        map_err_at!(Self::parse_doctype_impl(s), s, InvalidDoctype)
754    }
755
756    // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
757    fn parse_doctype_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
758        let start = s.pos();
759        s.advance(9);
760
761        s.consume_spaces()?;
762        let name = s.consume_name()?;
763        s.skip_spaces();
764
765        let external_id = Self::parse_external_id(s)?;
766        s.skip_spaces();
767
768        let c = s.curr_byte()?;
769        if c != b'[' && c !=  b'>' {
770            static EXPECTED: &[u8] = &[b'[', b'>'];
771            return Err(StreamError::InvalidCharMultiple(c, EXPECTED, s.gen_text_pos()));
772        }
773
774        s.advance(1);
775
776        let span = s.slice_back(start);
777        if c == b'[' {
778            Ok(Token::DtdStart { name, external_id, span })
779        } else {
780            Ok(Token::EmptyDtd { name, external_id, span })
781        }
782    }
783
784    // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
785    fn parse_external_id(s: &mut Stream<'a>) -> StreamResult<Option<ExternalId<'a>>> {
786        let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
787            let start = s.pos();
788            s.advance(6);
789            let id = s.slice_back(start);
790
791            s.consume_spaces()?;
792            let quote = s.consume_quote()?;
793            let literal1 = s.consume_bytes(|_, c| c != quote);
794            s.consume_byte(quote)?;
795
796            let v = if id.as_str() == "SYSTEM" {
797                ExternalId::System(literal1)
798            } else {
799                s.consume_spaces()?;
800                let quote = s.consume_quote()?;
801                let literal2 = s.consume_bytes(|_, c| c != quote);
802                s.consume_byte(quote)?;
803
804                ExternalId::Public(literal1, literal2)
805            };
806
807            Some(v)
808        } else {
809            None
810        };
811
812        Ok(v)
813    }
814
815    fn parse_entity_decl(s: &mut Stream<'a>) -> Result<Token<'a>> {
816        map_err_at!(Self::parse_entity_decl_impl(s), s, InvalidEntity)
817    }
818
819    // EntityDecl  ::= GEDecl | PEDecl
820    // GEDecl      ::= '<!ENTITY' S Name S EntityDef S? '>'
821    // PEDecl      ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
822    fn parse_entity_decl_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
823        let start = s.pos();
824        s.advance(8);
825
826        s.consume_spaces()?;
827
828        let is_ge = if s.try_consume_byte(b'%') {
829            s.consume_spaces()?;
830            false
831        } else {
832            true
833        };
834
835        let name = s.consume_name()?;
836        s.consume_spaces()?;
837        let definition = Self::parse_entity_def(s, is_ge)?;
838        s.skip_spaces();
839        s.consume_byte(b'>')?;
840
841        let span = s.slice_back(start);
842
843        Ok(Token::EntityDeclaration { name, definition, span })
844    }
845
846    // EntityDef   ::= EntityValue | (ExternalID NDataDecl?)
847    // PEDef       ::= EntityValue | ExternalID
848    // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' |  "'" ([^%&']
849    //                             | PEReference | Reference)* "'"
850    // ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
851    // NDataDecl   ::= S 'NDATA' S Name
852    fn parse_entity_def(s: &mut Stream<'a>, is_ge: bool) -> StreamResult<EntityDefinition<'a>> {
853        let c = s.curr_byte()?;
854        match c {
855            b'"' | b'\'' => {
856                let quote = s.consume_quote()?;
857                let value = s.consume_bytes(|_, c| c != quote);
858                s.consume_byte(quote)?;
859
860                Ok(EntityDefinition::EntityValue(value))
861            }
862            b'S' | b'P' => {
863                if let Some(id) = Self::parse_external_id(s)? {
864                    if is_ge {
865                        s.skip_spaces();
866                        if s.starts_with(b"NDATA") {
867                            s.advance(5);
868                            s.consume_spaces()?;
869                            s.skip_name()?;
870                            // TODO: NDataDecl is not supported
871                        }
872                    }
873
874                    Ok(EntityDefinition::ExternalId(id))
875                } else {
876                    Err(StreamError::InvalidExternalID)
877                }
878            }
879            _ => {
880                static EXPECTED: &[u8] = &[b'"', b'\'', b'S', b'P'];
881                let pos = s.gen_text_pos();
882                Err(StreamError::InvalidCharMultiple(c, EXPECTED, pos))
883            }
884        }
885    }
886
887    fn consume_decl(s: &mut Stream) -> StreamResult<()> {
888        s.skip_bytes(|_, c| c != b'>');
889        s.consume_byte(b'>')?;
890        Ok(())
891    }
892
893    fn parse_cdata(s: &mut Stream<'a>) -> Result<Token<'a>> {
894        map_err_at!(Self::parse_cdata_impl(s), s, InvalidCdata)
895    }
896
897    // CDSect  ::= CDStart CData CDEnd
898    // CDStart ::= '<![CDATA['
899    // CData   ::= (Char* - (Char* ']]>' Char*))
900    // CDEnd   ::= ']]>'
901    fn parse_cdata_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
902        let start = s.pos();
903        s.advance(9);
904        let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
905        s.skip_string(b"]]>")?;
906        let span = s.slice_back(start);
907        Ok(Token::Cdata { text, span })
908    }
909
910    fn parse_element_start(s: &mut Stream<'a>) -> Result<Token<'a>> {
911        map_err_at!(Self::parse_element_start_impl(s), s, InvalidElement)
912    }
913
914    // '<' Name (S Attribute)* S? '>'
915    fn parse_element_start_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
916        let start = s.pos();
917        s.advance(1);
918        let (prefix, local) = s.consume_qname()?;
919        let span = s.slice_back(start);
920
921        Ok(Token::ElementStart { prefix, local, span })
922    }
923
924    fn parse_close_element(s: &mut Stream<'a>) -> Result<Token<'a>> {
925        map_err_at!(Self::parse_close_element_impl(s), s, InvalidElement)
926    }
927
928    // '</' Name S? '>'
929    fn parse_close_element_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
930        let start = s.pos();
931        s.advance(2);
932
933        let (prefix, tag_name) = s.consume_qname()?;
934        s.skip_spaces();
935        s.consume_byte(b'>')?;
936
937        let span = s.slice_back(start);
938
939        Ok(Token::ElementEnd { end: ElementEnd::Close(prefix, tag_name), span })
940    }
941
942    // Name Eq AttValue
943    fn parse_attribute(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
944        let attr_start = s.pos();
945        let has_space = s.starts_with_space();
946        s.skip_spaces();
947
948        if let Ok(c) = s.curr_byte() {
949            let start = s.pos();
950
951            match c {
952                b'/' => {
953                    s.advance(1);
954                    s.consume_byte(b'>')?;
955                    let span = s.slice_back(start);
956                    return Ok(Token::ElementEnd { end: ElementEnd::Empty, span });
957                }
958                b'>' => {
959                    s.advance(1);
960                    let span = s.slice_back(start);
961                    return Ok(Token::ElementEnd { end: ElementEnd::Open, span });
962                }
963                _ => {}
964            }
965        }
966
967        if !has_space {
968            if !s.at_end() {
969                return Err(StreamError::InvalidSpace(
970                    s.curr_byte_unchecked(), s.gen_text_pos_from(attr_start))
971                );
972            } else {
973                return Err(StreamError::UnexpectedEndOfStream);
974            }
975        }
976
977        let start = s.pos();
978
979        let (prefix, local) = s.consume_qname()?;
980        s.consume_eq()?;
981        let quote = s.consume_quote()?;
982        let quote_c = quote as char;
983        // The attribute value must not contain the < character.
984        let value = s.consume_chars(|_, c| c != quote_c && c != '<')?;
985        s.consume_byte(quote)?;
986        let span = s.slice_back(start);
987
988        Ok(Token::Attribute { prefix, local, value, span })
989    }
990
991    fn parse_text(s: &mut Stream<'a>) -> Result<Token<'a>> {
992        map_err_at!(Self::parse_text_impl(s), s, InvalidCharData)
993    }
994
995    fn parse_text_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
996        let text = s.consume_chars(|_, c| c != '<')?;
997
998        // According to the spec, `]]>` must not appear inside a Text node.
999        // https://www.w3.org/TR/xml/#syntax
1000        //
1001        // Search for `>` first, since it's a bit faster than looking for `]]>`.
1002        if text.as_str().contains('>') {
1003            if text.as_str().contains("]]>") {
1004                return Err(StreamError::InvalidCharacterData);
1005            }
1006        }
1007
1008        Ok(Token::Text { text })
1009    }
1010
1011    /// Returns a copy of the tokenizer's stream.
1012    pub fn stream(&self) -> Stream<'a> {
1013        self.stream
1014    }
1015}
1016
1017impl<'a> Iterator for Tokenizer<'a> {
1018    type Item = Result<Token<'a>>;
1019
1020    #[inline]
1021    fn next(&mut self) -> Option<Self::Item> {
1022        let mut t = None;
1023        while !self.stream.at_end() && self.state != State::End && t.is_none() {
1024            t = self.parse_next_impl();
1025        }
1026
1027        if let Some(Err(_)) = t {
1028            self.stream.jump_to_end();
1029            self.state = State::End;
1030        }
1031
1032        t
1033    }
1034}