roxmltree/
tokenizer.rs

1use core::ops::Range;
2use core::str;
3use alloc::string::String;
4
5use memchr::memchr2;
6
7use crate::{Error, TextPos};
8
9type Result<T> = core::result::Result<T, Error>;
10
11/// Extension methods for XML-subset only operations.
12trait XmlCharExt {
13    /// Checks if the value is within the
14    /// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range.
15    fn is_xml_name_start(&self) -> bool;
16
17    /// Checks if the value is within the
18    /// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range.
19    fn is_xml_name(&self) -> bool;
20
21    /// Checks if the value is within the
22    /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
23    fn is_xml_char(&self) -> bool;
24}
25
26impl XmlCharExt for char {
27    #[inline]
28    fn is_xml_name_start(&self) -> bool {
29        // Check for ASCII first.
30        if *self as u32 <= 128 {
31            return (*self as u8).is_xml_name_start();
32        }
33
34        matches!(*self as u32,
35            0x0000C0..=0x0000D6
36            | 0x0000D8..=0x0000F6
37            | 0x0000F8..=0x0002FF
38            | 0x000370..=0x00037D
39            | 0x00037F..=0x001FFF
40            | 0x00200C..=0x00200D
41            | 0x002070..=0x00218F
42            | 0x002C00..=0x002FEF
43            | 0x003001..=0x00D7FF
44            | 0x00F900..=0x00FDCF
45            | 0x00FDF0..=0x00FFFD
46            | 0x010000..=0x0EFFFF)
47    }
48
49    #[inline]
50    fn is_xml_name(&self) -> bool {
51        // Check for ASCII first.
52        if *self as u32 <= 128 {
53            return (*self as u8).is_xml_name();
54        }
55
56        matches!(*self as u32, 0x0000B7
57                | 0x0000C0..=0x0000D6
58                | 0x0000D8..=0x0000F6
59                | 0x0000F8..=0x0002FF
60                | 0x000300..=0x00036F
61                | 0x000370..=0x00037D
62                | 0x00037F..=0x001FFF
63                | 0x00200C..=0x00200D
64                | 0x00203F..=0x002040
65                | 0x002070..=0x00218F
66                | 0x002C00..=0x002FEF
67                | 0x003001..=0x00D7FF
68                | 0x00F900..=0x00FDCF
69                | 0x00FDF0..=0x00FFFD
70                | 0x010000..=0x0EFFFF)
71    }
72
73    #[inline]
74    fn is_xml_char(&self) -> bool {
75        // Does not check for surrogate code points U+D800-U+DFFF,
76        // since that check was performed by Rust when the `&str` was constructed.
77        if (*self as u32) < 0x20 {
78            return (*self as u8).is_xml_space();
79        }
80
81        !matches!(*self as u32, 0xFFFF | 0xFFFE)
82    }
83}
84
85trait XmlByteExt {
86    /// Checks if byte is a space.
87    ///
88    /// `[ \r\n\t]`
89    fn is_xml_space(&self) -> bool;
90
91    /// Checks if the value is within the
92    /// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range.
93    fn is_xml_name_start(&self) -> bool;
94
95    /// Checks if byte is within the ASCII
96    /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
97    fn is_xml_name(&self) -> bool;
98
99    /// Checks if the value is within the
100    /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
101    fn is_xml_char(&self) -> bool;
102}
103
104impl XmlByteExt for u8 {
105    #[inline]
106    fn is_xml_space(&self) -> bool {
107        matches!(*self, b' ' | b'\t' | b'\n' | b'\r')
108    }
109
110    #[inline]
111    fn is_xml_name_start(&self) -> bool {
112        matches!(*self, b'A'..=b'Z' | b'a'..=b'z' | b':' | b'_')
113    }
114
115    #[inline]
116    fn is_xml_name(&self) -> bool {
117        matches!(*self, b'A'..=b'Z' | b'a'..=b'z'| b'0'..=b'9'| b':' | b'_' | b'-' | b'.')
118    }
119
120    #[inline]
121    fn is_xml_char(&self) -> bool {
122        *self > 0x20 || self.is_xml_space()
123    }
124}
125
126#[inline]
127fn is_xml_str(s: &str, value_start: usize, stream: &mut Stream<'_>) -> Result<()> {
128    if s.as_bytes().is_ascii() {
129        for (i, b) in s.as_bytes().iter().enumerate() {
130            if !b.is_xml_char() {
131                return Err(Error::NonXmlChar(*b as char, stream.gen_text_pos_from(value_start + i)));
132            }
133        }
134
135        Ok(())
136    } else {
137        is_xml_str_unicode(s, value_start, stream)
138    }
139}
140
141
142#[cold]
143#[inline(never)]
144fn is_xml_str_unicode(s: &str, value_start: usize, stream: &mut Stream<'_>) -> Result<()> {
145    for (i, ch) in s.char_indices() {
146        if !ch.is_xml_char() {
147            return Err(Error::NonXmlChar(ch, stream.gen_text_pos_from(value_start + i)));
148        }
149    }
150
151    Ok(())
152}
153
154/// A string slice.
155///
156/// Like `&str`, but also contains the position in the input XML
157/// from which it was parsed.
158#[must_use]
159#[derive(Clone, Copy)]
160pub struct StrSpan<'input> {
161    text: &'input str,
162    start: usize,
163}
164
165impl<'input> From<&'input str> for StrSpan<'input> {
166    #[inline]
167    fn from(text: &'input str) -> Self {
168        StrSpan { text, start: 0 }
169    }
170}
171
172impl<'input> StrSpan<'input> {
173    #[inline]
174    pub fn from_substr(text: &'input str, start: usize, end: usize) -> Self {
175        debug_assert!(start <= end);
176        StrSpan {
177            text: &text[start..end],
178            start,
179        }
180    }
181
182    #[inline]
183    pub fn range(&self) -> Range<usize> {
184        self.start..(self.start + self.text.len())
185    }
186
187    #[inline]
188    pub fn as_str(&self) -> &'input str {
189        self.text
190    }
191
192    #[inline]
193    fn slice_region(&self, start: usize, end: usize) -> &'input str {
194        &self.text[start..end]
195    }
196}
197
198pub enum Token<'input> {
199    // <?target content?>
200    ProcessingInstruction(&'input str, Option<&'input str>, Range<usize>),
201
202    // <!-- text -->
203    Comment(&'input str, Range<usize>),
204
205    // <!ENTITY ns_extend "http://test.com">
206    EntityDeclaration(&'input str, StrSpan<'input>),
207
208    // <ns:elem
209    ElementStart(&'input str, &'input str, usize),
210
211    // ns:attr="value"
212    Attribute(Range<usize>, u16, u8, &'input str, &'input str, StrSpan<'input>),
213
214    ElementEnd(ElementEnd<'input>, Range<usize>),
215
216    // Contains text between elements including whitespaces.
217    // Basically everything between `>` and `<`.
218    // Except `]]>`, which is not allowed and will lead to an error.
219    Text(&'input str, Range<usize>),
220
221    // <![CDATA[text]]>
222    Cdata(&'input str, Range<usize>),
223}
224
225/// `ElementEnd` token.
226#[derive(Clone, Copy)]
227pub enum ElementEnd<'input> {
228    /// Indicates `>`
229    Open,
230    /// Indicates `</ns:name>`
231    Close(&'input str, &'input str),
232    /// Indicates `/>`
233    Empty,
234}
235
236pub trait XmlEvents<'input> {
237    fn token(&mut self, token: Token<'input>) -> Result<()>;
238
239    fn resolve_entity(&mut self, _pub_id: Option<&str>, _uri: &str) -> core::result::Result<Option<&'input str>, String> { Ok(None) }
240}
241
242// document ::= prolog element Misc*
243pub fn parse<'input>(
244    text: &'input str,
245    allow_dtd: bool,
246    events: &mut impl XmlEvents<'input>,
247) -> Result<()> {
248    let s = &mut Stream::new(text);
249
250    parse_declaration(s)?;
251
252    parse_misc(s, events)?;
253
254    s.skip_spaces();
255    if s.starts_with(b"<!DOCTYPE") {
256        if !allow_dtd {
257            return Err(Error::DtdDetected);
258        }
259
260        parse_doctype(s, events)?;
261        parse_misc(s, events)?;
262    }
263
264    s.skip_spaces();
265    if s.curr_byte().ok() == Some(b'<') {
266        parse_element(s, events)?;
267    }
268
269    parse_misc(s, events)?;
270
271    if !s.at_end() {
272        return Err(Error::UnknownToken(s.gen_text_pos()));
273    }
274
275    Ok(())
276}
277
278// Misc ::= Comment | PI | S
279fn parse_misc<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
280    while !s.at_end() {
281        s.skip_spaces();
282        if s.starts_with(b"<!--") {
283            parse_comment(s, events)?;
284        } else if s.starts_with(b"<?") {
285            parse_pi(s, events)?;
286        } else {
287            break;
288        }
289    }
290
291    Ok(())
292}
293
294// XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
295//
296// We don't actually return a token for the XML declaration and only validate it.
297fn parse_declaration(s: &mut Stream) -> Result<()> {
298    // Skip UTF-8 BOM.
299    if s.starts_with(&[0xEF, 0xBB, 0xBF]) {
300        s.advance(3);
301    }
302
303    if !s.starts_with(b"<?xml ") {
304        return Ok(())
305    }
306
307    fn consume_spaces(s: &mut Stream) -> Result<()> {
308        if s.starts_with_space() {
309            s.skip_spaces();
310        } else if !s.starts_with(b"?>") && !s.at_end() {
311            return Err(Error::InvalidChar2(
312                "a whitespace",
313                s.curr_byte_unchecked(),
314                s.gen_text_pos(),
315            ));
316        }
317
318        Ok(())
319    }
320
321    s.advance(5); // <?xml
322    consume_spaces(s)?;
323
324    // The `version` "attribute" is mandatory.
325    if !s.starts_with(b"version") {
326        // Will trigger the InvalidString error, which is what we want.
327        return s.skip_string(b"version");
328    }
329    let _ = parse_attribute(s)?;
330    consume_spaces(s)?;
331
332    if s.starts_with(b"encoding") {
333        let _ = parse_attribute(s)?;
334        consume_spaces(s)?;
335    }
336
337    if s.starts_with(b"standalone") {
338        let _ = parse_attribute(s)?;
339    }
340
341    s.skip_spaces();
342    s.skip_string(b"?>")?;
343
344    Ok(())
345}
346
347// '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
348fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
349    let start = s.pos();
350    s.advance(4);
351    let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
352    s.skip_string(b"-->")?;
353
354    if text.contains("--") {
355        return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
356    }
357
358    if text.ends_with('-') {
359        return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
360    }
361
362    let range = s.range_from(start);
363    events.token(Token::Comment(text, range))?;
364
365    Ok(())
366}
367
368// PI       ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
369// PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
370fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
371    if s.starts_with(b"<?xml ") {
372        return Err(Error::UnexpectedDeclaration(s.gen_text_pos()));
373    }
374
375    let start = s.pos();
376    s.advance(2);
377    let target = s.consume_name()?;
378    s.skip_spaces();
379    let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
380    let content = if !content.is_empty() {
381        Some(content)
382    } else {
383        None
384    };
385
386    s.skip_string(b"?>")?;
387
388    let range = s.range_from(start);
389    events.token(Token::ProcessingInstruction(target, content, range))?;
390    Ok(())
391}
392
393fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
394    let start = s.pos();
395    parse_doctype_start(s)?;
396    s.skip_spaces();
397
398    if s.curr_byte() == Ok(b'>') {
399        s.advance(1);
400        return Ok(());
401    }
402
403    s.advance(1); // [
404    while !s.at_end() {
405        s.skip_spaces();
406        if s.starts_with(b"<!ENTITY") {
407            parse_entity_decl(s, events)?;
408        } else if s.starts_with(b"<!--") {
409            parse_comment(s, events)?;
410        } else if s.starts_with(b"<?") {
411            parse_pi(s, events)?;
412        } else if s.starts_with(b"]") {
413            // DTD ends with ']' S? '>', therefore we have to skip possible spaces.
414            s.advance(1);
415            s.skip_spaces();
416            match s.curr_byte() {
417                Ok(b'>') => {
418                    s.advance(1);
419                    break;
420                }
421                Ok(c) => {
422                    return Err(Error::InvalidChar2("'>'", c, s.gen_text_pos()));
423                }
424                Err(_) => {
425                    return Err(Error::UnexpectedEndOfStream);
426                }
427            }
428        } else if s.starts_with(b"<!ELEMENT")
429            || s.starts_with(b"<!ATTLIST")
430            || s.starts_with(b"<!NOTATION")
431        {
432            if consume_decl(s).is_err() {
433                let pos = s.gen_text_pos_from(start);
434                return Err(Error::UnknownToken(pos));
435            }
436        } else {
437            return Err(Error::UnknownToken(s.gen_text_pos()));
438        }
439    }
440
441    Ok(())
442}
443
444// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
445fn parse_doctype_start(s: &mut Stream) -> Result<()> {
446    s.advance(9);
447
448    s.consume_spaces()?;
449    s.skip_name()?;
450    s.skip_spaces();
451
452    let _ = parse_external_id(s)?;
453    s.skip_spaces();
454
455    let c = s.curr_byte()?;
456    if c != b'[' && c != b'>' {
457        return Err(Error::InvalidChar2("'[' or '>'", c, s.gen_text_pos()));
458    }
459
460    Ok(())
461}
462
463// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
464fn parse_external_id<'input>(s: &mut Stream<'input>) -> Result<Option<(Option<&'input str>, &'input str)>> {
465    let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
466        let start = s.pos();
467        s.advance(6);
468        let id = s.slice_back(start);
469
470        s.consume_spaces()?;
471        let quote = s.consume_quote()?;
472        let first = s.consume_bytes(|c| c != quote);
473        s.consume_byte(quote)?;
474
475        if id == "SYSTEM" {
476            Some((None, first))
477        } else {
478            s.consume_spaces()?;
479            let quote = s.consume_quote()?;
480            let second = s.consume_bytes(|c| c != quote);
481            s.consume_byte(quote)?;
482
483            Some((Some(first), second))
484        }
485    } else {
486        None
487    };
488
489    Ok(v)
490}
491
492// EntityDecl  ::= GEDecl | PEDecl
493// GEDecl      ::= '<!ENTITY' S Name S EntityDef S? '>'
494// PEDecl      ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
495fn parse_entity_decl<'input>(
496    s: &mut Stream<'input>,
497    events: &mut impl XmlEvents<'input>,
498) -> Result<()> {
499    s.advance(8);
500    s.consume_spaces()?;
501
502    let is_ge = if s.try_consume_byte(b'%') {
503        s.consume_spaces()?;
504        false
505    } else {
506        true
507    };
508
509    let name = s.consume_name()?;
510    s.consume_spaces()?;
511    if let Some(definition) = parse_entity_def(s, events, is_ge)? {
512        events.token(Token::EntityDeclaration(name, definition))?;
513    }
514    s.skip_spaces();
515    s.consume_byte(b'>')?;
516
517    Ok(())
518}
519
520// EntityDef   ::= EntityValue | (ExternalID NDataDecl?)
521// PEDef       ::= EntityValue | ExternalID
522// EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' |  "'" ([^%&']
523//                             | PEReference | Reference)* "'"
524// ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
525// NDataDecl   ::= S 'NDATA' S Name
526fn parse_entity_def<'input>(
527    s: &mut Stream<'input>,
528    events: &mut impl XmlEvents<'input>,
529    is_ge: bool,
530) -> Result<Option<StrSpan<'input>>> {
531    let c = s.curr_byte()?;
532    match c {
533        b'"' | b'\'' => {
534            let quote = s.consume_quote()?;
535            let start = s.pos();
536            s.skip_bytes(|c| c != quote);
537            let value = s.slice_back_span(start);
538            s.consume_byte(quote)?;
539            Ok(Some(value))
540        }
541        b'S' | b'P' => {
542            if let Some((pub_id, uri)) = parse_external_id(s)? {
543                if is_ge {
544                    s.skip_spaces();
545                    if s.starts_with(b"NDATA") {
546                        s.advance(5);
547                        s.consume_spaces()?;
548                        s.skip_name()?;
549                        // TODO: NDataDecl is not supported
550                    }
551                }
552
553                let value = events.resolve_entity(pub_id, uri).map_err(|msg| Error::EntityResolver(s.gen_text_pos(), msg))?;
554
555                match value {
556                    Some(value) => {
557                        let mut stream = Stream::new(value);
558                        parse_declaration(&mut stream)?;
559                        let value = StrSpan::from(&value[stream.pos..]);
560
561                        Ok(Some(value))
562                    }
563                    None => Ok(None),
564                }
565            } else {
566                Err(Error::InvalidExternalID(s.gen_text_pos()))
567            }
568        }
569        _ => {
570            let pos = s.gen_text_pos();
571            Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC", c, pos))
572        }
573    }
574}
575
576fn consume_decl(s: &mut Stream) -> Result<()> {
577    s.skip_bytes(|c| c != b'>');
578    s.consume_byte(b'>')?;
579    Ok(())
580}
581
582// element ::= EmptyElemTag | STag content ETag
583// '<' Name (S Attribute)* S? '>'
584fn parse_element<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
585    let start = s.pos();
586    s.advance(1); // <
587    let (prefix, local) = s.consume_qname()?;
588    events.token(Token::ElementStart(prefix, local, start))?;
589
590    let mut open = false;
591    while !s.at_end() {
592        let has_space = s.starts_with_space();
593        s.skip_spaces();
594        let start = s.pos();
595        match s.curr_byte()? {
596            b'/' => {
597                s.advance(1);
598                s.consume_byte(b'>')?;
599                let range = s.range_from(start);
600                events.token(Token::ElementEnd(ElementEnd::Empty, range))?;
601                break;
602            }
603            b'>' => {
604                s.advance(1);
605                let range = s.range_from(start);
606                events.token(Token::ElementEnd(ElementEnd::Open, range))?;
607                open = true;
608                break;
609            }
610            _ => {
611                // An attribute must be preceded with a whitespace.
612                if !has_space {
613                    // Will always trigger an error. Which is what we want.
614                    s.consume_spaces()?;
615                }
616
617                // Manual inlining of `parse_attribute` for performance.
618                // We cannot mark `parse_attribute` as `#[inline(always)]`
619                // because it will blow up the binary size.
620                let (prefix, local) = s.consume_qname()?;
621                let qname_end = s.pos();
622                let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX);
623                s.consume_eq()?;
624                let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX);
625                let quote = s.consume_quote()?;
626                // The attribute value must not contain the < character.
627                let value_start = s.pos();
628                s.advance_until2(quote, b'<')?;
629                let value = s.slice_back_span(value_start);
630                is_xml_str(value.as_str(), value_start, s)?;
631                s.consume_byte(quote)?;
632                let end = s.pos();
633                events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?;
634            }
635        }
636    }
637
638    if open {
639        parse_content(s, events)?;
640    }
641
642    Ok(())
643}
644
645// Attribute ::= Name Eq AttValue
646fn parse_attribute<'input>(
647    s: &mut Stream<'input>,
648) -> Result<(&'input str, &'input str, StrSpan<'input>)> {
649    let (prefix, local) = s.consume_qname()?;
650    s.consume_eq()?;
651    let quote = s.consume_quote()?;
652    let quote_c = quote as char;
653    // The attribute value must not contain the < character.
654    let value_start = s.pos();
655    s.skip_chars(|_, c| c != quote_c && c != '<')?;
656    let value = s.slice_back_span(value_start);
657    s.consume_byte(quote)?;
658    Ok((prefix, local, value))
659}
660
661// content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
662pub fn parse_content<'input>(
663    s: &mut Stream<'input>,
664    events: &mut impl XmlEvents<'input>,
665) -> Result<()> {
666    while !s.at_end() {
667        match s.curr_byte() {
668            Ok(b'<') => match s.next_byte() {
669                Ok(b'!') => {
670                    if s.starts_with(b"<!--") {
671                        parse_comment(s, events)?;
672                    } else if s.starts_with(b"<![CDATA[") {
673                        parse_cdata(s, events)?;
674                    } else {
675                        return Err(Error::UnknownToken(s.gen_text_pos()));
676                    }
677                }
678                Ok(b'?') => parse_pi(s, events)?,
679                Ok(b'/') => {
680                    parse_close_element(s, events)?;
681                    break;
682                }
683                Ok(_) => parse_element(s, events)?,
684                Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
685            },
686            Ok(_) => parse_text(s, events)?,
687            Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
688        }
689    }
690
691    Ok(())
692}
693
694// CDSect  ::= CDStart CData CDEnd
695// CDStart ::= '<![CDATA['
696// CData   ::= (Char* - (Char* ']]>' Char*))
697// CDEnd   ::= ']]>'
698fn parse_cdata<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
699    let start = s.pos();
700    s.advance(9); // <![CDATA[
701    let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
702    s.skip_string(b"]]>")?;
703    let range = s.range_from(start);
704    events.token(Token::Cdata(text, range))?;
705    Ok(())
706}
707
708// '</' Name S? '>'
709fn parse_close_element<'input>(
710    s: &mut Stream<'input>,
711    events: &mut impl XmlEvents<'input>,
712) -> Result<()> {
713    let start = s.pos();
714    s.advance(2); // </
715
716    let (prefix, tag_name) = s.consume_qname()?;
717    s.skip_spaces();
718    s.consume_byte(b'>')?;
719
720    let range = s.range_from(start);
721    events.token(Token::ElementEnd(
722        ElementEnd::Close(prefix, tag_name),
723        range,
724    ))?;
725    Ok(())
726}
727
728fn parse_text<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
729    let start = s.pos();
730    let text = s.consume_chars(|_, c| c != '<')?;
731
732    // According to the spec, `]]>` must not appear inside a Text node.
733    // https://www.w3.org/TR/xml/#syntax
734    //
735    // Search for `>` first, since it's a bit faster than looking for `]]>`.
736    if text.contains('>') && text.contains("]]>") {
737        return Err(Error::InvalidCharacterData(s.gen_text_pos()));
738    }
739
740    let range = s.range_from(start);
741    events.token(Token::Text(text, range))?;
742    Ok(())
743}
744
745/// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value.
746#[derive(Clone, Copy)]
747pub enum Reference<'input> {
748    /// An entity reference.
749    ///
750    /// <https://www.w3.org/TR/xml/#NT-EntityRef>
751    Entity(&'input str),
752
753    /// A character reference.
754    ///
755    /// <https://www.w3.org/TR/xml/#NT-CharRef>
756    Char(char),
757}
758
759#[derive(Clone)]
760pub struct Stream<'input> {
761    pos: usize,
762    end: usize,
763    span: StrSpan<'input>,
764}
765
766impl<'input> Stream<'input> {
767    #[inline]
768    pub fn new(text: &'input str) -> Self {
769        Stream {
770            pos: 0,
771            end: text.len(),
772            span: text.into(),
773        }
774    }
775
776    #[inline]
777    pub fn from_substr(text: &'input str, fragment: Range<usize>) -> Self {
778        Stream {
779            pos: fragment.start,
780            end: fragment.end,
781            span: text.into(),
782        }
783    }
784
785    #[inline]
786    pub fn pos(&self) -> usize {
787        self.pos
788    }
789
790    #[inline]
791    pub fn at_end(&self) -> bool {
792        self.pos >= self.end
793    }
794
795    #[inline]
796    pub fn curr_byte(&self) -> Result<u8> {
797        if self.at_end() {
798            return Err(Error::UnexpectedEndOfStream);
799        }
800
801        Ok(self.curr_byte_unchecked())
802    }
803
804    #[inline]
805    pub fn curr_byte_unchecked(&self) -> u8 {
806        self.span.text.as_bytes()[self.pos]
807    }
808
809    #[inline]
810    fn next_byte(&self) -> Result<u8> {
811        if self.pos + 1 >= self.end {
812            return Err(Error::UnexpectedEndOfStream);
813        }
814
815        Ok(self.span.as_str().as_bytes()[self.pos + 1])
816    }
817
818    #[inline]
819    fn as_bytes(&self) -> &[u8] {
820        &self.span.text.as_bytes()[self.pos..self.end]
821    }
822
823    #[inline]
824    pub fn advance(&mut self, n: usize) {
825        debug_assert!(self.pos + n <= self.end);
826        self.pos += n;
827    }
828
829    #[inline]
830    fn starts_with(&self, text: &[u8]) -> bool {
831        self.span.text.as_bytes()[self.pos..self.end].starts_with(text)
832    }
833
834    fn consume_byte(&mut self, c: u8) -> Result<()> {
835        let curr = self.curr_byte()?;
836        if curr != c {
837            return Err(Error::InvalidChar(c, curr, self.gen_text_pos()));
838        }
839
840        self.advance(1);
841        Ok(())
842    }
843
844    // Unlike `consume_byte()` will not return any errors.
845    fn try_consume_byte(&mut self, c: u8) -> bool {
846        match self.curr_byte() {
847            Ok(b) if b == c => {
848                self.advance(1);
849                true
850            }
851            _ => false,
852        }
853    }
854
855    fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
856        if !self.starts_with(text) {
857            let pos = self.gen_text_pos();
858
859            // Assume that all input `text` are valid UTF-8 strings, so unwrap is safe.
860            let expected = str::from_utf8(text).unwrap();
861
862            return Err(Error::InvalidString(expected, pos));
863        }
864
865        self.advance(text.len());
866        Ok(())
867    }
868
869    #[inline]
870    fn consume_bytes<F: Fn(u8) -> bool>(&mut self, f: F) -> &'input str {
871        let start = self.pos;
872        self.skip_bytes(f);
873        self.slice_back(start)
874    }
875
876    fn skip_bytes<F: Fn(u8) -> bool>(&mut self, f: F) {
877        while !self.at_end() && f(self.curr_byte_unchecked()) {
878            self.advance(1);
879        }
880    }
881
882    #[inline]
883    fn consume_chars<F>(&mut self, f: F) -> Result<&'input str>
884    where
885        F: Fn(&Stream, char) -> bool,
886    {
887        let start = self.pos;
888        self.skip_chars(f)?;
889        Ok(self.slice_back(start))
890    }
891
892    #[inline]
893    fn skip_chars<F>(&mut self, f: F) -> Result<()>
894    where
895        F: Fn(&Stream, char) -> bool,
896    {
897        for c in self.chars() {
898            if !c.is_xml_char() {
899                return Err(Error::NonXmlChar(c, self.gen_text_pos()));
900            } else if f(self, c) {
901                self.advance(c.len_utf8());
902            } else {
903                break;
904            }
905        }
906
907        Ok(())
908    }
909
910    #[inline]
911    fn advance_until2(&mut self, needle1: u8, needle2: u8) -> Result<()> {
912        match memchr2(needle1, needle2, self.as_bytes()) {
913            Some(pos) => {
914                self.advance(pos);
915                Ok(())
916            }
917            None => Err(Error::UnexpectedEndOfStream),
918        }
919    }
920
921    #[inline]
922    fn chars(&self) -> str::Chars<'input> {
923        self.span.as_str()[self.pos..self.end].chars()
924    }
925
926    #[inline]
927    fn slice_back(&self, pos: usize) -> &'input str {
928        self.span.slice_region(pos, self.pos)
929    }
930
931    #[inline]
932    fn slice_back_span(&self, pos: usize) -> StrSpan<'input> {
933        StrSpan::from_substr(self.span.text, pos, self.pos)
934    }
935
936    #[inline]
937    fn range_from(&self, start: usize) -> Range<usize> {
938        start..self.pos
939    }
940
941    #[inline]
942    fn skip_spaces(&mut self) {
943        while self.starts_with_space() {
944            self.advance(1);
945        }
946    }
947
948    #[inline]
949    fn starts_with_space(&self) -> bool {
950        !self.at_end() && self.curr_byte_unchecked().is_xml_space()
951    }
952
953    // Like `skip_spaces()`, but checks that first char is actually a space.
954    fn consume_spaces(&mut self) -> Result<()> {
955        if self.at_end() {
956            return Err(Error::UnexpectedEndOfStream);
957        }
958
959        if !self.starts_with_space() {
960            return Err(Error::InvalidChar2(
961                "a whitespace",
962                self.curr_byte_unchecked(),
963                self.gen_text_pos(),
964            ));
965        }
966
967        self.skip_spaces();
968        Ok(())
969    }
970
971    /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Reference>
972    #[inline(never)]
973    pub fn consume_reference(&mut self) -> Option<Reference<'input>> {
974        if !self.try_consume_byte(b'&') {
975            return None;
976        }
977
978        let reference = if self.try_consume_byte(b'#') {
979            let (value, radix) = if self.try_consume_byte(b'x') {
980                let value =
981                    self.consume_bytes(|c| c.is_ascii_hexdigit());
982                (value, 16)
983            } else {
984                let value = self.consume_bytes(|c| c.is_ascii_digit());
985                (value, 10)
986            };
987
988            let n = u32::from_str_radix(value, radix).ok()?;
989
990            let c = char::from_u32(n).unwrap_or('\u{FFFD}');
991            if !c.is_xml_char() {
992                return None;
993            }
994
995            Reference::Char(c)
996        } else {
997            let name = self.consume_name().ok()?;
998            match name {
999                "quot" => Reference::Char('"'),
1000                "amp" => Reference::Char('&'),
1001                "apos" => Reference::Char('\''),
1002                "lt" => Reference::Char('<'),
1003                "gt" => Reference::Char('>'),
1004                _ => Reference::Entity(name),
1005            }
1006        };
1007
1008        self.consume_byte(b';').ok()?;
1009
1010        Some(reference)
1011    }
1012
1013    /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Name>
1014    fn consume_name(&mut self) -> Result<&'input str> {
1015        let start = self.pos();
1016        self.skip_name()?;
1017
1018        let name = self.slice_back(start);
1019        if name.is_empty() {
1020            return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1021        }
1022
1023        Ok(name)
1024    }
1025
1026    /// The same as `consume_name()`, but does not return a consumed name.
1027    fn skip_name(&mut self) -> Result<()> {
1028        let start = self.pos();
1029        let mut iter = self.chars();
1030        if let Some(c) = iter.next() {
1031            if c.is_xml_name_start() {
1032                self.advance(c.len_utf8());
1033            } else {
1034                return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1035            }
1036        }
1037
1038        for c in iter {
1039            if c.is_xml_name() {
1040                self.advance(c.len_utf8());
1041            } else {
1042                break;
1043            }
1044        }
1045
1046        Ok(())
1047    }
1048
1049    /// Consumes a qualified XML name and returns it.
1050    ///
1051    /// Consumes according to: <https://www.w3.org/TR/xml-names/#ns-qualnames>
1052    #[inline(never)]
1053    fn consume_qname(&mut self) -> Result<(&'input str, &'input str)> {
1054        let start = self.pos();
1055
1056        let mut splitter = None;
1057
1058        while !self.at_end() {
1059            // Check for ASCII first for performance reasons.
1060            let b = self.curr_byte_unchecked();
1061            if b < 128 {
1062                if b == b':' {
1063                    if splitter.is_none() {
1064                        splitter = Some(self.pos());
1065                        self.advance(1);
1066                    } else {
1067                        // Multiple `:` is an error.
1068                        return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1069                    }
1070                } else if b.is_xml_name() {
1071                    self.advance(1);
1072                } else {
1073                    break;
1074                }
1075            } else {
1076                // Fallback to Unicode code point.
1077                match self.chars().next() {
1078                    Some(c) if c.is_xml_name() => {
1079                        self.advance(c.len_utf8());
1080                    }
1081                    _ => break,
1082                }
1083            }
1084        }
1085
1086        let (prefix, local) = if let Some(splitter) = splitter {
1087            let prefix = self.span.slice_region(start, splitter);
1088            let local = self.slice_back(splitter + 1);
1089            (prefix, local)
1090        } else {
1091            let local = self.slice_back(start);
1092            // Slice an empty prefix. This way we can preserve attribute start position.
1093            (self.span.slice_region(start, start), local)
1094        };
1095
1096        fn is_xml_name_start(name: &str) -> bool {
1097            if let Some(b) = name.as_bytes().first() {
1098                if *b < 128 {
1099                    return b.is_xml_name_start();
1100                } else if let Some(c) = name.chars().next() {
1101                    return c.is_xml_name_start();
1102                }
1103            }
1104
1105            false
1106        }
1107
1108        // Prefix must be empty or start with a `NameStartChar`.
1109        if !prefix.is_empty() && !is_xml_name_start(prefix) {
1110            return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1111        }
1112
1113        // Local name must start with a `NameStartChar`.
1114        if !is_xml_name_start(local) {
1115            return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1116        }
1117
1118        Ok((prefix, local))
1119    }
1120
1121    fn consume_eq(&mut self) -> Result<()> {
1122        self.skip_spaces();
1123        self.consume_byte(b'=')?;
1124        self.skip_spaces();
1125
1126        Ok(())
1127    }
1128
1129    fn consume_quote(&mut self) -> Result<u8> {
1130        let c = self.curr_byte()?;
1131        if c == b'\'' || c == b'"' {
1132            self.advance(1);
1133            Ok(c)
1134        } else {
1135            Err(Error::InvalidChar2("a quote", c, self.gen_text_pos()))
1136        }
1137    }
1138
1139    /// Calculates a current absolute position.
1140    ///
1141    /// This operation is very expensive. Use only for errors.
1142    #[inline(never)]
1143    pub fn gen_text_pos(&self) -> TextPos {
1144        let text = self.span.as_str();
1145        let end = self.pos;
1146
1147        let row = Self::calc_curr_row(text, end);
1148        let col = Self::calc_curr_col(text, end);
1149        TextPos::new(row, col)
1150    }
1151
1152    /// Calculates an absolute position at `pos`.
1153    ///
1154    /// This operation is very expensive. Use only for errors.
1155    #[inline(never)]
1156    pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
1157        let mut s = self.clone();
1158        s.pos = core::cmp::min(pos, s.span.as_str().len());
1159        s.gen_text_pos()
1160    }
1161
1162    fn calc_curr_row(text: &str, end: usize) -> u32 {
1163        let mut row = 1;
1164        for c in &text.as_bytes()[..end] {
1165            if *c == b'\n' {
1166                row += 1;
1167            }
1168        }
1169
1170        row
1171    }
1172
1173    fn calc_curr_col(text: &str, end: usize) -> u32 {
1174        let mut col = 1;
1175        for c in text[..end].chars().rev() {
1176            if c == '\n' {
1177                break;
1178            } else {
1179                col += 1;
1180            }
1181        }
1182
1183        col
1184    }
1185}
roxmltree/tokenizer.rs

roxmltree/
tokenizer.rs