1use core::ops::Range;
2use core::str;
3use alloc::string::String;
4
5use memchr::memchr2;
6
7use crate::{Error, TextPos};
8
9type Result<T> = core::result::Result<T, Error>;
10
11trait XmlCharExt {
13 fn is_xml_name_start(&self) -> bool;
16
17 fn is_xml_name(&self) -> bool;
20
21 fn is_xml_char(&self) -> bool;
24}
25
26impl XmlCharExt for char {
27 #[inline]
28 fn is_xml_name_start(&self) -> bool {
29 if *self as u32 <= 128 {
31 return (*self as u8).is_xml_name_start();
32 }
33
34 matches!(*self as u32,
35 0x0000C0..=0x0000D6
36 | 0x0000D8..=0x0000F6
37 | 0x0000F8..=0x0002FF
38 | 0x000370..=0x00037D
39 | 0x00037F..=0x001FFF
40 | 0x00200C..=0x00200D
41 | 0x002070..=0x00218F
42 | 0x002C00..=0x002FEF
43 | 0x003001..=0x00D7FF
44 | 0x00F900..=0x00FDCF
45 | 0x00FDF0..=0x00FFFD
46 | 0x010000..=0x0EFFFF)
47 }
48
49 #[inline]
50 fn is_xml_name(&self) -> bool {
51 if *self as u32 <= 128 {
53 return (*self as u8).is_xml_name();
54 }
55
56 matches!(*self as u32, 0x0000B7
57 | 0x0000C0..=0x0000D6
58 | 0x0000D8..=0x0000F6
59 | 0x0000F8..=0x0002FF
60 | 0x000300..=0x00036F
61 | 0x000370..=0x00037D
62 | 0x00037F..=0x001FFF
63 | 0x00200C..=0x00200D
64 | 0x00203F..=0x002040
65 | 0x002070..=0x00218F
66 | 0x002C00..=0x002FEF
67 | 0x003001..=0x00D7FF
68 | 0x00F900..=0x00FDCF
69 | 0x00FDF0..=0x00FFFD
70 | 0x010000..=0x0EFFFF)
71 }
72
73 #[inline]
74 fn is_xml_char(&self) -> bool {
75 if (*self as u32) < 0x20 {
78 return (*self as u8).is_xml_space();
79 }
80
81 !matches!(*self as u32, 0xFFFF | 0xFFFE)
82 }
83}
84
85trait XmlByteExt {
86 fn is_xml_space(&self) -> bool;
90
91 fn is_xml_name_start(&self) -> bool;
94
95 fn is_xml_name(&self) -> bool;
98
99 fn is_xml_char(&self) -> bool;
102}
103
104impl XmlByteExt for u8 {
105 #[inline]
106 fn is_xml_space(&self) -> bool {
107 matches!(*self, b' ' | b'\t' | b'\n' | b'\r')
108 }
109
110 #[inline]
111 fn is_xml_name_start(&self) -> bool {
112 matches!(*self, b'A'..=b'Z' | b'a'..=b'z' | b':' | b'_')
113 }
114
115 #[inline]
116 fn is_xml_name(&self) -> bool {
117 matches!(*self, b'A'..=b'Z' | b'a'..=b'z'| b'0'..=b'9'| b':' | b'_' | b'-' | b'.')
118 }
119
120 #[inline]
121 fn is_xml_char(&self) -> bool {
122 *self > 0x20 || self.is_xml_space()
123 }
124}
125
126#[inline]
127fn is_xml_str(s: &str, value_start: usize, stream: &mut Stream<'_>) -> Result<()> {
128 if s.as_bytes().is_ascii() {
129 for (i, b) in s.as_bytes().iter().enumerate() {
130 if !b.is_xml_char() {
131 return Err(Error::NonXmlChar(*b as char, stream.gen_text_pos_from(value_start + i)));
132 }
133 }
134
135 Ok(())
136 } else {
137 is_xml_str_unicode(s, value_start, stream)
138 }
139}
140
141
142#[cold]
143#[inline(never)]
144fn is_xml_str_unicode(s: &str, value_start: usize, stream: &mut Stream<'_>) -> Result<()> {
145 for (i, ch) in s.char_indices() {
146 if !ch.is_xml_char() {
147 return Err(Error::NonXmlChar(ch, stream.gen_text_pos_from(value_start + i)));
148 }
149 }
150
151 Ok(())
152}
153
154#[must_use]
159#[derive(Clone, Copy)]
160pub struct StrSpan<'input> {
161 text: &'input str,
162 start: usize,
163}
164
165impl<'input> From<&'input str> for StrSpan<'input> {
166 #[inline]
167 fn from(text: &'input str) -> Self {
168 StrSpan { text, start: 0 }
169 }
170}
171
172impl<'input> StrSpan<'input> {
173 #[inline]
174 pub fn from_substr(text: &'input str, start: usize, end: usize) -> Self {
175 debug_assert!(start <= end);
176 StrSpan {
177 text: &text[start..end],
178 start,
179 }
180 }
181
182 #[inline]
183 pub fn range(&self) -> Range<usize> {
184 self.start..(self.start + self.text.len())
185 }
186
187 #[inline]
188 pub fn as_str(&self) -> &'input str {
189 self.text
190 }
191
192 #[inline]
193 fn slice_region(&self, start: usize, end: usize) -> &'input str {
194 &self.text[start..end]
195 }
196}
197
198pub enum Token<'input> {
199 ProcessingInstruction(&'input str, Option<&'input str>, Range<usize>),
201
202 Comment(&'input str, Range<usize>),
204
205 EntityDeclaration(&'input str, StrSpan<'input>),
207
208 ElementStart(&'input str, &'input str, usize),
210
211 Attribute(Range<usize>, u16, u8, &'input str, &'input str, StrSpan<'input>),
213
214 ElementEnd(ElementEnd<'input>, Range<usize>),
215
216 Text(&'input str, Range<usize>),
220
221 Cdata(&'input str, Range<usize>),
223}
224
225#[derive(Clone, Copy)]
227pub enum ElementEnd<'input> {
228 Open,
230 Close(&'input str, &'input str),
232 Empty,
234}
235
236pub trait XmlEvents<'input> {
237 fn token(&mut self, token: Token<'input>) -> Result<()>;
238
239 fn resolve_entity(&mut self, _pub_id: Option<&str>, _uri: &str) -> core::result::Result<Option<&'input str>, String> { Ok(None) }
240}
241
242pub fn parse<'input>(
244 text: &'input str,
245 allow_dtd: bool,
246 events: &mut impl XmlEvents<'input>,
247) -> Result<()> {
248 let s = &mut Stream::new(text);
249
250 parse_declaration(s)?;
251
252 parse_misc(s, events)?;
253
254 s.skip_spaces();
255 if s.starts_with(b"<!DOCTYPE") {
256 if !allow_dtd {
257 return Err(Error::DtdDetected);
258 }
259
260 parse_doctype(s, events)?;
261 parse_misc(s, events)?;
262 }
263
264 s.skip_spaces();
265 if s.curr_byte().ok() == Some(b'<') {
266 parse_element(s, events)?;
267 }
268
269 parse_misc(s, events)?;
270
271 if !s.at_end() {
272 return Err(Error::UnknownToken(s.gen_text_pos()));
273 }
274
275 Ok(())
276}
277
278fn parse_misc<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
280 while !s.at_end() {
281 s.skip_spaces();
282 if s.starts_with(b"<!--") {
283 parse_comment(s, events)?;
284 } else if s.starts_with(b"<?") {
285 parse_pi(s, events)?;
286 } else {
287 break;
288 }
289 }
290
291 Ok(())
292}
293
294fn parse_declaration(s: &mut Stream) -> Result<()> {
298 if s.starts_with(&[0xEF, 0xBB, 0xBF]) {
300 s.advance(3);
301 }
302
303 if !s.starts_with(b"<?xml ") {
304 return Ok(())
305 }
306
307 fn consume_spaces(s: &mut Stream) -> Result<()> {
308 if s.starts_with_space() {
309 s.skip_spaces();
310 } else if !s.starts_with(b"?>") && !s.at_end() {
311 return Err(Error::InvalidChar2(
312 "a whitespace",
313 s.curr_byte_unchecked(),
314 s.gen_text_pos(),
315 ));
316 }
317
318 Ok(())
319 }
320
321 s.advance(5); consume_spaces(s)?;
323
324 if !s.starts_with(b"version") {
326 return s.skip_string(b"version");
328 }
329 let _ = parse_attribute(s)?;
330 consume_spaces(s)?;
331
332 if s.starts_with(b"encoding") {
333 let _ = parse_attribute(s)?;
334 consume_spaces(s)?;
335 }
336
337 if s.starts_with(b"standalone") {
338 let _ = parse_attribute(s)?;
339 }
340
341 s.skip_spaces();
342 s.skip_string(b"?>")?;
343
344 Ok(())
345}
346
347fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
349 let start = s.pos();
350 s.advance(4);
351 let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
352 s.skip_string(b"-->")?;
353
354 if text.contains("--") {
355 return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
356 }
357
358 if text.ends_with('-') {
359 return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
360 }
361
362 let range = s.range_from(start);
363 events.token(Token::Comment(text, range))?;
364
365 Ok(())
366}
367
368fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
371 if s.starts_with(b"<?xml ") {
372 return Err(Error::UnexpectedDeclaration(s.gen_text_pos()));
373 }
374
375 let start = s.pos();
376 s.advance(2);
377 let target = s.consume_name()?;
378 s.skip_spaces();
379 let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
380 let content = if !content.is_empty() {
381 Some(content)
382 } else {
383 None
384 };
385
386 s.skip_string(b"?>")?;
387
388 let range = s.range_from(start);
389 events.token(Token::ProcessingInstruction(target, content, range))?;
390 Ok(())
391}
392
393fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
394 let start = s.pos();
395 parse_doctype_start(s)?;
396 s.skip_spaces();
397
398 if s.curr_byte() == Ok(b'>') {
399 s.advance(1);
400 return Ok(());
401 }
402
403 s.advance(1); while !s.at_end() {
405 s.skip_spaces();
406 if s.starts_with(b"<!ENTITY") {
407 parse_entity_decl(s, events)?;
408 } else if s.starts_with(b"<!--") {
409 parse_comment(s, events)?;
410 } else if s.starts_with(b"<?") {
411 parse_pi(s, events)?;
412 } else if s.starts_with(b"]") {
413 s.advance(1);
415 s.skip_spaces();
416 match s.curr_byte() {
417 Ok(b'>') => {
418 s.advance(1);
419 break;
420 }
421 Ok(c) => {
422 return Err(Error::InvalidChar2("'>'", c, s.gen_text_pos()));
423 }
424 Err(_) => {
425 return Err(Error::UnexpectedEndOfStream);
426 }
427 }
428 } else if s.starts_with(b"<!ELEMENT")
429 || s.starts_with(b"<!ATTLIST")
430 || s.starts_with(b"<!NOTATION")
431 {
432 if consume_decl(s).is_err() {
433 let pos = s.gen_text_pos_from(start);
434 return Err(Error::UnknownToken(pos));
435 }
436 } else {
437 return Err(Error::UnknownToken(s.gen_text_pos()));
438 }
439 }
440
441 Ok(())
442}
443
444fn parse_doctype_start(s: &mut Stream) -> Result<()> {
446 s.advance(9);
447
448 s.consume_spaces()?;
449 s.skip_name()?;
450 s.skip_spaces();
451
452 let _ = parse_external_id(s)?;
453 s.skip_spaces();
454
455 let c = s.curr_byte()?;
456 if c != b'[' && c != b'>' {
457 return Err(Error::InvalidChar2("'[' or '>'", c, s.gen_text_pos()));
458 }
459
460 Ok(())
461}
462
463fn parse_external_id<'input>(s: &mut Stream<'input>) -> Result<Option<(Option<&'input str>, &'input str)>> {
465 let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
466 let start = s.pos();
467 s.advance(6);
468 let id = s.slice_back(start);
469
470 s.consume_spaces()?;
471 let quote = s.consume_quote()?;
472 let first = s.consume_bytes(|c| c != quote);
473 s.consume_byte(quote)?;
474
475 if id == "SYSTEM" {
476 Some((None, first))
477 } else {
478 s.consume_spaces()?;
479 let quote = s.consume_quote()?;
480 let second = s.consume_bytes(|c| c != quote);
481 s.consume_byte(quote)?;
482
483 Some((Some(first), second))
484 }
485 } else {
486 None
487 };
488
489 Ok(v)
490}
491
492fn parse_entity_decl<'input>(
496 s: &mut Stream<'input>,
497 events: &mut impl XmlEvents<'input>,
498) -> Result<()> {
499 s.advance(8);
500 s.consume_spaces()?;
501
502 let is_ge = if s.try_consume_byte(b'%') {
503 s.consume_spaces()?;
504 false
505 } else {
506 true
507 };
508
509 let name = s.consume_name()?;
510 s.consume_spaces()?;
511 if let Some(definition) = parse_entity_def(s, events, is_ge)? {
512 events.token(Token::EntityDeclaration(name, definition))?;
513 }
514 s.skip_spaces();
515 s.consume_byte(b'>')?;
516
517 Ok(())
518}
519
520fn parse_entity_def<'input>(
527 s: &mut Stream<'input>,
528 events: &mut impl XmlEvents<'input>,
529 is_ge: bool,
530) -> Result<Option<StrSpan<'input>>> {
531 let c = s.curr_byte()?;
532 match c {
533 b'"' | b'\'' => {
534 let quote = s.consume_quote()?;
535 let start = s.pos();
536 s.skip_bytes(|c| c != quote);
537 let value = s.slice_back_span(start);
538 s.consume_byte(quote)?;
539 Ok(Some(value))
540 }
541 b'S' | b'P' => {
542 if let Some((pub_id, uri)) = parse_external_id(s)? {
543 if is_ge {
544 s.skip_spaces();
545 if s.starts_with(b"NDATA") {
546 s.advance(5);
547 s.consume_spaces()?;
548 s.skip_name()?;
549 }
551 }
552
553 let value = events.resolve_entity(pub_id, uri).map_err(|msg| Error::EntityResolver(s.gen_text_pos(), msg))?;
554
555 match value {
556 Some(value) => {
557 let mut stream = Stream::new(value);
558 parse_declaration(&mut stream)?;
559 let value = StrSpan::from(&value[stream.pos..]);
560
561 Ok(Some(value))
562 }
563 None => Ok(None),
564 }
565 } else {
566 Err(Error::InvalidExternalID(s.gen_text_pos()))
567 }
568 }
569 _ => {
570 let pos = s.gen_text_pos();
571 Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC", c, pos))
572 }
573 }
574}
575
576fn consume_decl(s: &mut Stream) -> Result<()> {
577 s.skip_bytes(|c| c != b'>');
578 s.consume_byte(b'>')?;
579 Ok(())
580}
581
582fn parse_element<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
585 let start = s.pos();
586 s.advance(1); let (prefix, local) = s.consume_qname()?;
588 events.token(Token::ElementStart(prefix, local, start))?;
589
590 let mut open = false;
591 while !s.at_end() {
592 let has_space = s.starts_with_space();
593 s.skip_spaces();
594 let start = s.pos();
595 match s.curr_byte()? {
596 b'/' => {
597 s.advance(1);
598 s.consume_byte(b'>')?;
599 let range = s.range_from(start);
600 events.token(Token::ElementEnd(ElementEnd::Empty, range))?;
601 break;
602 }
603 b'>' => {
604 s.advance(1);
605 let range = s.range_from(start);
606 events.token(Token::ElementEnd(ElementEnd::Open, range))?;
607 open = true;
608 break;
609 }
610 _ => {
611 if !has_space {
613 s.consume_spaces()?;
615 }
616
617 let (prefix, local) = s.consume_qname()?;
621 let qname_end = s.pos();
622 let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX);
623 s.consume_eq()?;
624 let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX);
625 let quote = s.consume_quote()?;
626 let value_start = s.pos();
628 s.advance_until2(quote, b'<')?;
629 let value = s.slice_back_span(value_start);
630 is_xml_str(value.as_str(), value_start, s)?;
631 s.consume_byte(quote)?;
632 let end = s.pos();
633 events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?;
634 }
635 }
636 }
637
638 if open {
639 parse_content(s, events)?;
640 }
641
642 Ok(())
643}
644
645fn parse_attribute<'input>(
647 s: &mut Stream<'input>,
648) -> Result<(&'input str, &'input str, StrSpan<'input>)> {
649 let (prefix, local) = s.consume_qname()?;
650 s.consume_eq()?;
651 let quote = s.consume_quote()?;
652 let quote_c = quote as char;
653 let value_start = s.pos();
655 s.skip_chars(|_, c| c != quote_c && c != '<')?;
656 let value = s.slice_back_span(value_start);
657 s.consume_byte(quote)?;
658 Ok((prefix, local, value))
659}
660
661pub fn parse_content<'input>(
663 s: &mut Stream<'input>,
664 events: &mut impl XmlEvents<'input>,
665) -> Result<()> {
666 while !s.at_end() {
667 match s.curr_byte() {
668 Ok(b'<') => match s.next_byte() {
669 Ok(b'!') => {
670 if s.starts_with(b"<!--") {
671 parse_comment(s, events)?;
672 } else if s.starts_with(b"<![CDATA[") {
673 parse_cdata(s, events)?;
674 } else {
675 return Err(Error::UnknownToken(s.gen_text_pos()));
676 }
677 }
678 Ok(b'?') => parse_pi(s, events)?,
679 Ok(b'/') => {
680 parse_close_element(s, events)?;
681 break;
682 }
683 Ok(_) => parse_element(s, events)?,
684 Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
685 },
686 Ok(_) => parse_text(s, events)?,
687 Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
688 }
689 }
690
691 Ok(())
692}
693
694fn parse_cdata<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
699 let start = s.pos();
700 s.advance(9); let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
702 s.skip_string(b"]]>")?;
703 let range = s.range_from(start);
704 events.token(Token::Cdata(text, range))?;
705 Ok(())
706}
707
708fn parse_close_element<'input>(
710 s: &mut Stream<'input>,
711 events: &mut impl XmlEvents<'input>,
712) -> Result<()> {
713 let start = s.pos();
714 s.advance(2); let (prefix, tag_name) = s.consume_qname()?;
717 s.skip_spaces();
718 s.consume_byte(b'>')?;
719
720 let range = s.range_from(start);
721 events.token(Token::ElementEnd(
722 ElementEnd::Close(prefix, tag_name),
723 range,
724 ))?;
725 Ok(())
726}
727
728fn parse_text<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'input>) -> Result<()> {
729 let start = s.pos();
730 let text = s.consume_chars(|_, c| c != '<')?;
731
732 if text.contains('>') && text.contains("]]>") {
737 return Err(Error::InvalidCharacterData(s.gen_text_pos()));
738 }
739
740 let range = s.range_from(start);
741 events.token(Token::Text(text, range))?;
742 Ok(())
743}
744
745#[derive(Clone, Copy)]
747pub enum Reference<'input> {
748 Entity(&'input str),
752
753 Char(char),
757}
758
759#[derive(Clone)]
760pub struct Stream<'input> {
761 pos: usize,
762 end: usize,
763 span: StrSpan<'input>,
764}
765
766impl<'input> Stream<'input> {
767 #[inline]
768 pub fn new(text: &'input str) -> Self {
769 Stream {
770 pos: 0,
771 end: text.len(),
772 span: text.into(),
773 }
774 }
775
776 #[inline]
777 pub fn from_substr(text: &'input str, fragment: Range<usize>) -> Self {
778 Stream {
779 pos: fragment.start,
780 end: fragment.end,
781 span: text.into(),
782 }
783 }
784
785 #[inline]
786 pub fn pos(&self) -> usize {
787 self.pos
788 }
789
790 #[inline]
791 pub fn at_end(&self) -> bool {
792 self.pos >= self.end
793 }
794
795 #[inline]
796 pub fn curr_byte(&self) -> Result<u8> {
797 if self.at_end() {
798 return Err(Error::UnexpectedEndOfStream);
799 }
800
801 Ok(self.curr_byte_unchecked())
802 }
803
804 #[inline]
805 pub fn curr_byte_unchecked(&self) -> u8 {
806 self.span.text.as_bytes()[self.pos]
807 }
808
809 #[inline]
810 fn next_byte(&self) -> Result<u8> {
811 if self.pos + 1 >= self.end {
812 return Err(Error::UnexpectedEndOfStream);
813 }
814
815 Ok(self.span.as_str().as_bytes()[self.pos + 1])
816 }
817
818 #[inline]
819 fn as_bytes(&self) -> &[u8] {
820 &self.span.text.as_bytes()[self.pos..self.end]
821 }
822
823 #[inline]
824 pub fn advance(&mut self, n: usize) {
825 debug_assert!(self.pos + n <= self.end);
826 self.pos += n;
827 }
828
829 #[inline]
830 fn starts_with(&self, text: &[u8]) -> bool {
831 self.span.text.as_bytes()[self.pos..self.end].starts_with(text)
832 }
833
834 fn consume_byte(&mut self, c: u8) -> Result<()> {
835 let curr = self.curr_byte()?;
836 if curr != c {
837 return Err(Error::InvalidChar(c, curr, self.gen_text_pos()));
838 }
839
840 self.advance(1);
841 Ok(())
842 }
843
844 fn try_consume_byte(&mut self, c: u8) -> bool {
846 match self.curr_byte() {
847 Ok(b) if b == c => {
848 self.advance(1);
849 true
850 }
851 _ => false,
852 }
853 }
854
855 fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
856 if !self.starts_with(text) {
857 let pos = self.gen_text_pos();
858
859 let expected = str::from_utf8(text).unwrap();
861
862 return Err(Error::InvalidString(expected, pos));
863 }
864
865 self.advance(text.len());
866 Ok(())
867 }
868
869 #[inline]
870 fn consume_bytes<F: Fn(u8) -> bool>(&mut self, f: F) -> &'input str {
871 let start = self.pos;
872 self.skip_bytes(f);
873 self.slice_back(start)
874 }
875
876 fn skip_bytes<F: Fn(u8) -> bool>(&mut self, f: F) {
877 while !self.at_end() && f(self.curr_byte_unchecked()) {
878 self.advance(1);
879 }
880 }
881
882 #[inline]
883 fn consume_chars<F>(&mut self, f: F) -> Result<&'input str>
884 where
885 F: Fn(&Stream, char) -> bool,
886 {
887 let start = self.pos;
888 self.skip_chars(f)?;
889 Ok(self.slice_back(start))
890 }
891
892 #[inline]
893 fn skip_chars<F>(&mut self, f: F) -> Result<()>
894 where
895 F: Fn(&Stream, char) -> bool,
896 {
897 for c in self.chars() {
898 if !c.is_xml_char() {
899 return Err(Error::NonXmlChar(c, self.gen_text_pos()));
900 } else if f(self, c) {
901 self.advance(c.len_utf8());
902 } else {
903 break;
904 }
905 }
906
907 Ok(())
908 }
909
910 #[inline]
911 fn advance_until2(&mut self, needle1: u8, needle2: u8) -> Result<()> {
912 match memchr2(needle1, needle2, self.as_bytes()) {
913 Some(pos) => {
914 self.advance(pos);
915 Ok(())
916 }
917 None => Err(Error::UnexpectedEndOfStream),
918 }
919 }
920
921 #[inline]
922 fn chars(&self) -> str::Chars<'input> {
923 self.span.as_str()[self.pos..self.end].chars()
924 }
925
926 #[inline]
927 fn slice_back(&self, pos: usize) -> &'input str {
928 self.span.slice_region(pos, self.pos)
929 }
930
931 #[inline]
932 fn slice_back_span(&self, pos: usize) -> StrSpan<'input> {
933 StrSpan::from_substr(self.span.text, pos, self.pos)
934 }
935
936 #[inline]
937 fn range_from(&self, start: usize) -> Range<usize> {
938 start..self.pos
939 }
940
941 #[inline]
942 fn skip_spaces(&mut self) {
943 while self.starts_with_space() {
944 self.advance(1);
945 }
946 }
947
948 #[inline]
949 fn starts_with_space(&self) -> bool {
950 !self.at_end() && self.curr_byte_unchecked().is_xml_space()
951 }
952
953 fn consume_spaces(&mut self) -> Result<()> {
955 if self.at_end() {
956 return Err(Error::UnexpectedEndOfStream);
957 }
958
959 if !self.starts_with_space() {
960 return Err(Error::InvalidChar2(
961 "a whitespace",
962 self.curr_byte_unchecked(),
963 self.gen_text_pos(),
964 ));
965 }
966
967 self.skip_spaces();
968 Ok(())
969 }
970
971 #[inline(never)]
973 pub fn consume_reference(&mut self) -> Option<Reference<'input>> {
974 if !self.try_consume_byte(b'&') {
975 return None;
976 }
977
978 let reference = if self.try_consume_byte(b'#') {
979 let (value, radix) = if self.try_consume_byte(b'x') {
980 let value =
981 self.consume_bytes(|c| c.is_ascii_hexdigit());
982 (value, 16)
983 } else {
984 let value = self.consume_bytes(|c| c.is_ascii_digit());
985 (value, 10)
986 };
987
988 let n = u32::from_str_radix(value, radix).ok()?;
989
990 let c = char::from_u32(n).unwrap_or('\u{FFFD}');
991 if !c.is_xml_char() {
992 return None;
993 }
994
995 Reference::Char(c)
996 } else {
997 let name = self.consume_name().ok()?;
998 match name {
999 "quot" => Reference::Char('"'),
1000 "amp" => Reference::Char('&'),
1001 "apos" => Reference::Char('\''),
1002 "lt" => Reference::Char('<'),
1003 "gt" => Reference::Char('>'),
1004 _ => Reference::Entity(name),
1005 }
1006 };
1007
1008 self.consume_byte(b';').ok()?;
1009
1010 Some(reference)
1011 }
1012
1013 fn consume_name(&mut self) -> Result<&'input str> {
1015 let start = self.pos();
1016 self.skip_name()?;
1017
1018 let name = self.slice_back(start);
1019 if name.is_empty() {
1020 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1021 }
1022
1023 Ok(name)
1024 }
1025
1026 fn skip_name(&mut self) -> Result<()> {
1028 let start = self.pos();
1029 let mut iter = self.chars();
1030 if let Some(c) = iter.next() {
1031 if c.is_xml_name_start() {
1032 self.advance(c.len_utf8());
1033 } else {
1034 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1035 }
1036 }
1037
1038 for c in iter {
1039 if c.is_xml_name() {
1040 self.advance(c.len_utf8());
1041 } else {
1042 break;
1043 }
1044 }
1045
1046 Ok(())
1047 }
1048
1049 #[inline(never)]
1053 fn consume_qname(&mut self) -> Result<(&'input str, &'input str)> {
1054 let start = self.pos();
1055
1056 let mut splitter = None;
1057
1058 while !self.at_end() {
1059 let b = self.curr_byte_unchecked();
1061 if b < 128 {
1062 if b == b':' {
1063 if splitter.is_none() {
1064 splitter = Some(self.pos());
1065 self.advance(1);
1066 } else {
1067 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1069 }
1070 } else if b.is_xml_name() {
1071 self.advance(1);
1072 } else {
1073 break;
1074 }
1075 } else {
1076 match self.chars().next() {
1078 Some(c) if c.is_xml_name() => {
1079 self.advance(c.len_utf8());
1080 }
1081 _ => break,
1082 }
1083 }
1084 }
1085
1086 let (prefix, local) = if let Some(splitter) = splitter {
1087 let prefix = self.span.slice_region(start, splitter);
1088 let local = self.slice_back(splitter + 1);
1089 (prefix, local)
1090 } else {
1091 let local = self.slice_back(start);
1092 (self.span.slice_region(start, start), local)
1094 };
1095
1096 fn is_xml_name_start(name: &str) -> bool {
1097 if let Some(b) = name.as_bytes().first() {
1098 if *b < 128 {
1099 return b.is_xml_name_start();
1100 } else if let Some(c) = name.chars().next() {
1101 return c.is_xml_name_start();
1102 }
1103 }
1104
1105 false
1106 }
1107
1108 if !prefix.is_empty() && !is_xml_name_start(prefix) {
1110 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1111 }
1112
1113 if !is_xml_name_start(local) {
1115 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1116 }
1117
1118 Ok((prefix, local))
1119 }
1120
1121 fn consume_eq(&mut self) -> Result<()> {
1122 self.skip_spaces();
1123 self.consume_byte(b'=')?;
1124 self.skip_spaces();
1125
1126 Ok(())
1127 }
1128
1129 fn consume_quote(&mut self) -> Result<u8> {
1130 let c = self.curr_byte()?;
1131 if c == b'\'' || c == b'"' {
1132 self.advance(1);
1133 Ok(c)
1134 } else {
1135 Err(Error::InvalidChar2("a quote", c, self.gen_text_pos()))
1136 }
1137 }
1138
1139 #[inline(never)]
1143 pub fn gen_text_pos(&self) -> TextPos {
1144 let text = self.span.as_str();
1145 let end = self.pos;
1146
1147 let row = Self::calc_curr_row(text, end);
1148 let col = Self::calc_curr_col(text, end);
1149 TextPos::new(row, col)
1150 }
1151
1152 #[inline(never)]
1156 pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
1157 let mut s = self.clone();
1158 s.pos = core::cmp::min(pos, s.span.as_str().len());
1159 s.gen_text_pos()
1160 }
1161
1162 fn calc_curr_row(text: &str, end: usize) -> u32 {
1163 let mut row = 1;
1164 for c in &text.as_bytes()[..end] {
1165 if *c == b'\n' {
1166 row += 1;
1167 }
1168 }
1169
1170 row
1171 }
1172
1173 fn calc_curr_col(text: &str, end: usize) -> u32 {
1174 let mut col = 1;
1175 for c in text[..end].chars().rev() {
1176 if c == '\n' {
1177 break;
1178 } else {
1179 col += 1;
1180 }
1181 }
1182
1183 col
1184 }
1185}