1mod char_ref;
11mod interface;
12mod qname;
13pub mod states;
14
15pub use self::interface::{
16 Doctype, EmptyTag, EndTag, Pi, ShortTag, StartTag, Tag, TagKind, Token, TokenSink,
17};
18pub use crate::{LocalName, Namespace, Prefix};
19
20use crate::macros::time;
21use crate::tendril::StrTendril;
22use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
23use log::debug;
24use markup5ever::{local_name, namespace_prefix, ns, small_char_set, TokenizerResult};
25use std::borrow::Cow::{self, Borrowed};
26use std::cell::{Cell, RefCell, RefMut};
27use std::collections::BTreeMap;
28use std::mem::replace;
29
30use buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
31use char_ref::{CharRef, CharRefTokenizer};
32use qname::QualNameTokenizer;
33use states::{AttrValueKind::*, DoctypeKind, DoctypeKind::*, XmlState};
34
35#[derive(Copy, Clone)]
37pub struct XmlTokenizerOpts {
38 pub exact_errors: bool,
41
42 pub discard_bom: bool,
45
46 pub profile: bool,
49
50 pub initial_state: Option<XmlState>,
53}
54
55fn process_qname(tag_name: StrTendril) -> QualName {
56 let split = if (*tag_name).len() < 3 {
62 None
63 } else {
64 QualNameTokenizer::new((*tag_name).as_bytes()).run()
65 };
66
67 match split {
68 None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
69 Some(col) => {
70 let len = (*tag_name).len() as u32;
71 let prefix = tag_name.subtendril(0, col);
72 let local = tag_name.subtendril(col + 1, len - col - 1);
73 let ns = ns!(); QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
75 },
76 }
77}
78
79fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
80 match *opt_str {
81 Some(ref mut s) => s.push_char(c),
82 None => *opt_str = Some(StrTendril::from_char(c)),
83 }
84}
85
86impl Default for XmlTokenizerOpts {
87 fn default() -> XmlTokenizerOpts {
88 XmlTokenizerOpts {
89 exact_errors: false,
90 discard_bom: true,
91 profile: false,
92 initial_state: None,
93 }
94 }
95}
96pub struct XmlTokenizer<Sink> {
98 opts: XmlTokenizerOpts,
100
101 pub sink: Sink,
103
104 state: Cell<XmlState>,
106
107 at_eof: Cell<bool>,
110
111 char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
114
115 current_char: Cell<char>,
117
118 reconsume: Cell<bool>,
120
121 ignore_lf: Cell<bool>,
124
125 discard_bom: Cell<bool>,
128
129 temp_buf: RefCell<StrTendril>,
131
132 current_tag_kind: Cell<TagKind>,
134
135 current_tag_name: RefCell<StrTendril>,
137
138 current_tag_attrs: RefCell<Vec<Attribute>>,
140
141 current_attr_name: RefCell<StrTendril>,
143
144 current_attr_value: RefCell<StrTendril>,
146
147 current_doctype: RefCell<Doctype>,
148
149 current_comment: RefCell<StrTendril>,
151
152 current_pi_target: RefCell<StrTendril>,
154
155 current_pi_data: RefCell<StrTendril>,
157
158 state_profile: RefCell<BTreeMap<XmlState, u64>>,
160
161 time_in_sink: Cell<u64>,
163}
164
165impl<Sink: TokenSink> XmlTokenizer<Sink> {
166 pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink> {
168 if opts.profile && cfg!(for_c) {
169 panic!("Can't profile tokenizer when built as a C library");
170 }
171
172 let state = *opts.initial_state.as_ref().unwrap_or(&XmlState::Data);
173 let discard_bom = opts.discard_bom;
174 XmlTokenizer {
175 opts,
176 sink,
177 state: Cell::new(state),
178 char_ref_tokenizer: RefCell::new(None),
179 at_eof: Cell::new(false),
180 current_char: Cell::new('\0'),
181 reconsume: Cell::new(false),
182 ignore_lf: Cell::new(false),
183 temp_buf: RefCell::new(StrTendril::new()),
184 discard_bom: Cell::new(discard_bom),
185 current_tag_kind: Cell::new(StartTag),
186 current_tag_name: RefCell::new(StrTendril::new()),
187 current_tag_attrs: RefCell::new(vec![]),
188 current_attr_name: RefCell::new(StrTendril::new()),
189 current_attr_value: RefCell::new(StrTendril::new()),
190 current_comment: RefCell::new(StrTendril::new()),
191 current_pi_data: RefCell::new(StrTendril::new()),
192 current_pi_target: RefCell::new(StrTendril::new()),
193 current_doctype: RefCell::new(Doctype::default()),
194 state_profile: RefCell::new(BTreeMap::new()),
195 time_in_sink: Cell::new(0),
196 }
197 }
198
199 pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
201 if input.is_empty() {
202 return TokenizerResult::Done;
203 }
204
205 if self.discard_bom.get() {
206 if let Some(c) = input.peek() {
207 if c == '\u{feff}' {
208 input.next();
209 }
210 } else {
211 return TokenizerResult::Done;
212 }
213 };
214
215 self.run(input)
216 }
217
218 fn process_token(&self, token: Token) -> ProcessResult<Sink::Handle> {
219 if self.opts.profile {
220 let (result, dt) = time!(self.sink.process_token(token));
221 self.time_in_sink.set(self.time_in_sink.get() + dt);
222 result
223 } else {
224 self.sink.process_token(token)
225 }
226 }
227
228 fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
231 if self.ignore_lf.get() {
232 self.ignore_lf.set(false);
233 if c == '\n' {
234 c = input.next()?;
235 }
236 }
237
238 if c == '\r' {
239 self.ignore_lf.set(true);
240 c = '\n';
241 }
242
243 if c == '\x00' {
245 c = '\u{FFFD}'
246 }
247
248 if self.opts.exact_errors
250 && match c as u32 {
251 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
252 n if (n & 0xFFFE) == 0xFFFE => true,
253 _ => false,
254 }
255 {
256 let msg = format!("Bad character {c}");
257 self.emit_error(Cow::Owned(msg));
258 }
259
260 debug!("got character {c}");
261 self.current_char.set(c);
262 Some(c)
263 }
264
265 fn bad_eof_error(&self) {
266 let msg = if self.opts.exact_errors {
267 Cow::from(format!("Saw EOF in state {:?}", self.state))
268 } else {
269 Cow::from("Unexpected EOF")
270 };
271 self.emit_error(msg);
272 }
273
274 fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
275 if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
280 return self.get_char(input).map(FromSet);
281 }
282
283 let d = input.pop_except_from(set);
284 debug!("got characters {d:?}");
285 match d {
286 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
287
288 _ => d,
292 }
293 }
294
295 fn eat(&self, input: &BufferQueue, pat: &str) -> Option<bool> {
301 input.push_front(replace(&mut *self.temp_buf.borrow_mut(), StrTendril::new()));
302 match input.eat(pat, u8::eq_ignore_ascii_case) {
303 None if self.at_eof.get() => Some(false),
304 None => {
305 let mut temp_buf = self.temp_buf.borrow_mut();
306 while let Some(data) = input.next() {
307 temp_buf.push_char(data);
308 }
309 None
310 },
311 Some(matched) => Some(matched),
312 }
313 }
314
315 pub fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
317 if self.opts.profile {
318 loop {
319 let state = self.state.get();
320 let old_sink = self.time_in_sink.get();
321 let (run, mut dt) = time!(self.step(input));
322 dt -= self.time_in_sink.get() - old_sink;
323 let new = match self.state_profile.borrow_mut().get_mut(&state) {
324 Some(x) => {
325 *x += dt;
326 false
327 },
328 None => true,
329 };
330 if new {
331 self.state_profile.borrow_mut().insert(state, dt);
333 }
334 match run {
335 ProcessResult::Continue => continue,
336 ProcessResult::Done => return TokenizerResult::Done,
337 ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
338 }
339 }
340 } else {
341 loop {
342 match self.step(input) {
343 ProcessResult::Continue => continue,
344 ProcessResult::Done => return TokenizerResult::Done,
345 ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
346 }
347 }
348 }
349 }
350
351 fn get_char(&self, input: &BufferQueue) -> Option<char> {
354 if self.reconsume.get() {
355 self.reconsume.set(false);
356 Some(self.current_char.get())
357 } else {
358 input
359 .next()
360 .and_then(|c| self.get_preprocessed_char(c, input))
361 }
362 }
363
364 fn bad_char_error(&self) {
365 let msg = if self.opts.exact_errors {
366 let c = self.current_char.get();
367 let state = self.state.get();
368 Cow::from(format!("Saw {c} in state {state:?}"))
369 } else {
370 Cow::from("Bad character")
371 };
372 self.emit_error(msg);
373 }
374
375 fn discard_tag(&self) {
376 *self.current_tag_name.borrow_mut() = StrTendril::new();
377 *self.current_tag_attrs.borrow_mut() = Vec::new();
378 }
379
380 fn create_tag(&self, kind: TagKind, c: char) {
381 self.discard_tag();
382 self.current_tag_name.borrow_mut().push_char(c);
383 self.current_tag_kind.set(kind);
384 }
385
386 fn create_pi(&self, c: char) {
389 *self.current_pi_target.borrow_mut() = StrTendril::new();
390 *self.current_pi_data.borrow_mut() = StrTendril::new();
391 self.current_pi_target.borrow_mut().push_char(c);
392 }
393
394 fn emit_char(&self, c: char) {
395 self.process_token(Token::Characters(StrTendril::from_char(match c {
396 '\0' => '\u{FFFD}',
397 c => c,
398 })));
399 }
400
401 fn emit_short_tag(&self) -> ProcessResult<Sink::Handle> {
402 self.current_tag_kind.set(ShortTag);
403 *self.current_tag_name.borrow_mut() = StrTendril::new();
404 self.emit_current_tag()
405 }
406
407 fn emit_empty_tag(&self) -> ProcessResult<Sink::Handle> {
408 self.current_tag_kind.set(EmptyTag);
409 self.emit_current_tag()
410 }
411
412 fn set_empty_tag(&self) {
413 self.current_tag_kind.set(EmptyTag);
414 }
415
416 fn emit_start_tag(&self) -> ProcessResult<Sink::Handle> {
417 self.current_tag_kind.set(StartTag);
418 self.emit_current_tag()
419 }
420
421 fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
422 self.finish_attribute();
423
424 let qname = process_qname(replace(
425 &mut *self.current_tag_name.borrow_mut(),
426 StrTendril::new(),
427 ));
428
429 match self.current_tag_kind.get() {
430 StartTag | EmptyTag => {},
431 EndTag => {
432 if !self.current_tag_attrs.borrow().is_empty() {
433 self.emit_error(Borrowed("Attributes on an end tag"));
434 }
435 },
436 ShortTag => {
437 if !self.current_tag_attrs.borrow().is_empty() {
438 self.emit_error(Borrowed("Attributes on a short tag"));
439 }
440 },
441 }
442
443 let token = Token::Tag(Tag {
444 kind: self.current_tag_kind.get(),
445 name: qname,
446 attrs: self.current_tag_attrs.take(),
447 });
448
449 self.process_token(token)
450 }
451
452 fn emit_chars(&self, b: StrTendril) {
454 self.process_token(Token::Characters(b));
455 }
456
457 fn emit_pi(&self) -> ProcessResult<<Sink as TokenSink>::Handle> {
459 let token = Token::ProcessingInstruction(Pi {
460 target: replace(&mut *self.current_pi_target.borrow_mut(), StrTendril::new()),
461 data: replace(&mut *self.current_pi_data.borrow_mut(), StrTendril::new()),
462 });
463 self.process_token(token)
464 }
465
466 fn consume_char_ref(&self, addnl_allowed: Option<char>) {
467 *self.char_ref_tokenizer.borrow_mut() =
470 Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
471 }
472
473 fn emit_eof(&self) {
474 self.process_token(Token::EndOfFile);
475 }
476
477 fn emit_error(&self, error: Cow<'static, str>) {
478 self.process_token(Token::ParseError(error));
479 }
480
481 fn emit_current_comment(&self) {
482 let comment = self.current_comment.take();
483 self.process_token(Token::Comment(comment));
484 }
485
486 fn emit_current_doctype(&self) {
487 let doctype = self.current_doctype.take();
488 self.process_token(Token::Doctype(doctype));
489 }
490
491 fn doctype_id(&self, kind: DoctypeKind) -> RefMut<'_, Option<StrTendril>> {
492 let current_doctype = self.current_doctype.borrow_mut();
493 match kind {
494 DoctypeKind::Public => RefMut::map(current_doctype, |d| &mut d.public_id),
495 DoctypeKind::System => RefMut::map(current_doctype, |d| &mut d.system_id),
496 }
497 }
498
499 fn clear_doctype_id(&self, kind: DoctypeKind) {
500 let mut id = self.doctype_id(kind);
501 match *id {
502 Some(ref mut s) => s.clear(),
503 None => *id = Some(StrTendril::new()),
504 }
505 }
506
507 fn peek(&self, input: &BufferQueue) -> Option<char> {
508 if self.reconsume.get() {
509 Some(self.current_char.get())
510 } else {
511 input.peek()
512 }
513 }
514
515 fn discard_char(&self, input: &BufferQueue) {
516 let c = self.get_char(input);
517 assert!(c.is_some());
518 }
519
520 fn unconsume(&self, input: &BufferQueue, buf: StrTendril) {
521 input.push_front(buf);
522 }
523}
524
525macro_rules! shorthand (
527 ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) );
528 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
529 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) );
530 ( $me:ident : discard_tag $input:expr ) => ( $me.discard_tag($input) );
531 ( $me:ident : discard_char ) => ( $me.discard_char() );
532 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) );
533 ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() );
534 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
535 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
536 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) );
537 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) );
538 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
539 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) );
540 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) );
541 ( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
542 ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() );
543 ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
544 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
545 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) );
546 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
547 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
548 ( $me:ident : error ) => ( $me.bad_char_error() );
549 ( $me:ident : error_eof ) => ( $me.bad_eof_error() );
550 ( $me:ident : create_pi $c:expr ) => ( $me.create_pi($c) );
551 ( $me:ident : push_pi_target $c:expr ) => ( $me.current_pi_target.borrow_mut().push_char($c) );
552 ( $me:ident : push_pi_data $c:expr ) => ( $me.current_pi_data.borrow_mut().push_char($c) );
553 ( $me:ident : set_empty_tag ) => ( $me.set_empty_tag() );
554);
555
556#[cfg(feature = "trace_tokenizer")]
559macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
560 debug!(" {:?}", stringify!($($cmds)*));
561 shorthand!($me : $($cmds)*);
562}));
563
564#[cfg(not(feature = "trace_tokenizer"))]
565macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
566
567macro_rules! go (
569 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
573 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
574 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
575 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
576
577 ( $me:ident : to $s:ident ) => ({ $me.state.set(XmlState::$s); return ProcessResult::Continue; });
580 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(XmlState::$s($k1)); return ProcessResult::Continue; });
581 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(XmlState::$s($k1($k2))); return ProcessResult::Continue; });
582
583 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); });
584 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); });
585 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
586
587 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; });
588 ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
589
590 ( $me:ident : emit_tag $s:ident ) => ({
592 $me.state.set(XmlState::$s);
593 return $me.emit_current_tag();
594 });
595
596 ( $me:ident : emit_short_tag $s:ident ) => ({
598 $me.state.set(XmlState::$s);
599 return $me.emit_short_tag();
600 });
601
602 ( $me:ident : emit_empty_tag $s:ident ) => ({
603 $me.state.set(XmlState::$s);
604 return $me.emit_empty_tag();
605 });
606
607 ( $me:ident : emit_start_tag $s:ident ) => ({
608 $me.state.set(XmlState::$s);
609 return $me.emit_start_tag();
610 });
611
612 ( $me:ident : emit_pi $s:ident ) => ({
613 $me.state.set(XmlState::$s);
614 return $me.emit_pi();
615 });
616
617 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Done; });
618
619 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
621
622 ( $me:ident : ) => (());
624);
625
626macro_rules! get_char ( ($me:expr, $input:expr) => {{
629 let Some(character) = $me.get_char($input) else {
630 return ProcessResult::Done;
631 };
632 character
633}});
634
635macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => {{
636 let Some(popped_element) = $me.pop_except_from($input, $set) else {
637 return ProcessResult::Done;
638 };
639 popped_element
640}});
641
642macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => {{
643 let Some(value) = $me.eat($input, $pat) else {
644 return ProcessResult::Done;
645 };
646 value
647}});
648
649pub enum ProcessResult<Handle> {
651 Done,
653 Continue,
655 Script(Handle),
658}
659
660impl<Sink: TokenSink> XmlTokenizer<Sink> {
661 #[allow(clippy::never_loop)]
663 fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
664 if self.char_ref_tokenizer.borrow().is_some() {
665 return self.step_char_ref_tokenizer(input);
666 }
667
668 debug!("processing in state {:?}", self.state);
669 match self.state.get() {
670 XmlState::Data => loop {
672 match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
673 FromSet('&') => go!(self: consume_char_ref),
674 FromSet('<') => go!(self: to TagState),
675 FromSet(c) => go!(self: emit c),
676 NotFromSet(b) => self.emit_chars(b),
677 }
678 },
679 XmlState::TagState => loop {
681 match get_char!(self, input) {
682 '!' => go!(self: to MarkupDecl),
683 '/' => go!(self: to EndTagState),
684 '?' => go!(self: to Pi),
685 '\t' | '\n' | ' ' | ':' | '<' | '>' => {
686 go!(self: error; emit '<'; reconsume Data)
687 },
688 cl => go!(self: create_tag StartTag cl; to TagName),
689 }
690 },
691 XmlState::EndTagState => loop {
693 match get_char!(self, input) {
694 '>' => go!(self: emit_short_tag Data),
695 '\t' | '\n' | ' ' | '<' | ':' => {
696 go!(self: error; emit '<'; emit '/'; reconsume Data)
697 },
698 cl => go!(self: create_tag EndTag cl; to EndTagName),
699 }
700 },
701 XmlState::EndTagName => loop {
703 match get_char!(self, input) {
704 '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
705 '/' => go!(self: error; to EndTagNameAfter),
706 '>' => go!(self: emit_tag Data),
707 cl => go!(self: push_tag cl),
708 }
709 },
710 XmlState::EndTagNameAfter => loop {
712 match get_char!(self, input) {
713 '>' => go!(self: emit_tag Data),
714 '\t' | '\n' | ' ' => (),
715 _ => self.emit_error(Borrowed("Unexpected element in tag name")),
716 }
717 },
718 XmlState::Pi => loop {
720 match get_char!(self, input) {
721 '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
722 cl => go!(self: create_pi cl; to PiTarget),
723 }
724 },
725 XmlState::PiTarget => loop {
727 match get_char!(self, input) {
728 '\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
729 '?' => go!(self: to PiAfter),
730 cl => go!(self: push_pi_target cl),
731 }
732 },
733 XmlState::PiTargetAfter => loop {
735 match get_char!(self, input) {
736 '\t' | '\n' | ' ' => (),
737 _ => go!(self: reconsume PiData),
738 }
739 },
740 XmlState::PiData => loop {
742 match get_char!(self, input) {
743 '?' => go!(self: to PiAfter),
744 cl => go!(self: push_pi_data cl),
745 }
746 },
747 XmlState::PiAfter => loop {
749 match get_char!(self, input) {
750 '>' => go!(self: emit_pi Data),
751 '?' => go!(self: to PiAfter),
752 cl => go!(self: push_pi_data cl),
753 }
754 },
755 XmlState::MarkupDecl => loop {
757 if eat!(self, input, "--") {
758 go!(self: clear_comment; to CommentStart);
759 } else if eat!(self, input, "[CDATA[") {
760 go!(self: to Cdata);
761 } else if eat!(self, input, "DOCTYPE") {
762 go!(self: to Doctype);
763 } else {
764 go!(self: error; to BogusComment);
766 }
767 },
768 XmlState::CommentStart => loop {
770 match get_char!(self, input) {
771 '-' => go!(self: to CommentStartDash),
772 '>' => go!(self: error; emit_comment; to Data),
773 _ => go!(self: reconsume Comment),
774 }
775 },
776 XmlState::CommentStartDash => loop {
778 match get_char!(self, input) {
779 '-' => go!(self: to CommentEnd),
780 '>' => go!(self: error; emit_comment; to Data),
781 _ => go!(self: push_comment '-'; reconsume Comment),
782 }
783 },
784 XmlState::Comment => loop {
786 match get_char!(self, input) {
787 '<' => go!(self: push_comment '<'; to CommentLessThan),
788 '-' => go!(self: to CommentEndDash),
789 c => go!(self: push_comment c),
790 }
791 },
792 XmlState::CommentLessThan => loop {
794 match get_char!(self, input) {
795 '!' => go!(self: push_comment '!';to CommentLessThanBang),
796 '<' => go!(self: push_comment '<'),
797 _ => go!(self: reconsume Comment),
798 }
799 },
800 XmlState::CommentLessThanBang => loop {
802 match get_char!(self, input) {
803 '-' => go!(self: to CommentLessThanBangDash),
804 _ => go!(self: reconsume Comment),
805 }
806 },
807 XmlState::CommentLessThanBangDash => loop {
809 match get_char!(self, input) {
810 '-' => go!(self: to CommentLessThanBangDashDash),
811 _ => go!(self: reconsume CommentEndDash),
812 }
813 },
814 XmlState::CommentLessThanBangDashDash => loop {
816 match get_char!(self, input) {
817 '>' => go!(self: reconsume CommentEnd),
818 _ => go!(self: error; reconsume CommentEnd),
819 }
820 },
821 XmlState::CommentEndDash => loop {
823 match get_char!(self, input) {
824 '-' => go!(self: to CommentEnd),
825 _ => go!(self: push_comment '-'; reconsume Comment),
826 }
827 },
828 XmlState::CommentEnd => loop {
830 match get_char!(self, input) {
831 '>' => go!(self: emit_comment; to Data),
832 '!' => go!(self: to CommentEndBang),
833 '-' => go!(self: push_comment '-'),
834 _ => go!(self: append_comment "--"; reconsume Comment),
835 }
836 },
837 XmlState::CommentEndBang => loop {
839 match get_char!(self, input) {
840 '-' => go!(self: append_comment "--!"; to CommentEndDash),
841 '>' => go!(self: error; emit_comment; to Data),
842 _ => go!(self: append_comment "--!"; reconsume Comment),
843 }
844 },
845 XmlState::BogusComment => loop {
847 match get_char!(self, input) {
848 '>' => go!(self: emit_comment; to Data),
849 c => go!(self: push_comment c),
850 }
851 },
852 XmlState::Cdata => loop {
854 match get_char!(self, input) {
855 ']' => go!(self: to CdataBracket),
856 cl => go!(self: emit cl),
857 }
858 },
859 XmlState::CdataBracket => loop {
861 match get_char!(self, input) {
862 ']' => go!(self: to CdataEnd),
863 cl => go!(self: emit ']'; emit cl; to Cdata),
864 }
865 },
866 XmlState::CdataEnd => loop {
868 match get_char!(self, input) {
869 '>' => go!(self: to Data),
870 ']' => go!(self: emit ']'),
871 cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
872 }
873 },
874 XmlState::TagName => loop {
876 match get_char!(self, input) {
877 '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
878 '>' => go!(self: emit_tag Data),
879 '/' => go!(self: set_empty_tag; to TagEmpty),
880 cl => go!(self: push_tag cl),
881 }
882 },
883 XmlState::TagEmpty => loop {
885 match get_char!(self, input) {
886 '>' => go!(self: emit_empty_tag Data),
887 _ => go!(self: reconsume TagAttrValueBefore),
888 }
889 },
890 XmlState::TagAttrNameBefore => loop {
892 match get_char!(self, input) {
893 '\t' | '\n' | ' ' => (),
894 '>' => go!(self: emit_tag Data),
895 '/' => go!(self: set_empty_tag; to TagEmpty),
896 ':' => go!(self: error),
897 cl => go!(self: create_attr cl; to TagAttrName),
898 }
899 },
900 XmlState::TagAttrName => loop {
902 match get_char!(self, input) {
903 '=' => go!(self: to TagAttrValueBefore),
904 '>' => go!(self: emit_tag Data),
905 '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
906 '/' => go!(self: set_empty_tag; to TagEmpty),
907 cl => go!(self: push_name cl),
908 }
909 },
910 XmlState::TagAttrNameAfter => loop {
912 match get_char!(self, input) {
913 '\t' | '\n' | ' ' => (),
914 '=' => go!(self: to TagAttrValueBefore),
915 '>' => go!(self: emit_tag Data),
916 '/' => go!(self: set_empty_tag; to TagEmpty),
917 cl => go!(self: create_attr cl; to TagAttrName),
918 }
919 },
920 XmlState::TagAttrValueBefore => loop {
922 match get_char!(self, input) {
923 '\t' | '\n' | ' ' => (),
924 '"' => go!(self: to TagAttrValue DoubleQuoted),
925 '\'' => go!(self: to TagAttrValue SingleQuoted),
926 '&' => go!(self: reconsume TagAttrValue(Unquoted)),
927 '>' => go!(self: emit_tag Data),
928 cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
929 }
930 },
931 XmlState::TagAttrValue(DoubleQuoted) => loop {
933 match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
934 FromSet('"') => go!(self: to TagAttrNameBefore),
935 FromSet('&') => go!(self: consume_char_ref '"' ),
936 FromSet(c) => go!(self: push_value c),
937 NotFromSet(ref b) => go!(self: append_value b),
938 }
939 },
940 XmlState::TagAttrValue(SingleQuoted) => loop {
942 match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
943 FromSet('\'') => go!(self: to TagAttrNameBefore),
944 FromSet('&') => go!(self: consume_char_ref '\''),
945 FromSet(c) => go!(self: push_value c),
946 NotFromSet(ref b) => go!(self: append_value b),
947 }
948 },
949 XmlState::TagAttrValue(Unquoted) => loop {
951 match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
952 FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
953 FromSet('&') => go!(self: consume_char_ref),
954 FromSet('>') => go!(self: emit_tag Data),
955 FromSet(c) => go!(self: push_value c),
956 NotFromSet(ref b) => go!(self: append_value b),
957 }
958 },
959
960 XmlState::Doctype => loop {
962 match get_char!(self, input) {
963 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
964 _ => go!(self: error; reconsume BeforeDoctypeName),
965 }
966 },
967 XmlState::BeforeDoctypeName => loop {
969 match get_char!(self, input) {
970 '\t' | '\n' | '\x0C' | ' ' => (),
971 '>' => go!(self: error; emit_doctype; to Data),
972 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
973 to DoctypeName),
974 }
975 },
976 XmlState::DoctypeName => loop {
978 match get_char!(self, input) {
979 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
980 '>' => go!(self: emit_doctype; to Data),
981 c => go!(self: push_doctype_name (c.to_ascii_lowercase());
982 to DoctypeName),
983 }
984 },
985 XmlState::AfterDoctypeName => loop {
987 if eat!(self, input, "public") {
988 go!(self: to AfterDoctypeKeyword Public);
989 } else if eat!(self, input, "system") {
990 go!(self: to AfterDoctypeKeyword System);
991 } else {
992 match get_char!(self, input) {
993 '\t' | '\n' | '\x0C' | ' ' => (),
994 '>' => go!(self: emit_doctype; to Data),
995 _ => go!(self: error; to BogusDoctype),
996 }
997 }
998 },
999 XmlState::AfterDoctypeKeyword(Public) => loop {
1001 match get_char!(self, input) {
1002 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
1003 '"' => {
1004 go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
1005 },
1006 '\'' => {
1007 go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
1008 },
1009 '>' => go!(self: error; emit_doctype; to Data),
1010 _ => go!(self: error; to BogusDoctype),
1011 }
1012 },
1013 XmlState::AfterDoctypeKeyword(System) => loop {
1015 match get_char!(self, input) {
1016 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
1017 '"' => {
1018 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1019 },
1020 '\'' => {
1021 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1022 },
1023 '>' => go!(self: error; emit_doctype; to Data),
1024 _ => go!(self: error; to BogusDoctype),
1025 }
1026 },
1027 XmlState::BeforeDoctypeIdentifier(kind) => loop {
1029 match get_char!(self, input) {
1030 '\t' | '\n' | '\x0C' | ' ' => (),
1031 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1032 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1033 '>' => go!(self: error; emit_doctype; to Data),
1034 _ => go!(self: error; to BogusDoctype),
1035 }
1036 },
1037 XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
1039 match get_char!(self, input) {
1040 '"' => go!(self: to AfterDoctypeIdentifier kind),
1041 '>' => go!(self: error; emit_doctype; to Data),
1042 c => go!(self: push_doctype_id kind c),
1043 }
1044 },
1045 XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
1047 match get_char!(self, input) {
1048 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1049 '>' => go!(self: error; emit_doctype; to Data),
1050 c => go!(self: push_doctype_id kind c),
1051 }
1052 },
1053 XmlState::AfterDoctypeIdentifier(Public) => loop {
1055 match get_char!(self, input) {
1056 '\t' | '\n' | '\x0C' | ' ' => {
1057 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1058 },
1059 '\'' => {
1060 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
1061 },
1062 '"' => {
1063 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
1064 },
1065 '>' => go!(self: emit_doctype; to Data),
1066 _ => go!(self: error; to BogusDoctype),
1067 }
1068 },
1069 XmlState::AfterDoctypeIdentifier(System) => loop {
1071 match get_char!(self, input) {
1072 '\t' | '\n' | '\x0C' | ' ' => (),
1073 '>' => go!(self: emit_doctype; to Data),
1074 _ => go!(self: error; to BogusDoctype),
1075 }
1076 },
1077 XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
1079 match get_char!(self, input) {
1080 '\t' | '\n' | '\x0C' | ' ' => (),
1081 '>' => go!(self: emit_doctype; to Data),
1082 '\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
1083 '"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
1084 _ => go!(self: error; to BogusDoctype),
1085 }
1086 },
1087 XmlState::BogusDoctype => loop {
1089 if get_char!(self, input) == '>' {
1090 go!(self: emit_doctype; to Data);
1091 }
1092 },
1093 }
1094 }
1095
1096 pub fn end(&self) {
1098 let input = BufferQueue::default();
1101 match self.char_ref_tokenizer.take() {
1102 None => (),
1103 Some(mut tok) => {
1104 tok.end_of_file(self, &input);
1105 self.process_char_ref(tok.get_result());
1106 },
1107 }
1108
1109 self.at_eof.set(true);
1112 let _ = self.run(&input);
1113
1114 loop {
1115 if !matches!(self.eof_step(), ProcessResult::Continue) {
1116 break;
1117 }
1118 }
1119
1120 self.sink.end();
1121
1122 if self.opts.profile {
1123 self.dump_profile();
1124 }
1125 }
1126
1127 #[cfg(for_c)]
1128 fn dump_profile(&self) {
1129 unreachable!();
1130 }
1131
1132 #[cfg(not(for_c))]
1133 fn dump_profile(&self) {
1134 let mut results: Vec<(XmlState, u64)> = self
1135 .state_profile
1136 .borrow()
1137 .iter()
1138 .map(|(s, t)| (*s, *t))
1139 .collect();
1140 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1141
1142 let total: u64 = results
1143 .iter()
1144 .map(|&(_, t)| t)
1145 .fold(0, ::std::ops::Add::add);
1146 debug!("\nTokenizer profile, in nanoseconds");
1147 debug!(
1148 "\n{:12} total in token sink",
1149 self.time_in_sink.get()
1150 );
1151 debug!("\n{total:12} total in tokenizer");
1152
1153 for (k, v) in results.into_iter() {
1154 let pct = 100.0 * (v as f64) / (total as f64);
1155 debug!("{v:12} {pct:4.1}% {k:?}");
1156 }
1157 }
1158
1159 fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1160 debug!("processing EOF in state {:?}", self.state.get());
1161 match self.state.get() {
1162 XmlState::Data => go!(self: eof),
1163 XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
1164 go!(self: reconsume Comment)
1165 },
1166 XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
1167 XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
1168 XmlState::CommentStartDash
1169 | XmlState::Comment
1170 | XmlState::CommentEndDash
1171 | XmlState::CommentEnd
1172 | XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
1173 XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
1174 XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
1175 XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
1176 XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
1177 go!(self: error_eof; to Data)
1178 },
1179 XmlState::Pi => go!(self: error_eof; to BogusComment),
1180 XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
1181 XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
1182 XmlState::TagName
1183 | XmlState::TagAttrNameBefore
1184 | XmlState::EndTagName
1185 | XmlState::TagAttrNameAfter
1186 | XmlState::EndTagNameAfter
1187 | XmlState::TagAttrValueBefore
1188 | XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
1189 XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
1190 XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
1191 XmlState::BeforeDoctypeName
1192 | XmlState::Doctype
1193 | XmlState::DoctypeName
1194 | XmlState::AfterDoctypeName
1195 | XmlState::AfterDoctypeKeyword(_)
1196 | XmlState::BeforeDoctypeIdentifier(_)
1197 | XmlState::AfterDoctypeIdentifier(_)
1198 | XmlState::DoctypeIdentifierSingleQuoted(_)
1199 | XmlState::DoctypeIdentifierDoubleQuoted(_)
1200 | XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
1201 go!(self: error_eof; emit_doctype; to Data)
1202 },
1203 XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
1204 XmlState::BogusComment => go!(self: emit_comment; to Data),
1205 }
1206 }
1207
1208 fn process_char_ref(&self, char_ref: CharRef) {
1209 let CharRef {
1210 mut chars,
1211 mut num_chars,
1212 } = char_ref;
1213
1214 if num_chars == 0 {
1215 chars[0] = '&';
1216 num_chars = 1;
1217 }
1218
1219 for i in 0..num_chars {
1220 let c = chars[i as usize];
1221 match self.state.get() {
1222 XmlState::Data | XmlState::Cdata => go!(self: emit c),
1223
1224 XmlState::TagAttrValue(_) => go!(self: push_value c),
1225
1226 _ => panic!(
1227 "state {:?} should not be reachable in process_char_ref",
1228 self.state.get()
1229 ),
1230 }
1231 }
1232 }
1233
1234 fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1235 let mut tok = self.char_ref_tokenizer.take().unwrap();
1236 let outcome = tok.step(self, input);
1237
1238 let progress = match outcome {
1239 char_ref::Done => {
1240 self.process_char_ref(tok.get_result());
1241 return ProcessResult::Continue;
1242 },
1243
1244 char_ref::Stuck => ProcessResult::Done,
1245 char_ref::Progress => ProcessResult::Continue,
1246 };
1247
1248 *self.char_ref_tokenizer.borrow_mut() = Some(tok);
1249 progress
1250 }
1251
1252 fn finish_attribute(&self) {
1253 if self.current_attr_name.borrow().is_empty() {
1254 return;
1255 }
1256
1257 let dup = {
1261 let current_attr_name = self.current_attr_name.borrow();
1262 let name = ¤t_attr_name[..];
1263 self.current_tag_attrs
1264 .borrow()
1265 .iter()
1266 .any(|a| &*a.name.local == name)
1267 };
1268
1269 if dup {
1270 self.emit_error(Borrowed("Duplicate attribute"));
1271 self.current_attr_name.borrow_mut().clear();
1272 self.current_attr_value.borrow_mut().clear();
1273 } else {
1274 let qname = process_qname(replace(
1275 &mut self.current_attr_name.borrow_mut(),
1276 StrTendril::new(),
1277 ));
1278 let attr = Attribute {
1279 name: qname.clone(),
1280 value: replace(&mut self.current_attr_value.borrow_mut(), StrTendril::new()),
1281 };
1282
1283 if qname.local == local_name!("xmlns")
1284 || qname.prefix == Some(namespace_prefix!("xmlns"))
1285 {
1286 self.current_tag_attrs.borrow_mut().insert(0, attr);
1287 } else {
1288 self.current_tag_attrs.borrow_mut().push(attr);
1289 }
1290 }
1291 }
1292
1293 fn create_attribute(&self, c: char) {
1294 self.finish_attribute();
1295
1296 self.current_attr_name.borrow_mut().push_char(c);
1297 }
1298}
1299
1300#[cfg(test)]
1301mod test {
1302
1303 use super::process_qname;
1304 use crate::tendril::SliceExt;
1305 use crate::{LocalName, Prefix};
1306
1307 #[test]
1308 fn simple_namespace() {
1309 let qname = process_qname("prefix:local".to_tendril());
1310 assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
1311 assert_eq!(qname.local, LocalName::from("local"));
1312
1313 let qname = process_qname("a:b".to_tendril());
1314 assert_eq!(qname.prefix, Some(Prefix::from("a")));
1315 assert_eq!(qname.local, LocalName::from("b"));
1316 }
1317
1318 #[test]
1319 fn wrong_namespaces() {
1320 let qname = process_qname(":local".to_tendril());
1321 assert_eq!(qname.prefix, None);
1322 assert_eq!(qname.local, LocalName::from(":local"));
1323
1324 let qname = process_qname("::local".to_tendril());
1325 assert_eq!(qname.prefix, None);
1326 assert_eq!(qname.local, LocalName::from("::local"));
1327
1328 let qname = process_qname("a::local".to_tendril());
1329 assert_eq!(qname.prefix, None);
1330 assert_eq!(qname.local, LocalName::from("a::local"));
1331
1332 let qname = process_qname("fake::".to_tendril());
1333 assert_eq!(qname.prefix, None);
1334 assert_eq!(qname.local, LocalName::from("fake::"));
1335
1336 let qname = process_qname(":::".to_tendril());
1337 assert_eq!(qname.prefix, None);
1338 assert_eq!(qname.local, LocalName::from(":::"));
1339
1340 let qname = process_qname(":a:b:".to_tendril());
1341 assert_eq!(qname.prefix, None);
1342 assert_eq!(qname.local, LocalName::from(":a:b:"));
1343 }
1344}