1mod char_ref;
11mod interface;
12mod qname;
13pub mod states;
14
15pub use self::interface::{
16 Doctype, EmptyTag, EndTag, Pi, ShortTag, StartTag, Tag, TagKind, Token, TokenSink,
17};
18pub use crate::{LocalName, Namespace, Prefix};
19
20use crate::macros::{time, unwrap_or_return};
21use crate::tendril::StrTendril;
22use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
23use log::debug;
24use markup5ever::{local_name, namespace_prefix, ns, small_char_set, TokenizerResult};
25use std::borrow::Cow::{self, Borrowed};
26use std::cell::{Cell, RefCell, RefMut};
27use std::collections::BTreeMap;
28use std::mem::replace;
29
30use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
31use self::char_ref::{CharRef, CharRefTokenizer};
32use self::qname::QualNameTokenizer;
33use self::states::XmlState;
34use self::states::{DoctypeKind, Public, System};
35use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
36
37#[derive(Copy, Clone)]
39pub struct XmlTokenizerOpts {
40 pub exact_errors: bool,
43
44 pub discard_bom: bool,
47
48 pub profile: bool,
51
52 pub initial_state: Option<states::XmlState>,
55}
56
57fn process_qname(tag_name: StrTendril) -> QualName {
58 let split = if (*tag_name).len() < 3 {
64 None
65 } else {
66 QualNameTokenizer::new((*tag_name).as_bytes()).run()
67 };
68
69 match split {
70 None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
71 Some(col) => {
72 let len = (*tag_name).len() as u32;
73 let prefix = tag_name.subtendril(0, col);
74 let local = tag_name.subtendril(col + 1, len - col - 1);
75 let ns = ns!(); QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
77 },
78 }
79}
80
81fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
82 match *opt_str {
83 Some(ref mut s) => s.push_char(c),
84 None => *opt_str = Some(StrTendril::from_char(c)),
85 }
86}
87
88impl Default for XmlTokenizerOpts {
89 fn default() -> XmlTokenizerOpts {
90 XmlTokenizerOpts {
91 exact_errors: false,
92 discard_bom: true,
93 profile: false,
94 initial_state: None,
95 }
96 }
97}
98pub struct XmlTokenizer<Sink> {
100 opts: XmlTokenizerOpts,
102
103 pub sink: Sink,
105
106 state: Cell<states::XmlState>,
108
109 at_eof: Cell<bool>,
112
113 char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
116
117 current_char: Cell<char>,
119
120 reconsume: Cell<bool>,
122
123 ignore_lf: Cell<bool>,
126
127 discard_bom: Cell<bool>,
130
131 temp_buf: RefCell<StrTendril>,
133
134 current_tag_kind: Cell<TagKind>,
136
137 current_tag_name: RefCell<StrTendril>,
139
140 current_tag_attrs: RefCell<Vec<Attribute>>,
142
143 current_attr_name: RefCell<StrTendril>,
145
146 current_attr_value: RefCell<StrTendril>,
148
149 current_doctype: RefCell<Doctype>,
150
151 current_comment: RefCell<StrTendril>,
153
154 current_pi_target: RefCell<StrTendril>,
156
157 current_pi_data: RefCell<StrTendril>,
159
160 state_profile: RefCell<BTreeMap<states::XmlState, u64>>,
162
163 time_in_sink: Cell<u64>,
165}
166
167impl<Sink: TokenSink> XmlTokenizer<Sink> {
168 pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink> {
170 if opts.profile && cfg!(for_c) {
171 panic!("Can't profile tokenizer when built as a C library");
172 }
173
174 let state = *opts.initial_state.as_ref().unwrap_or(&states::Data);
175 let discard_bom = opts.discard_bom;
176 XmlTokenizer {
177 opts,
178 sink,
179 state: Cell::new(state),
180 char_ref_tokenizer: RefCell::new(None),
181 at_eof: Cell::new(false),
182 current_char: Cell::new('\0'),
183 reconsume: Cell::new(false),
184 ignore_lf: Cell::new(false),
185 temp_buf: RefCell::new(StrTendril::new()),
186 discard_bom: Cell::new(discard_bom),
187 current_tag_kind: Cell::new(StartTag),
188 current_tag_name: RefCell::new(StrTendril::new()),
189 current_tag_attrs: RefCell::new(vec![]),
190 current_attr_name: RefCell::new(StrTendril::new()),
191 current_attr_value: RefCell::new(StrTendril::new()),
192 current_comment: RefCell::new(StrTendril::new()),
193 current_pi_data: RefCell::new(StrTendril::new()),
194 current_pi_target: RefCell::new(StrTendril::new()),
195 current_doctype: RefCell::new(Doctype::default()),
196 state_profile: RefCell::new(BTreeMap::new()),
197 time_in_sink: Cell::new(0),
198 }
199 }
200
201 pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
203 if input.is_empty() {
204 return TokenizerResult::Done;
205 }
206
207 if self.discard_bom.get() {
208 if let Some(c) = input.peek() {
209 if c == '\u{feff}' {
210 input.next();
211 }
212 } else {
213 return TokenizerResult::Done;
214 }
215 };
216
217 self.run(input)
218 }
219
220 fn process_token(&self, token: Token) -> ProcessResult<Sink::Handle> {
221 if self.opts.profile {
222 let (result, dt) = time!(self.sink.process_token(token));
223 self.time_in_sink.set(self.time_in_sink.get() + dt);
224 result
225 } else {
226 self.sink.process_token(token)
227 }
228 }
229
230 fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
233 if self.ignore_lf.get() {
234 self.ignore_lf.set(false);
235 if c == '\n' {
236 c = input.next()?;
237 }
238 }
239
240 if c == '\r' {
241 self.ignore_lf.set(true);
242 c = '\n';
243 }
244
245 if c == '\x00' {
247 c = '\u{FFFD}'
248 }
249
250 if self.opts.exact_errors
252 && match c as u32 {
253 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
254 n if (n & 0xFFFE) == 0xFFFE => true,
255 _ => false,
256 }
257 {
258 let msg = format!("Bad character {c}");
259 self.emit_error(Cow::Owned(msg));
260 }
261
262 debug!("got character {c}");
263 self.current_char.set(c);
264 Some(c)
265 }
266
267 fn bad_eof_error(&self) {
268 let msg = if self.opts.exact_errors {
269 Cow::from(format!("Saw EOF in state {:?}", self.state))
270 } else {
271 Cow::from("Unexpected EOF")
272 };
273 self.emit_error(msg);
274 }
275
276 fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
277 if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
282 return self.get_char(input).map(FromSet);
283 }
284
285 let d = input.pop_except_from(set);
286 debug!("got characters {d:?}");
287 match d {
288 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
289
290 _ => d,
294 }
295 }
296
297 fn eat(&self, input: &BufferQueue, pat: &str) -> Option<bool> {
303 input.push_front(replace(&mut *self.temp_buf.borrow_mut(), StrTendril::new()));
304 match input.eat(pat, u8::eq_ignore_ascii_case) {
305 None if self.at_eof.get() => Some(false),
306 None => {
307 let mut temp_buf = self.temp_buf.borrow_mut();
308 while let Some(data) = input.next() {
309 temp_buf.push_char(data);
310 }
311 None
312 },
313 Some(matched) => Some(matched),
314 }
315 }
316
317 pub fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
319 if self.opts.profile {
320 loop {
321 let state = self.state.get();
322 let old_sink = self.time_in_sink.get();
323 let (run, mut dt) = time!(self.step(input));
324 dt -= self.time_in_sink.get() - old_sink;
325 let new = match self.state_profile.borrow_mut().get_mut(&state) {
326 Some(x) => {
327 *x += dt;
328 false
329 },
330 None => true,
331 };
332 if new {
333 self.state_profile.borrow_mut().insert(state, dt);
335 }
336 match run {
337 ProcessResult::Continue => continue,
338 ProcessResult::Done => return TokenizerResult::Done,
339 ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
340 }
341 }
342 } else {
343 loop {
344 match self.step(input) {
345 ProcessResult::Continue => continue,
346 ProcessResult::Done => return TokenizerResult::Done,
347 ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
348 }
349 }
350 }
351 }
352
353 fn get_char(&self, input: &BufferQueue) -> Option<char> {
356 if self.reconsume.get() {
357 self.reconsume.set(false);
358 Some(self.current_char.get())
359 } else {
360 input
361 .next()
362 .and_then(|c| self.get_preprocessed_char(c, input))
363 }
364 }
365
366 fn bad_char_error(&self) {
367 let msg = if self.opts.exact_errors {
368 let c = self.current_char.get();
369 let state = self.state.get();
370 Cow::from(format!("Saw {c} in state {state:?}"))
371 } else {
372 Cow::from("Bad character")
373 };
374 self.emit_error(msg);
375 }
376
377 fn discard_tag(&self) {
378 *self.current_tag_name.borrow_mut() = StrTendril::new();
379 *self.current_tag_attrs.borrow_mut() = Vec::new();
380 }
381
382 fn create_tag(&self, kind: TagKind, c: char) {
383 self.discard_tag();
384 self.current_tag_name.borrow_mut().push_char(c);
385 self.current_tag_kind.set(kind);
386 }
387
388 fn create_pi(&self, c: char) {
391 *self.current_pi_target.borrow_mut() = StrTendril::new();
392 *self.current_pi_data.borrow_mut() = StrTendril::new();
393 self.current_pi_target.borrow_mut().push_char(c);
394 }
395
396 fn emit_char(&self, c: char) {
397 self.process_token(Token::Characters(StrTendril::from_char(match c {
398 '\0' => '\u{FFFD}',
399 c => c,
400 })));
401 }
402
403 fn emit_short_tag(&self) -> ProcessResult<Sink::Handle> {
404 self.current_tag_kind.set(ShortTag);
405 *self.current_tag_name.borrow_mut() = StrTendril::new();
406 self.emit_current_tag()
407 }
408
409 fn emit_empty_tag(&self) -> ProcessResult<Sink::Handle> {
410 self.current_tag_kind.set(EmptyTag);
411 self.emit_current_tag()
412 }
413
414 fn set_empty_tag(&self) {
415 self.current_tag_kind.set(EmptyTag);
416 }
417
418 fn emit_start_tag(&self) -> ProcessResult<Sink::Handle> {
419 self.current_tag_kind.set(StartTag);
420 self.emit_current_tag()
421 }
422
423 fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
424 self.finish_attribute();
425
426 let qname = process_qname(replace(
427 &mut *self.current_tag_name.borrow_mut(),
428 StrTendril::new(),
429 ));
430
431 match self.current_tag_kind.get() {
432 StartTag | EmptyTag => {},
433 EndTag => {
434 if !self.current_tag_attrs.borrow().is_empty() {
435 self.emit_error(Borrowed("Attributes on an end tag"));
436 }
437 },
438 ShortTag => {
439 if !self.current_tag_attrs.borrow().is_empty() {
440 self.emit_error(Borrowed("Attributes on a short tag"));
441 }
442 },
443 }
444
445 let token = Token::Tag(Tag {
446 kind: self.current_tag_kind.get(),
447 name: qname,
448 attrs: self.current_tag_attrs.take(),
449 });
450
451 self.process_token(token)
452 }
453
454 fn emit_chars(&self, b: StrTendril) {
456 self.process_token(Token::Characters(b));
457 }
458
459 fn emit_pi(&self) -> ProcessResult<<Sink as TokenSink>::Handle> {
461 let token = Token::ProcessingInstruction(Pi {
462 target: replace(&mut *self.current_pi_target.borrow_mut(), StrTendril::new()),
463 data: replace(&mut *self.current_pi_data.borrow_mut(), StrTendril::new()),
464 });
465 self.process_token(token)
466 }
467
468 fn consume_char_ref(&self, addnl_allowed: Option<char>) {
469 *self.char_ref_tokenizer.borrow_mut() =
472 Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
473 }
474
475 fn emit_eof(&self) {
476 self.process_token(Token::EndOfFile);
477 }
478
479 fn emit_error(&self, error: Cow<'static, str>) {
480 self.process_token(Token::ParseError(error));
481 }
482
483 fn emit_current_comment(&self) {
484 let comment = self.current_comment.take();
485 self.process_token(Token::Comment(comment));
486 }
487
488 fn emit_current_doctype(&self) {
489 let doctype = self.current_doctype.take();
490 self.process_token(Token::Doctype(doctype));
491 }
492
493 fn doctype_id(&self, kind: DoctypeKind) -> RefMut<'_, Option<StrTendril>> {
494 let current_doctype = self.current_doctype.borrow_mut();
495 match kind {
496 Public => RefMut::map(current_doctype, |d| &mut d.public_id),
497 System => RefMut::map(current_doctype, |d| &mut d.system_id),
498 }
499 }
500
501 fn clear_doctype_id(&self, kind: DoctypeKind) {
502 let mut id = self.doctype_id(kind);
503 match *id {
504 Some(ref mut s) => s.clear(),
505 None => *id = Some(StrTendril::new()),
506 }
507 }
508
509 fn peek(&self, input: &BufferQueue) -> Option<char> {
510 if self.reconsume.get() {
511 Some(self.current_char.get())
512 } else {
513 input.peek()
514 }
515 }
516
517 fn discard_char(&self, input: &BufferQueue) {
518 let c = self.get_char(input);
519 assert!(c.is_some());
520 }
521
522 fn unconsume(&self, input: &BufferQueue, buf: StrTendril) {
523 input.push_front(buf);
524 }
525}
526
527macro_rules! shorthand (
529 ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) );
530 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
531 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) );
532 ( $me:ident : discard_tag $input:expr ) => ( $me.discard_tag($input) );
533 ( $me:ident : discard_char ) => ( $me.discard_char() );
534 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) );
535 ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() );
536 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
537 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
538 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) );
539 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) );
540 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
541 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) );
542 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) );
543 ( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
544 ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() );
545 ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
546 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
547 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) );
548 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
549 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
550 ( $me:ident : error ) => ( $me.bad_char_error() );
551 ( $me:ident : error_eof ) => ( $me.bad_eof_error() );
552 ( $me:ident : create_pi $c:expr ) => ( $me.create_pi($c) );
553 ( $me:ident : push_pi_target $c:expr ) => ( $me.current_pi_target.borrow_mut().push_char($c) );
554 ( $me:ident : push_pi_data $c:expr ) => ( $me.current_pi_data.borrow_mut().push_char($c) );
555 ( $me:ident : set_empty_tag ) => ( $me.set_empty_tag() );
556);
557
558#[cfg(feature = "trace_tokenizer")]
561macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
562 debug!(" {:?}", stringify!($($cmds)*));
563 shorthand!($me : $($cmds)*);
564}));
565
566#[cfg(not(feature = "trace_tokenizer"))]
567macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
568
569macro_rules! go (
571 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
575 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
576 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
577 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
578
579 ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; });
582 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; });
583 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
584
585 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); });
586 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); });
587 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
588
589 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; });
590 ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
591
592 ( $me:ident : emit_tag $s:ident ) => ({
594 $me.state.set(states::$s);
595 return $me.emit_current_tag();
596 });
597
598 ( $me:ident : emit_short_tag $s:ident ) => ({
600 $me.state.set(states::$s);
601 return $me.emit_short_tag();
602 });
603
604 ( $me:ident : emit_empty_tag $s:ident ) => ({
605 $me.state.set(states::$s);
606 return $me.emit_empty_tag();
607 });
608
609 ( $me:ident : emit_start_tag $s:ident ) => ({
610 $me.state.set(states::$s);
611 return $me.emit_start_tag();
612 });
613
614 ( $me:ident : emit_pi $s:ident ) => ({
615 $me.state.set(states::$s);
616 return $me.emit_pi();
617 });
618
619 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Done; });
620
621 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
623
624 ( $me:ident : ) => (());
626);
627
628macro_rules! get_char ( ($me:expr, $input:expr) => (
631 unwrap_or_return!($me.get_char($input), ProcessResult::Done)
632));
633
634macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
635 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Done)
636));
637
638macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
639 unwrap_or_return!($me.eat($input, $pat), ProcessResult::Done)
640));
641
642pub enum ProcessResult<Handle> {
644 Done,
646 Continue,
648 Script(Handle),
651}
652
653impl<Sink: TokenSink> XmlTokenizer<Sink> {
654 #[allow(clippy::never_loop)]
656 fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
657 if self.char_ref_tokenizer.borrow().is_some() {
658 return self.step_char_ref_tokenizer(input);
659 }
660
661 debug!("processing in state {:?}", self.state);
662 match self.state.get() {
663 XmlState::Quiescent => {
664 self.state.set(XmlState::Data);
665 ProcessResult::Done
666 },
667 XmlState::Data => loop {
669 match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
670 FromSet('&') => go!(self: consume_char_ref),
671 FromSet('<') => go!(self: to TagState),
672 FromSet(c) => go!(self: emit c),
673 NotFromSet(b) => self.emit_chars(b),
674 }
675 },
676 XmlState::TagState => loop {
678 match get_char!(self, input) {
679 '!' => go!(self: to MarkupDecl),
680 '/' => go!(self: to EndTagState),
681 '?' => go!(self: to Pi),
682 '\t' | '\n' | ' ' | ':' | '<' | '>' => {
683 go!(self: error; emit '<'; reconsume Data)
684 },
685 cl => go!(self: create_tag StartTag cl; to TagName),
686 }
687 },
688 XmlState::EndTagState => loop {
690 match get_char!(self, input) {
691 '>' => go!(self: emit_short_tag Data),
692 '\t' | '\n' | ' ' | '<' | ':' => {
693 go!(self: error; emit '<'; emit '/'; reconsume Data)
694 },
695 cl => go!(self: create_tag EndTag cl; to EndTagName),
696 }
697 },
698 XmlState::EndTagName => loop {
700 match get_char!(self, input) {
701 '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
702 '/' => go!(self: error; to EndTagNameAfter),
703 '>' => go!(self: emit_tag Data),
704 cl => go!(self: push_tag cl),
705 }
706 },
707 XmlState::EndTagNameAfter => loop {
709 match get_char!(self, input) {
710 '>' => go!(self: emit_tag Data),
711 '\t' | '\n' | ' ' => (),
712 _ => self.emit_error(Borrowed("Unexpected element in tag name")),
713 }
714 },
715 XmlState::Pi => loop {
717 match get_char!(self, input) {
718 '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
719 cl => go!(self: create_pi cl; to PiTarget),
720 }
721 },
722 XmlState::PiTarget => loop {
724 match get_char!(self, input) {
725 '\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
726 '?' => go!(self: to PiAfter),
727 cl => go!(self: push_pi_target cl),
728 }
729 },
730 XmlState::PiTargetAfter => loop {
732 match get_char!(self, input) {
733 '\t' | '\n' | ' ' => (),
734 _ => go!(self: reconsume PiData),
735 }
736 },
737 XmlState::PiData => loop {
739 match get_char!(self, input) {
740 '?' => go!(self: to PiAfter),
741 cl => go!(self: push_pi_data cl),
742 }
743 },
744 XmlState::PiAfter => loop {
746 match get_char!(self, input) {
747 '>' => go!(self: emit_pi Data),
748 '?' => go!(self: to PiAfter),
749 cl => go!(self: push_pi_data cl),
750 }
751 },
752 XmlState::MarkupDecl => loop {
754 if eat!(self, input, "--") {
755 go!(self: clear_comment; to CommentStart);
756 } else if eat!(self, input, "[CDATA[") {
757 go!(self: to Cdata);
758 } else if eat!(self, input, "DOCTYPE") {
759 go!(self: to Doctype);
760 } else {
761 go!(self: error; to BogusComment);
763 }
764 },
765 XmlState::CommentStart => loop {
767 match get_char!(self, input) {
768 '-' => go!(self: to CommentStartDash),
769 '>' => go!(self: error; emit_comment; to Data),
770 _ => go!(self: reconsume Comment),
771 }
772 },
773 XmlState::CommentStartDash => loop {
775 match get_char!(self, input) {
776 '-' => go!(self: to CommentEnd),
777 '>' => go!(self: error; emit_comment; to Data),
778 _ => go!(self: push_comment '-'; reconsume Comment),
779 }
780 },
781 XmlState::Comment => loop {
783 match get_char!(self, input) {
784 '<' => go!(self: push_comment '<'; to CommentLessThan),
785 '-' => go!(self: to CommentEndDash),
786 c => go!(self: push_comment c),
787 }
788 },
789 XmlState::CommentLessThan => loop {
791 match get_char!(self, input) {
792 '!' => go!(self: push_comment '!';to CommentLessThanBang),
793 '<' => go!(self: push_comment '<'),
794 _ => go!(self: reconsume Comment),
795 }
796 },
797 XmlState::CommentLessThanBang => loop {
799 match get_char!(self, input) {
800 '-' => go!(self: to CommentLessThanBangDash),
801 _ => go!(self: reconsume Comment),
802 }
803 },
804 XmlState::CommentLessThanBangDash => loop {
806 match get_char!(self, input) {
807 '-' => go!(self: to CommentLessThanBangDashDash),
808 _ => go!(self: reconsume CommentEndDash),
809 }
810 },
811 XmlState::CommentLessThanBangDashDash => loop {
813 match get_char!(self, input) {
814 '>' => go!(self: reconsume CommentEnd),
815 _ => go!(self: error; reconsume CommentEnd),
816 }
817 },
818 XmlState::CommentEndDash => loop {
820 match get_char!(self, input) {
821 '-' => go!(self: to CommentEnd),
822 _ => go!(self: push_comment '-'; reconsume Comment),
823 }
824 },
825 XmlState::CommentEnd => loop {
827 match get_char!(self, input) {
828 '>' => go!(self: emit_comment; to Data),
829 '!' => go!(self: to CommentEndBang),
830 '-' => go!(self: push_comment '-'),
831 _ => go!(self: append_comment "--"; reconsume Comment),
832 }
833 },
834 XmlState::CommentEndBang => loop {
836 match get_char!(self, input) {
837 '-' => go!(self: append_comment "--!"; to CommentEndDash),
838 '>' => go!(self: error; emit_comment; to Data),
839 _ => go!(self: append_comment "--!"; reconsume Comment),
840 }
841 },
842 XmlState::BogusComment => loop {
844 match get_char!(self, input) {
845 '>' => go!(self: emit_comment; to Data),
846 c => go!(self: push_comment c),
847 }
848 },
849 XmlState::Cdata => loop {
851 match get_char!(self, input) {
852 ']' => go!(self: to CdataBracket),
853 cl => go!(self: emit cl),
854 }
855 },
856 XmlState::CdataBracket => loop {
858 match get_char!(self, input) {
859 ']' => go!(self: to CdataEnd),
860 cl => go!(self: emit ']'; emit cl; to Cdata),
861 }
862 },
863 XmlState::CdataEnd => loop {
865 match get_char!(self, input) {
866 '>' => go!(self: to Data),
867 ']' => go!(self: emit ']'),
868 cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
869 }
870 },
871 XmlState::TagName => loop {
873 match get_char!(self, input) {
874 '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
875 '>' => go!(self: emit_tag Data),
876 '/' => go!(self: set_empty_tag; to TagEmpty),
877 cl => go!(self: push_tag cl),
878 }
879 },
880 XmlState::TagEmpty => loop {
882 match get_char!(self, input) {
883 '>' => go!(self: emit_empty_tag Data),
884 _ => go!(self: reconsume TagAttrValueBefore),
885 }
886 },
887 XmlState::TagAttrNameBefore => loop {
889 match get_char!(self, input) {
890 '\t' | '\n' | ' ' => (),
891 '>' => go!(self: emit_tag Data),
892 '/' => go!(self: set_empty_tag; to TagEmpty),
893 ':' => go!(self: error),
894 cl => go!(self: create_attr cl; to TagAttrName),
895 }
896 },
897 XmlState::TagAttrName => loop {
899 match get_char!(self, input) {
900 '=' => go!(self: to TagAttrValueBefore),
901 '>' => go!(self: emit_tag Data),
902 '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
903 '/' => go!(self: set_empty_tag; to TagEmpty),
904 cl => go!(self: push_name cl),
905 }
906 },
907 XmlState::TagAttrNameAfter => loop {
909 match get_char!(self, input) {
910 '\t' | '\n' | ' ' => (),
911 '=' => go!(self: to TagAttrValueBefore),
912 '>' => go!(self: emit_tag Data),
913 '/' => go!(self: set_empty_tag; to TagEmpty),
914 cl => go!(self: create_attr cl; to TagAttrName),
915 }
916 },
917 XmlState::TagAttrValueBefore => loop {
919 match get_char!(self, input) {
920 '\t' | '\n' | ' ' => (),
921 '"' => go!(self: to TagAttrValue DoubleQuoted),
922 '\'' => go!(self: to TagAttrValue SingleQuoted),
923 '&' => go!(self: reconsume TagAttrValue(Unquoted)),
924 '>' => go!(self: emit_tag Data),
925 cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
926 }
927 },
928 XmlState::TagAttrValue(DoubleQuoted) => loop {
930 match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
931 FromSet('"') => go!(self: to TagAttrNameBefore),
932 FromSet('&') => go!(self: consume_char_ref '"' ),
933 FromSet(c) => go!(self: push_value c),
934 NotFromSet(ref b) => go!(self: append_value b),
935 }
936 },
937 XmlState::TagAttrValue(SingleQuoted) => loop {
939 match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
940 FromSet('\'') => go!(self: to TagAttrNameBefore),
941 FromSet('&') => go!(self: consume_char_ref '\''),
942 FromSet(c) => go!(self: push_value c),
943 NotFromSet(ref b) => go!(self: append_value b),
944 }
945 },
946 XmlState::TagAttrValue(Unquoted) => loop {
948 match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
949 FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
950 FromSet('&') => go!(self: consume_char_ref),
951 FromSet('>') => go!(self: emit_tag Data),
952 FromSet(c) => go!(self: push_value c),
953 NotFromSet(ref b) => go!(self: append_value b),
954 }
955 },
956
957 XmlState::Doctype => loop {
959 match get_char!(self, input) {
960 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
961 _ => go!(self: error; reconsume BeforeDoctypeName),
962 }
963 },
964 XmlState::BeforeDoctypeName => loop {
966 match get_char!(self, input) {
967 '\t' | '\n' | '\x0C' | ' ' => (),
968 '>' => go!(self: error; emit_doctype; to Data),
969 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
970 to DoctypeName),
971 }
972 },
973 XmlState::DoctypeName => loop {
975 match get_char!(self, input) {
976 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
977 '>' => go!(self: emit_doctype; to Data),
978 c => go!(self: push_doctype_name (c.to_ascii_lowercase());
979 to DoctypeName),
980 }
981 },
982 XmlState::AfterDoctypeName => loop {
984 if eat!(self, input, "public") {
985 go!(self: to AfterDoctypeKeyword Public);
986 } else if eat!(self, input, "system") {
987 go!(self: to AfterDoctypeKeyword System);
988 } else {
989 match get_char!(self, input) {
990 '\t' | '\n' | '\x0C' | ' ' => (),
991 '>' => go!(self: emit_doctype; to Data),
992 _ => go!(self: error; to BogusDoctype),
993 }
994 }
995 },
996 XmlState::AfterDoctypeKeyword(Public) => loop {
998 match get_char!(self, input) {
999 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
1000 '"' => {
1001 go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
1002 },
1003 '\'' => {
1004 go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
1005 },
1006 '>' => go!(self: error; emit_doctype; to Data),
1007 _ => go!(self: error; to BogusDoctype),
1008 }
1009 },
1010 XmlState::AfterDoctypeKeyword(System) => loop {
1012 match get_char!(self, input) {
1013 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
1014 '"' => {
1015 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1016 },
1017 '\'' => {
1018 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1019 },
1020 '>' => go!(self: error; emit_doctype; to Data),
1021 _ => go!(self: error; to BogusDoctype),
1022 }
1023 },
1024 XmlState::BeforeDoctypeIdentifier(kind) => loop {
1026 match get_char!(self, input) {
1027 '\t' | '\n' | '\x0C' | ' ' => (),
1028 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1029 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1030 '>' => go!(self: error; emit_doctype; to Data),
1031 _ => go!(self: error; to BogusDoctype),
1032 }
1033 },
1034 XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
1036 match get_char!(self, input) {
1037 '"' => go!(self: to AfterDoctypeIdentifier kind),
1038 '>' => go!(self: error; emit_doctype; to Data),
1039 c => go!(self: push_doctype_id kind c),
1040 }
1041 },
1042 XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
1044 match get_char!(self, input) {
1045 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1046 '>' => go!(self: error; emit_doctype; to Data),
1047 c => go!(self: push_doctype_id kind c),
1048 }
1049 },
1050 XmlState::AfterDoctypeIdentifier(Public) => loop {
1052 match get_char!(self, input) {
1053 '\t' | '\n' | '\x0C' | ' ' => {
1054 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1055 },
1056 '\'' => {
1057 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
1058 },
1059 '"' => {
1060 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
1061 },
1062 '>' => go!(self: emit_doctype; to Data),
1063 _ => go!(self: error; to BogusDoctype),
1064 }
1065 },
1066 XmlState::AfterDoctypeIdentifier(System) => loop {
1068 match get_char!(self, input) {
1069 '\t' | '\n' | '\x0C' | ' ' => (),
1070 '>' => go!(self: emit_doctype; to Data),
1071 _ => go!(self: error; to BogusDoctype),
1072 }
1073 },
1074 XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
1076 match get_char!(self, input) {
1077 '\t' | '\n' | '\x0C' | ' ' => (),
1078 '>' => go!(self: emit_doctype; to Data),
1079 '\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
1080 '"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
1081 _ => go!(self: error; to BogusDoctype),
1082 }
1083 },
1084 XmlState::BogusDoctype => loop {
1086 if get_char!(self, input) == '>' {
1087 go!(self: emit_doctype; to Data);
1088 }
1089 },
1090 }
1091 }
1092
1093 pub fn end(&self) {
1095 let input = BufferQueue::default();
1098 match self.char_ref_tokenizer.take() {
1099 None => (),
1100 Some(mut tok) => {
1101 tok.end_of_file(self, &input);
1102 self.process_char_ref(tok.get_result());
1103 },
1104 }
1105
1106 self.at_eof.set(true);
1109 let _ = self.run(&input);
1110
1111 loop {
1112 if !matches!(self.eof_step(), ProcessResult::Continue) {
1113 break;
1114 }
1115 }
1116
1117 self.sink.end();
1118
1119 if self.opts.profile {
1120 self.dump_profile();
1121 }
1122 }
1123
1124 #[cfg(for_c)]
1125 fn dump_profile(&self) {
1126 unreachable!();
1127 }
1128
1129 #[cfg(not(for_c))]
1130 fn dump_profile(&self) {
1131 let mut results: Vec<(states::XmlState, u64)> = self
1132 .state_profile
1133 .borrow()
1134 .iter()
1135 .map(|(s, t)| (*s, *t))
1136 .collect();
1137 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1138
1139 let total: u64 = results
1140 .iter()
1141 .map(|&(_, t)| t)
1142 .fold(0, ::std::ops::Add::add);
1143 debug!("\nTokenizer profile, in nanoseconds");
1144 debug!(
1145 "\n{:12} total in token sink",
1146 self.time_in_sink.get()
1147 );
1148 debug!("\n{total:12} total in tokenizer");
1149
1150 for (k, v) in results.into_iter() {
1151 let pct = 100.0 * (v as f64) / (total as f64);
1152 debug!("{v:12} {pct:4.1}% {k:?}");
1153 }
1154 }
1155
1156 fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1157 debug!("processing EOF in state {:?}", self.state.get());
1158 match self.state.get() {
1159 XmlState::Data | XmlState::Quiescent => go!(self: eof),
1160 XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
1161 go!(self: reconsume Comment)
1162 },
1163 XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
1164 XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
1165 XmlState::CommentStartDash
1166 | XmlState::Comment
1167 | XmlState::CommentEndDash
1168 | XmlState::CommentEnd
1169 | XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
1170 XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
1171 XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
1172 XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
1173 XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
1174 go!(self: error_eof; to Data)
1175 },
1176 XmlState::Pi => go!(self: error_eof; to BogusComment),
1177 XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
1178 XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
1179 XmlState::TagName
1180 | XmlState::TagAttrNameBefore
1181 | XmlState::EndTagName
1182 | XmlState::TagAttrNameAfter
1183 | XmlState::EndTagNameAfter
1184 | XmlState::TagAttrValueBefore
1185 | XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
1186 XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
1187 XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
1188 XmlState::BeforeDoctypeName
1189 | XmlState::Doctype
1190 | XmlState::DoctypeName
1191 | XmlState::AfterDoctypeName
1192 | XmlState::AfterDoctypeKeyword(_)
1193 | XmlState::BeforeDoctypeIdentifier(_)
1194 | XmlState::AfterDoctypeIdentifier(_)
1195 | XmlState::DoctypeIdentifierSingleQuoted(_)
1196 | XmlState::DoctypeIdentifierDoubleQuoted(_)
1197 | XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
1198 go!(self: error_eof; emit_doctype; to Data)
1199 },
1200 XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
1201 XmlState::BogusComment => go!(self: emit_comment; to Data),
1202 }
1203 }
1204
1205 fn process_char_ref(&self, char_ref: CharRef) {
1206 let CharRef {
1207 mut chars,
1208 mut num_chars,
1209 } = char_ref;
1210
1211 if num_chars == 0 {
1212 chars[0] = '&';
1213 num_chars = 1;
1214 }
1215
1216 for i in 0..num_chars {
1217 let c = chars[i as usize];
1218 match self.state.get() {
1219 states::Data | states::Cdata => go!(self: emit c),
1220
1221 states::TagAttrValue(_) => go!(self: push_value c),
1222
1223 _ => panic!(
1224 "state {:?} should not be reachable in process_char_ref",
1225 self.state.get()
1226 ),
1227 }
1228 }
1229 }
1230
1231 fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1232 let mut tok = self.char_ref_tokenizer.take().unwrap();
1233 let outcome = tok.step(self, input);
1234
1235 let progress = match outcome {
1236 char_ref::Done => {
1237 self.process_char_ref(tok.get_result());
1238 return ProcessResult::Continue;
1239 },
1240
1241 char_ref::Stuck => ProcessResult::Done,
1242 char_ref::Progress => ProcessResult::Continue,
1243 };
1244
1245 *self.char_ref_tokenizer.borrow_mut() = Some(tok);
1246 progress
1247 }
1248
1249 fn finish_attribute(&self) {
1250 if self.current_attr_name.borrow().is_empty() {
1251 return;
1252 }
1253
1254 let dup = {
1258 let current_attr_name = self.current_attr_name.borrow();
1259 let name = ¤t_attr_name[..];
1260 self.current_tag_attrs
1261 .borrow()
1262 .iter()
1263 .any(|a| &*a.name.local == name)
1264 };
1265
1266 if dup {
1267 self.emit_error(Borrowed("Duplicate attribute"));
1268 self.current_attr_name.borrow_mut().clear();
1269 self.current_attr_value.borrow_mut().clear();
1270 } else {
1271 let qname = process_qname(replace(
1272 &mut self.current_attr_name.borrow_mut(),
1273 StrTendril::new(),
1274 ));
1275 let attr = Attribute {
1276 name: qname.clone(),
1277 value: replace(&mut self.current_attr_value.borrow_mut(), StrTendril::new()),
1278 };
1279
1280 if qname.local == local_name!("xmlns")
1281 || qname.prefix == Some(namespace_prefix!("xmlns"))
1282 {
1283 self.current_tag_attrs.borrow_mut().insert(0, attr);
1284 } else {
1285 self.current_tag_attrs.borrow_mut().push(attr);
1286 }
1287 }
1288 }
1289
1290 fn create_attribute(&self, c: char) {
1291 self.finish_attribute();
1292
1293 self.current_attr_name.borrow_mut().push_char(c);
1294 }
1295}
1296
1297#[cfg(test)]
1298mod test {
1299
1300 use super::process_qname;
1301 use crate::tendril::SliceExt;
1302 use crate::{LocalName, Prefix};
1303
1304 #[test]
1305 fn simple_namespace() {
1306 let qname = process_qname("prefix:local".to_tendril());
1307 assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
1308 assert_eq!(qname.local, LocalName::from("local"));
1309
1310 let qname = process_qname("a:b".to_tendril());
1311 assert_eq!(qname.prefix, Some(Prefix::from("a")));
1312 assert_eq!(qname.local, LocalName::from("b"));
1313 }
1314
1315 #[test]
1316 fn wrong_namespaces() {
1317 let qname = process_qname(":local".to_tendril());
1318 assert_eq!(qname.prefix, None);
1319 assert_eq!(qname.local, LocalName::from(":local"));
1320
1321 let qname = process_qname("::local".to_tendril());
1322 assert_eq!(qname.prefix, None);
1323 assert_eq!(qname.local, LocalName::from("::local"));
1324
1325 let qname = process_qname("a::local".to_tendril());
1326 assert_eq!(qname.prefix, None);
1327 assert_eq!(qname.local, LocalName::from("a::local"));
1328
1329 let qname = process_qname("fake::".to_tendril());
1330 assert_eq!(qname.prefix, None);
1331 assert_eq!(qname.local, LocalName::from("fake::"));
1332
1333 let qname = process_qname(":::".to_tendril());
1334 assert_eq!(qname.prefix, None);
1335 assert_eq!(qname.local, LocalName::from(":::"));
1336
1337 let qname = process_qname(":a:b:".to_tendril());
1338 assert_eq!(qname.prefix, None);
1339 assert_eq!(qname.local, LocalName::from(":a:b:"));
1340 }
1341}