1pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use markup5ever::{ns, small_char_set, TokenizerResult};
28use std::borrow::Cow::{self, Borrowed};
29use std::cell::{Cell, RefCell, RefMut};
30use std::collections::BTreeMap;
31use std::mem;
32
33pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34use crate::macros::{time, unwrap_or_return};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42pub enum ProcessResult<Handle> {
44 Continue,
46 Suspend,
49 Script(Handle),
54 EncodingIndicator(StrTendril),
60}
61
62fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
63 match *opt_str {
64 Some(ref mut s) => s.push_char(c),
65 None => *opt_str = Some(StrTendril::from_char(c)),
66 }
67}
68
69#[derive(Clone)]
71pub struct TokenizerOpts {
72 pub exact_errors: bool,
75
76 pub discard_bom: bool,
79
80 pub profile: bool,
83
84 pub initial_state: Option<states::State>,
87
88 pub last_start_tag_name: Option<String>,
94}
95
96impl Default for TokenizerOpts {
97 fn default() -> TokenizerOpts {
98 TokenizerOpts {
99 exact_errors: false,
100 discard_bom: true,
101 profile: false,
102 initial_state: None,
103 last_start_tag_name: None,
104 }
105 }
106}
107
108pub struct Tokenizer<Sink> {
110 opts: TokenizerOpts,
112
113 pub sink: Sink,
115
116 state: Cell<states::State>,
118
119 at_eof: Cell<bool>,
122
123 char_ref_tokenizer: RefCell<Option<CharRefTokenizer>>,
126
127 current_char: Cell<char>,
129
130 reconsume: Cell<bool>,
132
133 ignore_lf: Cell<bool>,
136
137 discard_bom: Cell<bool>,
140
141 current_tag_kind: Cell<TagKind>,
143
144 current_tag_name: RefCell<StrTendril>,
146
147 current_tag_self_closing: Cell<bool>,
149
150 current_tag_attrs: RefCell<Vec<Attribute>>,
152
153 current_attr_name: RefCell<StrTendril>,
155
156 current_attr_value: RefCell<StrTendril>,
158
159 current_comment: RefCell<StrTendril>,
161
162 current_doctype: RefCell<Doctype>,
164
165 last_start_tag_name: RefCell<Option<LocalName>>,
167
168 temp_buf: RefCell<StrTendril>,
170
171 state_profile: RefCell<BTreeMap<states::State, u64>>,
173
174 time_in_sink: Cell<u64>,
176
177 current_line: Cell<u64>,
179}
180
181impl<Sink: TokenSink> Tokenizer<Sink> {
182 pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
184 let start_tag_name = opts
185 .last_start_tag_name
186 .take()
187 .map(|s| LocalName::from(&*s));
188 let state = opts.initial_state.unwrap_or(states::Data);
189 let discard_bom = opts.discard_bom;
190 Tokenizer {
191 opts,
192 sink,
193 state: Cell::new(state),
194 char_ref_tokenizer: RefCell::new(None),
195 at_eof: Cell::new(false),
196 current_char: Cell::new('\0'),
197 reconsume: Cell::new(false),
198 ignore_lf: Cell::new(false),
199 discard_bom: Cell::new(discard_bom),
200 current_tag_kind: Cell::new(StartTag),
201 current_tag_name: RefCell::new(StrTendril::new()),
202 current_tag_self_closing: Cell::new(false),
203 current_tag_attrs: RefCell::new(vec![]),
204 current_attr_name: RefCell::new(StrTendril::new()),
205 current_attr_value: RefCell::new(StrTendril::new()),
206 current_comment: RefCell::new(StrTendril::new()),
207 current_doctype: RefCell::new(Doctype::default()),
208 last_start_tag_name: RefCell::new(start_tag_name),
209 temp_buf: RefCell::new(StrTendril::new()),
210 state_profile: RefCell::new(BTreeMap::new()),
211 time_in_sink: Cell::new(0),
212 current_line: Cell::new(1),
213 }
214 }
215
216 pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
218 if input.is_empty() {
219 return TokenizerResult::Done;
220 }
221
222 if self.discard_bom.get() {
223 if let Some(c) = input.peek() {
224 if c == '\u{feff}' {
225 input.next();
226 }
227 } else {
228 return TokenizerResult::Done;
229 }
230 };
231
232 self.run(input)
233 }
234
235 pub fn set_plaintext_state(&self) {
236 self.state.set(states::Plaintext);
237 }
238
239 fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
240 if self.opts.profile {
241 let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
242 self.time_in_sink.set(self.time_in_sink.get() + dt);
243 ret
244 } else {
245 self.sink.process_token(token, self.current_line.get())
246 }
247 }
248
249 fn process_token_and_continue(&self, token: Token) {
250 assert!(matches!(
251 self.process_token(token),
252 TokenSinkResult::Continue
253 ));
254 }
255
256 fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
260 if self.ignore_lf.get() {
261 self.ignore_lf.set(false);
262 if c == '\n' {
263 c = input.next()?;
264 }
265 }
266
267 if c == '\r' {
268 self.ignore_lf.set(true);
269 c = '\n';
270 }
271
272 if c == '\n' {
273 self.current_line.set(self.current_line.get() + 1);
274 }
275
276 if self.opts.exact_errors
277 && match c as u32 {
278 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
279 n if (n & 0xFFFE) == 0xFFFE => true,
280 _ => false,
281 }
282 {
283 let msg = format!("Bad character {c}");
284 self.emit_error(Cow::Owned(msg));
285 }
286
287 trace!("got character {c}");
288 self.current_char.set(c);
289 Some(c)
290 }
291
292 fn get_char(&self, input: &BufferQueue) -> Option<char> {
295 if self.reconsume.get() {
296 self.reconsume.set(false);
297 Some(self.current_char.get())
298 } else {
299 input
300 .next()
301 .and_then(|c| self.get_preprocessed_char(c, input))
302 }
303 }
304
305 fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
306 if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
311 return self.get_char(input).map(FromSet);
312 }
313
314 let d = input.pop_except_from(set);
315 trace!("got characters {d:?}");
316 match d {
317 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
318
319 _ => d,
323 }
324 }
325
326 fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
331 if self.ignore_lf.get() {
332 self.ignore_lf.set(false);
333 if self.peek(input) == Some('\n') {
334 self.discard_char(input);
335 }
336 }
337
338 input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
339 match input.eat(pat, eq) {
340 None if self.at_eof.get() => Some(false),
341 None => {
342 while let Some(data) = input.next() {
343 self.temp_buf.borrow_mut().push_char(data);
344 }
345 None
346 },
347 Some(matched) => Some(matched),
348 }
349 }
350
351 fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
353 if self.opts.profile {
354 loop {
355 let state = self.state.get();
356 let old_sink = self.time_in_sink.get();
357 let (run, mut dt) = time!(self.step(input));
358 dt -= (self.time_in_sink.get() - old_sink);
359 let new = match self.state_profile.borrow_mut().get_mut(&state) {
360 Some(x) => {
361 *x += dt;
362 false
363 },
364 None => true,
365 };
366 if new {
367 self.state_profile.borrow_mut().insert(state, dt);
369 }
370 match run {
371 ProcessResult::Continue => (),
372 ProcessResult::Suspend => break,
373 ProcessResult::Script(node) => return TokenizerResult::Script(node),
374 ProcessResult::EncodingIndicator(encoding) => {
375 return TokenizerResult::EncodingIndicator(encoding)
376 },
377 }
378 }
379 } else {
380 loop {
381 match self.step(input) {
382 ProcessResult::Continue => (),
383 ProcessResult::Suspend => break,
384 ProcessResult::Script(node) => return TokenizerResult::Script(node),
385 ProcessResult::EncodingIndicator(encoding) => {
386 return TokenizerResult::EncodingIndicator(encoding)
387 },
388 }
389 }
390 }
391 TokenizerResult::Done
392 }
393
394 #[inline]
395 fn bad_char_error(&self) {
396 #[cfg(feature = "trace_tokenizer")]
397 trace!(" error");
398
399 let msg = if self.opts.exact_errors {
400 Cow::from("Bad character")
401 } else {
402 let c = self.current_char.get();
403 let state = self.state.get();
404 Cow::from(format!("Saw {c} in state {state:?}"))
405 };
406 self.emit_error(msg);
407 }
408
409 #[inline]
410 fn bad_eof_error(&self) {
411 #[cfg(feature = "trace_tokenizer")]
412 trace!(" error_eof");
413
414 let msg = if self.opts.exact_errors {
415 Cow::from("Unexpected EOF")
416 } else {
417 let state = self.state.get();
418 Cow::from(format!("Saw EOF in state {state:?}"))
419 };
420 self.emit_error(msg);
421 }
422
423 fn emit_char(&self, c: char) {
424 #[cfg(feature = "trace_tokenizer")]
425 trace!(" emit");
426
427 self.process_token_and_continue(match c {
428 '\0' => NullCharacterToken,
429 _ => CharacterTokens(StrTendril::from_char(c)),
430 });
431 }
432
433 fn emit_chars(&self, b: StrTendril) {
435 self.process_token_and_continue(CharacterTokens(b));
436 }
437
438 fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
439 self.finish_attribute();
440
441 let name = LocalName::from(&**self.current_tag_name.borrow());
442 self.current_tag_name.borrow_mut().clear();
443
444 match self.current_tag_kind.get() {
445 StartTag => {
446 *self.last_start_tag_name.borrow_mut() = Some(name.clone());
447 },
448 EndTag => {
449 if !self.current_tag_attrs.borrow().is_empty() {
450 self.emit_error(Borrowed("Attributes on an end tag"));
451 }
452 if self.current_tag_self_closing.get() {
453 self.emit_error(Borrowed("Self-closing end tag"));
454 }
455 },
456 }
457
458 let token = TagToken(Tag {
459 kind: self.current_tag_kind.get(),
460 name,
461 self_closing: self.current_tag_self_closing.get(),
462 attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
463 });
464
465 match self.process_token(token) {
466 TokenSinkResult::Continue => ProcessResult::Continue,
467 TokenSinkResult::Plaintext => {
468 self.state.set(states::Plaintext);
469 ProcessResult::Continue
470 },
471 TokenSinkResult::Script(node) => {
472 self.state.set(states::Data);
473 ProcessResult::Script(node)
474 },
475 TokenSinkResult::RawData(kind) => {
476 self.state.set(states::RawData(kind));
477 ProcessResult::Continue
478 },
479 TokenSinkResult::EncodingIndicator(encoding) => {
480 ProcessResult::EncodingIndicator(encoding)
481 },
482 }
483 }
484
485 fn emit_temp_buf(&self) {
486 #[cfg(feature = "trace_tokenizer")]
487 trace!(" emit_temp");
488
489 let buf = mem::take(&mut *self.temp_buf.borrow_mut());
491 self.emit_chars(buf);
492 }
493
494 fn clear_temp_buf(&self) {
495 self.temp_buf.borrow_mut().clear();
497 }
498
499 fn emit_current_comment(&self) {
500 let comment = mem::take(&mut *self.current_comment.borrow_mut());
501 self.process_token_and_continue(CommentToken(comment));
502 }
503
504 fn discard_tag(&self) {
505 self.current_tag_name.borrow_mut().clear();
506 self.current_tag_self_closing.set(false);
507 *self.current_tag_attrs.borrow_mut() = vec![];
508 }
509
510 fn create_tag(&self, kind: TagKind, c: char) {
511 self.discard_tag();
512 self.current_tag_name.borrow_mut().push_char(c);
513 self.current_tag_kind.set(kind);
514 }
515
516 fn have_appropriate_end_tag(&self) -> bool {
517 match self.last_start_tag_name.borrow().as_ref() {
518 Some(last) => {
519 (self.current_tag_kind.get() == EndTag)
520 && (**self.current_tag_name.borrow() == **last)
521 },
522 None => false,
523 }
524 }
525
526 fn create_attribute(&self, c: char) {
527 self.finish_attribute();
528
529 self.current_attr_name.borrow_mut().push_char(c);
530 }
531
532 fn finish_attribute(&self) {
533 if self.current_attr_name.borrow().is_empty() {
534 return;
535 }
536
537 let dup = {
540 let name = &*self.current_attr_name.borrow();
541 self.current_tag_attrs
542 .borrow()
543 .iter()
544 .any(|a| *a.name.local == **name)
545 };
546
547 if dup {
548 self.emit_error(Borrowed("Duplicate attribute"));
549 self.current_attr_name.borrow_mut().clear();
550 self.current_attr_value.borrow_mut().clear();
551 } else {
552 let name = LocalName::from(&**self.current_attr_name.borrow());
553 self.current_attr_name.borrow_mut().clear();
554 self.current_tag_attrs.borrow_mut().push(Attribute {
555 name: QualName::new(None, ns!(), name),
558 value: mem::take(&mut self.current_attr_value.borrow_mut()),
559 });
560 }
561 }
562
563 fn emit_current_doctype(&self) {
564 let doctype = self.current_doctype.take();
565 self.process_token_and_continue(DoctypeToken(doctype));
566 }
567
568 fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<'_, Option<StrTendril>> {
569 let current_doctype = self.current_doctype.borrow_mut();
570 match kind {
571 Public => RefMut::map(current_doctype, |d| &mut d.public_id),
572 System => RefMut::map(current_doctype, |d| &mut d.system_id),
573 }
574 }
575
576 fn clear_doctype_id(&self, kind: DoctypeIdKind) {
577 let mut id = self.doctype_id(kind);
578 match *id {
579 Some(ref mut s) => s.clear(),
580 None => *id = Some(StrTendril::new()),
581 }
582 }
583
584 fn start_consuming_character_reference(&self) {
585 debug_assert!(
586 self.char_ref_tokenizer.borrow().is_none(),
587 "Nested character references are impossible"
588 );
589
590 let is_in_attribute = matches!(self.state.get(), states::AttributeValue(_));
591 *self.char_ref_tokenizer.borrow_mut() = Some(CharRefTokenizer::new(is_in_attribute));
592 }
593
594 fn emit_eof(&self) {
595 self.process_token_and_continue(EOFToken);
596 }
597
598 fn peek(&self, input: &BufferQueue) -> Option<char> {
599 if self.reconsume.get() {
600 Some(self.current_char.get())
601 } else {
602 input.peek()
603 }
604 }
605
606 fn discard_char(&self, input: &BufferQueue) {
607 if self.reconsume.get() {
613 self.reconsume.set(false);
614 } else {
615 input.next();
616 }
617 }
618
619 fn emit_error(&self, error: Cow<'static, str>) {
620 self.process_token_and_continue(ParseError(error));
621 }
622}
623macro_rules! shorthand (
627 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
628 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) );
629 ( $me:ident : discard_tag ) => ( $me.discard_tag() );
630 ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) );
631 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) );
632 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
633 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
634 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) );
635 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) );
636 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
637 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) );
638 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) );
639 ( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
640 ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() );
641 ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
642 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
643 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) );
644 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
645 ( $me:ident : force_quirks ) => ( $me.current_doctype.borrow_mut().force_quirks = true);
646 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
647);
648
649#[cfg(feature = "trace_tokenizer")]
652macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
653 trace!(" {:?}", stringify!($($cmds)*));
654 shorthand!($me : $($cmds)*);
655}));
656
657#[cfg(not(feature = "trace_tokenizer"))]
658macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
659
660macro_rules! go (
662 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
666 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
667 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
668 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
669
670 ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; });
673 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; });
674 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
675
676 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); });
677 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); });
678 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
679
680 ( $me:ident : consume_char_ref ) => ({ $me.start_consuming_character_reference(); return ProcessResult::Continue; });
681
682 ( $me:ident : emit_tag $s:ident ) => ({
684 $me.state.set(states::$s);
685 return $me.emit_current_tag();
686 });
687
688 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
689
690 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
692
693 ( $me:ident : ) => (());
695);
696
697macro_rules! get_char ( ($me:expr, $input:expr) => (
700 unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
701));
702
703macro_rules! peek ( ($me:expr, $input:expr) => (
704 unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
705));
706
707macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
708 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
709));
710
711macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
712 unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
713));
714
715macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
716 unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
717));
718
719impl<Sink: TokenSink> Tokenizer<Sink> {
720 #[allow(clippy::never_loop)]
724 fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
725 if self.char_ref_tokenizer.borrow().is_some() {
726 return self.step_char_ref_tokenizer(input);
727 }
728
729 trace!("processing in state {:?}", self.state);
730 match self.state.get() {
731 states::Data => loop {
733 let set = small_char_set!('\r' '\0' '&' '<' '\n');
734
735 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
736 let set_result = if !(self.opts.exact_errors
737 || self.reconsume.get()
738 || self.ignore_lf.get())
739 && Self::is_supported_simd_feature_detected()
740 {
741 let front_buffer = input.peek_front_chunk_mut();
742 let Some(mut front_buffer) = front_buffer else {
743 return ProcessResult::Suspend;
744 };
745
746 let first_char = front_buffer
749 .chars()
750 .next()
751 .expect("Input buffers are never empty");
752
753 if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
754 drop(front_buffer);
755 self.pop_except_from(input, set)
756 } else {
757 let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
760
761 if front_buffer.is_empty() {
762 drop(front_buffer);
763 input.pop_front();
764 }
765
766 result
767 }
768 } else {
769 self.pop_except_from(input, set)
770 };
771
772 #[cfg(not(any(
773 target_arch = "x86",
774 target_arch = "x86_64",
775 target_arch = "aarch64"
776 )))]
777 let set_result = self.pop_except_from(input, set);
778
779 let Some(set_result) = set_result else {
780 return ProcessResult::Suspend;
781 };
782 match set_result {
783 FromSet('\0') => {
784 self.bad_char_error();
785 self.emit_char('\0');
786 },
787 FromSet('&') => go!(self: consume_char_ref),
788 FromSet('<') => go!(self: to TagOpen),
789 FromSet(c) => {
790 self.emit_char(c);
791 },
792 NotFromSet(b) => self.emit_chars(b),
793 }
794 },
795
796 states::RawData(Rcdata) => loop {
798 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
799 FromSet('\0') => {
800 self.bad_char_error();
801 self.emit_char('\u{fffd}');
802 },
803 FromSet('&') => go!(self: consume_char_ref),
804 FromSet('<') => go!(self: to RawLessThanSign Rcdata),
805 FromSet(c) => self.emit_char(c),
806 NotFromSet(b) => self.emit_chars(b),
807 }
808 },
809
810 states::RawData(Rawtext) => loop {
812 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
813 FromSet('\0') => {
814 self.bad_char_error();
815 self.emit_char('\u{fffd}');
816 },
817 FromSet('<') => go!(self: to RawLessThanSign Rawtext),
818 FromSet(c) => self.emit_char(c),
819 NotFromSet(b) => self.emit_chars(b),
820 }
821 },
822
823 states::RawData(ScriptData) => loop {
825 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
826 FromSet('\0') => {
827 self.bad_char_error();
828 self.emit_char('\u{fffd}');
829 },
830 FromSet('<') => go!(self: to RawLessThanSign ScriptData),
831 FromSet(c) => self.emit_char(c),
832 NotFromSet(b) => self.emit_chars(b),
833 }
834 },
835
836 states::RawData(ScriptDataEscaped(Escaped)) => loop {
838 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
839 FromSet('\0') => {
840 self.bad_char_error();
841 self.emit_char('\u{fffd}');
842 },
843 FromSet('-') => {
844 self.emit_char('-');
845 go!(self: to ScriptDataEscapedDash Escaped);
846 },
847 FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
848 FromSet(c) => self.emit_char(c),
849 NotFromSet(b) => self.emit_chars(b),
850 }
851 },
852
853 states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
855 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
856 FromSet('\0') => {
857 self.bad_char_error();
858 self.emit_char('\u{fffd}');
859 },
860 FromSet('-') => {
861 self.emit_char('-');
862 go!(self: to ScriptDataEscapedDash DoubleEscaped);
863 },
864 FromSet('<') => {
865 self.emit_char('<');
866 go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped)
867 },
868 FromSet(c) => self.emit_char(c),
869 NotFromSet(b) => self.emit_chars(b),
870 }
871 },
872
873 states::Plaintext => loop {
875 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
876 FromSet('\0') => {
877 self.bad_char_error();
878 self.emit_char('\u{fffd}');
879 },
880 FromSet(c) => self.emit_char(c),
881 NotFromSet(b) => self.emit_chars(b),
882 }
883 },
884
885 states::TagOpen => loop {
887 match get_char!(self, input) {
888 '!' => go!(self: to MarkupDeclarationOpen),
889 '/' => go!(self: to EndTagOpen),
890 '?' => {
891 self.bad_char_error();
892 go!(self: clear_comment; reconsume BogusComment)
893 },
894 c => match lower_ascii_letter(c) {
895 Some(cl) => go!(self: create_tag StartTag cl; to TagName),
896 None => {
897 self.bad_char_error();
898 self.emit_char('<');
899 go!(self: reconsume Data)
900 },
901 },
902 }
903 },
904
905 states::EndTagOpen => loop {
907 match get_char!(self, input) {
908 '>' => {
909 self.bad_char_error();
910 go!(self: to Data)
911 },
912 c => match lower_ascii_letter(c) {
913 Some(cl) => go!(self: create_tag EndTag cl; to TagName),
914 None => {
915 self.bad_char_error();
916 go!(self: clear_comment; reconsume BogusComment)
917 },
918 },
919 }
920 },
921
922 states::TagName => loop {
924 match get_char!(self, input) {
925 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
926 '/' => go!(self: to SelfClosingStartTag),
927 '>' => go!(self: emit_tag Data),
928 '\0' => {
929 self.bad_char_error();
930 go!(self: push_tag '\u{fffd}')
931 },
932 c => go!(self: push_tag (c.to_ascii_lowercase())),
933 }
934 },
935
936 states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
938 match get_char!(self, input) {
939 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
940 c => match lower_ascii_letter(c) {
941 Some(cl) => {
942 go!(self: clear_temp; push_temp cl);
943 self.emit_char('<');
944 self.emit_char(c);
945 go!(self: to ScriptDataEscapeStart DoubleEscaped);
946 },
947 None => {
948 self.emit_char('<');
949 go!(self: reconsume RawData ScriptDataEscaped Escaped);
950 },
951 },
952 }
953 },
954
955 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
957 match get_char!(self, input) {
958 '/' => {
959 go!(self: clear_temp);
960 self.emit_char('/');
961 go!(self: to ScriptDataDoubleEscapeEnd);
962 },
963 _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
964 }
965 },
966
967 states::RawLessThanSign(kind) => loop {
970 match get_char!(self, input) {
971 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
972 '!' if kind == ScriptData => {
973 self.emit_char('<');
974 self.emit_char('!');
975 go!(self: to ScriptDataEscapeStart Escaped);
976 },
977 _ => {
978 self.emit_char('<');
979 go!(self: reconsume RawData kind);
980 },
981 }
982 },
983
984 states::RawEndTagOpen(kind) => loop {
986 let c = get_char!(self, input);
987 match lower_ascii_letter(c) {
988 Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
989 None => {
990 self.emit_char('<');
991 self.emit_char('/');
992 go!(self: reconsume RawData kind);
993 },
994 }
995 },
996
997 states::RawEndTagName(kind) => loop {
999 let c = get_char!(self, input);
1000 if self.have_appropriate_end_tag() {
1001 match c {
1002 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName),
1003 '/' => go!(self: clear_temp; to SelfClosingStartTag),
1004 '>' => go!(self: clear_temp; emit_tag Data),
1005 _ => (),
1006 }
1007 }
1008
1009 match lower_ascii_letter(c) {
1010 Some(cl) => go!(self: push_tag cl; push_temp c),
1011 None => {
1012 go!(self: discard_tag);
1013 self.emit_char('<');
1014 self.emit_char('/');
1015 self.emit_temp_buf();
1016 go!(self: reconsume RawData kind);
1017 },
1018 }
1019 },
1020
1021 states::ScriptDataEscapeStart(DoubleEscaped) => loop {
1023 let c = get_char!(self, input);
1024 match c {
1025 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1026 let esc = if &**self.temp_buf.borrow() == "script" {
1027 DoubleEscaped
1028 } else {
1029 Escaped
1030 };
1031 self.emit_char(c);
1032 go!(self: to RawData ScriptDataEscaped esc);
1033 },
1034 _ => match lower_ascii_letter(c) {
1035 Some(cl) => {
1036 go!(self: push_temp cl);
1037 self.emit_char(c);
1038 },
1039 None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
1040 },
1041 }
1042 },
1043
1044 states::ScriptDataEscapeStart(Escaped) => loop {
1046 match get_char!(self, input) {
1047 '-' => {
1048 self.emit_char('-');
1049 go!(self: to ScriptDataEscapeStartDash);
1050 },
1051 _ => go!(self: reconsume RawData ScriptData),
1052 }
1053 },
1054
1055 states::ScriptDataEscapeStartDash => loop {
1057 match get_char!(self, input) {
1058 '-' => {
1059 self.emit_char('-');
1060 go!(self: to ScriptDataEscapedDashDash Escaped);
1061 },
1062 _ => go!(self: reconsume RawData ScriptData),
1063 }
1064 },
1065
1066 states::ScriptDataEscapedDash(kind) => loop {
1068 match get_char!(self, input) {
1069 '-' => {
1070 self.emit_char('-');
1071 go!(self: to ScriptDataEscapedDashDash kind);
1072 },
1073 '<' => {
1074 if kind == DoubleEscaped {
1075 self.emit_char('<');
1076 }
1077 go!(self: to RawLessThanSign ScriptDataEscaped kind);
1078 },
1079 '\0' => {
1080 self.bad_char_error();
1081 self.emit_char('\u{fffd}');
1082 go!(self: to RawData ScriptDataEscaped kind)
1083 },
1084 c => {
1085 self.emit_char(c);
1086 go!(self: to RawData ScriptDataEscaped kind);
1087 },
1088 }
1089 },
1090
1091 states::ScriptDataEscapedDashDash(kind) => loop {
1093 match get_char!(self, input) {
1094 '-' => {
1095 self.emit_char('-');
1096 },
1097 '<' => {
1098 if kind == DoubleEscaped {
1099 self.emit_char('<');
1100 }
1101 go!(self: to RawLessThanSign ScriptDataEscaped kind);
1102 },
1103 '>' => {
1104 self.emit_char('>');
1105 go!(self: to RawData ScriptData);
1106 },
1107 '\0' => {
1108 self.bad_char_error();
1109 self.emit_char('\u{fffd}');
1110 go!(self: to RawData ScriptDataEscaped kind)
1111 },
1112 c => {
1113 self.emit_char(c);
1114 go!(self: to RawData ScriptDataEscaped kind);
1115 },
1116 }
1117 },
1118
1119 states::ScriptDataDoubleEscapeEnd => loop {
1121 let c = get_char!(self, input);
1122 match c {
1123 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1124 let esc = if &**self.temp_buf.borrow() == "script" {
1125 Escaped
1126 } else {
1127 DoubleEscaped
1128 };
1129 self.emit_char(c);
1130 go!(self: to RawData ScriptDataEscaped esc);
1131 },
1132 _ => match lower_ascii_letter(c) {
1133 Some(cl) => {
1134 go!(self: push_temp cl);
1135 self.emit_char(c);
1136 },
1137 None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
1138 },
1139 }
1140 },
1141
1142 states::BeforeAttributeName => loop {
1144 match get_char!(self, input) {
1145 '\t' | '\n' | '\x0C' | ' ' => (),
1146 '/' => go!(self: to SelfClosingStartTag),
1147 '>' => go!(self: emit_tag Data),
1148 '\0' => {
1149 self.bad_char_error();
1150 go!(self: create_attr '\u{fffd}'; to AttributeName)
1151 },
1152 c => match lower_ascii_letter(c) {
1153 Some(cl) => go!(self: create_attr cl; to AttributeName),
1154 None => {
1155 if matches!(c, '"' | '\'' | '<' | '=') {
1156 self.bad_char_error();
1157 }
1158
1159 go!(self: create_attr c; to AttributeName);
1160 },
1161 },
1162 }
1163 },
1164
1165 states::AttributeName => loop {
1167 match get_char!(self, input) {
1168 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
1169 '/' => go!(self: to SelfClosingStartTag),
1170 '=' => go!(self: to BeforeAttributeValue),
1171 '>' => go!(self: emit_tag Data),
1172 '\0' => {
1173 self.bad_char_error();
1174 go!(self: push_name '\u{fffd}')
1175 },
1176 c => match lower_ascii_letter(c) {
1177 Some(cl) => go!(self: push_name cl),
1178 None => {
1179 if matches!(c, '"' | '\'' | '<') {
1180 self.bad_char_error();
1181 }
1182 go!(self: push_name c);
1183 },
1184 },
1185 }
1186 },
1187
1188 states::AfterAttributeName => loop {
1190 match get_char!(self, input) {
1191 '\t' | '\n' | '\x0C' | ' ' => (),
1192 '/' => go!(self: to SelfClosingStartTag),
1193 '=' => go!(self: to BeforeAttributeValue),
1194 '>' => go!(self: emit_tag Data),
1195 '\0' => {
1196 self.bad_char_error();
1197 go!(self: create_attr '\u{fffd}'; to AttributeName)
1198 },
1199 c => match lower_ascii_letter(c) {
1200 Some(cl) => go!(self: create_attr cl; to AttributeName),
1201 None => {
1202 if matches!(c, '"' | '\'' | '<') {
1203 self.bad_char_error();
1204 }
1205
1206 go!(self: create_attr c; to AttributeName);
1207 },
1208 },
1209 }
1210 },
1211
1212 states::BeforeAttributeValue => loop {
1216 match peek!(self, input) {
1217 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1218 '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1219 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1220 '>' => {
1221 go!(self: discard_char input);
1222 self.bad_char_error();
1223 go!(self: emit_tag Data)
1224 },
1225 _ => go!(self: to AttributeValue Unquoted),
1226 }
1227 },
1228
1229 states::AttributeValue(DoubleQuoted) => loop {
1231 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1232 FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1233 FromSet('&') => go!(self: consume_char_ref),
1234 FromSet('\0') => {
1235 self.bad_char_error();
1236 go!(self: push_value '\u{fffd}')
1237 },
1238 FromSet(c) => go!(self: push_value c),
1239 NotFromSet(ref b) => go!(self: append_value b),
1240 }
1241 },
1242
1243 states::AttributeValue(SingleQuoted) => loop {
1245 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1246 FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1247 FromSet('&') => go!(self: consume_char_ref),
1248 FromSet('\0') => {
1249 self.bad_char_error();
1250 go!(self: push_value '\u{fffd}')
1251 },
1252 FromSet(c) => go!(self: push_value c),
1253 NotFromSet(ref b) => go!(self: append_value b),
1254 }
1255 },
1256
1257 states::AttributeValue(Unquoted) => loop {
1259 match pop_except_from!(
1260 self,
1261 input,
1262 small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1263 ) {
1264 FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1265 go!(self: to BeforeAttributeName)
1266 },
1267 FromSet('&') => go!(self: consume_char_ref),
1268 FromSet('>') => go!(self: emit_tag Data),
1269 FromSet('\0') => {
1270 self.bad_char_error();
1271 go!(self: push_value '\u{fffd}')
1272 },
1273 FromSet(c) => {
1274 if matches!(c, '"' | '\'' | '<' | '=' | '`') {
1275 self.bad_char_error();
1276 }
1277 go!(self: push_value c);
1278 },
1279 NotFromSet(ref b) => go!(self: append_value b),
1280 }
1281 },
1282
1283 states::AfterAttributeValueQuoted => loop {
1285 match get_char!(self, input) {
1286 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1287 '/' => go!(self: to SelfClosingStartTag),
1288 '>' => go!(self: emit_tag Data),
1289 _ => {
1290 self.bad_char_error();
1291 go!(self: reconsume BeforeAttributeName)
1292 },
1293 }
1294 },
1295
1296 states::SelfClosingStartTag => loop {
1298 match get_char!(self, input) {
1299 '>' => {
1300 self.current_tag_self_closing.set(true);
1301 go!(self: emit_tag Data);
1302 },
1303 _ => {
1304 self.bad_char_error();
1305 go!(self: reconsume BeforeAttributeName)
1306 },
1307 }
1308 },
1309
1310 states::CommentStart => loop {
1312 match get_char!(self, input) {
1313 '-' => go!(self: to CommentStartDash),
1314 '\0' => {
1315 self.bad_char_error();
1316 go!(self: push_comment '\u{fffd}'; to Comment)
1317 },
1318 '>' => {
1319 self.bad_char_error();
1320 go!(self: emit_comment; to Data)
1321 },
1322 c => go!(self: push_comment c; to Comment),
1323 }
1324 },
1325
1326 states::CommentStartDash => loop {
1328 match get_char!(self, input) {
1329 '-' => go!(self: to CommentEnd),
1330 '\0' => {
1331 self.bad_char_error();
1332 go!(self: append_comment "-\u{fffd}"; to Comment)
1333 },
1334 '>' => {
1335 self.bad_char_error();
1336 go!(self: emit_comment; to Data)
1337 },
1338 c => go!(self: push_comment '-'; push_comment c; to Comment),
1339 }
1340 },
1341
1342 states::Comment => loop {
1344 match get_char!(self, input) {
1345 c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1346 '-' => go!(self: to CommentEndDash),
1347 '\0' => {
1348 self.bad_char_error();
1349 go!(self: push_comment '\u{fffd}')
1350 },
1351 c => go!(self: push_comment c),
1352 }
1353 },
1354
1355 states::CommentLessThanSign => loop {
1357 match get_char!(self, input) {
1358 c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1359 c @ '<' => go!(self: push_comment c),
1360 _ => go!(self: reconsume Comment),
1361 }
1362 },
1363
1364 states::CommentLessThanSignBang => loop {
1366 match get_char!(self, input) {
1367 '-' => go!(self: to CommentLessThanSignBangDash),
1368 _ => go!(self: reconsume Comment),
1369 }
1370 },
1371
1372 states::CommentLessThanSignBangDash => loop {
1374 match get_char!(self, input) {
1375 '-' => go!(self: to CommentLessThanSignBangDashDash),
1376 _ => go!(self: reconsume CommentEndDash),
1377 }
1378 },
1379
1380 states::CommentLessThanSignBangDashDash => loop {
1382 match get_char!(self, input) {
1383 '>' => go!(self: reconsume CommentEnd),
1384 _ => {
1385 self.bad_char_error();
1386 go!(self: reconsume CommentEnd)
1387 },
1388 }
1389 },
1390
1391 states::CommentEndDash => loop {
1393 match get_char!(self, input) {
1394 '-' => go!(self: to CommentEnd),
1395 '\0' => {
1396 self.bad_char_error();
1397 go!(self: append_comment "-\u{fffd}"; to Comment)
1398 },
1399 c => go!(self: push_comment '-'; push_comment c; to Comment),
1400 }
1401 },
1402
1403 states::CommentEnd => loop {
1405 match get_char!(self, input) {
1406 '>' => go!(self: emit_comment; to Data),
1407 '!' => go!(self: to CommentEndBang),
1408 '-' => go!(self: push_comment '-'),
1409 _ => go!(self: append_comment "--"; reconsume Comment),
1410 }
1411 },
1412
1413 states::CommentEndBang => loop {
1415 match get_char!(self, input) {
1416 '-' => go!(self: append_comment "--!"; to CommentEndDash),
1417 '>' => {
1418 self.bad_char_error();
1419 go!(self: emit_comment; to Data)
1420 },
1421 '\0' => {
1422 self.bad_char_error();
1423 go!(self: append_comment "--!\u{fffd}"; to Comment)
1424 },
1425 c => go!(self: append_comment "--!"; push_comment c; to Comment),
1426 }
1427 },
1428
1429 states::Doctype => loop {
1431 match get_char!(self, input) {
1432 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1433 '>' => go!(self: reconsume BeforeDoctypeName),
1434 _ => {
1435 self.bad_char_error();
1436 go!(self: reconsume BeforeDoctypeName)
1437 },
1438 }
1439 },
1440
1441 states::BeforeDoctypeName => loop {
1443 match get_char!(self, input) {
1444 '\t' | '\n' | '\x0C' | ' ' => (),
1445 '\0' => {
1446 self.bad_char_error();
1447 go!(self: create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1448 },
1449 '>' => {
1450 self.bad_char_error();
1451 go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1452 },
1453 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1454 to DoctypeName),
1455 }
1456 },
1457
1458 states::DoctypeName => loop {
1460 match get_char!(self, input) {
1461 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1462 '>' => go!(self: emit_doctype; to Data),
1463 '\0' => {
1464 self.bad_char_error();
1465 go!(self: push_doctype_name '\u{fffd}')
1466 },
1467 c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1468 }
1469 },
1470
1471 states::AfterDoctypeName => loop {
1473 if eat!(self, input, "public") {
1474 go!(self: to AfterDoctypeKeyword Public);
1475 } else if eat!(self, input, "system") {
1476 go!(self: to AfterDoctypeKeyword System);
1477 } else {
1478 match get_char!(self, input) {
1479 '\t' | '\n' | '\x0C' | ' ' => (),
1480 '>' => go!(self: emit_doctype; to Data),
1481 _ => {
1482 self.bad_char_error();
1483 go!(self: force_quirks; reconsume BogusDoctype)
1484 },
1485 }
1486 }
1487 },
1488
1489 states::AfterDoctypeKeyword(kind) => loop {
1491 match get_char!(self, input) {
1492 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1493 '"' => {
1494 self.bad_char_error();
1495 go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1496 },
1497 '\'' => {
1498 self.bad_char_error();
1499 go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1500 },
1501 '>' => {
1502 self.bad_char_error();
1503 go!(self: force_quirks; emit_doctype; to Data)
1504 },
1505 _ => {
1506 self.bad_char_error();
1507 go!(self: force_quirks; reconsume BogusDoctype)
1508 },
1509 }
1510 },
1511
1512 states::BeforeDoctypeIdentifier(kind) => loop {
1514 match get_char!(self, input) {
1515 '\t' | '\n' | '\x0C' | ' ' => (),
1516 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1517 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1518 '>' => {
1519 self.bad_char_error();
1520 go!(self: force_quirks; emit_doctype; to Data)
1521 },
1522 _ => {
1523 self.bad_char_error();
1524 go!(self: force_quirks; reconsume BogusDoctype)
1525 },
1526 }
1527 },
1528
1529 states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1531 match get_char!(self, input) {
1532 '"' => go!(self: to AfterDoctypeIdentifier kind),
1533 '\0' => {
1534 self.bad_char_error();
1535 go!(self: push_doctype_id kind '\u{fffd}')
1536 },
1537 '>' => {
1538 self.bad_char_error();
1539 go!(self: force_quirks; emit_doctype; to Data)
1540 },
1541 c => go!(self: push_doctype_id kind c),
1542 }
1543 },
1544
1545 states::DoctypeIdentifierSingleQuoted(kind) => loop {
1547 match get_char!(self, input) {
1548 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1549 '\0' => {
1550 self.bad_char_error();
1551 go!(self: push_doctype_id kind '\u{fffd}')
1552 },
1553 '>' => {
1554 self.bad_char_error();
1555 go!(self: force_quirks; emit_doctype; to Data)
1556 },
1557 c => go!(self: push_doctype_id kind c),
1558 }
1559 },
1560
1561 states::AfterDoctypeIdentifier(Public) => loop {
1563 match get_char!(self, input) {
1564 '\t' | '\n' | '\x0C' | ' ' => {
1565 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1566 },
1567 '>' => go!(self: emit_doctype; to Data),
1568 '"' => {
1569 self.bad_char_error();
1570 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1571 },
1572 '\'' => {
1573 self.bad_char_error();
1574 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1575 },
1576 _ => {
1577 self.bad_char_error();
1578 go!(self: force_quirks; reconsume BogusDoctype)
1579 },
1580 }
1581 },
1582
1583 states::AfterDoctypeIdentifier(System) => loop {
1585 match get_char!(self, input) {
1586 '\t' | '\n' | '\x0C' | ' ' => (),
1587 '>' => go!(self: emit_doctype; to Data),
1588 _ => {
1589 self.bad_char_error();
1590 go!(self: reconsume BogusDoctype)
1591 },
1592 }
1593 },
1594
1595 states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1597 match get_char!(self, input) {
1598 '\t' | '\n' | '\x0C' | ' ' => (),
1599 '>' => go!(self: emit_doctype; to Data),
1600 '"' => {
1601 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1602 },
1603 '\'' => {
1604 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1605 },
1606 _ => {
1607 self.bad_char_error();
1608 go!(self: force_quirks; reconsume BogusDoctype)
1609 },
1610 }
1611 },
1612
1613 states::BogusDoctype => loop {
1615 match get_char!(self, input) {
1616 '>' => go!(self: emit_doctype; to Data),
1617 '\0' => {
1618 self.bad_char_error();
1619 },
1620 _ => (),
1621 }
1622 },
1623
1624 states::BogusComment => loop {
1626 match get_char!(self, input) {
1627 '>' => go!(self: emit_comment; to Data),
1628 '\0' => {
1629 self.bad_char_error();
1630 go!(self: push_comment '\u{fffd}')
1631 },
1632 c => go!(self: push_comment c),
1633 }
1634 },
1635
1636 states::MarkupDeclarationOpen => loop {
1638 if eat_exact!(self, input, "--") {
1639 go!(self: clear_comment; to CommentStart);
1640 } else if eat!(self, input, "doctype") {
1641 go!(self: to Doctype);
1642 } else {
1643 if self
1644 .sink
1645 .adjusted_current_node_present_but_not_in_html_namespace()
1646 && eat_exact!(self, input, "[CDATA[")
1647 {
1648 go!(self: clear_temp; to CdataSection);
1649 }
1650 self.bad_char_error();
1651 go!(self: clear_comment; to BogusComment);
1652 }
1653 },
1654
1655 states::CdataSection => loop {
1657 match get_char!(self, input) {
1658 ']' => go!(self: to CdataSectionBracket),
1659 '\0' => {
1660 self.emit_temp_buf();
1661 self.emit_char('\0');
1662 },
1663 c => go!(self: push_temp c),
1664 }
1665 },
1666
1667 states::CdataSectionBracket => match get_char!(self, input) {
1669 ']' => go!(self: to CdataSectionEnd),
1670 _ => go!(self: push_temp ']'; reconsume CdataSection),
1671 },
1672
1673 states::CdataSectionEnd => loop {
1675 match get_char!(self, input) {
1676 ']' => go!(self: push_temp ']'),
1677 '>' => {
1678 self.emit_temp_buf();
1679 go!(self: to Data);
1680 },
1681 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1682 }
1683 },
1684 }
1686 }
1687
1688 fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1689 let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
1690 let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
1691 char_ref::Status::Done(char_ref) => {
1692 self.process_char_ref(char_ref);
1693 *char_ref_tokenizer = None;
1694 return ProcessResult::Continue;
1695 },
1696
1697 char_ref::Status::Stuck => ProcessResult::Suspend,
1698 char_ref::Status::Progress => ProcessResult::Continue,
1699 };
1700
1701 progress
1702 }
1703
1704 fn process_char_ref(&self, char_ref: CharRef) {
1705 let CharRef {
1706 mut chars,
1707 mut num_chars,
1708 } = char_ref;
1709
1710 if num_chars == 0 {
1711 chars[0] = '&';
1712 num_chars = 1;
1713 }
1714
1715 for i in 0..num_chars {
1716 let c = chars[i as usize];
1717 match self.state.get() {
1718 states::Data | states::RawData(states::Rcdata) => self.emit_char(c),
1719
1720 states::AttributeValue(_) => go!(self: push_value c),
1721
1722 _ => panic!(
1723 "state {:?} should not be reachable in process_char_ref",
1724 self.state.get()
1725 ),
1726 }
1727 }
1728 }
1729
1730 pub fn end(&self) {
1732 let input = BufferQueue::default();
1735 match self.char_ref_tokenizer.take() {
1736 None => (),
1737 Some(mut tokenizer) => {
1738 self.process_char_ref(tokenizer.end_of_file(self, &input));
1739 },
1740 }
1741
1742 self.at_eof.set(true);
1745 assert!(matches!(self.run(&input), TokenizerResult::Done));
1746 assert!(input.is_empty());
1747
1748 loop {
1749 match self.eof_step() {
1750 ProcessResult::Continue => (),
1751 ProcessResult::Suspend => break,
1752 ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
1753 }
1754 }
1755
1756 self.sink.end();
1757
1758 if self.opts.profile {
1759 self.dump_profile();
1760 }
1761 }
1762
1763 fn dump_profile(&self) {
1764 let mut results: Vec<(states::State, u64)> = self
1765 .state_profile
1766 .borrow()
1767 .iter()
1768 .map(|(s, t)| (*s, *t))
1769 .collect();
1770 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1771
1772 let total: u64 = results
1773 .iter()
1774 .map(|&(_, t)| t)
1775 .fold(0, ::std::ops::Add::add);
1776 println!("\nTokenizer profile, in nanoseconds");
1777 println!(
1778 "\n{:12} total in token sink",
1779 self.time_in_sink.get()
1780 );
1781 println!("\n{total:12} total in tokenizer");
1782
1783 for (k, v) in results.into_iter() {
1784 let pct = 100.0 * (v as f64) / (total as f64);
1785 println!("{v:12} {pct:4.1}% {k:?}");
1786 }
1787 }
1788
1789 fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1790 debug!("processing EOF in state {:?}", self.state.get());
1791 match self.state.get() {
1792 states::Data
1793 | states::RawData(Rcdata)
1794 | states::RawData(Rawtext)
1795 | states::RawData(ScriptData)
1796 | states::Plaintext => go!(self: eof),
1797
1798 states::TagName
1799 | states::RawData(ScriptDataEscaped(_))
1800 | states::BeforeAttributeName
1801 | states::AttributeName
1802 | states::AfterAttributeName
1803 | states::AttributeValue(_)
1804 | states::AfterAttributeValueQuoted
1805 | states::SelfClosingStartTag
1806 | states::ScriptDataEscapedDash(_)
1807 | states::ScriptDataEscapedDashDash(_) => {
1808 self.bad_eof_error();
1809 go!(self: to Data)
1810 },
1811
1812 states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1813
1814 states::TagOpen => {
1815 self.bad_eof_error();
1816 self.emit_char('<');
1817 go!(self: to Data);
1818 },
1819
1820 states::EndTagOpen => {
1821 self.bad_eof_error();
1822 self.emit_char('<');
1823 self.emit_char('/');
1824 go!(self: to Data);
1825 },
1826
1827 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1828 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1829 },
1830
1831 states::RawLessThanSign(kind) => {
1832 self.emit_char('<');
1833 go!(self: to RawData kind);
1834 },
1835
1836 states::RawEndTagOpen(kind) => {
1837 self.emit_char('<');
1838 self.emit_char('/');
1839 go!(self: to RawData kind);
1840 },
1841
1842 states::RawEndTagName(kind) => {
1843 self.emit_char('<');
1844 self.emit_char('/');
1845 self.emit_temp_buf();
1846 go!(self: to RawData kind)
1847 },
1848
1849 states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1850
1851 states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1852
1853 states::ScriptDataDoubleEscapeEnd => {
1854 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1855 },
1856
1857 states::CommentStart
1858 | states::CommentStartDash
1859 | states::Comment
1860 | states::CommentEndDash
1861 | states::CommentEnd
1862 | states::CommentEndBang => {
1863 self.bad_eof_error();
1864 go!(self: emit_comment; to Data)
1865 },
1866
1867 states::CommentLessThanSign | states::CommentLessThanSignBang => {
1868 go!(self: reconsume Comment)
1869 },
1870
1871 states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1872
1873 states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1874
1875 states::Doctype | states::BeforeDoctypeName => {
1876 self.bad_eof_error();
1877 go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1878 },
1879
1880 states::DoctypeName
1881 | states::AfterDoctypeName
1882 | states::AfterDoctypeKeyword(_)
1883 | states::BeforeDoctypeIdentifier(_)
1884 | states::DoctypeIdentifierDoubleQuoted(_)
1885 | states::DoctypeIdentifierSingleQuoted(_)
1886 | states::AfterDoctypeIdentifier(_)
1887 | states::BetweenDoctypePublicAndSystemIdentifiers => {
1888 self.bad_eof_error();
1889 go!(self: force_quirks; emit_doctype; to Data)
1890 },
1891
1892 states::BogusDoctype => go!(self: emit_doctype; to Data),
1893
1894 states::BogusComment => go!(self: emit_comment; to Data),
1895
1896 states::MarkupDeclarationOpen => {
1897 self.bad_char_error();
1898 go!(self: to BogusComment)
1899 },
1900
1901 states::CdataSection => {
1902 self.emit_temp_buf();
1903 self.bad_eof_error();
1904 go!(self: to Data)
1905 },
1906
1907 states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1908
1909 states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1910 }
1911 }
1912
1913 fn is_supported_simd_feature_detected() -> bool {
1915 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1916 {
1917 is_x86_feature_detected!("sse2")
1918 }
1919
1920 #[cfg(target_arch = "aarch64")]
1921 {
1922 std::arch::is_aarch64_feature_detected!("neon")
1923 }
1924
1925 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
1926 false
1927 }
1928
1929 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
1930 unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1941 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1942 let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
1943
1944 #[cfg(target_arch = "aarch64")]
1945 let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
1946
1947 while let Some(c) = input.as_bytes().get(i) {
1949 if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1950 break;
1951 }
1952 if *c == b'\n' {
1953 n_newlines += 1;
1954 }
1955
1956 i += 1;
1957 }
1958
1959 let set_result = if i == 0 {
1960 let first_char = input.pop_front_char().unwrap();
1961 debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1962
1963 let preprocessed_char = self
1967 .get_preprocessed_char(first_char, &BufferQueue::default())
1968 .unwrap();
1969 SetResult::FromSet(preprocessed_char)
1970 } else {
1971 debug_assert!(
1972 input.len() >= i,
1973 "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1974 i,
1975 input.len()
1976 );
1977 let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1978 input.unsafe_pop_front(i as u32);
1979 SetResult::NotFromSet(consumed_chunk)
1980 };
1981
1982 self.current_line.set(self.current_line.get() + n_newlines);
1983
1984 Some(set_result)
1985 }
1986
1987 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1988 #[target_feature(enable = "sse2")]
1989 unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
1997 #[cfg(target_arch = "x86")]
1998 use std::arch::x86::{
1999 __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
2000 _mm_set1_epi8,
2001 };
2002 #[cfg(target_arch = "x86_64")]
2003 use std::arch::x86_64::{
2004 __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
2005 _mm_set1_epi8,
2006 };
2007
2008 debug_assert!(!input.is_empty());
2009
2010 let quote_mask = _mm_set1_epi8('<' as i8);
2011 let escape_mask = _mm_set1_epi8('&' as i8);
2012 let carriage_return_mask = _mm_set1_epi8('\r' as i8);
2013 let zero_mask = _mm_set1_epi8('\0' as i8);
2014 let newline_mask = _mm_set1_epi8('\n' as i8);
2015
2016 let raw_bytes: &[u8] = input.as_bytes();
2017 let start = raw_bytes.as_ptr();
2018
2019 const STRIDE: usize = 16;
2020 let mut i = 0;
2021 let mut n_newlines = 0;
2022 while i + STRIDE <= raw_bytes.len() {
2023 let data = _mm_loadu_si128(start.add(i) as *const __m128i);
2025
2026 let quotes = _mm_cmpeq_epi8(data, quote_mask);
2028 let escapes = _mm_cmpeq_epi8(data, escape_mask);
2029 let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
2030 let zeros = _mm_cmpeq_epi8(data, zero_mask);
2031 let newlines = _mm_cmpeq_epi8(data, newline_mask);
2032
2033 let test_result = _mm_or_si128(
2036 _mm_or_si128(quotes, zeros),
2037 _mm_or_si128(escapes, carriage_returns),
2038 );
2039 let bitmask = _mm_movemask_epi8(test_result);
2040 let newline_mask = _mm_movemask_epi8(newlines);
2041
2042 if (bitmask != 0) {
2043 let position = if cfg!(target_endian = "little") {
2045 bitmask.trailing_zeros() as usize
2046 } else {
2047 bitmask.leading_zeros() as usize
2048 };
2049
2050 n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
2051 i += position;
2052 break;
2053 } else {
2054 n_newlines += newline_mask.count_ones() as u64;
2055 }
2056
2057 i += STRIDE;
2058 }
2059
2060 (i, n_newlines)
2061 }
2062
2063 #[cfg(target_arch = "aarch64")]
2064 #[target_feature(enable = "neon")]
2065 unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2073 use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
2074
2075 debug_assert!(!input.is_empty());
2076
2077 let quote_mask = vdupq_n_u8(b'<');
2078 let escape_mask = vdupq_n_u8(b'&');
2079 let carriage_return_mask = vdupq_n_u8(b'\r');
2080 let zero_mask = vdupq_n_u8(b'\0');
2081 let newline_mask = vdupq_n_u8(b'\n');
2082
2083 let raw_bytes: &[u8] = input.as_bytes();
2084 let start = raw_bytes.as_ptr();
2085
2086 const STRIDE: usize = 16;
2087 let mut i = 0;
2088 let mut n_newlines = 0;
2089 while i + STRIDE <= raw_bytes.len() {
2090 let data = vld1q_u8(start.add(i));
2092
2093 let quotes = vceqq_u8(data, quote_mask);
2095 let escapes = vceqq_u8(data, escape_mask);
2096 let carriage_returns = vceqq_u8(data, carriage_return_mask);
2097 let zeros = vceqq_u8(data, zero_mask);
2098 let newlines = vceqq_u8(data, newline_mask);
2099
2100 let test_result =
2103 vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
2104 let bitmask = vmaxvq_u8(test_result);
2105 let newline_mask = vmaxvq_u8(newlines);
2106 if bitmask != 0 {
2107 let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2109 let position = chunk_bytes
2110 .iter()
2111 .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
2112 .unwrap();
2113
2114 n_newlines += chunk_bytes[..position]
2115 .iter()
2116 .filter(|&&b| b == b'\n')
2117 .count() as u64;
2118
2119 i += position;
2120 break;
2121 } else if newline_mask != 0 {
2122 let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2123 n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
2124 }
2125
2126 i += STRIDE;
2127 }
2128
2129 (i, n_newlines)
2130 }
2131}
2132
2133#[cfg(test)]
2134#[allow(non_snake_case)]
2135mod test {
2136 use super::option_push; use crate::tendril::{SliceExt, StrTendril};
2138
2139 use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
2140
2141 use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
2142 use super::interface::{EndTag, StartTag, Tag, TagKind};
2143 use super::interface::{TagToken, Token};
2144
2145 use markup5ever::buffer_queue::BufferQueue;
2146 use std::cell::RefCell;
2147
2148 use crate::LocalName;
2149
2150 struct LinesMatch {
2154 tokens: RefCell<Vec<Token>>,
2155 current_str: RefCell<StrTendril>,
2156 lines: RefCell<Vec<(Token, u64)>>,
2157 }
2158
2159 impl LinesMatch {
2160 fn new() -> LinesMatch {
2161 LinesMatch {
2162 tokens: RefCell::new(vec![]),
2163 current_str: RefCell::new(StrTendril::new()),
2164 lines: RefCell::new(vec![]),
2165 }
2166 }
2167
2168 fn push(&self, token: Token, line_number: u64) {
2169 self.finish_str();
2170 self.lines.borrow_mut().push((token, line_number));
2171 }
2172
2173 fn finish_str(&self) {
2174 if !self.current_str.borrow().is_empty() {
2175 let s = self.current_str.take();
2176 self.tokens.borrow_mut().push(CharacterTokens(s));
2177 }
2178 }
2179 }
2180
2181 impl TokenSink for LinesMatch {
2182 type Handle = ();
2183
2184 fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
2185 match token {
2186 CharacterTokens(b) => {
2187 self.current_str.borrow_mut().push_slice(&b);
2188 },
2189
2190 NullCharacterToken => {
2191 self.current_str.borrow_mut().push_char('\0');
2192 },
2193
2194 ParseError(_) => {
2195 panic!("unexpected parse error");
2196 },
2197
2198 TagToken(mut t) => {
2199 match t.kind {
2203 EndTag => {
2204 t.self_closing = false;
2205 t.attrs = vec![];
2206 },
2207 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
2208 }
2209 self.push(TagToken(t), line_number);
2210 },
2211
2212 EOFToken => (),
2213
2214 _ => self.push(token, line_number),
2215 }
2216 TokenSinkResult::Continue
2217 }
2218 }
2219
2220 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
2223 let sink = LinesMatch::new();
2224 let tok = Tokenizer::new(sink, opts);
2225 let buffer = BufferQueue::default();
2226 for chunk in input.into_iter() {
2227 buffer.push_back(chunk);
2228 let _ = tok.feed(&buffer);
2229 }
2230 tok.end();
2231 tok.sink.lines.take()
2232 }
2233
2234 fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
2236 let name = LocalName::from(&*token);
2237
2238 TagToken(Tag {
2239 kind: tagkind,
2240 name,
2241 self_closing: false,
2242 attrs: vec![],
2243 })
2244 }
2245
2246 #[test]
2247 fn push_to_None_gives_singleton() {
2248 let mut s: Option<StrTendril> = None;
2249 option_push(&mut s, 'x');
2250 assert_eq!(s, Some("x".to_tendril()));
2251 }
2252
2253 #[test]
2254 fn push_to_empty_appends() {
2255 let mut s: Option<StrTendril> = Some(StrTendril::new());
2256 option_push(&mut s, 'x');
2257 assert_eq!(s, Some("x".to_tendril()));
2258 }
2259
2260 #[test]
2261 fn push_to_nonempty_appends() {
2262 let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
2263 option_push(&mut s, 'x');
2264 assert_eq!(s, Some("yx".to_tendril()));
2265 }
2266
2267 #[test]
2268 fn check_lines() {
2269 let opts = TokenizerOpts {
2270 exact_errors: false,
2271 discard_bom: true,
2272 profile: false,
2273 initial_state: None,
2274 last_start_tag_name: None,
2275 };
2276 let vector = vec![
2277 StrTendril::from("<a>\n"),
2278 StrTendril::from("<b>\n"),
2279 StrTendril::from("</b>\n"),
2280 StrTendril::from("</a>\n"),
2281 ];
2282 let expected = vec![
2283 (create_tag(StrTendril::from("a"), StartTag), 1),
2284 (create_tag(StrTendril::from("b"), StartTag), 2),
2285 (create_tag(StrTendril::from("b"), EndTag), 3),
2286 (create_tag(StrTendril::from("a"), EndTag), 4),
2287 ];
2288 let results = tokenize(vector, opts);
2289 assert_eq!(results, expected);
2290 }
2291
2292 #[test]
2293 fn check_lines_with_new_line() {
2294 let opts = TokenizerOpts {
2295 exact_errors: false,
2296 discard_bom: true,
2297 profile: false,
2298 initial_state: None,
2299 last_start_tag_name: None,
2300 };
2301 let vector = vec![
2302 StrTendril::from("<a>\r\n"),
2303 StrTendril::from("<b>\r\n"),
2304 StrTendril::from("</b>\r\n"),
2305 StrTendril::from("</a>\r\n"),
2306 ];
2307 let expected = vec![
2308 (create_tag(StrTendril::from("a"), StartTag), 1),
2309 (create_tag(StrTendril::from("b"), StartTag), 2),
2310 (create_tag(StrTendril::from("b"), EndTag), 3),
2311 (create_tag(StrTendril::from("a"), EndTag), 4),
2312 ];
2313 let results = tokenize(vector, opts);
2314 assert_eq!(results, expected);
2315 }
2316}