1pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use markup5ever::{ns, small_char_set, TokenizerResult};
28use std::borrow::Cow::{self, Borrowed};
29use std::cell::{Cell, RefCell, RefMut};
30use std::collections::BTreeMap;
31use std::mem;
32
33pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34use crate::macros::{time, unwrap_or_return};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42pub enum ProcessResult<Handle> {
44 Continue,
46 Suspend,
49 Script(Handle),
54 EncodingIndicator(StrTendril),
60}
61
62fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
63 match *opt_str {
64 Some(ref mut s) => s.push_char(c),
65 None => *opt_str = Some(StrTendril::from_char(c)),
66 }
67}
68
69#[derive(Clone)]
71pub struct TokenizerOpts {
72 pub exact_errors: bool,
75
76 pub discard_bom: bool,
79
80 pub profile: bool,
83
84 pub initial_state: Option<states::State>,
87
88 pub last_start_tag_name: Option<String>,
94}
95
96impl Default for TokenizerOpts {
97 fn default() -> TokenizerOpts {
98 TokenizerOpts {
99 exact_errors: false,
100 discard_bom: true,
101 profile: false,
102 initial_state: None,
103 last_start_tag_name: None,
104 }
105 }
106}
107
108pub struct Tokenizer<Sink> {
110 opts: TokenizerOpts,
112
113 pub sink: Sink,
115
116 state: Cell<states::State>,
118
119 at_eof: Cell<bool>,
122
123 char_ref_tokenizer: RefCell<Option<CharRefTokenizer>>,
126
127 current_char: Cell<char>,
129
130 reconsume: Cell<bool>,
132
133 ignore_lf: Cell<bool>,
136
137 discard_bom: Cell<bool>,
140
141 current_tag_kind: Cell<TagKind>,
143
144 current_tag_name: RefCell<StrTendril>,
146
147 current_tag_self_closing: Cell<bool>,
149
150 current_tag_had_duplicate_attributes: Cell<bool>,
152
153 current_tag_attrs: RefCell<Vec<Attribute>>,
155
156 current_attr_name: RefCell<StrTendril>,
158
159 current_attr_value: RefCell<StrTendril>,
161
162 current_comment: RefCell<StrTendril>,
164
165 current_doctype: RefCell<Doctype>,
167
168 last_start_tag_name: RefCell<Option<LocalName>>,
170
171 temp_buf: RefCell<StrTendril>,
173
174 state_profile: RefCell<BTreeMap<states::State, u64>>,
176
177 time_in_sink: Cell<u64>,
179
180 current_line: Cell<u64>,
182}
183
184impl<Sink: TokenSink> Tokenizer<Sink> {
185 pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
187 let start_tag_name = opts
188 .last_start_tag_name
189 .take()
190 .map(|s| LocalName::from(&*s));
191 let state = opts.initial_state.unwrap_or(states::Data);
192 let discard_bom = opts.discard_bom;
193 Tokenizer {
194 opts,
195 sink,
196 state: Cell::new(state),
197 char_ref_tokenizer: RefCell::new(None),
198 at_eof: Cell::new(false),
199 current_char: Cell::new('\0'),
200 reconsume: Cell::new(false),
201 ignore_lf: Cell::new(false),
202 discard_bom: Cell::new(discard_bom),
203 current_tag_kind: Cell::new(StartTag),
204 current_tag_name: RefCell::new(StrTendril::new()),
205 current_tag_self_closing: Cell::new(false),
206 current_tag_had_duplicate_attributes: Cell::new(false),
207 current_tag_attrs: RefCell::new(vec![]),
208 current_attr_name: RefCell::new(StrTendril::new()),
209 current_attr_value: RefCell::new(StrTendril::new()),
210 current_comment: RefCell::new(StrTendril::new()),
211 current_doctype: RefCell::new(Doctype::default()),
212 last_start_tag_name: RefCell::new(start_tag_name),
213 temp_buf: RefCell::new(StrTendril::new()),
214 state_profile: RefCell::new(BTreeMap::new()),
215 time_in_sink: Cell::new(0),
216 current_line: Cell::new(1),
217 }
218 }
219
220 pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
222 if input.is_empty() {
223 return TokenizerResult::Done;
224 }
225
226 if self.discard_bom.get() {
227 if let Some(c) = input.peek() {
228 if c == '\u{feff}' {
229 input.next();
230 }
231 } else {
232 return TokenizerResult::Done;
233 }
234 };
235
236 self.run(input)
237 }
238
239 pub fn set_plaintext_state(&self) {
240 self.state.set(states::Plaintext);
241 }
242
243 fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
244 if self.opts.profile {
245 let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
246 self.time_in_sink.set(self.time_in_sink.get() + dt);
247 ret
248 } else {
249 self.sink.process_token(token, self.current_line.get())
250 }
251 }
252
253 fn process_token_and_continue(&self, token: Token) {
254 assert!(matches!(
255 self.process_token(token),
256 TokenSinkResult::Continue
257 ));
258 }
259
260 fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
264 if self.ignore_lf.get() {
265 self.ignore_lf.set(false);
266 if c == '\n' {
267 c = input.next()?;
268 }
269 }
270
271 if c == '\r' {
272 self.ignore_lf.set(true);
273 c = '\n';
274 }
275
276 if c == '\n' {
277 self.current_line.set(self.current_line.get() + 1);
278 }
279
280 if self.opts.exact_errors
281 && match c as u32 {
282 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
283 n if (n & 0xFFFE) == 0xFFFE => true,
284 _ => false,
285 }
286 {
287 let msg = format!("Bad character {c}");
288 self.emit_error(Cow::Owned(msg));
289 }
290
291 trace!("got character {c}");
292 self.current_char.set(c);
293 Some(c)
294 }
295
296 fn get_char(&self, input: &BufferQueue) -> Option<char> {
299 if self.reconsume.get() {
300 self.reconsume.set(false);
301 Some(self.current_char.get())
302 } else {
303 input
304 .next()
305 .and_then(|c| self.get_preprocessed_char(c, input))
306 }
307 }
308
309 fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
310 if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
315 return self.get_char(input).map(FromSet);
316 }
317
318 let d = input.pop_except_from(set);
319 trace!("got characters {d:?}");
320 match d {
321 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
322
323 _ => d,
327 }
328 }
329
330 fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
335 if self.ignore_lf.get() {
336 self.ignore_lf.set(false);
337 if self.peek(input) == Some('\n') {
338 self.discard_char(input);
339 }
340 }
341
342 input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
343 match input.eat(pat, eq) {
344 None if self.at_eof.get() => Some(false),
345 None => {
346 while let Some(data) = input.next() {
347 self.temp_buf.borrow_mut().push_char(data);
348 }
349 None
350 },
351 Some(matched) => Some(matched),
352 }
353 }
354
355 fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
357 if self.opts.profile {
358 loop {
359 let state = self.state.get();
360 let old_sink = self.time_in_sink.get();
361 let (run, mut dt) = time!(self.step(input));
362 dt -= (self.time_in_sink.get() - old_sink);
363 let new = match self.state_profile.borrow_mut().get_mut(&state) {
364 Some(x) => {
365 *x += dt;
366 false
367 },
368 None => true,
369 };
370 if new {
371 self.state_profile.borrow_mut().insert(state, dt);
373 }
374 match run {
375 ProcessResult::Continue => (),
376 ProcessResult::Suspend => break,
377 ProcessResult::Script(node) => return TokenizerResult::Script(node),
378 ProcessResult::EncodingIndicator(encoding) => {
379 return TokenizerResult::EncodingIndicator(encoding)
380 },
381 }
382 }
383 } else {
384 loop {
385 match self.step(input) {
386 ProcessResult::Continue => (),
387 ProcessResult::Suspend => break,
388 ProcessResult::Script(node) => return TokenizerResult::Script(node),
389 ProcessResult::EncodingIndicator(encoding) => {
390 return TokenizerResult::EncodingIndicator(encoding)
391 },
392 }
393 }
394 }
395 TokenizerResult::Done
396 }
397
398 #[inline]
399 fn bad_char_error(&self) {
400 #[cfg(feature = "trace_tokenizer")]
401 trace!(" error");
402
403 let msg = if self.opts.exact_errors {
404 Cow::from("Bad character")
405 } else {
406 let c = self.current_char.get();
407 let state = self.state.get();
408 Cow::from(format!("Saw {c} in state {state:?}"))
409 };
410 self.emit_error(msg);
411 }
412
413 #[inline]
414 fn bad_eof_error(&self) {
415 #[cfg(feature = "trace_tokenizer")]
416 trace!(" error_eof");
417
418 let msg = if self.opts.exact_errors {
419 Cow::from("Unexpected EOF")
420 } else {
421 let state = self.state.get();
422 Cow::from(format!("Saw EOF in state {state:?}"))
423 };
424 self.emit_error(msg);
425 }
426
427 fn emit_char(&self, c: char) {
428 #[cfg(feature = "trace_tokenizer")]
429 trace!(" emit");
430
431 self.process_token_and_continue(match c {
432 '\0' => NullCharacterToken,
433 _ => CharacterTokens(StrTendril::from_char(c)),
434 });
435 }
436
437 fn emit_chars(&self, b: StrTendril) {
439 self.process_token_and_continue(CharacterTokens(b));
440 }
441
442 fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
443 self.finish_attribute();
444
445 let name = LocalName::from(&**self.current_tag_name.borrow());
446 self.current_tag_name.borrow_mut().clear();
447
448 match self.current_tag_kind.get() {
449 StartTag => {
450 *self.last_start_tag_name.borrow_mut() = Some(name.clone());
451 },
452 EndTag => {
453 if !self.current_tag_attrs.borrow().is_empty() {
454 self.emit_error(Borrowed("Attributes on an end tag"));
455 }
456 if self.current_tag_self_closing.get() {
457 self.emit_error(Borrowed("Self-closing end tag"));
458 }
459 },
460 }
461
462 let token = TagToken(Tag {
463 kind: self.current_tag_kind.get(),
464 name,
465 self_closing: self.current_tag_self_closing.get(),
466 attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
467 had_duplicate_attributes: self.current_tag_had_duplicate_attributes.get(),
468 });
469
470 match self.process_token(token) {
471 TokenSinkResult::Continue => ProcessResult::Continue,
472 TokenSinkResult::Plaintext => {
473 self.state.set(states::Plaintext);
474 ProcessResult::Continue
475 },
476 TokenSinkResult::Script(node) => {
477 self.state.set(states::Data);
478 ProcessResult::Script(node)
479 },
480 TokenSinkResult::RawData(kind) => {
481 self.state.set(states::RawData(kind));
482 ProcessResult::Continue
483 },
484 TokenSinkResult::EncodingIndicator(encoding) => {
485 ProcessResult::EncodingIndicator(encoding)
486 },
487 }
488 }
489
490 fn emit_temp_buf(&self) {
491 #[cfg(feature = "trace_tokenizer")]
492 trace!(" emit_temp");
493
494 let buf = mem::take(&mut *self.temp_buf.borrow_mut());
496 self.emit_chars(buf);
497 }
498
499 fn clear_temp_buf(&self) {
500 self.temp_buf.borrow_mut().clear();
502 }
503
504 fn emit_current_comment(&self) {
505 let comment = mem::take(&mut *self.current_comment.borrow_mut());
506 self.process_token_and_continue(CommentToken(comment));
507 }
508
509 fn discard_tag(&self) {
510 self.current_tag_name.borrow_mut().clear();
511 self.current_tag_self_closing.set(false);
512 self.current_tag_had_duplicate_attributes.set(false);
513 *self.current_tag_attrs.borrow_mut() = vec![];
514 }
515
516 fn create_tag(&self, kind: TagKind, c: char) {
517 self.discard_tag();
518 self.current_tag_name.borrow_mut().push_char(c);
519 self.current_tag_kind.set(kind);
520 }
521
522 fn have_appropriate_end_tag(&self) -> bool {
523 match self.last_start_tag_name.borrow().as_ref() {
524 Some(last) => {
525 (self.current_tag_kind.get() == EndTag)
526 && (**self.current_tag_name.borrow() == **last)
527 },
528 None => false,
529 }
530 }
531
532 fn create_attribute(&self, c: char) {
533 self.finish_attribute();
534
535 self.current_attr_name.borrow_mut().push_char(c);
536 }
537
538 fn finish_attribute(&self) {
539 if self.current_attr_name.borrow().is_empty() {
540 return;
541 }
542
543 let dup = {
546 let name = &*self.current_attr_name.borrow();
547 self.current_tag_attrs
548 .borrow()
549 .iter()
550 .any(|a| *a.name.local == **name)
551 };
552
553 if dup {
554 self.emit_error(Borrowed("Duplicate attribute"));
555 self.current_tag_had_duplicate_attributes.set(true);
556 self.current_attr_name.borrow_mut().clear();
557 self.current_attr_value.borrow_mut().clear();
558 } else {
559 let name = LocalName::from(&**self.current_attr_name.borrow());
560 self.current_attr_name.borrow_mut().clear();
561 self.current_tag_attrs.borrow_mut().push(Attribute {
562 name: QualName::new(None, ns!(), name),
565 value: mem::take(&mut self.current_attr_value.borrow_mut()),
566 });
567 }
568 }
569
570 fn emit_current_doctype(&self) {
571 let doctype = self.current_doctype.take();
572 self.process_token_and_continue(DoctypeToken(doctype));
573 }
574
575 fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<'_, Option<StrTendril>> {
576 let current_doctype = self.current_doctype.borrow_mut();
577 match kind {
578 Public => RefMut::map(current_doctype, |d| &mut d.public_id),
579 System => RefMut::map(current_doctype, |d| &mut d.system_id),
580 }
581 }
582
583 fn clear_doctype_id(&self, kind: DoctypeIdKind) {
584 let mut id = self.doctype_id(kind);
585 match *id {
586 Some(ref mut s) => s.clear(),
587 None => *id = Some(StrTendril::new()),
588 }
589 }
590
591 fn start_consuming_character_reference(&self) {
592 debug_assert!(
593 self.char_ref_tokenizer.borrow().is_none(),
594 "Nested character references are impossible"
595 );
596
597 let is_in_attribute = matches!(self.state.get(), states::AttributeValue(_));
598 *self.char_ref_tokenizer.borrow_mut() = Some(CharRefTokenizer::new(is_in_attribute));
599 }
600
601 fn emit_eof(&self) {
602 self.process_token_and_continue(EOFToken);
603 }
604
605 fn peek(&self, input: &BufferQueue) -> Option<char> {
606 if self.reconsume.get() {
607 Some(self.current_char.get())
608 } else {
609 input.peek()
610 }
611 }
612
613 fn discard_char(&self, input: &BufferQueue) {
614 if self.reconsume.get() {
620 self.reconsume.set(false);
621 } else {
622 input.next();
623 }
624 }
625
626 fn emit_error(&self, error: Cow<'static, str>) {
627 self.process_token_and_continue(ParseError(error));
628 }
629}
630macro_rules! shorthand (
634 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
635 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) );
636 ( $me:ident : discard_tag ) => ( $me.discard_tag() );
637 ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) );
638 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) );
639 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
640 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
641 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) );
642 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) );
643 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
644 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) );
645 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) );
646 ( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
647 ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() );
648 ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
649 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
650 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) );
651 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
652 ( $me:ident : force_quirks ) => ( $me.current_doctype.borrow_mut().force_quirks = true);
653 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
654);
655
656#[cfg(feature = "trace_tokenizer")]
659macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
660 trace!(" {:?}", stringify!($($cmds)*));
661 shorthand!($me : $($cmds)*);
662}));
663
664#[cfg(not(feature = "trace_tokenizer"))]
665macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
666
667macro_rules! go (
669 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
673 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
674 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
675 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
676
677 ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; });
680 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; });
681 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
682
683 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); });
684 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); });
685 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
686
687 ( $me:ident : consume_char_ref ) => ({ $me.start_consuming_character_reference(); return ProcessResult::Continue; });
688
689 ( $me:ident : emit_tag $s:ident ) => ({
691 $me.state.set(states::$s);
692 return $me.emit_current_tag();
693 });
694
695 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
696
697 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
699
700 ( $me:ident : ) => (());
702);
703
704macro_rules! get_char ( ($me:expr, $input:expr) => (
707 unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
708));
709
710macro_rules! peek ( ($me:expr, $input:expr) => (
711 unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
712));
713
714macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
715 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
716));
717
718macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
719 unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
720));
721
722macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
723 unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
724));
725
726impl<Sink: TokenSink> Tokenizer<Sink> {
727 #[allow(clippy::never_loop)]
731 fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
732 if self.char_ref_tokenizer.borrow().is_some() {
733 return self.step_char_ref_tokenizer(input);
734 }
735
736 trace!("processing in state {:?}", self.state);
737 match self.state.get() {
738 states::Data => loop {
740 let set = small_char_set!('\r' '\0' '&' '<' '\n');
741
742 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
743 let set_result = if !(self.opts.exact_errors
744 || self.reconsume.get()
745 || self.ignore_lf.get())
746 && Self::is_supported_simd_feature_detected()
747 {
748 let front_buffer = input.peek_front_chunk_mut();
749 let Some(mut front_buffer) = front_buffer else {
750 return ProcessResult::Suspend;
751 };
752
753 let first_char = front_buffer
756 .chars()
757 .next()
758 .expect("Input buffers are never empty");
759
760 if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
761 drop(front_buffer);
762 self.pop_except_from(input, set)
763 } else {
764 let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
767
768 if front_buffer.is_empty() {
769 drop(front_buffer);
770 input.pop_front();
771 }
772
773 result
774 }
775 } else {
776 self.pop_except_from(input, set)
777 };
778
779 #[cfg(not(any(
780 target_arch = "x86",
781 target_arch = "x86_64",
782 target_arch = "aarch64"
783 )))]
784 let set_result = self.pop_except_from(input, set);
785
786 let Some(set_result) = set_result else {
787 return ProcessResult::Suspend;
788 };
789 match set_result {
790 FromSet('\0') => {
791 self.bad_char_error();
792 self.emit_char('\0');
793 },
794 FromSet('&') => go!(self: consume_char_ref),
795 FromSet('<') => go!(self: to TagOpen),
796 FromSet(c) => {
797 self.emit_char(c);
798 },
799 NotFromSet(b) => self.emit_chars(b),
800 }
801 },
802
803 states::RawData(Rcdata) => loop {
805 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
806 FromSet('\0') => {
807 self.bad_char_error();
808 self.emit_char('\u{fffd}');
809 },
810 FromSet('&') => go!(self: consume_char_ref),
811 FromSet('<') => go!(self: to RawLessThanSign Rcdata),
812 FromSet(c) => self.emit_char(c),
813 NotFromSet(b) => self.emit_chars(b),
814 }
815 },
816
817 states::RawData(Rawtext) => loop {
819 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
820 FromSet('\0') => {
821 self.bad_char_error();
822 self.emit_char('\u{fffd}');
823 },
824 FromSet('<') => go!(self: to RawLessThanSign Rawtext),
825 FromSet(c) => self.emit_char(c),
826 NotFromSet(b) => self.emit_chars(b),
827 }
828 },
829
830 states::RawData(ScriptData) => loop {
832 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
833 FromSet('\0') => {
834 self.bad_char_error();
835 self.emit_char('\u{fffd}');
836 },
837 FromSet('<') => go!(self: to RawLessThanSign ScriptData),
838 FromSet(c) => self.emit_char(c),
839 NotFromSet(b) => self.emit_chars(b),
840 }
841 },
842
843 states::RawData(ScriptDataEscaped(Escaped)) => loop {
845 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
846 FromSet('\0') => {
847 self.bad_char_error();
848 self.emit_char('\u{fffd}');
849 },
850 FromSet('-') => {
851 self.emit_char('-');
852 go!(self: to ScriptDataEscapedDash Escaped);
853 },
854 FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
855 FromSet(c) => self.emit_char(c),
856 NotFromSet(b) => self.emit_chars(b),
857 }
858 },
859
860 states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
862 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
863 FromSet('\0') => {
864 self.bad_char_error();
865 self.emit_char('\u{fffd}');
866 },
867 FromSet('-') => {
868 self.emit_char('-');
869 go!(self: to ScriptDataEscapedDash DoubleEscaped);
870 },
871 FromSet('<') => {
872 self.emit_char('<');
873 go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped)
874 },
875 FromSet(c) => self.emit_char(c),
876 NotFromSet(b) => self.emit_chars(b),
877 }
878 },
879
880 states::Plaintext => loop {
882 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
883 FromSet('\0') => {
884 self.bad_char_error();
885 self.emit_char('\u{fffd}');
886 },
887 FromSet(c) => self.emit_char(c),
888 NotFromSet(b) => self.emit_chars(b),
889 }
890 },
891
892 states::TagOpen => loop {
894 match get_char!(self, input) {
895 '!' => go!(self: to MarkupDeclarationOpen),
896 '/' => go!(self: to EndTagOpen),
897 '?' => {
898 self.bad_char_error();
899 go!(self: clear_comment; reconsume BogusComment)
900 },
901 c => match lower_ascii_letter(c) {
902 Some(cl) => go!(self: create_tag StartTag cl; to TagName),
903 None => {
904 self.bad_char_error();
905 self.emit_char('<');
906 go!(self: reconsume Data)
907 },
908 },
909 }
910 },
911
912 states::EndTagOpen => loop {
914 match get_char!(self, input) {
915 '>' => {
916 self.bad_char_error();
917 go!(self: to Data)
918 },
919 c => match lower_ascii_letter(c) {
920 Some(cl) => go!(self: create_tag EndTag cl; to TagName),
921 None => {
922 self.bad_char_error();
923 go!(self: clear_comment; reconsume BogusComment)
924 },
925 },
926 }
927 },
928
929 states::TagName => loop {
931 match get_char!(self, input) {
932 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
933 '/' => go!(self: to SelfClosingStartTag),
934 '>' => go!(self: emit_tag Data),
935 '\0' => {
936 self.bad_char_error();
937 go!(self: push_tag '\u{fffd}')
938 },
939 c => go!(self: push_tag (c.to_ascii_lowercase())),
940 }
941 },
942
943 states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
945 match get_char!(self, input) {
946 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
947 c => match lower_ascii_letter(c) {
948 Some(cl) => {
949 go!(self: clear_temp; push_temp cl);
950 self.emit_char('<');
951 self.emit_char(c);
952 go!(self: to ScriptDataEscapeStart DoubleEscaped);
953 },
954 None => {
955 self.emit_char('<');
956 go!(self: reconsume RawData ScriptDataEscaped Escaped);
957 },
958 },
959 }
960 },
961
962 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
964 match get_char!(self, input) {
965 '/' => {
966 go!(self: clear_temp);
967 self.emit_char('/');
968 go!(self: to ScriptDataDoubleEscapeEnd);
969 },
970 _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
971 }
972 },
973
974 states::RawLessThanSign(kind) => loop {
977 match get_char!(self, input) {
978 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
979 '!' if kind == ScriptData => {
980 self.emit_char('<');
981 self.emit_char('!');
982 go!(self: to ScriptDataEscapeStart Escaped);
983 },
984 _ => {
985 self.emit_char('<');
986 go!(self: reconsume RawData kind);
987 },
988 }
989 },
990
991 states::RawEndTagOpen(kind) => loop {
993 let c = get_char!(self, input);
994 match lower_ascii_letter(c) {
995 Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
996 None => {
997 self.emit_char('<');
998 self.emit_char('/');
999 go!(self: reconsume RawData kind);
1000 },
1001 }
1002 },
1003
1004 states::RawEndTagName(kind) => loop {
1006 let c = get_char!(self, input);
1007 if self.have_appropriate_end_tag() {
1008 match c {
1009 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName),
1010 '/' => go!(self: clear_temp; to SelfClosingStartTag),
1011 '>' => go!(self: clear_temp; emit_tag Data),
1012 _ => (),
1013 }
1014 }
1015
1016 match lower_ascii_letter(c) {
1017 Some(cl) => go!(self: push_tag cl; push_temp c),
1018 None => {
1019 go!(self: discard_tag);
1020 self.emit_char('<');
1021 self.emit_char('/');
1022 self.emit_temp_buf();
1023 go!(self: reconsume RawData kind);
1024 },
1025 }
1026 },
1027
1028 states::ScriptDataEscapeStart(DoubleEscaped) => loop {
1030 let c = get_char!(self, input);
1031 match c {
1032 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1033 let esc = if &**self.temp_buf.borrow() == "script" {
1034 DoubleEscaped
1035 } else {
1036 Escaped
1037 };
1038 self.emit_char(c);
1039 go!(self: to RawData ScriptDataEscaped esc);
1040 },
1041 _ => match lower_ascii_letter(c) {
1042 Some(cl) => {
1043 go!(self: push_temp cl);
1044 self.emit_char(c);
1045 },
1046 None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
1047 },
1048 }
1049 },
1050
1051 states::ScriptDataEscapeStart(Escaped) => loop {
1053 match get_char!(self, input) {
1054 '-' => {
1055 self.emit_char('-');
1056 go!(self: to ScriptDataEscapeStartDash);
1057 },
1058 _ => go!(self: reconsume RawData ScriptData),
1059 }
1060 },
1061
1062 states::ScriptDataEscapeStartDash => loop {
1064 match get_char!(self, input) {
1065 '-' => {
1066 self.emit_char('-');
1067 go!(self: to ScriptDataEscapedDashDash Escaped);
1068 },
1069 _ => go!(self: reconsume RawData ScriptData),
1070 }
1071 },
1072
1073 states::ScriptDataEscapedDash(kind) => loop {
1075 match get_char!(self, input) {
1076 '-' => {
1077 self.emit_char('-');
1078 go!(self: to ScriptDataEscapedDashDash kind);
1079 },
1080 '<' => {
1081 if kind == DoubleEscaped {
1082 self.emit_char('<');
1083 }
1084 go!(self: to RawLessThanSign ScriptDataEscaped kind);
1085 },
1086 '\0' => {
1087 self.bad_char_error();
1088 self.emit_char('\u{fffd}');
1089 go!(self: to RawData ScriptDataEscaped kind)
1090 },
1091 c => {
1092 self.emit_char(c);
1093 go!(self: to RawData ScriptDataEscaped kind);
1094 },
1095 }
1096 },
1097
1098 states::ScriptDataEscapedDashDash(kind) => loop {
1100 match get_char!(self, input) {
1101 '-' => {
1102 self.emit_char('-');
1103 },
1104 '<' => {
1105 if kind == DoubleEscaped {
1106 self.emit_char('<');
1107 }
1108 go!(self: to RawLessThanSign ScriptDataEscaped kind);
1109 },
1110 '>' => {
1111 self.emit_char('>');
1112 go!(self: to RawData ScriptData);
1113 },
1114 '\0' => {
1115 self.bad_char_error();
1116 self.emit_char('\u{fffd}');
1117 go!(self: to RawData ScriptDataEscaped kind)
1118 },
1119 c => {
1120 self.emit_char(c);
1121 go!(self: to RawData ScriptDataEscaped kind);
1122 },
1123 }
1124 },
1125
1126 states::ScriptDataDoubleEscapeEnd => loop {
1128 let c = get_char!(self, input);
1129 match c {
1130 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1131 let esc = if &**self.temp_buf.borrow() == "script" {
1132 Escaped
1133 } else {
1134 DoubleEscaped
1135 };
1136 self.emit_char(c);
1137 go!(self: to RawData ScriptDataEscaped esc);
1138 },
1139 _ => match lower_ascii_letter(c) {
1140 Some(cl) => {
1141 go!(self: push_temp cl);
1142 self.emit_char(c);
1143 },
1144 None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
1145 },
1146 }
1147 },
1148
1149 states::BeforeAttributeName => loop {
1151 match get_char!(self, input) {
1152 '\t' | '\n' | '\x0C' | ' ' => (),
1153 '/' => go!(self: to SelfClosingStartTag),
1154 '>' => go!(self: emit_tag Data),
1155 '\0' => {
1156 self.bad_char_error();
1157 go!(self: create_attr '\u{fffd}'; to AttributeName)
1158 },
1159 c => match lower_ascii_letter(c) {
1160 Some(cl) => go!(self: create_attr cl; to AttributeName),
1161 None => {
1162 if matches!(c, '"' | '\'' | '<' | '=') {
1163 self.bad_char_error();
1164 }
1165
1166 go!(self: create_attr c; to AttributeName);
1167 },
1168 },
1169 }
1170 },
1171
1172 states::AttributeName => loop {
1174 match get_char!(self, input) {
1175 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
1176 '/' => go!(self: to SelfClosingStartTag),
1177 '=' => go!(self: to BeforeAttributeValue),
1178 '>' => go!(self: emit_tag Data),
1179 '\0' => {
1180 self.bad_char_error();
1181 go!(self: push_name '\u{fffd}')
1182 },
1183 c => match lower_ascii_letter(c) {
1184 Some(cl) => go!(self: push_name cl),
1185 None => {
1186 if matches!(c, '"' | '\'' | '<') {
1187 self.bad_char_error();
1188 }
1189 go!(self: push_name c);
1190 },
1191 },
1192 }
1193 },
1194
1195 states::AfterAttributeName => loop {
1197 match get_char!(self, input) {
1198 '\t' | '\n' | '\x0C' | ' ' => (),
1199 '/' => go!(self: to SelfClosingStartTag),
1200 '=' => go!(self: to BeforeAttributeValue),
1201 '>' => go!(self: emit_tag Data),
1202 '\0' => {
1203 self.bad_char_error();
1204 go!(self: create_attr '\u{fffd}'; to AttributeName)
1205 },
1206 c => match lower_ascii_letter(c) {
1207 Some(cl) => go!(self: create_attr cl; to AttributeName),
1208 None => {
1209 if matches!(c, '"' | '\'' | '<') {
1210 self.bad_char_error();
1211 }
1212
1213 go!(self: create_attr c; to AttributeName);
1214 },
1215 },
1216 }
1217 },
1218
1219 states::BeforeAttributeValue => loop {
1223 match peek!(self, input) {
1224 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1225 '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1226 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1227 '>' => {
1228 go!(self: discard_char input);
1229 self.bad_char_error();
1230 go!(self: emit_tag Data)
1231 },
1232 _ => go!(self: to AttributeValue Unquoted),
1233 }
1234 },
1235
1236 states::AttributeValue(DoubleQuoted) => loop {
1238 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1239 FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1240 FromSet('&') => go!(self: consume_char_ref),
1241 FromSet('\0') => {
1242 self.bad_char_error();
1243 go!(self: push_value '\u{fffd}')
1244 },
1245 FromSet(c) => go!(self: push_value c),
1246 NotFromSet(ref b) => go!(self: append_value b),
1247 }
1248 },
1249
1250 states::AttributeValue(SingleQuoted) => loop {
1252 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1253 FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1254 FromSet('&') => go!(self: consume_char_ref),
1255 FromSet('\0') => {
1256 self.bad_char_error();
1257 go!(self: push_value '\u{fffd}')
1258 },
1259 FromSet(c) => go!(self: push_value c),
1260 NotFromSet(ref b) => go!(self: append_value b),
1261 }
1262 },
1263
1264 states::AttributeValue(Unquoted) => loop {
1266 match pop_except_from!(
1267 self,
1268 input,
1269 small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1270 ) {
1271 FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1272 go!(self: to BeforeAttributeName)
1273 },
1274 FromSet('&') => go!(self: consume_char_ref),
1275 FromSet('>') => go!(self: emit_tag Data),
1276 FromSet('\0') => {
1277 self.bad_char_error();
1278 go!(self: push_value '\u{fffd}')
1279 },
1280 FromSet(c) => {
1281 if matches!(c, '"' | '\'' | '<' | '=' | '`') {
1282 self.bad_char_error();
1283 }
1284 go!(self: push_value c);
1285 },
1286 NotFromSet(ref b) => go!(self: append_value b),
1287 }
1288 },
1289
1290 states::AfterAttributeValueQuoted => loop {
1292 match get_char!(self, input) {
1293 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1294 '/' => go!(self: to SelfClosingStartTag),
1295 '>' => go!(self: emit_tag Data),
1296 _ => {
1297 self.bad_char_error();
1298 go!(self: reconsume BeforeAttributeName)
1299 },
1300 }
1301 },
1302
1303 states::SelfClosingStartTag => loop {
1305 match get_char!(self, input) {
1306 '>' => {
1307 self.current_tag_self_closing.set(true);
1308 go!(self: emit_tag Data);
1309 },
1310 _ => {
1311 self.bad_char_error();
1312 go!(self: reconsume BeforeAttributeName)
1313 },
1314 }
1315 },
1316
1317 states::CommentStart => loop {
1319 match get_char!(self, input) {
1320 '-' => go!(self: to CommentStartDash),
1321 '\0' => {
1322 self.bad_char_error();
1323 go!(self: push_comment '\u{fffd}'; to Comment)
1324 },
1325 '>' => {
1326 self.bad_char_error();
1327 go!(self: emit_comment; to Data)
1328 },
1329 c => go!(self: push_comment c; to Comment),
1330 }
1331 },
1332
1333 states::CommentStartDash => loop {
1335 match get_char!(self, input) {
1336 '-' => go!(self: to CommentEnd),
1337 '\0' => {
1338 self.bad_char_error();
1339 go!(self: append_comment "-\u{fffd}"; to Comment)
1340 },
1341 '>' => {
1342 self.bad_char_error();
1343 go!(self: emit_comment; to Data)
1344 },
1345 c => go!(self: push_comment '-'; push_comment c; to Comment),
1346 }
1347 },
1348
1349 states::Comment => loop {
1351 match get_char!(self, input) {
1352 c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1353 '-' => go!(self: to CommentEndDash),
1354 '\0' => {
1355 self.bad_char_error();
1356 go!(self: push_comment '\u{fffd}')
1357 },
1358 c => go!(self: push_comment c),
1359 }
1360 },
1361
1362 states::CommentLessThanSign => loop {
1364 match get_char!(self, input) {
1365 c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1366 c @ '<' => go!(self: push_comment c),
1367 _ => go!(self: reconsume Comment),
1368 }
1369 },
1370
1371 states::CommentLessThanSignBang => loop {
1373 match get_char!(self, input) {
1374 '-' => go!(self: to CommentLessThanSignBangDash),
1375 _ => go!(self: reconsume Comment),
1376 }
1377 },
1378
1379 states::CommentLessThanSignBangDash => loop {
1381 match get_char!(self, input) {
1382 '-' => go!(self: to CommentLessThanSignBangDashDash),
1383 _ => go!(self: reconsume CommentEndDash),
1384 }
1385 },
1386
1387 states::CommentLessThanSignBangDashDash => loop {
1389 match get_char!(self, input) {
1390 '>' => go!(self: reconsume CommentEnd),
1391 _ => {
1392 self.bad_char_error();
1393 go!(self: reconsume CommentEnd)
1394 },
1395 }
1396 },
1397
1398 states::CommentEndDash => loop {
1400 match get_char!(self, input) {
1401 '-' => go!(self: to CommentEnd),
1402 '\0' => {
1403 self.bad_char_error();
1404 go!(self: append_comment "-\u{fffd}"; to Comment)
1405 },
1406 c => go!(self: push_comment '-'; push_comment c; to Comment),
1407 }
1408 },
1409
1410 states::CommentEnd => loop {
1412 match get_char!(self, input) {
1413 '>' => go!(self: emit_comment; to Data),
1414 '!' => go!(self: to CommentEndBang),
1415 '-' => go!(self: push_comment '-'),
1416 _ => go!(self: append_comment "--"; reconsume Comment),
1417 }
1418 },
1419
1420 states::CommentEndBang => loop {
1422 match get_char!(self, input) {
1423 '-' => go!(self: append_comment "--!"; to CommentEndDash),
1424 '>' => {
1425 self.bad_char_error();
1426 go!(self: emit_comment; to Data)
1427 },
1428 '\0' => {
1429 self.bad_char_error();
1430 go!(self: append_comment "--!\u{fffd}"; to Comment)
1431 },
1432 c => go!(self: append_comment "--!"; push_comment c; to Comment),
1433 }
1434 },
1435
1436 states::Doctype => loop {
1438 match get_char!(self, input) {
1439 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1440 '>' => go!(self: reconsume BeforeDoctypeName),
1441 _ => {
1442 self.bad_char_error();
1443 go!(self: reconsume BeforeDoctypeName)
1444 },
1445 }
1446 },
1447
1448 states::BeforeDoctypeName => loop {
1450 match get_char!(self, input) {
1451 '\t' | '\n' | '\x0C' | ' ' => (),
1452 '\0' => {
1453 self.bad_char_error();
1454 go!(self: create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1455 },
1456 '>' => {
1457 self.bad_char_error();
1458 go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1459 },
1460 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1461 to DoctypeName),
1462 }
1463 },
1464
1465 states::DoctypeName => loop {
1467 match get_char!(self, input) {
1468 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1469 '>' => go!(self: emit_doctype; to Data),
1470 '\0' => {
1471 self.bad_char_error();
1472 go!(self: push_doctype_name '\u{fffd}')
1473 },
1474 c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1475 }
1476 },
1477
1478 states::AfterDoctypeName => loop {
1480 if eat!(self, input, "public") {
1481 go!(self: to AfterDoctypeKeyword Public);
1482 } else if eat!(self, input, "system") {
1483 go!(self: to AfterDoctypeKeyword System);
1484 } else {
1485 match get_char!(self, input) {
1486 '\t' | '\n' | '\x0C' | ' ' => (),
1487 '>' => go!(self: emit_doctype; to Data),
1488 _ => {
1489 self.bad_char_error();
1490 go!(self: force_quirks; reconsume BogusDoctype)
1491 },
1492 }
1493 }
1494 },
1495
1496 states::AfterDoctypeKeyword(kind) => loop {
1498 match get_char!(self, input) {
1499 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1500 '"' => {
1501 self.bad_char_error();
1502 go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1503 },
1504 '\'' => {
1505 self.bad_char_error();
1506 go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1507 },
1508 '>' => {
1509 self.bad_char_error();
1510 go!(self: force_quirks; emit_doctype; to Data)
1511 },
1512 _ => {
1513 self.bad_char_error();
1514 go!(self: force_quirks; reconsume BogusDoctype)
1515 },
1516 }
1517 },
1518
1519 states::BeforeDoctypeIdentifier(kind) => loop {
1521 match get_char!(self, input) {
1522 '\t' | '\n' | '\x0C' | ' ' => (),
1523 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1524 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1525 '>' => {
1526 self.bad_char_error();
1527 go!(self: force_quirks; emit_doctype; to Data)
1528 },
1529 _ => {
1530 self.bad_char_error();
1531 go!(self: force_quirks; reconsume BogusDoctype)
1532 },
1533 }
1534 },
1535
1536 states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1538 match get_char!(self, input) {
1539 '"' => go!(self: to AfterDoctypeIdentifier kind),
1540 '\0' => {
1541 self.bad_char_error();
1542 go!(self: push_doctype_id kind '\u{fffd}')
1543 },
1544 '>' => {
1545 self.bad_char_error();
1546 go!(self: force_quirks; emit_doctype; to Data)
1547 },
1548 c => go!(self: push_doctype_id kind c),
1549 }
1550 },
1551
1552 states::DoctypeIdentifierSingleQuoted(kind) => loop {
1554 match get_char!(self, input) {
1555 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1556 '\0' => {
1557 self.bad_char_error();
1558 go!(self: push_doctype_id kind '\u{fffd}')
1559 },
1560 '>' => {
1561 self.bad_char_error();
1562 go!(self: force_quirks; emit_doctype; to Data)
1563 },
1564 c => go!(self: push_doctype_id kind c),
1565 }
1566 },
1567
1568 states::AfterDoctypeIdentifier(Public) => loop {
1570 match get_char!(self, input) {
1571 '\t' | '\n' | '\x0C' | ' ' => {
1572 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1573 },
1574 '>' => go!(self: emit_doctype; to Data),
1575 '"' => {
1576 self.bad_char_error();
1577 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1578 },
1579 '\'' => {
1580 self.bad_char_error();
1581 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1582 },
1583 _ => {
1584 self.bad_char_error();
1585 go!(self: force_quirks; reconsume BogusDoctype)
1586 },
1587 }
1588 },
1589
1590 states::AfterDoctypeIdentifier(System) => loop {
1592 match get_char!(self, input) {
1593 '\t' | '\n' | '\x0C' | ' ' => (),
1594 '>' => go!(self: emit_doctype; to Data),
1595 _ => {
1596 self.bad_char_error();
1597 go!(self: reconsume BogusDoctype)
1598 },
1599 }
1600 },
1601
1602 states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1604 match get_char!(self, input) {
1605 '\t' | '\n' | '\x0C' | ' ' => (),
1606 '>' => go!(self: emit_doctype; to Data),
1607 '"' => {
1608 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1609 },
1610 '\'' => {
1611 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1612 },
1613 _ => {
1614 self.bad_char_error();
1615 go!(self: force_quirks; reconsume BogusDoctype)
1616 },
1617 }
1618 },
1619
1620 states::BogusDoctype => loop {
1622 match get_char!(self, input) {
1623 '>' => go!(self: emit_doctype; to Data),
1624 '\0' => {
1625 self.bad_char_error();
1626 },
1627 _ => (),
1628 }
1629 },
1630
1631 states::BogusComment => loop {
1633 match get_char!(self, input) {
1634 '>' => go!(self: emit_comment; to Data),
1635 '\0' => {
1636 self.bad_char_error();
1637 go!(self: push_comment '\u{fffd}')
1638 },
1639 c => go!(self: push_comment c),
1640 }
1641 },
1642
1643 states::MarkupDeclarationOpen => loop {
1645 if eat_exact!(self, input, "--") {
1646 go!(self: clear_comment; to CommentStart);
1647 } else if eat!(self, input, "doctype") {
1648 go!(self: to Doctype);
1649 } else {
1650 if self
1651 .sink
1652 .adjusted_current_node_present_but_not_in_html_namespace()
1653 && eat_exact!(self, input, "[CDATA[")
1654 {
1655 go!(self: clear_temp; to CdataSection);
1656 }
1657 self.bad_char_error();
1658 go!(self: clear_comment; to BogusComment);
1659 }
1660 },
1661
1662 states::CdataSection => loop {
1664 match get_char!(self, input) {
1665 ']' => go!(self: to CdataSectionBracket),
1666 '\0' => {
1667 self.emit_temp_buf();
1668 self.emit_char('\0');
1669 },
1670 c => go!(self: push_temp c),
1671 }
1672 },
1673
1674 states::CdataSectionBracket => match get_char!(self, input) {
1676 ']' => go!(self: to CdataSectionEnd),
1677 _ => go!(self: push_temp ']'; reconsume CdataSection),
1678 },
1679
1680 states::CdataSectionEnd => loop {
1682 match get_char!(self, input) {
1683 ']' => go!(self: push_temp ']'),
1684 '>' => {
1685 self.emit_temp_buf();
1686 go!(self: to Data);
1687 },
1688 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1689 }
1690 },
1691 }
1693 }
1694
1695 fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1696 let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
1697 let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
1698 char_ref::Status::Done(char_ref) => {
1699 self.process_char_ref(char_ref);
1700 *char_ref_tokenizer = None;
1701 return ProcessResult::Continue;
1702 },
1703
1704 char_ref::Status::Stuck => ProcessResult::Suspend,
1705 char_ref::Status::Progress => ProcessResult::Continue,
1706 };
1707
1708 progress
1709 }
1710
1711 fn process_char_ref(&self, char_ref: CharRef) {
1712 let CharRef {
1713 mut chars,
1714 mut num_chars,
1715 } = char_ref;
1716
1717 if num_chars == 0 {
1718 chars[0] = '&';
1719 num_chars = 1;
1720 }
1721
1722 for i in 0..num_chars {
1723 let c = chars[i as usize];
1724 match self.state.get() {
1725 states::Data | states::RawData(states::Rcdata) => self.emit_char(c),
1726
1727 states::AttributeValue(_) => go!(self: push_value c),
1728
1729 _ => panic!(
1730 "state {:?} should not be reachable in process_char_ref",
1731 self.state.get()
1732 ),
1733 }
1734 }
1735 }
1736
1737 pub fn end(&self) {
1739 let input = BufferQueue::default();
1742 match self.char_ref_tokenizer.take() {
1743 None => (),
1744 Some(mut tokenizer) => {
1745 self.process_char_ref(tokenizer.end_of_file(self, &input));
1746 },
1747 }
1748
1749 self.at_eof.set(true);
1752 assert!(matches!(self.run(&input), TokenizerResult::Done));
1753 assert!(input.is_empty());
1754
1755 loop {
1756 match self.eof_step() {
1757 ProcessResult::Continue => (),
1758 ProcessResult::Suspend => break,
1759 ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
1760 }
1761 }
1762
1763 self.sink.end();
1764
1765 if self.opts.profile {
1766 self.dump_profile();
1767 }
1768 }
1769
1770 fn dump_profile(&self) {
1771 let mut results: Vec<(states::State, u64)> = self
1772 .state_profile
1773 .borrow()
1774 .iter()
1775 .map(|(s, t)| (*s, *t))
1776 .collect();
1777 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1778
1779 let total: u64 = results
1780 .iter()
1781 .map(|&(_, t)| t)
1782 .fold(0, ::std::ops::Add::add);
1783 println!("\nTokenizer profile, in nanoseconds");
1784 println!(
1785 "\n{:12} total in token sink",
1786 self.time_in_sink.get()
1787 );
1788 println!("\n{total:12} total in tokenizer");
1789
1790 for (k, v) in results.into_iter() {
1791 let pct = 100.0 * (v as f64) / (total as f64);
1792 println!("{v:12} {pct:4.1}% {k:?}");
1793 }
1794 }
1795
1796 fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1797 debug!("processing EOF in state {:?}", self.state.get());
1798 match self.state.get() {
1799 states::Data
1800 | states::RawData(Rcdata)
1801 | states::RawData(Rawtext)
1802 | states::RawData(ScriptData)
1803 | states::Plaintext => go!(self: eof),
1804
1805 states::TagName
1806 | states::RawData(ScriptDataEscaped(_))
1807 | states::BeforeAttributeName
1808 | states::AttributeName
1809 | states::AfterAttributeName
1810 | states::AttributeValue(_)
1811 | states::AfterAttributeValueQuoted
1812 | states::SelfClosingStartTag
1813 | states::ScriptDataEscapedDash(_)
1814 | states::ScriptDataEscapedDashDash(_) => {
1815 self.bad_eof_error();
1816 go!(self: to Data)
1817 },
1818
1819 states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1820
1821 states::TagOpen => {
1822 self.bad_eof_error();
1823 self.emit_char('<');
1824 go!(self: to Data);
1825 },
1826
1827 states::EndTagOpen => {
1828 self.bad_eof_error();
1829 self.emit_char('<');
1830 self.emit_char('/');
1831 go!(self: to Data);
1832 },
1833
1834 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1835 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1836 },
1837
1838 states::RawLessThanSign(kind) => {
1839 self.emit_char('<');
1840 go!(self: to RawData kind);
1841 },
1842
1843 states::RawEndTagOpen(kind) => {
1844 self.emit_char('<');
1845 self.emit_char('/');
1846 go!(self: to RawData kind);
1847 },
1848
1849 states::RawEndTagName(kind) => {
1850 self.emit_char('<');
1851 self.emit_char('/');
1852 self.emit_temp_buf();
1853 go!(self: to RawData kind)
1854 },
1855
1856 states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1857
1858 states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1859
1860 states::ScriptDataDoubleEscapeEnd => {
1861 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1862 },
1863
1864 states::CommentStart
1865 | states::CommentStartDash
1866 | states::Comment
1867 | states::CommentEndDash
1868 | states::CommentEnd
1869 | states::CommentEndBang => {
1870 self.bad_eof_error();
1871 go!(self: emit_comment; to Data)
1872 },
1873
1874 states::CommentLessThanSign | states::CommentLessThanSignBang => {
1875 go!(self: reconsume Comment)
1876 },
1877
1878 states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1879
1880 states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1881
1882 states::Doctype | states::BeforeDoctypeName => {
1883 self.bad_eof_error();
1884 go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1885 },
1886
1887 states::DoctypeName
1888 | states::AfterDoctypeName
1889 | states::AfterDoctypeKeyword(_)
1890 | states::BeforeDoctypeIdentifier(_)
1891 | states::DoctypeIdentifierDoubleQuoted(_)
1892 | states::DoctypeIdentifierSingleQuoted(_)
1893 | states::AfterDoctypeIdentifier(_)
1894 | states::BetweenDoctypePublicAndSystemIdentifiers => {
1895 self.bad_eof_error();
1896 go!(self: force_quirks; emit_doctype; to Data)
1897 },
1898
1899 states::BogusDoctype => go!(self: emit_doctype; to Data),
1900
1901 states::BogusComment => go!(self: emit_comment; to Data),
1902
1903 states::MarkupDeclarationOpen => {
1904 self.bad_char_error();
1905 go!(self: to BogusComment)
1906 },
1907
1908 states::CdataSection => {
1909 self.emit_temp_buf();
1910 self.bad_eof_error();
1911 go!(self: to Data)
1912 },
1913
1914 states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1915
1916 states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1917 }
1918 }
1919
1920 fn is_supported_simd_feature_detected() -> bool {
1922 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1923 {
1924 is_x86_feature_detected!("sse2")
1925 }
1926
1927 #[cfg(target_arch = "aarch64")]
1928 {
1929 std::arch::is_aarch64_feature_detected!("neon")
1930 }
1931
1932 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
1933 false
1934 }
1935
1936 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
1937 unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1948 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1949 let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
1950
1951 #[cfg(target_arch = "aarch64")]
1952 let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
1953
1954 while let Some(c) = input.as_bytes().get(i) {
1956 if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1957 break;
1958 }
1959 if *c == b'\n' {
1960 n_newlines += 1;
1961 }
1962
1963 i += 1;
1964 }
1965
1966 let set_result = if i == 0 {
1967 let first_char = input.pop_front_char().unwrap();
1968 debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1969
1970 let preprocessed_char = self
1974 .get_preprocessed_char(first_char, &BufferQueue::default())
1975 .unwrap();
1976 SetResult::FromSet(preprocessed_char)
1977 } else {
1978 debug_assert!(
1979 input.len() >= i,
1980 "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1981 i,
1982 input.len()
1983 );
1984 let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1985 input.unsafe_pop_front(i as u32);
1986 SetResult::NotFromSet(consumed_chunk)
1987 };
1988
1989 self.current_line.set(self.current_line.get() + n_newlines);
1990
1991 Some(set_result)
1992 }
1993
1994 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1995 #[target_feature(enable = "sse2")]
1996 unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2004 #[cfg(target_arch = "x86")]
2005 use std::arch::x86::{
2006 __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
2007 _mm_set1_epi8,
2008 };
2009 #[cfg(target_arch = "x86_64")]
2010 use std::arch::x86_64::{
2011 __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
2012 _mm_set1_epi8,
2013 };
2014
2015 debug_assert!(!input.is_empty());
2016
2017 let quote_mask = _mm_set1_epi8('<' as i8);
2018 let escape_mask = _mm_set1_epi8('&' as i8);
2019 let carriage_return_mask = _mm_set1_epi8('\r' as i8);
2020 let zero_mask = _mm_set1_epi8('\0' as i8);
2021 let newline_mask = _mm_set1_epi8('\n' as i8);
2022
2023 let raw_bytes: &[u8] = input.as_bytes();
2024 let start = raw_bytes.as_ptr();
2025
2026 const STRIDE: usize = 16;
2027 let mut i = 0;
2028 let mut n_newlines = 0;
2029 while i + STRIDE <= raw_bytes.len() {
2030 let data = _mm_loadu_si128(start.add(i) as *const __m128i);
2032
2033 let quotes = _mm_cmpeq_epi8(data, quote_mask);
2035 let escapes = _mm_cmpeq_epi8(data, escape_mask);
2036 let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
2037 let zeros = _mm_cmpeq_epi8(data, zero_mask);
2038 let newlines = _mm_cmpeq_epi8(data, newline_mask);
2039
2040 let test_result = _mm_or_si128(
2043 _mm_or_si128(quotes, zeros),
2044 _mm_or_si128(escapes, carriage_returns),
2045 );
2046 let bitmask = _mm_movemask_epi8(test_result);
2047 let newline_mask = _mm_movemask_epi8(newlines);
2048
2049 if (bitmask != 0) {
2050 let position = if cfg!(target_endian = "little") {
2052 bitmask.trailing_zeros() as usize
2053 } else {
2054 bitmask.leading_zeros() as usize
2055 };
2056
2057 n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
2058 i += position;
2059 break;
2060 } else {
2061 n_newlines += newline_mask.count_ones() as u64;
2062 }
2063
2064 i += STRIDE;
2065 }
2066
2067 (i, n_newlines)
2068 }
2069
2070 #[cfg(target_arch = "aarch64")]
2071 #[target_feature(enable = "neon")]
2072 unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2080 use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
2081
2082 debug_assert!(!input.is_empty());
2083
2084 let quote_mask = vdupq_n_u8(b'<');
2085 let escape_mask = vdupq_n_u8(b'&');
2086 let carriage_return_mask = vdupq_n_u8(b'\r');
2087 let zero_mask = vdupq_n_u8(b'\0');
2088 let newline_mask = vdupq_n_u8(b'\n');
2089
2090 let raw_bytes: &[u8] = input.as_bytes();
2091 let start = raw_bytes.as_ptr();
2092
2093 const STRIDE: usize = 16;
2094 let mut i = 0;
2095 let mut n_newlines = 0;
2096 while i + STRIDE <= raw_bytes.len() {
2097 let data = vld1q_u8(start.add(i));
2099
2100 let quotes = vceqq_u8(data, quote_mask);
2102 let escapes = vceqq_u8(data, escape_mask);
2103 let carriage_returns = vceqq_u8(data, carriage_return_mask);
2104 let zeros = vceqq_u8(data, zero_mask);
2105 let newlines = vceqq_u8(data, newline_mask);
2106
2107 let test_result =
2110 vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
2111 let bitmask = vmaxvq_u8(test_result);
2112 let newline_mask = vmaxvq_u8(newlines);
2113 if bitmask != 0 {
2114 let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2116 let position = chunk_bytes
2117 .iter()
2118 .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
2119 .unwrap();
2120
2121 n_newlines += chunk_bytes[..position]
2122 .iter()
2123 .filter(|&&b| b == b'\n')
2124 .count() as u64;
2125
2126 i += position;
2127 break;
2128 } else if newline_mask != 0 {
2129 let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2130 n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
2131 }
2132
2133 i += STRIDE;
2134 }
2135
2136 (i, n_newlines)
2137 }
2138}
2139
2140#[cfg(test)]
2141#[allow(non_snake_case)]
2142mod test {
2143 use super::option_push; use crate::tendril::{SliceExt, StrTendril};
2145
2146 use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
2147
2148 use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
2149 use super::interface::{EndTag, StartTag, Tag, TagKind};
2150 use super::interface::{TagToken, Token};
2151
2152 use markup5ever::buffer_queue::BufferQueue;
2153 use std::cell::RefCell;
2154
2155 use crate::LocalName;
2156
2157 struct LinesMatch {
2161 tokens: RefCell<Vec<Token>>,
2162 current_str: RefCell<StrTendril>,
2163 lines: RefCell<Vec<(Token, u64)>>,
2164 }
2165
2166 impl LinesMatch {
2167 fn new() -> LinesMatch {
2168 LinesMatch {
2169 tokens: RefCell::new(vec![]),
2170 current_str: RefCell::new(StrTendril::new()),
2171 lines: RefCell::new(vec![]),
2172 }
2173 }
2174
2175 fn push(&self, token: Token, line_number: u64) {
2176 self.finish_str();
2177 self.lines.borrow_mut().push((token, line_number));
2178 }
2179
2180 fn finish_str(&self) {
2181 if !self.current_str.borrow().is_empty() {
2182 let s = self.current_str.take();
2183 self.tokens.borrow_mut().push(CharacterTokens(s));
2184 }
2185 }
2186 }
2187
2188 impl TokenSink for LinesMatch {
2189 type Handle = ();
2190
2191 fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
2192 match token {
2193 CharacterTokens(b) => {
2194 self.current_str.borrow_mut().push_slice(&b);
2195 },
2196
2197 NullCharacterToken => {
2198 self.current_str.borrow_mut().push_char('\0');
2199 },
2200
2201 ParseError(_) => {
2202 panic!("unexpected parse error");
2203 },
2204
2205 TagToken(mut t) => {
2206 match t.kind {
2210 EndTag => {
2211 t.self_closing = false;
2212 t.attrs = vec![];
2213 },
2214 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
2215 }
2216 self.push(TagToken(t), line_number);
2217 },
2218
2219 EOFToken => (),
2220
2221 _ => self.push(token, line_number),
2222 }
2223 TokenSinkResult::Continue
2224 }
2225 }
2226
2227 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
2230 let sink = LinesMatch::new();
2231 let tok = Tokenizer::new(sink, opts);
2232 let buffer = BufferQueue::default();
2233 for chunk in input.into_iter() {
2234 buffer.push_back(chunk);
2235 let _ = tok.feed(&buffer);
2236 }
2237 tok.end();
2238 tok.sink.lines.take()
2239 }
2240
2241 fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
2243 let name = LocalName::from(&*token);
2244
2245 TagToken(Tag {
2246 kind: tagkind,
2247 name,
2248 self_closing: false,
2249 attrs: vec![],
2250 had_duplicate_attributes: false,
2251 })
2252 }
2253
2254 #[test]
2255 fn push_to_None_gives_singleton() {
2256 let mut s: Option<StrTendril> = None;
2257 option_push(&mut s, 'x');
2258 assert_eq!(s, Some("x".to_tendril()));
2259 }
2260
2261 #[test]
2262 fn push_to_empty_appends() {
2263 let mut s: Option<StrTendril> = Some(StrTendril::new());
2264 option_push(&mut s, 'x');
2265 assert_eq!(s, Some("x".to_tendril()));
2266 }
2267
2268 #[test]
2269 fn push_to_nonempty_appends() {
2270 let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
2271 option_push(&mut s, 'x');
2272 assert_eq!(s, Some("yx".to_tendril()));
2273 }
2274
2275 #[test]
2276 fn check_lines() {
2277 let opts = TokenizerOpts {
2278 exact_errors: false,
2279 discard_bom: true,
2280 profile: false,
2281 initial_state: None,
2282 last_start_tag_name: None,
2283 };
2284 let vector = vec![
2285 StrTendril::from("<a>\n"),
2286 StrTendril::from("<b>\n"),
2287 StrTendril::from("</b>\n"),
2288 StrTendril::from("</a>\n"),
2289 ];
2290 let expected = vec![
2291 (create_tag(StrTendril::from("a"), StartTag), 1),
2292 (create_tag(StrTendril::from("b"), StartTag), 2),
2293 (create_tag(StrTendril::from("b"), EndTag), 3),
2294 (create_tag(StrTendril::from("a"), EndTag), 4),
2295 ];
2296 let results = tokenize(vector, opts);
2297 assert_eq!(results, expected);
2298 }
2299
2300 #[test]
2301 fn check_lines_with_new_line() {
2302 let opts = TokenizerOpts {
2303 exact_errors: false,
2304 discard_bom: true,
2305 profile: false,
2306 initial_state: None,
2307 last_start_tag_name: None,
2308 };
2309 let vector = vec![
2310 StrTendril::from("<a>\r\n"),
2311 StrTendril::from("<b>\r\n"),
2312 StrTendril::from("</b>\r\n"),
2313 StrTendril::from("</a>\r\n"),
2314 ];
2315 let expected = vec![
2316 (create_tag(StrTendril::from("a"), StartTag), 1),
2317 (create_tag(StrTendril::from("b"), StartTag), 2),
2318 (create_tag(StrTendril::from("b"), EndTag), 3),
2319 (create_tag(StrTendril::from("a"), EndTag), 4),
2320 ];
2321 let results = tokenize(vector, opts);
2322 assert_eq!(results, expected);
2323 }
2324}