1pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use markup5ever::{ns, small_char_set, TokenizerResult};
28use std::borrow::Cow::{self, Borrowed};
29use std::cell::{Cell, RefCell, RefMut};
30use std::collections::BTreeMap;
31use std::mem;
32
33pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34use crate::macros::{time, unwrap_or_return};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42pub enum ProcessResult<Handle> {
43 Continue,
44 Suspend,
45 Script(Handle),
46}
47
48fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
49 match *opt_str {
50 Some(ref mut s) => s.push_char(c),
51 None => *opt_str = Some(StrTendril::from_char(c)),
52 }
53}
54
55#[derive(Clone)]
57pub struct TokenizerOpts {
58 pub exact_errors: bool,
61
62 pub discard_bom: bool,
65
66 pub profile: bool,
69
70 pub initial_state: Option<states::State>,
73
74 pub last_start_tag_name: Option<String>,
80}
81
82impl Default for TokenizerOpts {
83 fn default() -> TokenizerOpts {
84 TokenizerOpts {
85 exact_errors: false,
86 discard_bom: true,
87 profile: false,
88 initial_state: None,
89 last_start_tag_name: None,
90 }
91 }
92}
93
94pub struct Tokenizer<Sink> {
96 opts: TokenizerOpts,
98
99 pub sink: Sink,
101
102 state: Cell<states::State>,
104
105 at_eof: Cell<bool>,
108
109 char_ref_tokenizer: RefCell<Option<CharRefTokenizer>>,
112
113 current_char: Cell<char>,
115
116 reconsume: Cell<bool>,
118
119 ignore_lf: Cell<bool>,
122
123 discard_bom: Cell<bool>,
126
127 current_tag_kind: Cell<TagKind>,
129
130 current_tag_name: RefCell<StrTendril>,
132
133 current_tag_self_closing: Cell<bool>,
135
136 current_tag_attrs: RefCell<Vec<Attribute>>,
138
139 current_attr_name: RefCell<StrTendril>,
141
142 current_attr_value: RefCell<StrTendril>,
144
145 current_comment: RefCell<StrTendril>,
147
148 current_doctype: RefCell<Doctype>,
150
151 last_start_tag_name: RefCell<Option<LocalName>>,
153
154 temp_buf: RefCell<StrTendril>,
156
157 state_profile: RefCell<BTreeMap<states::State, u64>>,
159
160 time_in_sink: Cell<u64>,
162
163 current_line: Cell<u64>,
165}
166
167impl<Sink: TokenSink> Tokenizer<Sink> {
168 pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
170 let start_tag_name = opts
171 .last_start_tag_name
172 .take()
173 .map(|s| LocalName::from(&*s));
174 let state = opts.initial_state.unwrap_or(states::Data);
175 let discard_bom = opts.discard_bom;
176 Tokenizer {
177 opts,
178 sink,
179 state: Cell::new(state),
180 char_ref_tokenizer: RefCell::new(None),
181 at_eof: Cell::new(false),
182 current_char: Cell::new('\0'),
183 reconsume: Cell::new(false),
184 ignore_lf: Cell::new(false),
185 discard_bom: Cell::new(discard_bom),
186 current_tag_kind: Cell::new(StartTag),
187 current_tag_name: RefCell::new(StrTendril::new()),
188 current_tag_self_closing: Cell::new(false),
189 current_tag_attrs: RefCell::new(vec![]),
190 current_attr_name: RefCell::new(StrTendril::new()),
191 current_attr_value: RefCell::new(StrTendril::new()),
192 current_comment: RefCell::new(StrTendril::new()),
193 current_doctype: RefCell::new(Doctype::default()),
194 last_start_tag_name: RefCell::new(start_tag_name),
195 temp_buf: RefCell::new(StrTendril::new()),
196 state_profile: RefCell::new(BTreeMap::new()),
197 time_in_sink: Cell::new(0),
198 current_line: Cell::new(1),
199 }
200 }
201
202 pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
204 if input.is_empty() {
205 return TokenizerResult::Done;
206 }
207
208 if self.discard_bom.get() {
209 if let Some(c) = input.peek() {
210 if c == '\u{feff}' {
211 input.next();
212 }
213 } else {
214 return TokenizerResult::Done;
215 }
216 };
217
218 self.run(input)
219 }
220
221 pub fn set_plaintext_state(&self) {
222 self.state.set(states::Plaintext);
223 }
224
225 fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
226 if self.opts.profile {
227 let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
228 self.time_in_sink.set(self.time_in_sink.get() + dt);
229 ret
230 } else {
231 self.sink.process_token(token, self.current_line.get())
232 }
233 }
234
235 fn process_token_and_continue(&self, token: Token) {
236 assert!(matches!(
237 self.process_token(token),
238 TokenSinkResult::Continue
239 ));
240 }
241
242 fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
246 if self.ignore_lf.get() {
247 self.ignore_lf.set(false);
248 if c == '\n' {
249 c = input.next()?;
250 }
251 }
252
253 if c == '\r' {
254 self.ignore_lf.set(true);
255 c = '\n';
256 }
257
258 if c == '\n' {
259 self.current_line.set(self.current_line.get() + 1);
260 }
261
262 if self.opts.exact_errors
263 && match c as u32 {
264 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
265 n if (n & 0xFFFE) == 0xFFFE => true,
266 _ => false,
267 }
268 {
269 let msg = format!("Bad character {c}");
270 self.emit_error(Cow::Owned(msg));
271 }
272
273 trace!("got character {c}");
274 self.current_char.set(c);
275 Some(c)
276 }
277
278 fn get_char(&self, input: &BufferQueue) -> Option<char> {
281 if self.reconsume.get() {
282 self.reconsume.set(false);
283 Some(self.current_char.get())
284 } else {
285 input
286 .next()
287 .and_then(|c| self.get_preprocessed_char(c, input))
288 }
289 }
290
291 fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
292 if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
297 return self.get_char(input).map(FromSet);
298 }
299
300 let d = input.pop_except_from(set);
301 trace!("got characters {d:?}");
302 match d {
303 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
304
305 _ => d,
309 }
310 }
311
312 fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
317 if self.ignore_lf.get() {
318 self.ignore_lf.set(false);
319 if self.peek(input) == Some('\n') {
320 self.discard_char(input);
321 }
322 }
323
324 input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
325 match input.eat(pat, eq) {
326 None if self.at_eof.get() => Some(false),
327 None => {
328 while let Some(data) = input.next() {
329 self.temp_buf.borrow_mut().push_char(data);
330 }
331 None
332 },
333 Some(matched) => Some(matched),
334 }
335 }
336
337 fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
339 if self.opts.profile {
340 loop {
341 let state = self.state.get();
342 let old_sink = self.time_in_sink.get();
343 let (run, mut dt) = time!(self.step(input));
344 dt -= (self.time_in_sink.get() - old_sink);
345 let new = match self.state_profile.borrow_mut().get_mut(&state) {
346 Some(x) => {
347 *x += dt;
348 false
349 },
350 None => true,
351 };
352 if new {
353 self.state_profile.borrow_mut().insert(state, dt);
355 }
356 match run {
357 ProcessResult::Continue => (),
358 ProcessResult::Suspend => break,
359 ProcessResult::Script(node) => return TokenizerResult::Script(node),
360 }
361 }
362 } else {
363 loop {
364 match self.step(input) {
365 ProcessResult::Continue => (),
366 ProcessResult::Suspend => break,
367 ProcessResult::Script(node) => return TokenizerResult::Script(node),
368 }
369 }
370 }
371 TokenizerResult::Done
372 }
373
374 #[inline]
375 fn bad_char_error(&self) {
376 #[cfg(feature = "trace_tokenizer")]
377 trace!(" error");
378
379 let msg = if self.opts.exact_errors {
380 Cow::from("Bad character")
381 } else {
382 let c = self.current_char.get();
383 let state = self.state.get();
384 Cow::from(format!("Saw {c} in state {state:?}"))
385 };
386 self.emit_error(msg);
387 }
388
389 #[inline]
390 fn bad_eof_error(&self) {
391 #[cfg(feature = "trace_tokenizer")]
392 trace!(" error_eof");
393
394 let msg = if self.opts.exact_errors {
395 Cow::from("Unexpected EOF")
396 } else {
397 let state = self.state.get();
398 Cow::from(format!("Saw EOF in state {state:?}"))
399 };
400 self.emit_error(msg);
401 }
402
403 fn emit_char(&self, c: char) {
404 #[cfg(feature = "trace_tokenizer")]
405 trace!(" emit");
406
407 self.process_token_and_continue(match c {
408 '\0' => NullCharacterToken,
409 _ => CharacterTokens(StrTendril::from_char(c)),
410 });
411 }
412
413 fn emit_chars(&self, b: StrTendril) {
415 self.process_token_and_continue(CharacterTokens(b));
416 }
417
418 fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
419 self.finish_attribute();
420
421 let name = LocalName::from(&**self.current_tag_name.borrow());
422 self.current_tag_name.borrow_mut().clear();
423
424 match self.current_tag_kind.get() {
425 StartTag => {
426 *self.last_start_tag_name.borrow_mut() = Some(name.clone());
427 },
428 EndTag => {
429 if !self.current_tag_attrs.borrow().is_empty() {
430 self.emit_error(Borrowed("Attributes on an end tag"));
431 }
432 if self.current_tag_self_closing.get() {
433 self.emit_error(Borrowed("Self-closing end tag"));
434 }
435 },
436 }
437
438 let token = TagToken(Tag {
439 kind: self.current_tag_kind.get(),
440 name,
441 self_closing: self.current_tag_self_closing.get(),
442 attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
443 });
444
445 match self.process_token(token) {
446 TokenSinkResult::Continue => ProcessResult::Continue,
447 TokenSinkResult::Plaintext => {
448 self.state.set(states::Plaintext);
449 ProcessResult::Continue
450 },
451 TokenSinkResult::Script(node) => {
452 self.state.set(states::Data);
453 ProcessResult::Script(node)
454 },
455 TokenSinkResult::RawData(kind) => {
456 self.state.set(states::RawData(kind));
457 ProcessResult::Continue
458 },
459 }
460 }
461
462 fn emit_temp_buf(&self) {
463 #[cfg(feature = "trace_tokenizer")]
464 trace!(" emit_temp");
465
466 let buf = mem::take(&mut *self.temp_buf.borrow_mut());
468 self.emit_chars(buf);
469 }
470
471 fn clear_temp_buf(&self) {
472 self.temp_buf.borrow_mut().clear();
474 }
475
476 fn emit_current_comment(&self) {
477 let comment = mem::take(&mut *self.current_comment.borrow_mut());
478 self.process_token_and_continue(CommentToken(comment));
479 }
480
481 fn discard_tag(&self) {
482 self.current_tag_name.borrow_mut().clear();
483 self.current_tag_self_closing.set(false);
484 *self.current_tag_attrs.borrow_mut() = vec![];
485 }
486
487 fn create_tag(&self, kind: TagKind, c: char) {
488 self.discard_tag();
489 self.current_tag_name.borrow_mut().push_char(c);
490 self.current_tag_kind.set(kind);
491 }
492
493 fn have_appropriate_end_tag(&self) -> bool {
494 match self.last_start_tag_name.borrow().as_ref() {
495 Some(last) => {
496 (self.current_tag_kind.get() == EndTag)
497 && (**self.current_tag_name.borrow() == **last)
498 },
499 None => false,
500 }
501 }
502
503 fn create_attribute(&self, c: char) {
504 self.finish_attribute();
505
506 self.current_attr_name.borrow_mut().push_char(c);
507 }
508
509 fn finish_attribute(&self) {
510 if self.current_attr_name.borrow().is_empty() {
511 return;
512 }
513
514 let dup = {
517 let name = &*self.current_attr_name.borrow();
518 self.current_tag_attrs
519 .borrow()
520 .iter()
521 .any(|a| *a.name.local == **name)
522 };
523
524 if dup {
525 self.emit_error(Borrowed("Duplicate attribute"));
526 self.current_attr_name.borrow_mut().clear();
527 self.current_attr_value.borrow_mut().clear();
528 } else {
529 let name = LocalName::from(&**self.current_attr_name.borrow());
530 self.current_attr_name.borrow_mut().clear();
531 self.current_tag_attrs.borrow_mut().push(Attribute {
532 name: QualName::new(None, ns!(), name),
535 value: mem::take(&mut self.current_attr_value.borrow_mut()),
536 });
537 }
538 }
539
540 fn emit_current_doctype(&self) {
541 let doctype = self.current_doctype.take();
542 self.process_token_and_continue(DoctypeToken(doctype));
543 }
544
545 fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<'_, Option<StrTendril>> {
546 let current_doctype = self.current_doctype.borrow_mut();
547 match kind {
548 Public => RefMut::map(current_doctype, |d| &mut d.public_id),
549 System => RefMut::map(current_doctype, |d| &mut d.system_id),
550 }
551 }
552
553 fn clear_doctype_id(&self, kind: DoctypeIdKind) {
554 let mut id = self.doctype_id(kind);
555 match *id {
556 Some(ref mut s) => s.clear(),
557 None => *id = Some(StrTendril::new()),
558 }
559 }
560
561 fn start_consuming_character_reference(&self) {
562 debug_assert!(
563 self.char_ref_tokenizer.borrow().is_none(),
564 "Nested character references are impossible"
565 );
566
567 let is_in_attribute = matches!(self.state.get(), states::AttributeValue(_));
568 *self.char_ref_tokenizer.borrow_mut() = Some(CharRefTokenizer::new(is_in_attribute));
569 }
570
571 fn emit_eof(&self) {
572 self.process_token_and_continue(EOFToken);
573 }
574
575 fn peek(&self, input: &BufferQueue) -> Option<char> {
576 if self.reconsume.get() {
577 Some(self.current_char.get())
578 } else {
579 input.peek()
580 }
581 }
582
583 fn discard_char(&self, input: &BufferQueue) {
584 if self.reconsume.get() {
590 self.reconsume.set(false);
591 } else {
592 input.next();
593 }
594 }
595
596 fn emit_error(&self, error: Cow<'static, str>) {
597 self.process_token_and_continue(ParseError(error));
598 }
599}
600macro_rules! shorthand (
604 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
605 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) );
606 ( $me:ident : discard_tag ) => ( $me.discard_tag() );
607 ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) );
608 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) );
609 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
610 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
611 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) );
612 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) );
613 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
614 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) );
615 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) );
616 ( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
617 ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() );
618 ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
619 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
620 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) );
621 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
622 ( $me:ident : force_quirks ) => ( $me.current_doctype.borrow_mut().force_quirks = true);
623 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
624);
625
626#[cfg(feature = "trace_tokenizer")]
629macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
630 trace!(" {:?}", stringify!($($cmds)*));
631 shorthand!($me : $($cmds)*);
632}));
633
634#[cfg(not(feature = "trace_tokenizer"))]
635macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
636
637macro_rules! go (
639 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
643 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
644 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
645 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
646
647 ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; });
650 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; });
651 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
652
653 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); });
654 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); });
655 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
656
657 ( $me:ident : consume_char_ref ) => ({ $me.start_consuming_character_reference(); return ProcessResult::Continue; });
658
659 ( $me:ident : emit_tag $s:ident ) => ({
661 $me.state.set(states::$s);
662 return $me.emit_current_tag();
663 });
664
665 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
666
667 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
669
670 ( $me:ident : ) => (());
672);
673
674macro_rules! get_char ( ($me:expr, $input:expr) => (
677 unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
678));
679
680macro_rules! peek ( ($me:expr, $input:expr) => (
681 unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
682));
683
684macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
685 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
686));
687
688macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
689 unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
690));
691
692macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
693 unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
694));
695
696impl<Sink: TokenSink> Tokenizer<Sink> {
697 #[allow(clippy::never_loop)]
701 fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
702 if self.char_ref_tokenizer.borrow().is_some() {
703 return self.step_char_ref_tokenizer(input);
704 }
705
706 trace!("processing in state {:?}", self.state);
707 match self.state.get() {
708 states::Data => loop {
710 let set = small_char_set!('\r' '\0' '&' '<' '\n');
711
712 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
713 let set_result = if !(self.opts.exact_errors
714 || self.reconsume.get()
715 || self.ignore_lf.get())
716 && Self::is_supported_simd_feature_detected()
717 {
718 let front_buffer = input.peek_front_chunk_mut();
719 let Some(mut front_buffer) = front_buffer else {
720 return ProcessResult::Suspend;
721 };
722
723 let first_char = front_buffer
726 .chars()
727 .next()
728 .expect("Input buffers are never empty");
729
730 if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
731 drop(front_buffer);
732 self.pop_except_from(input, set)
733 } else {
734 let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
737
738 if front_buffer.is_empty() {
739 drop(front_buffer);
740 input.pop_front();
741 }
742
743 result
744 }
745 } else {
746 self.pop_except_from(input, set)
747 };
748
749 #[cfg(not(any(
750 target_arch = "x86",
751 target_arch = "x86_64",
752 target_arch = "aarch64"
753 )))]
754 let set_result = self.pop_except_from(input, set);
755
756 let Some(set_result) = set_result else {
757 return ProcessResult::Suspend;
758 };
759 match set_result {
760 FromSet('\0') => {
761 self.bad_char_error();
762 self.emit_char('\0');
763 },
764 FromSet('&') => go!(self: consume_char_ref),
765 FromSet('<') => go!(self: to TagOpen),
766 FromSet(c) => {
767 self.emit_char(c);
768 },
769 NotFromSet(b) => self.emit_chars(b),
770 }
771 },
772
773 states::RawData(Rcdata) => loop {
775 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
776 FromSet('\0') => {
777 self.bad_char_error();
778 self.emit_char('\u{fffd}');
779 },
780 FromSet('&') => go!(self: consume_char_ref),
781 FromSet('<') => go!(self: to RawLessThanSign Rcdata),
782 FromSet(c) => self.emit_char(c),
783 NotFromSet(b) => self.emit_chars(b),
784 }
785 },
786
787 states::RawData(Rawtext) => loop {
789 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
790 FromSet('\0') => {
791 self.bad_char_error();
792 self.emit_char('\u{fffd}');
793 },
794 FromSet('<') => go!(self: to RawLessThanSign Rawtext),
795 FromSet(c) => self.emit_char(c),
796 NotFromSet(b) => self.emit_chars(b),
797 }
798 },
799
800 states::RawData(ScriptData) => loop {
802 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
803 FromSet('\0') => {
804 self.bad_char_error();
805 self.emit_char('\u{fffd}');
806 },
807 FromSet('<') => go!(self: to RawLessThanSign ScriptData),
808 FromSet(c) => self.emit_char(c),
809 NotFromSet(b) => self.emit_chars(b),
810 }
811 },
812
813 states::RawData(ScriptDataEscaped(Escaped)) => loop {
815 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
816 FromSet('\0') => {
817 self.bad_char_error();
818 self.emit_char('\u{fffd}');
819 },
820 FromSet('-') => {
821 self.emit_char('-');
822 go!(self: to ScriptDataEscapedDash Escaped);
823 },
824 FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
825 FromSet(c) => self.emit_char(c),
826 NotFromSet(b) => self.emit_chars(b),
827 }
828 },
829
830 states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
832 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
833 FromSet('\0') => {
834 self.bad_char_error();
835 self.emit_char('\u{fffd}');
836 },
837 FromSet('-') => {
838 self.emit_char('-');
839 go!(self: to ScriptDataEscapedDash DoubleEscaped);
840 },
841 FromSet('<') => {
842 self.emit_char('<');
843 go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped)
844 },
845 FromSet(c) => self.emit_char(c),
846 NotFromSet(b) => self.emit_chars(b),
847 }
848 },
849
850 states::Plaintext => loop {
852 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
853 FromSet('\0') => {
854 self.bad_char_error();
855 self.emit_char('\u{fffd}');
856 },
857 FromSet(c) => self.emit_char(c),
858 NotFromSet(b) => self.emit_chars(b),
859 }
860 },
861
862 states::TagOpen => loop {
864 match get_char!(self, input) {
865 '!' => go!(self: to MarkupDeclarationOpen),
866 '/' => go!(self: to EndTagOpen),
867 '?' => {
868 self.bad_char_error();
869 go!(self: clear_comment; reconsume BogusComment)
870 },
871 c => match lower_ascii_letter(c) {
872 Some(cl) => go!(self: create_tag StartTag cl; to TagName),
873 None => {
874 self.bad_char_error();
875 self.emit_char('<');
876 go!(self: reconsume Data)
877 },
878 },
879 }
880 },
881
882 states::EndTagOpen => loop {
884 match get_char!(self, input) {
885 '>' => {
886 self.bad_char_error();
887 go!(self: to Data)
888 },
889 c => match lower_ascii_letter(c) {
890 Some(cl) => go!(self: create_tag EndTag cl; to TagName),
891 None => {
892 self.bad_char_error();
893 go!(self: clear_comment; reconsume BogusComment)
894 },
895 },
896 }
897 },
898
899 states::TagName => loop {
901 match get_char!(self, input) {
902 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
903 '/' => go!(self: to SelfClosingStartTag),
904 '>' => go!(self: emit_tag Data),
905 '\0' => {
906 self.bad_char_error();
907 go!(self: push_tag '\u{fffd}')
908 },
909 c => go!(self: push_tag (c.to_ascii_lowercase())),
910 }
911 },
912
913 states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
915 match get_char!(self, input) {
916 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
917 c => match lower_ascii_letter(c) {
918 Some(cl) => {
919 go!(self: clear_temp; push_temp cl);
920 self.emit_char('<');
921 self.emit_char(c);
922 go!(self: to ScriptDataEscapeStart DoubleEscaped);
923 },
924 None => {
925 self.emit_char('<');
926 go!(self: reconsume RawData ScriptDataEscaped Escaped);
927 },
928 },
929 }
930 },
931
932 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
934 match get_char!(self, input) {
935 '/' => {
936 go!(self: clear_temp);
937 self.emit_char('/');
938 go!(self: to ScriptDataDoubleEscapeEnd);
939 },
940 _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
941 }
942 },
943
944 states::RawLessThanSign(kind) => loop {
947 match get_char!(self, input) {
948 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
949 '!' if kind == ScriptData => {
950 self.emit_char('<');
951 self.emit_char('!');
952 go!(self: to ScriptDataEscapeStart Escaped);
953 },
954 _ => {
955 self.emit_char('<');
956 go!(self: reconsume RawData kind);
957 },
958 }
959 },
960
961 states::RawEndTagOpen(kind) => loop {
963 let c = get_char!(self, input);
964 match lower_ascii_letter(c) {
965 Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
966 None => {
967 self.emit_char('<');
968 self.emit_char('/');
969 go!(self: reconsume RawData kind);
970 },
971 }
972 },
973
974 states::RawEndTagName(kind) => loop {
976 let c = get_char!(self, input);
977 if self.have_appropriate_end_tag() {
978 match c {
979 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName),
980 '/' => go!(self: clear_temp; to SelfClosingStartTag),
981 '>' => go!(self: clear_temp; emit_tag Data),
982 _ => (),
983 }
984 }
985
986 match lower_ascii_letter(c) {
987 Some(cl) => go!(self: push_tag cl; push_temp c),
988 None => {
989 go!(self: discard_tag);
990 self.emit_char('<');
991 self.emit_char('/');
992 self.emit_temp_buf();
993 go!(self: reconsume RawData kind);
994 },
995 }
996 },
997
998 states::ScriptDataEscapeStart(DoubleEscaped) => loop {
1000 let c = get_char!(self, input);
1001 match c {
1002 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1003 let esc = if &**self.temp_buf.borrow() == "script" {
1004 DoubleEscaped
1005 } else {
1006 Escaped
1007 };
1008 self.emit_char(c);
1009 go!(self: to RawData ScriptDataEscaped esc);
1010 },
1011 _ => match lower_ascii_letter(c) {
1012 Some(cl) => {
1013 go!(self: push_temp cl);
1014 self.emit_char(c);
1015 },
1016 None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
1017 },
1018 }
1019 },
1020
1021 states::ScriptDataEscapeStart(Escaped) => loop {
1023 match get_char!(self, input) {
1024 '-' => {
1025 self.emit_char('-');
1026 go!(self: to ScriptDataEscapeStartDash);
1027 },
1028 _ => go!(self: reconsume RawData ScriptData),
1029 }
1030 },
1031
1032 states::ScriptDataEscapeStartDash => loop {
1034 match get_char!(self, input) {
1035 '-' => {
1036 self.emit_char('-');
1037 go!(self: to ScriptDataEscapedDashDash Escaped);
1038 },
1039 _ => go!(self: reconsume RawData ScriptData),
1040 }
1041 },
1042
1043 states::ScriptDataEscapedDash(kind) => loop {
1045 match get_char!(self, input) {
1046 '-' => {
1047 self.emit_char('-');
1048 go!(self: to ScriptDataEscapedDashDash kind);
1049 },
1050 '<' => {
1051 if kind == DoubleEscaped {
1052 self.emit_char('<');
1053 }
1054 go!(self: to RawLessThanSign ScriptDataEscaped kind);
1055 },
1056 '\0' => {
1057 self.bad_char_error();
1058 self.emit_char('\u{fffd}');
1059 go!(self: to RawData ScriptDataEscaped kind)
1060 },
1061 c => {
1062 self.emit_char(c);
1063 go!(self: to RawData ScriptDataEscaped kind);
1064 },
1065 }
1066 },
1067
1068 states::ScriptDataEscapedDashDash(kind) => loop {
1070 match get_char!(self, input) {
1071 '-' => {
1072 self.emit_char('-');
1073 },
1074 '<' => {
1075 if kind == DoubleEscaped {
1076 self.emit_char('<');
1077 }
1078 go!(self: to RawLessThanSign ScriptDataEscaped kind);
1079 },
1080 '>' => {
1081 self.emit_char('>');
1082 go!(self: to RawData ScriptData);
1083 },
1084 '\0' => {
1085 self.bad_char_error();
1086 self.emit_char('\u{fffd}');
1087 go!(self: to RawData ScriptDataEscaped kind)
1088 },
1089 c => {
1090 self.emit_char(c);
1091 go!(self: to RawData ScriptDataEscaped kind);
1092 },
1093 }
1094 },
1095
1096 states::ScriptDataDoubleEscapeEnd => loop {
1098 let c = get_char!(self, input);
1099 match c {
1100 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1101 let esc = if &**self.temp_buf.borrow() == "script" {
1102 Escaped
1103 } else {
1104 DoubleEscaped
1105 };
1106 self.emit_char(c);
1107 go!(self: to RawData ScriptDataEscaped esc);
1108 },
1109 _ => match lower_ascii_letter(c) {
1110 Some(cl) => {
1111 go!(self: push_temp cl);
1112 self.emit_char(c);
1113 },
1114 None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
1115 },
1116 }
1117 },
1118
1119 states::BeforeAttributeName => loop {
1121 match get_char!(self, input) {
1122 '\t' | '\n' | '\x0C' | ' ' => (),
1123 '/' => go!(self: to SelfClosingStartTag),
1124 '>' => go!(self: emit_tag Data),
1125 '\0' => {
1126 self.bad_char_error();
1127 go!(self: create_attr '\u{fffd}'; to AttributeName)
1128 },
1129 c => match lower_ascii_letter(c) {
1130 Some(cl) => go!(self: create_attr cl; to AttributeName),
1131 None => {
1132 if matches!(c, '"' | '\'' | '<' | '=') {
1133 self.bad_char_error();
1134 }
1135
1136 go!(self: create_attr c; to AttributeName);
1137 },
1138 },
1139 }
1140 },
1141
1142 states::AttributeName => loop {
1144 match get_char!(self, input) {
1145 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
1146 '/' => go!(self: to SelfClosingStartTag),
1147 '=' => go!(self: to BeforeAttributeValue),
1148 '>' => go!(self: emit_tag Data),
1149 '\0' => {
1150 self.bad_char_error();
1151 go!(self: push_name '\u{fffd}')
1152 },
1153 c => match lower_ascii_letter(c) {
1154 Some(cl) => go!(self: push_name cl),
1155 None => {
1156 if matches!(c, '"' | '\'' | '<') {
1157 self.bad_char_error();
1158 }
1159 go!(self: push_name c);
1160 },
1161 },
1162 }
1163 },
1164
1165 states::AfterAttributeName => loop {
1167 match get_char!(self, input) {
1168 '\t' | '\n' | '\x0C' | ' ' => (),
1169 '/' => go!(self: to SelfClosingStartTag),
1170 '=' => go!(self: to BeforeAttributeValue),
1171 '>' => go!(self: emit_tag Data),
1172 '\0' => {
1173 self.bad_char_error();
1174 go!(self: create_attr '\u{fffd}'; to AttributeName)
1175 },
1176 c => match lower_ascii_letter(c) {
1177 Some(cl) => go!(self: create_attr cl; to AttributeName),
1178 None => {
1179 if matches!(c, '"' | '\'' | '<') {
1180 self.bad_char_error();
1181 }
1182
1183 go!(self: create_attr c; to AttributeName);
1184 },
1185 },
1186 }
1187 },
1188
1189 states::BeforeAttributeValue => loop {
1193 match peek!(self, input) {
1194 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1195 '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1196 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1197 '>' => {
1198 go!(self: discard_char input);
1199 self.bad_char_error();
1200 go!(self: emit_tag Data)
1201 },
1202 _ => go!(self: to AttributeValue Unquoted),
1203 }
1204 },
1205
1206 states::AttributeValue(DoubleQuoted) => loop {
1208 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1209 FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1210 FromSet('&') => go!(self: consume_char_ref),
1211 FromSet('\0') => {
1212 self.bad_char_error();
1213 go!(self: push_value '\u{fffd}')
1214 },
1215 FromSet(c) => go!(self: push_value c),
1216 NotFromSet(ref b) => go!(self: append_value b),
1217 }
1218 },
1219
1220 states::AttributeValue(SingleQuoted) => loop {
1222 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1223 FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1224 FromSet('&') => go!(self: consume_char_ref),
1225 FromSet('\0') => {
1226 self.bad_char_error();
1227 go!(self: push_value '\u{fffd}')
1228 },
1229 FromSet(c) => go!(self: push_value c),
1230 NotFromSet(ref b) => go!(self: append_value b),
1231 }
1232 },
1233
1234 states::AttributeValue(Unquoted) => loop {
1236 match pop_except_from!(
1237 self,
1238 input,
1239 small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1240 ) {
1241 FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1242 go!(self: to BeforeAttributeName)
1243 },
1244 FromSet('&') => go!(self: consume_char_ref),
1245 FromSet('>') => go!(self: emit_tag Data),
1246 FromSet('\0') => {
1247 self.bad_char_error();
1248 go!(self: push_value '\u{fffd}')
1249 },
1250 FromSet(c) => {
1251 if matches!(c, '"' | '\'' | '<' | '=' | '`') {
1252 self.bad_char_error();
1253 }
1254 go!(self: push_value c);
1255 },
1256 NotFromSet(ref b) => go!(self: append_value b),
1257 }
1258 },
1259
1260 states::AfterAttributeValueQuoted => loop {
1262 match get_char!(self, input) {
1263 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1264 '/' => go!(self: to SelfClosingStartTag),
1265 '>' => go!(self: emit_tag Data),
1266 _ => {
1267 self.bad_char_error();
1268 go!(self: reconsume BeforeAttributeName)
1269 },
1270 }
1271 },
1272
1273 states::SelfClosingStartTag => loop {
1275 match get_char!(self, input) {
1276 '>' => {
1277 self.current_tag_self_closing.set(true);
1278 go!(self: emit_tag Data);
1279 },
1280 _ => {
1281 self.bad_char_error();
1282 go!(self: reconsume BeforeAttributeName)
1283 },
1284 }
1285 },
1286
1287 states::CommentStart => loop {
1289 match get_char!(self, input) {
1290 '-' => go!(self: to CommentStartDash),
1291 '\0' => {
1292 self.bad_char_error();
1293 go!(self: push_comment '\u{fffd}'; to Comment)
1294 },
1295 '>' => {
1296 self.bad_char_error();
1297 go!(self: emit_comment; to Data)
1298 },
1299 c => go!(self: push_comment c; to Comment),
1300 }
1301 },
1302
1303 states::CommentStartDash => loop {
1305 match get_char!(self, input) {
1306 '-' => go!(self: to CommentEnd),
1307 '\0' => {
1308 self.bad_char_error();
1309 go!(self: append_comment "-\u{fffd}"; to Comment)
1310 },
1311 '>' => {
1312 self.bad_char_error();
1313 go!(self: emit_comment; to Data)
1314 },
1315 c => go!(self: push_comment '-'; push_comment c; to Comment),
1316 }
1317 },
1318
1319 states::Comment => loop {
1321 match get_char!(self, input) {
1322 c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1323 '-' => go!(self: to CommentEndDash),
1324 '\0' => {
1325 self.bad_char_error();
1326 go!(self: push_comment '\u{fffd}')
1327 },
1328 c => go!(self: push_comment c),
1329 }
1330 },
1331
1332 states::CommentLessThanSign => loop {
1334 match get_char!(self, input) {
1335 c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1336 c @ '<' => go!(self: push_comment c),
1337 _ => go!(self: reconsume Comment),
1338 }
1339 },
1340
1341 states::CommentLessThanSignBang => loop {
1343 match get_char!(self, input) {
1344 '-' => go!(self: to CommentLessThanSignBangDash),
1345 _ => go!(self: reconsume Comment),
1346 }
1347 },
1348
1349 states::CommentLessThanSignBangDash => loop {
1351 match get_char!(self, input) {
1352 '-' => go!(self: to CommentLessThanSignBangDashDash),
1353 _ => go!(self: reconsume CommentEndDash),
1354 }
1355 },
1356
1357 states::CommentLessThanSignBangDashDash => loop {
1359 match get_char!(self, input) {
1360 '>' => go!(self: reconsume CommentEnd),
1361 _ => {
1362 self.bad_char_error();
1363 go!(self: reconsume CommentEnd)
1364 },
1365 }
1366 },
1367
1368 states::CommentEndDash => loop {
1370 match get_char!(self, input) {
1371 '-' => go!(self: to CommentEnd),
1372 '\0' => {
1373 self.bad_char_error();
1374 go!(self: append_comment "-\u{fffd}"; to Comment)
1375 },
1376 c => go!(self: push_comment '-'; push_comment c; to Comment),
1377 }
1378 },
1379
1380 states::CommentEnd => loop {
1382 match get_char!(self, input) {
1383 '>' => go!(self: emit_comment; to Data),
1384 '!' => go!(self: to CommentEndBang),
1385 '-' => go!(self: push_comment '-'),
1386 _ => go!(self: append_comment "--"; reconsume Comment),
1387 }
1388 },
1389
1390 states::CommentEndBang => loop {
1392 match get_char!(self, input) {
1393 '-' => go!(self: append_comment "--!"; to CommentEndDash),
1394 '>' => {
1395 self.bad_char_error();
1396 go!(self: emit_comment; to Data)
1397 },
1398 '\0' => {
1399 self.bad_char_error();
1400 go!(self: append_comment "--!\u{fffd}"; to Comment)
1401 },
1402 c => go!(self: append_comment "--!"; push_comment c; to Comment),
1403 }
1404 },
1405
1406 states::Doctype => loop {
1408 match get_char!(self, input) {
1409 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1410 '>' => go!(self: reconsume BeforeDoctypeName),
1411 _ => {
1412 self.bad_char_error();
1413 go!(self: reconsume BeforeDoctypeName)
1414 },
1415 }
1416 },
1417
1418 states::BeforeDoctypeName => loop {
1420 match get_char!(self, input) {
1421 '\t' | '\n' | '\x0C' | ' ' => (),
1422 '\0' => {
1423 self.bad_char_error();
1424 go!(self: create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1425 },
1426 '>' => {
1427 self.bad_char_error();
1428 go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1429 },
1430 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1431 to DoctypeName),
1432 }
1433 },
1434
1435 states::DoctypeName => loop {
1437 match get_char!(self, input) {
1438 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1439 '>' => go!(self: emit_doctype; to Data),
1440 '\0' => {
1441 self.bad_char_error();
1442 go!(self: push_doctype_name '\u{fffd}')
1443 },
1444 c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1445 }
1446 },
1447
1448 states::AfterDoctypeName => loop {
1450 if eat!(self, input, "public") {
1451 go!(self: to AfterDoctypeKeyword Public);
1452 } else if eat!(self, input, "system") {
1453 go!(self: to AfterDoctypeKeyword System);
1454 } else {
1455 match get_char!(self, input) {
1456 '\t' | '\n' | '\x0C' | ' ' => (),
1457 '>' => go!(self: emit_doctype; to Data),
1458 _ => {
1459 self.bad_char_error();
1460 go!(self: force_quirks; reconsume BogusDoctype)
1461 },
1462 }
1463 }
1464 },
1465
1466 states::AfterDoctypeKeyword(kind) => loop {
1468 match get_char!(self, input) {
1469 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1470 '"' => {
1471 self.bad_char_error();
1472 go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1473 },
1474 '\'' => {
1475 self.bad_char_error();
1476 go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1477 },
1478 '>' => {
1479 self.bad_char_error();
1480 go!(self: force_quirks; emit_doctype; to Data)
1481 },
1482 _ => {
1483 self.bad_char_error();
1484 go!(self: force_quirks; reconsume BogusDoctype)
1485 },
1486 }
1487 },
1488
1489 states::BeforeDoctypeIdentifier(kind) => loop {
1491 match get_char!(self, input) {
1492 '\t' | '\n' | '\x0C' | ' ' => (),
1493 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1494 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1495 '>' => {
1496 self.bad_char_error();
1497 go!(self: force_quirks; emit_doctype; to Data)
1498 },
1499 _ => {
1500 self.bad_char_error();
1501 go!(self: force_quirks; reconsume BogusDoctype)
1502 },
1503 }
1504 },
1505
1506 states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1508 match get_char!(self, input) {
1509 '"' => go!(self: to AfterDoctypeIdentifier kind),
1510 '\0' => {
1511 self.bad_char_error();
1512 go!(self: push_doctype_id kind '\u{fffd}')
1513 },
1514 '>' => {
1515 self.bad_char_error();
1516 go!(self: force_quirks; emit_doctype; to Data)
1517 },
1518 c => go!(self: push_doctype_id kind c),
1519 }
1520 },
1521
1522 states::DoctypeIdentifierSingleQuoted(kind) => loop {
1524 match get_char!(self, input) {
1525 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1526 '\0' => {
1527 self.bad_char_error();
1528 go!(self: push_doctype_id kind '\u{fffd}')
1529 },
1530 '>' => {
1531 self.bad_char_error();
1532 go!(self: force_quirks; emit_doctype; to Data)
1533 },
1534 c => go!(self: push_doctype_id kind c),
1535 }
1536 },
1537
1538 states::AfterDoctypeIdentifier(Public) => loop {
1540 match get_char!(self, input) {
1541 '\t' | '\n' | '\x0C' | ' ' => {
1542 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1543 },
1544 '>' => go!(self: emit_doctype; to Data),
1545 '"' => {
1546 self.bad_char_error();
1547 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1548 },
1549 '\'' => {
1550 self.bad_char_error();
1551 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1552 },
1553 _ => {
1554 self.bad_char_error();
1555 go!(self: force_quirks; reconsume BogusDoctype)
1556 },
1557 }
1558 },
1559
1560 states::AfterDoctypeIdentifier(System) => loop {
1562 match get_char!(self, input) {
1563 '\t' | '\n' | '\x0C' | ' ' => (),
1564 '>' => go!(self: emit_doctype; to Data),
1565 _ => {
1566 self.bad_char_error();
1567 go!(self: reconsume BogusDoctype)
1568 },
1569 }
1570 },
1571
1572 states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1574 match get_char!(self, input) {
1575 '\t' | '\n' | '\x0C' | ' ' => (),
1576 '>' => go!(self: emit_doctype; to Data),
1577 '"' => {
1578 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1579 },
1580 '\'' => {
1581 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1582 },
1583 _ => {
1584 self.bad_char_error();
1585 go!(self: force_quirks; reconsume BogusDoctype)
1586 },
1587 }
1588 },
1589
1590 states::BogusDoctype => loop {
1592 match get_char!(self, input) {
1593 '>' => go!(self: emit_doctype; to Data),
1594 '\0' => {
1595 self.bad_char_error();
1596 },
1597 _ => (),
1598 }
1599 },
1600
1601 states::BogusComment => loop {
1603 match get_char!(self, input) {
1604 '>' => go!(self: emit_comment; to Data),
1605 '\0' => {
1606 self.bad_char_error();
1607 go!(self: push_comment '\u{fffd}')
1608 },
1609 c => go!(self: push_comment c),
1610 }
1611 },
1612
1613 states::MarkupDeclarationOpen => loop {
1615 if eat_exact!(self, input, "--") {
1616 go!(self: clear_comment; to CommentStart);
1617 } else if eat!(self, input, "doctype") {
1618 go!(self: to Doctype);
1619 } else {
1620 if self
1621 .sink
1622 .adjusted_current_node_present_but_not_in_html_namespace()
1623 && eat_exact!(self, input, "[CDATA[")
1624 {
1625 go!(self: clear_temp; to CdataSection);
1626 }
1627 self.bad_char_error();
1628 go!(self: clear_comment; to BogusComment);
1629 }
1630 },
1631
1632 states::CdataSection => loop {
1634 match get_char!(self, input) {
1635 ']' => go!(self: to CdataSectionBracket),
1636 '\0' => {
1637 self.emit_temp_buf();
1638 self.emit_char('\0');
1639 },
1640 c => go!(self: push_temp c),
1641 }
1642 },
1643
1644 states::CdataSectionBracket => match get_char!(self, input) {
1646 ']' => go!(self: to CdataSectionEnd),
1647 _ => go!(self: push_temp ']'; reconsume CdataSection),
1648 },
1649
1650 states::CdataSectionEnd => loop {
1652 match get_char!(self, input) {
1653 ']' => go!(self: push_temp ']'),
1654 '>' => {
1655 self.emit_temp_buf();
1656 go!(self: to Data);
1657 },
1658 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1659 }
1660 },
1661 }
1663 }
1664
1665 fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1666 let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
1667 let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
1668 char_ref::Status::Done(char_ref) => {
1669 self.process_char_ref(char_ref);
1670 *char_ref_tokenizer = None;
1671 return ProcessResult::Continue;
1672 },
1673
1674 char_ref::Status::Stuck => ProcessResult::Suspend,
1675 char_ref::Status::Progress => ProcessResult::Continue,
1676 };
1677
1678 progress
1679 }
1680
1681 fn process_char_ref(&self, char_ref: CharRef) {
1682 let CharRef {
1683 mut chars,
1684 mut num_chars,
1685 } = char_ref;
1686
1687 if num_chars == 0 {
1688 chars[0] = '&';
1689 num_chars = 1;
1690 }
1691
1692 for i in 0..num_chars {
1693 let c = chars[i as usize];
1694 match self.state.get() {
1695 states::Data | states::RawData(states::Rcdata) => self.emit_char(c),
1696
1697 states::AttributeValue(_) => go!(self: push_value c),
1698
1699 _ => panic!(
1700 "state {:?} should not be reachable in process_char_ref",
1701 self.state.get()
1702 ),
1703 }
1704 }
1705 }
1706
1707 pub fn end(&self) {
1709 let input = BufferQueue::default();
1712 match self.char_ref_tokenizer.take() {
1713 None => (),
1714 Some(mut tokenizer) => {
1715 self.process_char_ref(tokenizer.end_of_file(self, &input));
1716 },
1717 }
1718
1719 self.at_eof.set(true);
1722 assert!(matches!(self.run(&input), TokenizerResult::Done));
1723 assert!(input.is_empty());
1724
1725 loop {
1726 match self.eof_step() {
1727 ProcessResult::Continue => (),
1728 ProcessResult::Suspend => break,
1729 ProcessResult::Script(_) => unreachable!(),
1730 }
1731 }
1732
1733 self.sink.end();
1734
1735 if self.opts.profile {
1736 self.dump_profile();
1737 }
1738 }
1739
1740 fn dump_profile(&self) {
1741 let mut results: Vec<(states::State, u64)> = self
1742 .state_profile
1743 .borrow()
1744 .iter()
1745 .map(|(s, t)| (*s, *t))
1746 .collect();
1747 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1748
1749 let total: u64 = results
1750 .iter()
1751 .map(|&(_, t)| t)
1752 .fold(0, ::std::ops::Add::add);
1753 println!("\nTokenizer profile, in nanoseconds");
1754 println!(
1755 "\n{:12} total in token sink",
1756 self.time_in_sink.get()
1757 );
1758 println!("\n{total:12} total in tokenizer");
1759
1760 for (k, v) in results.into_iter() {
1761 let pct = 100.0 * (v as f64) / (total as f64);
1762 println!("{v:12} {pct:4.1}% {k:?}");
1763 }
1764 }
1765
1766 fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1767 debug!("processing EOF in state {:?}", self.state.get());
1768 match self.state.get() {
1769 states::Data
1770 | states::RawData(Rcdata)
1771 | states::RawData(Rawtext)
1772 | states::RawData(ScriptData)
1773 | states::Plaintext => go!(self: eof),
1774
1775 states::TagName
1776 | states::RawData(ScriptDataEscaped(_))
1777 | states::BeforeAttributeName
1778 | states::AttributeName
1779 | states::AfterAttributeName
1780 | states::AttributeValue(_)
1781 | states::AfterAttributeValueQuoted
1782 | states::SelfClosingStartTag
1783 | states::ScriptDataEscapedDash(_)
1784 | states::ScriptDataEscapedDashDash(_) => {
1785 self.bad_eof_error();
1786 go!(self: to Data)
1787 },
1788
1789 states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1790
1791 states::TagOpen => {
1792 self.bad_eof_error();
1793 self.emit_char('<');
1794 go!(self: to Data);
1795 },
1796
1797 states::EndTagOpen => {
1798 self.bad_eof_error();
1799 self.emit_char('<');
1800 self.emit_char('/');
1801 go!(self: to Data);
1802 },
1803
1804 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1805 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1806 },
1807
1808 states::RawLessThanSign(kind) => {
1809 self.emit_char('<');
1810 go!(self: to RawData kind);
1811 },
1812
1813 states::RawEndTagOpen(kind) => {
1814 self.emit_char('<');
1815 self.emit_char('/');
1816 go!(self: to RawData kind);
1817 },
1818
1819 states::RawEndTagName(kind) => {
1820 self.emit_char('<');
1821 self.emit_char('/');
1822 self.emit_temp_buf();
1823 go!(self: to RawData kind)
1824 },
1825
1826 states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1827
1828 states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1829
1830 states::ScriptDataDoubleEscapeEnd => {
1831 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1832 },
1833
1834 states::CommentStart
1835 | states::CommentStartDash
1836 | states::Comment
1837 | states::CommentEndDash
1838 | states::CommentEnd
1839 | states::CommentEndBang => {
1840 self.bad_eof_error();
1841 go!(self: emit_comment; to Data)
1842 },
1843
1844 states::CommentLessThanSign | states::CommentLessThanSignBang => {
1845 go!(self: reconsume Comment)
1846 },
1847
1848 states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1849
1850 states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1851
1852 states::Doctype | states::BeforeDoctypeName => {
1853 self.bad_eof_error();
1854 go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1855 },
1856
1857 states::DoctypeName
1858 | states::AfterDoctypeName
1859 | states::AfterDoctypeKeyword(_)
1860 | states::BeforeDoctypeIdentifier(_)
1861 | states::DoctypeIdentifierDoubleQuoted(_)
1862 | states::DoctypeIdentifierSingleQuoted(_)
1863 | states::AfterDoctypeIdentifier(_)
1864 | states::BetweenDoctypePublicAndSystemIdentifiers => {
1865 self.bad_eof_error();
1866 go!(self: force_quirks; emit_doctype; to Data)
1867 },
1868
1869 states::BogusDoctype => go!(self: emit_doctype; to Data),
1870
1871 states::BogusComment => go!(self: emit_comment; to Data),
1872
1873 states::MarkupDeclarationOpen => {
1874 self.bad_char_error();
1875 go!(self: to BogusComment)
1876 },
1877
1878 states::CdataSection => {
1879 self.emit_temp_buf();
1880 self.bad_eof_error();
1881 go!(self: to Data)
1882 },
1883
1884 states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1885
1886 states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1887 }
1888 }
1889
1890 fn is_supported_simd_feature_detected() -> bool {
1892 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1893 {
1894 is_x86_feature_detected!("sse2")
1895 }
1896
1897 #[cfg(target_arch = "aarch64")]
1898 {
1899 std::arch::is_aarch64_feature_detected!("neon")
1900 }
1901
1902 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
1903 false
1904 }
1905
1906 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
1907 unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1918 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1919 let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
1920
1921 #[cfg(target_arch = "aarch64")]
1922 let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
1923
1924 while let Some(c) = input.as_bytes().get(i) {
1926 if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1927 break;
1928 }
1929 if *c == b'\n' {
1930 n_newlines += 1;
1931 }
1932
1933 i += 1;
1934 }
1935
1936 let set_result = if i == 0 {
1937 let first_char = input.pop_front_char().unwrap();
1938 debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1939
1940 let preprocessed_char = self
1944 .get_preprocessed_char(first_char, &BufferQueue::default())
1945 .unwrap();
1946 SetResult::FromSet(preprocessed_char)
1947 } else {
1948 debug_assert!(
1949 input.len() >= i,
1950 "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1951 i,
1952 input.len()
1953 );
1954 let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1955 input.unsafe_pop_front(i as u32);
1956 SetResult::NotFromSet(consumed_chunk)
1957 };
1958
1959 self.current_line.set(self.current_line.get() + n_newlines);
1960
1961 Some(set_result)
1962 }
1963
1964 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1965 #[target_feature(enable = "sse2")]
1966 unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
1974 #[cfg(target_arch = "x86")]
1975 use std::arch::x86::{
1976 __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1977 _mm_set1_epi8,
1978 };
1979 #[cfg(target_arch = "x86_64")]
1980 use std::arch::x86_64::{
1981 __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1982 _mm_set1_epi8,
1983 };
1984
1985 debug_assert!(!input.is_empty());
1986
1987 let quote_mask = _mm_set1_epi8('<' as i8);
1988 let escape_mask = _mm_set1_epi8('&' as i8);
1989 let carriage_return_mask = _mm_set1_epi8('\r' as i8);
1990 let zero_mask = _mm_set1_epi8('\0' as i8);
1991 let newline_mask = _mm_set1_epi8('\n' as i8);
1992
1993 let raw_bytes: &[u8] = input.as_bytes();
1994 let start = raw_bytes.as_ptr();
1995
1996 const STRIDE: usize = 16;
1997 let mut i = 0;
1998 let mut n_newlines = 0;
1999 while i + STRIDE <= raw_bytes.len() {
2000 let data = _mm_loadu_si128(start.add(i) as *const __m128i);
2002
2003 let quotes = _mm_cmpeq_epi8(data, quote_mask);
2005 let escapes = _mm_cmpeq_epi8(data, escape_mask);
2006 let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
2007 let zeros = _mm_cmpeq_epi8(data, zero_mask);
2008 let newlines = _mm_cmpeq_epi8(data, newline_mask);
2009
2010 let test_result = _mm_or_si128(
2013 _mm_or_si128(quotes, zeros),
2014 _mm_or_si128(escapes, carriage_returns),
2015 );
2016 let bitmask = _mm_movemask_epi8(test_result);
2017 let newline_mask = _mm_movemask_epi8(newlines);
2018
2019 if (bitmask != 0) {
2020 let position = if cfg!(target_endian = "little") {
2022 bitmask.trailing_zeros() as usize
2023 } else {
2024 bitmask.leading_zeros() as usize
2025 };
2026
2027 n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
2028 i += position;
2029 break;
2030 } else {
2031 n_newlines += newline_mask.count_ones() as u64;
2032 }
2033
2034 i += STRIDE;
2035 }
2036
2037 (i, n_newlines)
2038 }
2039
2040 #[cfg(target_arch = "aarch64")]
2041 #[target_feature(enable = "neon")]
2042 unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2050 use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
2051
2052 debug_assert!(!input.is_empty());
2053
2054 let quote_mask = vdupq_n_u8(b'<');
2055 let escape_mask = vdupq_n_u8(b'&');
2056 let carriage_return_mask = vdupq_n_u8(b'\r');
2057 let zero_mask = vdupq_n_u8(b'\0');
2058 let newline_mask = vdupq_n_u8(b'\n');
2059
2060 let raw_bytes: &[u8] = input.as_bytes();
2061 let start = raw_bytes.as_ptr();
2062
2063 const STRIDE: usize = 16;
2064 let mut i = 0;
2065 let mut n_newlines = 0;
2066 while i + STRIDE <= raw_bytes.len() {
2067 let data = vld1q_u8(start.add(i));
2069
2070 let quotes = vceqq_u8(data, quote_mask);
2072 let escapes = vceqq_u8(data, escape_mask);
2073 let carriage_returns = vceqq_u8(data, carriage_return_mask);
2074 let zeros = vceqq_u8(data, zero_mask);
2075 let newlines = vceqq_u8(data, newline_mask);
2076
2077 let test_result =
2080 vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
2081 let bitmask = vmaxvq_u8(test_result);
2082 let newline_mask = vmaxvq_u8(newlines);
2083 if bitmask != 0 {
2084 let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2086 let position = chunk_bytes
2087 .iter()
2088 .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
2089 .unwrap();
2090
2091 n_newlines += chunk_bytes[..position]
2092 .iter()
2093 .filter(|&&b| b == b'\n')
2094 .count() as u64;
2095
2096 i += position;
2097 break;
2098 } else if newline_mask != 0 {
2099 let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2100 n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
2101 }
2102
2103 i += STRIDE;
2104 }
2105
2106 (i, n_newlines)
2107 }
2108}
2109
2110#[cfg(test)]
2111#[allow(non_snake_case)]
2112mod test {
2113 use super::option_push; use crate::tendril::{SliceExt, StrTendril};
2115
2116 use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
2117
2118 use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
2119 use super::interface::{EndTag, StartTag, Tag, TagKind};
2120 use super::interface::{TagToken, Token};
2121
2122 use markup5ever::buffer_queue::BufferQueue;
2123 use std::cell::RefCell;
2124
2125 use crate::LocalName;
2126
2127 struct LinesMatch {
2131 tokens: RefCell<Vec<Token>>,
2132 current_str: RefCell<StrTendril>,
2133 lines: RefCell<Vec<(Token, u64)>>,
2134 }
2135
2136 impl LinesMatch {
2137 fn new() -> LinesMatch {
2138 LinesMatch {
2139 tokens: RefCell::new(vec![]),
2140 current_str: RefCell::new(StrTendril::new()),
2141 lines: RefCell::new(vec![]),
2142 }
2143 }
2144
2145 fn push(&self, token: Token, line_number: u64) {
2146 self.finish_str();
2147 self.lines.borrow_mut().push((token, line_number));
2148 }
2149
2150 fn finish_str(&self) {
2151 if !self.current_str.borrow().is_empty() {
2152 let s = self.current_str.take();
2153 self.tokens.borrow_mut().push(CharacterTokens(s));
2154 }
2155 }
2156 }
2157
2158 impl TokenSink for LinesMatch {
2159 type Handle = ();
2160
2161 fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
2162 match token {
2163 CharacterTokens(b) => {
2164 self.current_str.borrow_mut().push_slice(&b);
2165 },
2166
2167 NullCharacterToken => {
2168 self.current_str.borrow_mut().push_char('\0');
2169 },
2170
2171 ParseError(_) => {
2172 panic!("unexpected parse error");
2173 },
2174
2175 TagToken(mut t) => {
2176 match t.kind {
2180 EndTag => {
2181 t.self_closing = false;
2182 t.attrs = vec![];
2183 },
2184 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
2185 }
2186 self.push(TagToken(t), line_number);
2187 },
2188
2189 EOFToken => (),
2190
2191 _ => self.push(token, line_number),
2192 }
2193 TokenSinkResult::Continue
2194 }
2195 }
2196
2197 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
2200 let sink = LinesMatch::new();
2201 let tok = Tokenizer::new(sink, opts);
2202 let buffer = BufferQueue::default();
2203 for chunk in input.into_iter() {
2204 buffer.push_back(chunk);
2205 let _ = tok.feed(&buffer);
2206 }
2207 tok.end();
2208 tok.sink.lines.take()
2209 }
2210
2211 fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
2213 let name = LocalName::from(&*token);
2214
2215 TagToken(Tag {
2216 kind: tagkind,
2217 name,
2218 self_closing: false,
2219 attrs: vec![],
2220 })
2221 }
2222
2223 #[test]
2224 fn push_to_None_gives_singleton() {
2225 let mut s: Option<StrTendril> = None;
2226 option_push(&mut s, 'x');
2227 assert_eq!(s, Some("x".to_tendril()));
2228 }
2229
2230 #[test]
2231 fn push_to_empty_appends() {
2232 let mut s: Option<StrTendril> = Some(StrTendril::new());
2233 option_push(&mut s, 'x');
2234 assert_eq!(s, Some("x".to_tendril()));
2235 }
2236
2237 #[test]
2238 fn push_to_nonempty_appends() {
2239 let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
2240 option_push(&mut s, 'x');
2241 assert_eq!(s, Some("yx".to_tendril()));
2242 }
2243
2244 #[test]
2245 fn check_lines() {
2246 let opts = TokenizerOpts {
2247 exact_errors: false,
2248 discard_bom: true,
2249 profile: false,
2250 initial_state: None,
2251 last_start_tag_name: None,
2252 };
2253 let vector = vec![
2254 StrTendril::from("<a>\n"),
2255 StrTendril::from("<b>\n"),
2256 StrTendril::from("</b>\n"),
2257 StrTendril::from("</a>\n"),
2258 ];
2259 let expected = vec![
2260 (create_tag(StrTendril::from("a"), StartTag), 1),
2261 (create_tag(StrTendril::from("b"), StartTag), 2),
2262 (create_tag(StrTendril::from("b"), EndTag), 3),
2263 (create_tag(StrTendril::from("a"), EndTag), 4),
2264 ];
2265 let results = tokenize(vector, opts);
2266 assert_eq!(results, expected);
2267 }
2268
2269 #[test]
2270 fn check_lines_with_new_line() {
2271 let opts = TokenizerOpts {
2272 exact_errors: false,
2273 discard_bom: true,
2274 profile: false,
2275 initial_state: None,
2276 last_start_tag_name: None,
2277 };
2278 let vector = vec![
2279 StrTendril::from("<a>\r\n"),
2280 StrTendril::from("<b>\r\n"),
2281 StrTendril::from("</b>\r\n"),
2282 StrTendril::from("</a>\r\n"),
2283 ];
2284 let expected = vec![
2285 (create_tag(StrTendril::from("a"), StartTag), 1),
2286 (create_tag(StrTendril::from("b"), StartTag), 2),
2287 (create_tag(StrTendril::from("b"), EndTag), 3),
2288 (create_tag(StrTendril::from("a"), EndTag), 4),
2289 ];
2290 let results = tokenize(vector, opts);
2291 assert_eq!(results, expected);
2292 }
2293}