html5ever/tokenizer/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The HTML5 tokenizer.
11
12pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use markup5ever::{ns, small_char_set, TokenizerResult};
28use std::borrow::Cow::{self, Borrowed};
29use std::cell::{Cell, RefCell, RefMut};
30use std::collections::BTreeMap;
31use std::mem;
32
33pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34use crate::macros::{time, unwrap_or_return};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42/// The result of invoking the tokenizer once.
43pub enum ProcessResult<Handle> {
44    /// The tokenizer should be re-invoked immediately.
45    Continue,
46    /// The tokenizer has not finished, but it needs to wait for more
47    /// input to arrive before it can continue.
48    Suspend,
49    /// The tokenizer was blocked by a `<script>`.
50    ///
51    /// This `<script>` needs to be executed before tokenization
52    /// can continue, as it might invoke `document.write`.
53    Script(Handle),
54    /// The tokenizer was blocked because it found a `<meta charset>` tag.
55    ///
56    /// Such tags may force the user agent to re-parse the document with the new
57    /// encoding, but non-conformant implementations can reasonably treat
58    /// this as [Self::Continue].
59    EncodingIndicator(StrTendril),
60}
61
62fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
63    match *opt_str {
64        Some(ref mut s) => s.push_char(c),
65        None => *opt_str = Some(StrTendril::from_char(c)),
66    }
67}
68
69/// Tokenizer options, with an impl for `Default`.
70#[derive(Clone)]
71pub struct TokenizerOpts {
72    /// Report all parse errors described in the spec, at some
73    /// performance penalty?  Default: false
74    pub exact_errors: bool,
75
76    /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
77    /// of the stream?  Default: true
78    pub discard_bom: bool,
79
80    /// Keep a record of how long we spent in each state?  Printed
81    /// when `end()` is called.  Default: false
82    pub profile: bool,
83
84    /// Initial state override.  Only the test runner should use
85    /// a non-`None` value!
86    pub initial_state: Option<states::State>,
87
88    /// Last start tag.  Only the test runner should use a
89    /// non-`None` value!
90    ///
91    /// FIXME: Can't use Tendril because we want TokenizerOpts
92    /// to be Send.
93    pub last_start_tag_name: Option<String>,
94}
95
96impl Default for TokenizerOpts {
97    fn default() -> TokenizerOpts {
98        TokenizerOpts {
99            exact_errors: false,
100            discard_bom: true,
101            profile: false,
102            initial_state: None,
103            last_start_tag_name: None,
104        }
105    }
106}
107
108/// The HTML tokenizer.
109pub struct Tokenizer<Sink> {
110    /// Options controlling the behavior of the tokenizer.
111    opts: TokenizerOpts,
112
113    /// Destination for tokens we emit.
114    pub sink: Sink,
115
116    /// The abstract machine state as described in the spec.
117    state: Cell<states::State>,
118
119    /// Are we at the end of the file, once buffers have been processed
120    /// completely? This affects whether we will wait for lookahead or not.
121    at_eof: Cell<bool>,
122
123    /// Tokenizer for character references, if we're tokenizing
124    /// one at the moment.
125    char_ref_tokenizer: RefCell<Option<CharRefTokenizer>>,
126
127    /// Current input character.  Just consumed, may reconsume.
128    current_char: Cell<char>,
129
130    /// Should we reconsume the current input character?
131    reconsume: Cell<bool>,
132
133    /// Did we just consume \r, translating it to \n?  In that case we need
134    /// to ignore the next character if it's \n.
135    ignore_lf: Cell<bool>,
136
137    /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
138    /// beginning of the stream.
139    discard_bom: Cell<bool>,
140
141    /// Current tag kind.
142    current_tag_kind: Cell<TagKind>,
143
144    /// Current tag name.
145    current_tag_name: RefCell<StrTendril>,
146
147    /// Current tag is self-closing?
148    current_tag_self_closing: Cell<bool>,
149
150    /// Current tag had duplicate attributes?
151    current_tag_had_duplicate_attributes: Cell<bool>,
152
153    /// Current tag attributes.
154    current_tag_attrs: RefCell<Vec<Attribute>>,
155
156    /// Current attribute name.
157    current_attr_name: RefCell<StrTendril>,
158
159    /// Current attribute value.
160    current_attr_value: RefCell<StrTendril>,
161
162    /// Current comment.
163    current_comment: RefCell<StrTendril>,
164
165    /// Current doctype token.
166    current_doctype: RefCell<Doctype>,
167
168    /// Last start tag name, for use in checking "appropriate end tag".
169    last_start_tag_name: RefCell<Option<LocalName>>,
170
171    /// The "temporary buffer" mentioned in the spec.
172    temp_buf: RefCell<StrTendril>,
173
174    /// Record of how many ns we spent in each state, if profiling is enabled.
175    state_profile: RefCell<BTreeMap<states::State, u64>>,
176
177    /// Record of how many ns we spent in the token sink.
178    time_in_sink: Cell<u64>,
179
180    /// Track current line
181    current_line: Cell<u64>,
182}
183
184impl<Sink: TokenSink> Tokenizer<Sink> {
185    /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
186    pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
187        let start_tag_name = opts
188            .last_start_tag_name
189            .take()
190            .map(|s| LocalName::from(&*s));
191        let state = opts.initial_state.unwrap_or(states::Data);
192        let discard_bom = opts.discard_bom;
193        Tokenizer {
194            opts,
195            sink,
196            state: Cell::new(state),
197            char_ref_tokenizer: RefCell::new(None),
198            at_eof: Cell::new(false),
199            current_char: Cell::new('\0'),
200            reconsume: Cell::new(false),
201            ignore_lf: Cell::new(false),
202            discard_bom: Cell::new(discard_bom),
203            current_tag_kind: Cell::new(StartTag),
204            current_tag_name: RefCell::new(StrTendril::new()),
205            current_tag_self_closing: Cell::new(false),
206            current_tag_had_duplicate_attributes: Cell::new(false),
207            current_tag_attrs: RefCell::new(vec![]),
208            current_attr_name: RefCell::new(StrTendril::new()),
209            current_attr_value: RefCell::new(StrTendril::new()),
210            current_comment: RefCell::new(StrTendril::new()),
211            current_doctype: RefCell::new(Doctype::default()),
212            last_start_tag_name: RefCell::new(start_tag_name),
213            temp_buf: RefCell::new(StrTendril::new()),
214            state_profile: RefCell::new(BTreeMap::new()),
215            time_in_sink: Cell::new(0),
216            current_line: Cell::new(1),
217        }
218    }
219
220    /// Feed an input string into the tokenizer.
221    pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
222        if input.is_empty() {
223            return TokenizerResult::Done;
224        }
225
226        if self.discard_bom.get() {
227            if let Some(c) = input.peek() {
228                if c == '\u{feff}' {
229                    input.next();
230                }
231            } else {
232                return TokenizerResult::Done;
233            }
234        };
235
236        self.run(input)
237    }
238
239    pub fn set_plaintext_state(&self) {
240        self.state.set(states::Plaintext);
241    }
242
243    fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
244        if self.opts.profile {
245            let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
246            self.time_in_sink.set(self.time_in_sink.get() + dt);
247            ret
248        } else {
249            self.sink.process_token(token, self.current_line.get())
250        }
251    }
252
253    fn process_token_and_continue(&self, token: Token) {
254        assert!(matches!(
255            self.process_token(token),
256            TokenSinkResult::Continue
257        ));
258    }
259
260    //§ preprocessing-the-input-stream
261    // Get the next input character, which might be the character
262    // 'c' that we already consumed from the buffers.
263    fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
264        if self.ignore_lf.get() {
265            self.ignore_lf.set(false);
266            if c == '\n' {
267                c = input.next()?;
268            }
269        }
270
271        if c == '\r' {
272            self.ignore_lf.set(true);
273            c = '\n';
274        }
275
276        if c == '\n' {
277            self.current_line.set(self.current_line.get() + 1);
278        }
279
280        if self.opts.exact_errors
281            && match c as u32 {
282                0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
283                n if (n & 0xFFFE) == 0xFFFE => true,
284                _ => false,
285            }
286        {
287            let msg = format!("Bad character {c}");
288            self.emit_error(Cow::Owned(msg));
289        }
290
291        trace!("got character {c}");
292        self.current_char.set(c);
293        Some(c)
294    }
295
296    //§ tokenization
297    // Get the next input character, if one is available.
298    fn get_char(&self, input: &BufferQueue) -> Option<char> {
299        if self.reconsume.get() {
300            self.reconsume.set(false);
301            Some(self.current_char.get())
302        } else {
303            input
304                .next()
305                .and_then(|c| self.get_preprocessed_char(c, input))
306        }
307    }
308
309    fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
310        // Bail to the slow path for various corner cases.
311        // This means that `FromSet` can contain characters not in the set!
312        // It shouldn't matter because the fallback `FromSet` case should
313        // always do the same thing as the `NotFromSet` case.
314        if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
315            return self.get_char(input).map(FromSet);
316        }
317
318        let d = input.pop_except_from(set);
319        trace!("got characters {d:?}");
320        match d {
321            Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
322
323            // NB: We don't set self.current_char for a run of characters not
324            // in the set.  It shouldn't matter for the codepaths that use
325            // this.
326            _ => d,
327        }
328    }
329
330    // Check if the next characters are an ASCII case-insensitive match.  See
331    // BufferQueue::eat.
332    //
333    // NB: this doesn't set the current input character.
334    fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
335        if self.ignore_lf.get() {
336            self.ignore_lf.set(false);
337            if self.peek(input) == Some('\n') {
338                self.discard_char(input);
339            }
340        }
341
342        input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
343        match input.eat(pat, eq) {
344            None if self.at_eof.get() => Some(false),
345            None => {
346                while let Some(data) = input.next() {
347                    self.temp_buf.borrow_mut().push_char(data);
348                }
349                None
350            },
351            Some(matched) => Some(matched),
352        }
353    }
354
355    /// Run the state machine for as long as we can.
356    fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
357        if self.opts.profile {
358            loop {
359                let state = self.state.get();
360                let old_sink = self.time_in_sink.get();
361                let (run, mut dt) = time!(self.step(input));
362                dt -= (self.time_in_sink.get() - old_sink);
363                let new = match self.state_profile.borrow_mut().get_mut(&state) {
364                    Some(x) => {
365                        *x += dt;
366                        false
367                    },
368                    None => true,
369                };
370                if new {
371                    // do this here because of borrow shenanigans
372                    self.state_profile.borrow_mut().insert(state, dt);
373                }
374                match run {
375                    ProcessResult::Continue => (),
376                    ProcessResult::Suspend => break,
377                    ProcessResult::Script(node) => return TokenizerResult::Script(node),
378                    ProcessResult::EncodingIndicator(encoding) => {
379                        return TokenizerResult::EncodingIndicator(encoding)
380                    },
381                }
382            }
383        } else {
384            loop {
385                match self.step(input) {
386                    ProcessResult::Continue => (),
387                    ProcessResult::Suspend => break,
388                    ProcessResult::Script(node) => return TokenizerResult::Script(node),
389                    ProcessResult::EncodingIndicator(encoding) => {
390                        return TokenizerResult::EncodingIndicator(encoding)
391                    },
392                }
393            }
394        }
395        TokenizerResult::Done
396    }
397
398    #[inline]
399    fn bad_char_error(&self) {
400        #[cfg(feature = "trace_tokenizer")]
401        trace!("  error");
402
403        let msg = if self.opts.exact_errors {
404            Cow::from("Bad character")
405        } else {
406            let c = self.current_char.get();
407            let state = self.state.get();
408            Cow::from(format!("Saw {c} in state {state:?}"))
409        };
410        self.emit_error(msg);
411    }
412
413    #[inline]
414    fn bad_eof_error(&self) {
415        #[cfg(feature = "trace_tokenizer")]
416        trace!("  error_eof");
417
418        let msg = if self.opts.exact_errors {
419            Cow::from("Unexpected EOF")
420        } else {
421            let state = self.state.get();
422            Cow::from(format!("Saw EOF in state {state:?}"))
423        };
424        self.emit_error(msg);
425    }
426
427    fn emit_char(&self, c: char) {
428        #[cfg(feature = "trace_tokenizer")]
429        trace!("  emit");
430
431        self.process_token_and_continue(match c {
432            '\0' => NullCharacterToken,
433            _ => CharacterTokens(StrTendril::from_char(c)),
434        });
435    }
436
437    // The string must not contain '\0'!
438    fn emit_chars(&self, b: StrTendril) {
439        self.process_token_and_continue(CharacterTokens(b));
440    }
441
442    fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
443        self.finish_attribute();
444
445        let name = LocalName::from(&**self.current_tag_name.borrow());
446        self.current_tag_name.borrow_mut().clear();
447
448        match self.current_tag_kind.get() {
449            StartTag => {
450                *self.last_start_tag_name.borrow_mut() = Some(name.clone());
451            },
452            EndTag => {
453                if !self.current_tag_attrs.borrow().is_empty() {
454                    self.emit_error(Borrowed("Attributes on an end tag"));
455                }
456                if self.current_tag_self_closing.get() {
457                    self.emit_error(Borrowed("Self-closing end tag"));
458                }
459            },
460        }
461
462        let token = TagToken(Tag {
463            kind: self.current_tag_kind.get(),
464            name,
465            self_closing: self.current_tag_self_closing.get(),
466            attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
467            had_duplicate_attributes: self.current_tag_had_duplicate_attributes.get(),
468        });
469
470        match self.process_token(token) {
471            TokenSinkResult::Continue => ProcessResult::Continue,
472            TokenSinkResult::Plaintext => {
473                self.state.set(states::Plaintext);
474                ProcessResult::Continue
475            },
476            TokenSinkResult::Script(node) => {
477                self.state.set(states::Data);
478                ProcessResult::Script(node)
479            },
480            TokenSinkResult::RawData(kind) => {
481                self.state.set(states::RawData(kind));
482                ProcessResult::Continue
483            },
484            TokenSinkResult::EncodingIndicator(encoding) => {
485                ProcessResult::EncodingIndicator(encoding)
486            },
487        }
488    }
489
490    fn emit_temp_buf(&self) {
491        #[cfg(feature = "trace_tokenizer")]
492        trace!("  emit_temp");
493
494        // FIXME: Make sure that clearing on emit is spec-compatible.
495        let buf = mem::take(&mut *self.temp_buf.borrow_mut());
496        self.emit_chars(buf);
497    }
498
499    fn clear_temp_buf(&self) {
500        // Do this without a new allocation.
501        self.temp_buf.borrow_mut().clear();
502    }
503
504    fn emit_current_comment(&self) {
505        let comment = mem::take(&mut *self.current_comment.borrow_mut());
506        self.process_token_and_continue(CommentToken(comment));
507    }
508
509    fn discard_tag(&self) {
510        self.current_tag_name.borrow_mut().clear();
511        self.current_tag_self_closing.set(false);
512        self.current_tag_had_duplicate_attributes.set(false);
513        *self.current_tag_attrs.borrow_mut() = vec![];
514    }
515
516    fn create_tag(&self, kind: TagKind, c: char) {
517        self.discard_tag();
518        self.current_tag_name.borrow_mut().push_char(c);
519        self.current_tag_kind.set(kind);
520    }
521
522    fn have_appropriate_end_tag(&self) -> bool {
523        match self.last_start_tag_name.borrow().as_ref() {
524            Some(last) => {
525                (self.current_tag_kind.get() == EndTag)
526                    && (**self.current_tag_name.borrow() == **last)
527            },
528            None => false,
529        }
530    }
531
532    fn create_attribute(&self, c: char) {
533        self.finish_attribute();
534
535        self.current_attr_name.borrow_mut().push_char(c);
536    }
537
538    fn finish_attribute(&self) {
539        if self.current_attr_name.borrow().is_empty() {
540            return;
541        }
542
543        // Check for a duplicate attribute.
544        // FIXME: the spec says we should error as soon as the name is finished.
545        let dup = {
546            let name = &*self.current_attr_name.borrow();
547            self.current_tag_attrs
548                .borrow()
549                .iter()
550                .any(|a| *a.name.local == **name)
551        };
552
553        if dup {
554            self.emit_error(Borrowed("Duplicate attribute"));
555            self.current_tag_had_duplicate_attributes.set(true);
556            self.current_attr_name.borrow_mut().clear();
557            self.current_attr_value.borrow_mut().clear();
558        } else {
559            let name = LocalName::from(&**self.current_attr_name.borrow());
560            self.current_attr_name.borrow_mut().clear();
561            self.current_tag_attrs.borrow_mut().push(Attribute {
562                // The tree builder will adjust the namespace if necessary.
563                // This only happens in foreign elements.
564                name: QualName::new(None, ns!(), name),
565                value: mem::take(&mut self.current_attr_value.borrow_mut()),
566            });
567        }
568    }
569
570    fn emit_current_doctype(&self) {
571        let doctype = self.current_doctype.take();
572        self.process_token_and_continue(DoctypeToken(doctype));
573    }
574
575    fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<'_, Option<StrTendril>> {
576        let current_doctype = self.current_doctype.borrow_mut();
577        match kind {
578            Public => RefMut::map(current_doctype, |d| &mut d.public_id),
579            System => RefMut::map(current_doctype, |d| &mut d.system_id),
580        }
581    }
582
583    fn clear_doctype_id(&self, kind: DoctypeIdKind) {
584        let mut id = self.doctype_id(kind);
585        match *id {
586            Some(ref mut s) => s.clear(),
587            None => *id = Some(StrTendril::new()),
588        }
589    }
590
591    fn start_consuming_character_reference(&self) {
592        debug_assert!(
593            self.char_ref_tokenizer.borrow().is_none(),
594            "Nested character references are impossible"
595        );
596
597        let is_in_attribute = matches!(self.state.get(), states::AttributeValue(_));
598        *self.char_ref_tokenizer.borrow_mut() = Some(CharRefTokenizer::new(is_in_attribute));
599    }
600
601    fn emit_eof(&self) {
602        self.process_token_and_continue(EOFToken);
603    }
604
605    fn peek(&self, input: &BufferQueue) -> Option<char> {
606        if self.reconsume.get() {
607            Some(self.current_char.get())
608        } else {
609            input.peek()
610        }
611    }
612
613    fn discard_char(&self, input: &BufferQueue) {
614        // peek() deals in un-processed characters (no newline normalization), while get_char()
615        // does.
616        //
617        // since discard_char is supposed to be used in combination with peek(), discard_char must
618        // discard a single raw input character, not a normalized newline.
619        if self.reconsume.get() {
620            self.reconsume.set(false);
621        } else {
622            input.next();
623        }
624    }
625
626    fn emit_error(&self, error: Cow<'static, str>) {
627        self.process_token_and_continue(ParseError(error));
628    }
629}
630//§ END
631
632// Shorthand for common state machine behaviors.
633macro_rules! shorthand (
634    ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c)                           );
635    ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.borrow_mut().push_char($c)     );
636    ( $me:ident : discard_tag                      ) => ( $me.discard_tag()                                   );
637    ( $me:ident : discard_char $input:expr         ) => ( $me.discard_char($input)                            );
638    ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.borrow_mut().push_char($c)             );
639    ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf()                                );
640    ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c)                            );
641    ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.borrow_mut().push_char($c)    );
642    ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.borrow_mut().push_char($c)   );
643    ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
644    ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.borrow_mut().push_char($c)      );
645    ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.borrow_mut().push_slice($c)     );
646    ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment()                          );
647    ( $me:ident : clear_comment                    ) => ( $me.current_comment.borrow_mut().clear()            );
648    ( $me:ident : create_doctype                   ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
649    ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
650    ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c)            );
651    ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k)                            );
652    ( $me:ident : force_quirks                     ) => ( $me.current_doctype.borrow_mut().force_quirks = true);
653    ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype()                          );
654);
655
656// Tracing of tokenizer actions.  This adds significant bloat and compile time,
657// so it's behind a cfg flag.
658#[cfg(feature = "trace_tokenizer")]
659macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
660    trace!("  {:?}", stringify!($($cmds)*));
661    shorthand!($me : $($cmds)*);
662}));
663
664#[cfg(not(feature = "trace_tokenizer"))]
665macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
666
667// A little DSL for sequencing shorthand actions.
668macro_rules! go (
669    // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
670    // We have to tell the parser how much lookahead we need.
671
672    ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
673    ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
674    ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
675    ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
676
677    // These can only come at the end.
678
679    ( $me:ident : to $s:ident                    ) => ({ $me.state.set(states::$s); return ProcessResult::Continue;           });
680    ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue;      });
681    ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
682
683    ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume.set(true); go!($me: to $s);         });
684    ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume.set(true); go!($me: to $s $k1);     });
685    ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
686
687    ( $me:ident : consume_char_ref             ) => ({ $me.start_consuming_character_reference(); return ProcessResult::Continue;         });
688
689    // We have a default next state after emitting a tag, but the sink can override.
690    ( $me:ident : emit_tag $s:ident ) => ({
691        $me.state.set(states::$s);
692        return $me.emit_current_tag();
693    });
694
695    ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
696
697    // If nothing else matched, it's a single command
698    ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
699
700    // or nothing.
701    ( $me:ident : ) => (());
702);
703
704// This is a macro because it can cause early return
705// from the function where it is used.
706macro_rules! get_char ( ($me:expr, $input:expr) => (
707    unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
708));
709
710macro_rules! peek ( ($me:expr, $input:expr) => (
711    unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
712));
713
714macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
715    unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
716));
717
718macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
719    unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
720));
721
722macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
723    unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
724));
725
726impl<Sink: TokenSink> Tokenizer<Sink> {
727    // Run the state machine for a while.
728    // Return true if we should be immediately re-invoked
729    // (this just simplifies control flow vs. break / continue).
730    #[allow(clippy::never_loop)]
731    fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
732        if self.char_ref_tokenizer.borrow().is_some() {
733            return self.step_char_ref_tokenizer(input);
734        }
735
736        trace!("processing in state {:?}", self.state);
737        match self.state.get() {
738            //§ data-state
739            states::Data => loop {
740                let set = small_char_set!('\r' '\0' '&' '<' '\n');
741
742                #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
743                let set_result = if !(self.opts.exact_errors
744                    || self.reconsume.get()
745                    || self.ignore_lf.get())
746                    && Self::is_supported_simd_feature_detected()
747                {
748                    let front_buffer = input.peek_front_chunk_mut();
749                    let Some(mut front_buffer) = front_buffer else {
750                        return ProcessResult::Suspend;
751                    };
752
753                    // Special case: The fast path is not worth taking if the first character is already in the set,
754                    // which is fairly common
755                    let first_char = front_buffer
756                        .chars()
757                        .next()
758                        .expect("Input buffers are never empty");
759
760                    if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
761                        drop(front_buffer);
762                        self.pop_except_from(input, set)
763                    } else {
764                        // SAFETY:
765                        // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
766                        let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
767
768                        if front_buffer.is_empty() {
769                            drop(front_buffer);
770                            input.pop_front();
771                        }
772
773                        result
774                    }
775                } else {
776                    self.pop_except_from(input, set)
777                };
778
779                #[cfg(not(any(
780                    target_arch = "x86",
781                    target_arch = "x86_64",
782                    target_arch = "aarch64"
783                )))]
784                let set_result = self.pop_except_from(input, set);
785
786                let Some(set_result) = set_result else {
787                    return ProcessResult::Suspend;
788                };
789                match set_result {
790                    FromSet('\0') => {
791                        self.bad_char_error();
792                        self.emit_char('\0');
793                    },
794                    FromSet('&') => go!(self: consume_char_ref),
795                    FromSet('<') => go!(self: to TagOpen),
796                    FromSet(c) => {
797                        self.emit_char(c);
798                    },
799                    NotFromSet(b) => self.emit_chars(b),
800                }
801            },
802
803            //§ rcdata-state
804            states::RawData(Rcdata) => loop {
805                match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
806                    FromSet('\0') => {
807                        self.bad_char_error();
808                        self.emit_char('\u{fffd}');
809                    },
810                    FromSet('&') => go!(self: consume_char_ref),
811                    FromSet('<') => go!(self: to RawLessThanSign Rcdata),
812                    FromSet(c) => self.emit_char(c),
813                    NotFromSet(b) => self.emit_chars(b),
814                }
815            },
816
817            //§ rawtext-state
818            states::RawData(Rawtext) => loop {
819                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
820                    FromSet('\0') => {
821                        self.bad_char_error();
822                        self.emit_char('\u{fffd}');
823                    },
824                    FromSet('<') => go!(self: to RawLessThanSign Rawtext),
825                    FromSet(c) => self.emit_char(c),
826                    NotFromSet(b) => self.emit_chars(b),
827                }
828            },
829
830            //§ script-data-state
831            states::RawData(ScriptData) => loop {
832                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
833                    FromSet('\0') => {
834                        self.bad_char_error();
835                        self.emit_char('\u{fffd}');
836                    },
837                    FromSet('<') => go!(self: to RawLessThanSign ScriptData),
838                    FromSet(c) => self.emit_char(c),
839                    NotFromSet(b) => self.emit_chars(b),
840                }
841            },
842
843            //§ script-data-escaped-state
844            states::RawData(ScriptDataEscaped(Escaped)) => loop {
845                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
846                    FromSet('\0') => {
847                        self.bad_char_error();
848                        self.emit_char('\u{fffd}');
849                    },
850                    FromSet('-') => {
851                        self.emit_char('-');
852                        go!(self: to ScriptDataEscapedDash Escaped);
853                    },
854                    FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
855                    FromSet(c) => self.emit_char(c),
856                    NotFromSet(b) => self.emit_chars(b),
857                }
858            },
859
860            //§ script-data-double-escaped-state
861            states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
862                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
863                    FromSet('\0') => {
864                        self.bad_char_error();
865                        self.emit_char('\u{fffd}');
866                    },
867                    FromSet('-') => {
868                        self.emit_char('-');
869                        go!(self: to ScriptDataEscapedDash DoubleEscaped);
870                    },
871                    FromSet('<') => {
872                        self.emit_char('<');
873                        go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped)
874                    },
875                    FromSet(c) => self.emit_char(c),
876                    NotFromSet(b) => self.emit_chars(b),
877                }
878            },
879
880            //§ plaintext-state
881            states::Plaintext => loop {
882                match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
883                    FromSet('\0') => {
884                        self.bad_char_error();
885                        self.emit_char('\u{fffd}');
886                    },
887                    FromSet(c) => self.emit_char(c),
888                    NotFromSet(b) => self.emit_chars(b),
889                }
890            },
891
892            //§ tag-open-state
893            states::TagOpen => loop {
894                match get_char!(self, input) {
895                    '!' => go!(self: to MarkupDeclarationOpen),
896                    '/' => go!(self: to EndTagOpen),
897                    '?' => {
898                        self.bad_char_error();
899                        go!(self: clear_comment; reconsume BogusComment)
900                    },
901                    c => match lower_ascii_letter(c) {
902                        Some(cl) => go!(self: create_tag StartTag cl; to TagName),
903                        None => {
904                            self.bad_char_error();
905                            self.emit_char('<');
906                            go!(self: reconsume Data)
907                        },
908                    },
909                }
910            },
911
912            //§ end-tag-open-state
913            states::EndTagOpen => loop {
914                match get_char!(self, input) {
915                    '>' => {
916                        self.bad_char_error();
917                        go!(self: to Data)
918                    },
919                    c => match lower_ascii_letter(c) {
920                        Some(cl) => go!(self: create_tag EndTag cl; to TagName),
921                        None => {
922                            self.bad_char_error();
923                            go!(self: clear_comment; reconsume BogusComment)
924                        },
925                    },
926                }
927            },
928
929            //§ tag-name-state
930            states::TagName => loop {
931                match get_char!(self, input) {
932                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
933                    '/' => go!(self: to SelfClosingStartTag),
934                    '>' => go!(self: emit_tag Data),
935                    '\0' => {
936                        self.bad_char_error();
937                        go!(self: push_tag '\u{fffd}')
938                    },
939                    c => go!(self: push_tag (c.to_ascii_lowercase())),
940                }
941            },
942
943            //§ script-data-escaped-less-than-sign-state
944            states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
945                match get_char!(self, input) {
946                    '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
947                    c => match lower_ascii_letter(c) {
948                        Some(cl) => {
949                            go!(self: clear_temp; push_temp cl);
950                            self.emit_char('<');
951                            self.emit_char(c);
952                            go!(self: to ScriptDataEscapeStart DoubleEscaped);
953                        },
954                        None => {
955                            self.emit_char('<');
956                            go!(self: reconsume RawData ScriptDataEscaped Escaped);
957                        },
958                    },
959                }
960            },
961
962            //§ script-data-double-escaped-less-than-sign-state
963            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
964                match get_char!(self, input) {
965                    '/' => {
966                        go!(self: clear_temp);
967                        self.emit_char('/');
968                        go!(self: to ScriptDataDoubleEscapeEnd);
969                    },
970                    _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
971                }
972            },
973
974            //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
975            // otherwise
976            states::RawLessThanSign(kind) => loop {
977                match get_char!(self, input) {
978                    '/' => go!(self: clear_temp; to RawEndTagOpen kind),
979                    '!' if kind == ScriptData => {
980                        self.emit_char('<');
981                        self.emit_char('!');
982                        go!(self: to ScriptDataEscapeStart Escaped);
983                    },
984                    _ => {
985                        self.emit_char('<');
986                        go!(self: reconsume RawData kind);
987                    },
988                }
989            },
990
991            //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
992            states::RawEndTagOpen(kind) => loop {
993                let c = get_char!(self, input);
994                match lower_ascii_letter(c) {
995                    Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
996                    None => {
997                        self.emit_char('<');
998                        self.emit_char('/');
999                        go!(self: reconsume RawData kind);
1000                    },
1001                }
1002            },
1003
1004            //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
1005            states::RawEndTagName(kind) => loop {
1006                let c = get_char!(self, input);
1007                if self.have_appropriate_end_tag() {
1008                    match c {
1009                        '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName),
1010                        '/' => go!(self: clear_temp; to SelfClosingStartTag),
1011                        '>' => go!(self: clear_temp; emit_tag Data),
1012                        _ => (),
1013                    }
1014                }
1015
1016                match lower_ascii_letter(c) {
1017                    Some(cl) => go!(self: push_tag cl; push_temp c),
1018                    None => {
1019                        go!(self: discard_tag);
1020                        self.emit_char('<');
1021                        self.emit_char('/');
1022                        self.emit_temp_buf();
1023                        go!(self: reconsume RawData kind);
1024                    },
1025                }
1026            },
1027
1028            //§ script-data-double-escape-start-state
1029            states::ScriptDataEscapeStart(DoubleEscaped) => loop {
1030                let c = get_char!(self, input);
1031                match c {
1032                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1033                        let esc = if &**self.temp_buf.borrow() == "script" {
1034                            DoubleEscaped
1035                        } else {
1036                            Escaped
1037                        };
1038                        self.emit_char(c);
1039                        go!(self: to RawData ScriptDataEscaped esc);
1040                    },
1041                    _ => match lower_ascii_letter(c) {
1042                        Some(cl) => {
1043                            go!(self: push_temp cl);
1044                            self.emit_char(c);
1045                        },
1046                        None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
1047                    },
1048                }
1049            },
1050
1051            //§ script-data-escape-start-state
1052            states::ScriptDataEscapeStart(Escaped) => loop {
1053                match get_char!(self, input) {
1054                    '-' => {
1055                        self.emit_char('-');
1056                        go!(self: to ScriptDataEscapeStartDash);
1057                    },
1058                    _ => go!(self: reconsume RawData ScriptData),
1059                }
1060            },
1061
1062            //§ script-data-escape-start-dash-state
1063            states::ScriptDataEscapeStartDash => loop {
1064                match get_char!(self, input) {
1065                    '-' => {
1066                        self.emit_char('-');
1067                        go!(self: to ScriptDataEscapedDashDash Escaped);
1068                    },
1069                    _ => go!(self: reconsume RawData ScriptData),
1070                }
1071            },
1072
1073            //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
1074            states::ScriptDataEscapedDash(kind) => loop {
1075                match get_char!(self, input) {
1076                    '-' => {
1077                        self.emit_char('-');
1078                        go!(self: to ScriptDataEscapedDashDash kind);
1079                    },
1080                    '<' => {
1081                        if kind == DoubleEscaped {
1082                            self.emit_char('<');
1083                        }
1084                        go!(self: to RawLessThanSign ScriptDataEscaped kind);
1085                    },
1086                    '\0' => {
1087                        self.bad_char_error();
1088                        self.emit_char('\u{fffd}');
1089                        go!(self: to RawData ScriptDataEscaped kind)
1090                    },
1091                    c => {
1092                        self.emit_char(c);
1093                        go!(self: to RawData ScriptDataEscaped kind);
1094                    },
1095                }
1096            },
1097
1098            //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
1099            states::ScriptDataEscapedDashDash(kind) => loop {
1100                match get_char!(self, input) {
1101                    '-' => {
1102                        self.emit_char('-');
1103                    },
1104                    '<' => {
1105                        if kind == DoubleEscaped {
1106                            self.emit_char('<');
1107                        }
1108                        go!(self: to RawLessThanSign ScriptDataEscaped kind);
1109                    },
1110                    '>' => {
1111                        self.emit_char('>');
1112                        go!(self: to RawData ScriptData);
1113                    },
1114                    '\0' => {
1115                        self.bad_char_error();
1116                        self.emit_char('\u{fffd}');
1117                        go!(self: to RawData ScriptDataEscaped kind)
1118                    },
1119                    c => {
1120                        self.emit_char(c);
1121                        go!(self: to RawData ScriptDataEscaped kind);
1122                    },
1123                }
1124            },
1125
1126            //§ script-data-double-escape-end-state
1127            states::ScriptDataDoubleEscapeEnd => loop {
1128                let c = get_char!(self, input);
1129                match c {
1130                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1131                        let esc = if &**self.temp_buf.borrow() == "script" {
1132                            Escaped
1133                        } else {
1134                            DoubleEscaped
1135                        };
1136                        self.emit_char(c);
1137                        go!(self: to RawData ScriptDataEscaped esc);
1138                    },
1139                    _ => match lower_ascii_letter(c) {
1140                        Some(cl) => {
1141                            go!(self: push_temp cl);
1142                            self.emit_char(c);
1143                        },
1144                        None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
1145                    },
1146                }
1147            },
1148
1149            //§ before-attribute-name-state
1150            states::BeforeAttributeName => loop {
1151                match get_char!(self, input) {
1152                    '\t' | '\n' | '\x0C' | ' ' => (),
1153                    '/' => go!(self: to SelfClosingStartTag),
1154                    '>' => go!(self: emit_tag Data),
1155                    '\0' => {
1156                        self.bad_char_error();
1157                        go!(self: create_attr '\u{fffd}'; to AttributeName)
1158                    },
1159                    c => match lower_ascii_letter(c) {
1160                        Some(cl) => go!(self: create_attr cl; to AttributeName),
1161                        None => {
1162                            if matches!(c, '"' | '\'' | '<' | '=') {
1163                                self.bad_char_error();
1164                            }
1165
1166                            go!(self: create_attr c; to AttributeName);
1167                        },
1168                    },
1169                }
1170            },
1171
1172            //§ attribute-name-state
1173            states::AttributeName => loop {
1174                match get_char!(self, input) {
1175                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
1176                    '/' => go!(self: to SelfClosingStartTag),
1177                    '=' => go!(self: to BeforeAttributeValue),
1178                    '>' => go!(self: emit_tag Data),
1179                    '\0' => {
1180                        self.bad_char_error();
1181                        go!(self: push_name '\u{fffd}')
1182                    },
1183                    c => match lower_ascii_letter(c) {
1184                        Some(cl) => go!(self: push_name cl),
1185                        None => {
1186                            if matches!(c, '"' | '\'' | '<') {
1187                                self.bad_char_error();
1188                            }
1189                            go!(self: push_name c);
1190                        },
1191                    },
1192                }
1193            },
1194
1195            //§ after-attribute-name-state
1196            states::AfterAttributeName => loop {
1197                match get_char!(self, input) {
1198                    '\t' | '\n' | '\x0C' | ' ' => (),
1199                    '/' => go!(self: to SelfClosingStartTag),
1200                    '=' => go!(self: to BeforeAttributeValue),
1201                    '>' => go!(self: emit_tag Data),
1202                    '\0' => {
1203                        self.bad_char_error();
1204                        go!(self: create_attr '\u{fffd}'; to AttributeName)
1205                    },
1206                    c => match lower_ascii_letter(c) {
1207                        Some(cl) => go!(self: create_attr cl; to AttributeName),
1208                        None => {
1209                            if matches!(c, '"' | '\'' | '<') {
1210                                self.bad_char_error();
1211                            }
1212
1213                            go!(self: create_attr c; to AttributeName);
1214                        },
1215                    },
1216                }
1217            },
1218
1219            //§ before-attribute-value-state
1220            // Use peek so we can handle the first attr character along with the rest,
1221            // hopefully in the same zero-copy buffer.
1222            states::BeforeAttributeValue => loop {
1223                match peek!(self, input) {
1224                    '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1225                    '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1226                    '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1227                    '>' => {
1228                        go!(self: discard_char input);
1229                        self.bad_char_error();
1230                        go!(self: emit_tag Data)
1231                    },
1232                    _ => go!(self: to AttributeValue Unquoted),
1233                }
1234            },
1235
1236            //§ attribute-value-(double-quoted)-state
1237            states::AttributeValue(DoubleQuoted) => loop {
1238                match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1239                    FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1240                    FromSet('&') => go!(self: consume_char_ref),
1241                    FromSet('\0') => {
1242                        self.bad_char_error();
1243                        go!(self: push_value '\u{fffd}')
1244                    },
1245                    FromSet(c) => go!(self: push_value c),
1246                    NotFromSet(ref b) => go!(self: append_value b),
1247                }
1248            },
1249
1250            //§ attribute-value-(single-quoted)-state
1251            states::AttributeValue(SingleQuoted) => loop {
1252                match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1253                    FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1254                    FromSet('&') => go!(self: consume_char_ref),
1255                    FromSet('\0') => {
1256                        self.bad_char_error();
1257                        go!(self: push_value '\u{fffd}')
1258                    },
1259                    FromSet(c) => go!(self: push_value c),
1260                    NotFromSet(ref b) => go!(self: append_value b),
1261                }
1262            },
1263
1264            //§ attribute-value-(unquoted)-state
1265            states::AttributeValue(Unquoted) => loop {
1266                match pop_except_from!(
1267                    self,
1268                    input,
1269                    small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1270                ) {
1271                    FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1272                        go!(self: to BeforeAttributeName)
1273                    },
1274                    FromSet('&') => go!(self: consume_char_ref),
1275                    FromSet('>') => go!(self: emit_tag Data),
1276                    FromSet('\0') => {
1277                        self.bad_char_error();
1278                        go!(self: push_value '\u{fffd}')
1279                    },
1280                    FromSet(c) => {
1281                        if matches!(c, '"' | '\'' | '<' | '=' | '`') {
1282                            self.bad_char_error();
1283                        }
1284                        go!(self: push_value c);
1285                    },
1286                    NotFromSet(ref b) => go!(self: append_value b),
1287                }
1288            },
1289
1290            //§ after-attribute-value-(quoted)-state
1291            states::AfterAttributeValueQuoted => loop {
1292                match get_char!(self, input) {
1293                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1294                    '/' => go!(self: to SelfClosingStartTag),
1295                    '>' => go!(self: emit_tag Data),
1296                    _ => {
1297                        self.bad_char_error();
1298                        go!(self: reconsume BeforeAttributeName)
1299                    },
1300                }
1301            },
1302
1303            //§ self-closing-start-tag-state
1304            states::SelfClosingStartTag => loop {
1305                match get_char!(self, input) {
1306                    '>' => {
1307                        self.current_tag_self_closing.set(true);
1308                        go!(self: emit_tag Data);
1309                    },
1310                    _ => {
1311                        self.bad_char_error();
1312                        go!(self: reconsume BeforeAttributeName)
1313                    },
1314                }
1315            },
1316
1317            //§ comment-start-state
1318            states::CommentStart => loop {
1319                match get_char!(self, input) {
1320                    '-' => go!(self: to CommentStartDash),
1321                    '\0' => {
1322                        self.bad_char_error();
1323                        go!(self: push_comment '\u{fffd}'; to Comment)
1324                    },
1325                    '>' => {
1326                        self.bad_char_error();
1327                        go!(self: emit_comment; to Data)
1328                    },
1329                    c => go!(self: push_comment c; to Comment),
1330                }
1331            },
1332
1333            //§ comment-start-dash-state
1334            states::CommentStartDash => loop {
1335                match get_char!(self, input) {
1336                    '-' => go!(self: to CommentEnd),
1337                    '\0' => {
1338                        self.bad_char_error();
1339                        go!(self: append_comment "-\u{fffd}"; to Comment)
1340                    },
1341                    '>' => {
1342                        self.bad_char_error();
1343                        go!(self: emit_comment; to Data)
1344                    },
1345                    c => go!(self: push_comment '-'; push_comment c; to Comment),
1346                }
1347            },
1348
1349            //§ comment-state
1350            states::Comment => loop {
1351                match get_char!(self, input) {
1352                    c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1353                    '-' => go!(self: to CommentEndDash),
1354                    '\0' => {
1355                        self.bad_char_error();
1356                        go!(self: push_comment '\u{fffd}')
1357                    },
1358                    c => go!(self: push_comment c),
1359                }
1360            },
1361
1362            //§ comment-less-than-sign-state
1363            states::CommentLessThanSign => loop {
1364                match get_char!(self, input) {
1365                    c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1366                    c @ '<' => go!(self: push_comment c),
1367                    _ => go!(self: reconsume Comment),
1368                }
1369            },
1370
1371            //§ comment-less-than-sign-bang
1372            states::CommentLessThanSignBang => loop {
1373                match get_char!(self, input) {
1374                    '-' => go!(self: to CommentLessThanSignBangDash),
1375                    _ => go!(self: reconsume Comment),
1376                }
1377            },
1378
1379            //§ comment-less-than-sign-bang-dash
1380            states::CommentLessThanSignBangDash => loop {
1381                match get_char!(self, input) {
1382                    '-' => go!(self: to CommentLessThanSignBangDashDash),
1383                    _ => go!(self: reconsume CommentEndDash),
1384                }
1385            },
1386
1387            //§ comment-less-than-sign-bang-dash-dash
1388            states::CommentLessThanSignBangDashDash => loop {
1389                match get_char!(self, input) {
1390                    '>' => go!(self: reconsume CommentEnd),
1391                    _ => {
1392                        self.bad_char_error();
1393                        go!(self: reconsume CommentEnd)
1394                    },
1395                }
1396            },
1397
1398            //§ comment-end-dash-state
1399            states::CommentEndDash => loop {
1400                match get_char!(self, input) {
1401                    '-' => go!(self: to CommentEnd),
1402                    '\0' => {
1403                        self.bad_char_error();
1404                        go!(self: append_comment "-\u{fffd}"; to Comment)
1405                    },
1406                    c => go!(self: push_comment '-'; push_comment c; to Comment),
1407                }
1408            },
1409
1410            //§ comment-end-state
1411            states::CommentEnd => loop {
1412                match get_char!(self, input) {
1413                    '>' => go!(self: emit_comment; to Data),
1414                    '!' => go!(self: to CommentEndBang),
1415                    '-' => go!(self: push_comment '-'),
1416                    _ => go!(self: append_comment "--"; reconsume Comment),
1417                }
1418            },
1419
1420            //§ comment-end-bang-state
1421            states::CommentEndBang => loop {
1422                match get_char!(self, input) {
1423                    '-' => go!(self: append_comment "--!"; to CommentEndDash),
1424                    '>' => {
1425                        self.bad_char_error();
1426                        go!(self: emit_comment; to Data)
1427                    },
1428                    '\0' => {
1429                        self.bad_char_error();
1430                        go!(self: append_comment "--!\u{fffd}"; to Comment)
1431                    },
1432                    c => go!(self: append_comment "--!"; push_comment c; to Comment),
1433                }
1434            },
1435
1436            //§ doctype-state
1437            states::Doctype => loop {
1438                match get_char!(self, input) {
1439                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1440                    '>' => go!(self: reconsume BeforeDoctypeName),
1441                    _ => {
1442                        self.bad_char_error();
1443                        go!(self: reconsume BeforeDoctypeName)
1444                    },
1445                }
1446            },
1447
1448            //§ before-doctype-name-state
1449            states::BeforeDoctypeName => loop {
1450                match get_char!(self, input) {
1451                    '\t' | '\n' | '\x0C' | ' ' => (),
1452                    '\0' => {
1453                        self.bad_char_error();
1454                        go!(self: create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1455                    },
1456                    '>' => {
1457                        self.bad_char_error();
1458                        go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1459                    },
1460                    c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1461                                  to DoctypeName),
1462                }
1463            },
1464
1465            //§ doctype-name-state
1466            states::DoctypeName => loop {
1467                match get_char!(self, input) {
1468                    '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1469                    '>' => go!(self: emit_doctype; to Data),
1470                    '\0' => {
1471                        self.bad_char_error();
1472                        go!(self: push_doctype_name '\u{fffd}')
1473                    },
1474                    c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1475                }
1476            },
1477
1478            //§ after-doctype-name-state
1479            states::AfterDoctypeName => loop {
1480                if eat!(self, input, "public") {
1481                    go!(self: to AfterDoctypeKeyword Public);
1482                } else if eat!(self, input, "system") {
1483                    go!(self: to AfterDoctypeKeyword System);
1484                } else {
1485                    match get_char!(self, input) {
1486                        '\t' | '\n' | '\x0C' | ' ' => (),
1487                        '>' => go!(self: emit_doctype; to Data),
1488                        _ => {
1489                            self.bad_char_error();
1490                            go!(self: force_quirks; reconsume BogusDoctype)
1491                        },
1492                    }
1493                }
1494            },
1495
1496            //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1497            states::AfterDoctypeKeyword(kind) => loop {
1498                match get_char!(self, input) {
1499                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1500                    '"' => {
1501                        self.bad_char_error();
1502                        go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1503                    },
1504                    '\'' => {
1505                        self.bad_char_error();
1506                        go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1507                    },
1508                    '>' => {
1509                        self.bad_char_error();
1510                        go!(self: force_quirks; emit_doctype; to Data)
1511                    },
1512                    _ => {
1513                        self.bad_char_error();
1514                        go!(self: force_quirks; reconsume BogusDoctype)
1515                    },
1516                }
1517            },
1518
1519            //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1520            states::BeforeDoctypeIdentifier(kind) => loop {
1521                match get_char!(self, input) {
1522                    '\t' | '\n' | '\x0C' | ' ' => (),
1523                    '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1524                    '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1525                    '>' => {
1526                        self.bad_char_error();
1527                        go!(self: force_quirks; emit_doctype; to Data)
1528                    },
1529                    _ => {
1530                        self.bad_char_error();
1531                        go!(self: force_quirks; reconsume BogusDoctype)
1532                    },
1533                }
1534            },
1535
1536            //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1537            states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1538                match get_char!(self, input) {
1539                    '"' => go!(self: to AfterDoctypeIdentifier kind),
1540                    '\0' => {
1541                        self.bad_char_error();
1542                        go!(self: push_doctype_id kind '\u{fffd}')
1543                    },
1544                    '>' => {
1545                        self.bad_char_error();
1546                        go!(self: force_quirks; emit_doctype; to Data)
1547                    },
1548                    c => go!(self: push_doctype_id kind c),
1549                }
1550            },
1551
1552            //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1553            states::DoctypeIdentifierSingleQuoted(kind) => loop {
1554                match get_char!(self, input) {
1555                    '\'' => go!(self: to AfterDoctypeIdentifier kind),
1556                    '\0' => {
1557                        self.bad_char_error();
1558                        go!(self: push_doctype_id kind '\u{fffd}')
1559                    },
1560                    '>' => {
1561                        self.bad_char_error();
1562                        go!(self: force_quirks; emit_doctype; to Data)
1563                    },
1564                    c => go!(self: push_doctype_id kind c),
1565                }
1566            },
1567
1568            //§ after-doctype-public-identifier-state
1569            states::AfterDoctypeIdentifier(Public) => loop {
1570                match get_char!(self, input) {
1571                    '\t' | '\n' | '\x0C' | ' ' => {
1572                        go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1573                    },
1574                    '>' => go!(self: emit_doctype; to Data),
1575                    '"' => {
1576                        self.bad_char_error();
1577                        go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1578                    },
1579                    '\'' => {
1580                        self.bad_char_error();
1581                        go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1582                    },
1583                    _ => {
1584                        self.bad_char_error();
1585                        go!(self: force_quirks; reconsume BogusDoctype)
1586                    },
1587                }
1588            },
1589
1590            //§ after-doctype-system-identifier-state
1591            states::AfterDoctypeIdentifier(System) => loop {
1592                match get_char!(self, input) {
1593                    '\t' | '\n' | '\x0C' | ' ' => (),
1594                    '>' => go!(self: emit_doctype; to Data),
1595                    _ => {
1596                        self.bad_char_error();
1597                        go!(self: reconsume BogusDoctype)
1598                    },
1599                }
1600            },
1601
1602            //§ between-doctype-public-and-system-identifiers-state
1603            states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1604                match get_char!(self, input) {
1605                    '\t' | '\n' | '\x0C' | ' ' => (),
1606                    '>' => go!(self: emit_doctype; to Data),
1607                    '"' => {
1608                        go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1609                    },
1610                    '\'' => {
1611                        go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1612                    },
1613                    _ => {
1614                        self.bad_char_error();
1615                        go!(self: force_quirks; reconsume BogusDoctype)
1616                    },
1617                }
1618            },
1619
1620            //§ bogus-doctype-state
1621            states::BogusDoctype => loop {
1622                match get_char!(self, input) {
1623                    '>' => go!(self: emit_doctype; to Data),
1624                    '\0' => {
1625                        self.bad_char_error();
1626                    },
1627                    _ => (),
1628                }
1629            },
1630
1631            //§ bogus-comment-state
1632            states::BogusComment => loop {
1633                match get_char!(self, input) {
1634                    '>' => go!(self: emit_comment; to Data),
1635                    '\0' => {
1636                        self.bad_char_error();
1637                        go!(self: push_comment '\u{fffd}')
1638                    },
1639                    c => go!(self: push_comment c),
1640                }
1641            },
1642
1643            //§ markup-declaration-open-state
1644            states::MarkupDeclarationOpen => loop {
1645                if eat_exact!(self, input, "--") {
1646                    go!(self: clear_comment; to CommentStart);
1647                } else if eat!(self, input, "doctype") {
1648                    go!(self: to Doctype);
1649                } else {
1650                    if self
1651                        .sink
1652                        .adjusted_current_node_present_but_not_in_html_namespace()
1653                        && eat_exact!(self, input, "[CDATA[")
1654                    {
1655                        go!(self: clear_temp; to CdataSection);
1656                    }
1657                    self.bad_char_error();
1658                    go!(self: clear_comment; to BogusComment);
1659                }
1660            },
1661
1662            //§ cdata-section-state
1663            states::CdataSection => loop {
1664                match get_char!(self, input) {
1665                    ']' => go!(self: to CdataSectionBracket),
1666                    '\0' => {
1667                        self.emit_temp_buf();
1668                        self.emit_char('\0');
1669                    },
1670                    c => go!(self: push_temp c),
1671                }
1672            },
1673
1674            //§ cdata-section-bracket
1675            states::CdataSectionBracket => match get_char!(self, input) {
1676                ']' => go!(self: to CdataSectionEnd),
1677                _ => go!(self: push_temp ']'; reconsume CdataSection),
1678            },
1679
1680            //§ cdata-section-end
1681            states::CdataSectionEnd => loop {
1682                match get_char!(self, input) {
1683                    ']' => go!(self: push_temp ']'),
1684                    '>' => {
1685                        self.emit_temp_buf();
1686                        go!(self: to Data);
1687                    },
1688                    _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1689                }
1690            },
1691            //§ END
1692        }
1693    }
1694
1695    fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1696        let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
1697        let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
1698            char_ref::Status::Done(char_ref) => {
1699                self.process_char_ref(char_ref);
1700                *char_ref_tokenizer = None;
1701                return ProcessResult::Continue;
1702            },
1703
1704            char_ref::Status::Stuck => ProcessResult::Suspend,
1705            char_ref::Status::Progress => ProcessResult::Continue,
1706        };
1707
1708        progress
1709    }
1710
1711    fn process_char_ref(&self, char_ref: CharRef) {
1712        let CharRef {
1713            mut chars,
1714            mut num_chars,
1715        } = char_ref;
1716
1717        if num_chars == 0 {
1718            chars[0] = '&';
1719            num_chars = 1;
1720        }
1721
1722        for i in 0..num_chars {
1723            let c = chars[i as usize];
1724            match self.state.get() {
1725                states::Data | states::RawData(states::Rcdata) => self.emit_char(c),
1726
1727                states::AttributeValue(_) => go!(self: push_value c),
1728
1729                _ => panic!(
1730                    "state {:?} should not be reachable in process_char_ref",
1731                    self.state.get()
1732                ),
1733            }
1734        }
1735    }
1736
1737    /// Indicate that we have reached the end of the input.
1738    pub fn end(&self) {
1739        // Handle EOF in the char ref sub-tokenizer, if there is one.
1740        // Do this first because it might un-consume stuff.
1741        let input = BufferQueue::default();
1742        match self.char_ref_tokenizer.take() {
1743            None => (),
1744            Some(mut tokenizer) => {
1745                self.process_char_ref(tokenizer.end_of_file(self, &input));
1746            },
1747        }
1748
1749        // Process all remaining buffered input.
1750        // If we're waiting for lookahead, we're not gonna get it.
1751        self.at_eof.set(true);
1752        assert!(matches!(self.run(&input), TokenizerResult::Done));
1753        assert!(input.is_empty());
1754
1755        loop {
1756            match self.eof_step() {
1757                ProcessResult::Continue => (),
1758                ProcessResult::Suspend => break,
1759                ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
1760            }
1761        }
1762
1763        self.sink.end();
1764
1765        if self.opts.profile {
1766            self.dump_profile();
1767        }
1768    }
1769
1770    fn dump_profile(&self) {
1771        let mut results: Vec<(states::State, u64)> = self
1772            .state_profile
1773            .borrow()
1774            .iter()
1775            .map(|(s, t)| (*s, *t))
1776            .collect();
1777        results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1778
1779        let total: u64 = results
1780            .iter()
1781            .map(|&(_, t)| t)
1782            .fold(0, ::std::ops::Add::add);
1783        println!("\nTokenizer profile, in nanoseconds");
1784        println!(
1785            "\n{:12}         total in token sink",
1786            self.time_in_sink.get()
1787        );
1788        println!("\n{total:12}         total in tokenizer");
1789
1790        for (k, v) in results.into_iter() {
1791            let pct = 100.0 * (v as f64) / (total as f64);
1792            println!("{v:12}  {pct:4.1}%  {k:?}");
1793        }
1794    }
1795
1796    fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1797        debug!("processing EOF in state {:?}", self.state.get());
1798        match self.state.get() {
1799            states::Data
1800            | states::RawData(Rcdata)
1801            | states::RawData(Rawtext)
1802            | states::RawData(ScriptData)
1803            | states::Plaintext => go!(self: eof),
1804
1805            states::TagName
1806            | states::RawData(ScriptDataEscaped(_))
1807            | states::BeforeAttributeName
1808            | states::AttributeName
1809            | states::AfterAttributeName
1810            | states::AttributeValue(_)
1811            | states::AfterAttributeValueQuoted
1812            | states::SelfClosingStartTag
1813            | states::ScriptDataEscapedDash(_)
1814            | states::ScriptDataEscapedDashDash(_) => {
1815                self.bad_eof_error();
1816                go!(self: to Data)
1817            },
1818
1819            states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1820
1821            states::TagOpen => {
1822                self.bad_eof_error();
1823                self.emit_char('<');
1824                go!(self: to Data);
1825            },
1826
1827            states::EndTagOpen => {
1828                self.bad_eof_error();
1829                self.emit_char('<');
1830                self.emit_char('/');
1831                go!(self: to Data);
1832            },
1833
1834            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1835                go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1836            },
1837
1838            states::RawLessThanSign(kind) => {
1839                self.emit_char('<');
1840                go!(self: to RawData kind);
1841            },
1842
1843            states::RawEndTagOpen(kind) => {
1844                self.emit_char('<');
1845                self.emit_char('/');
1846                go!(self: to RawData kind);
1847            },
1848
1849            states::RawEndTagName(kind) => {
1850                self.emit_char('<');
1851                self.emit_char('/');
1852                self.emit_temp_buf();
1853                go!(self: to RawData kind)
1854            },
1855
1856            states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1857
1858            states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1859
1860            states::ScriptDataDoubleEscapeEnd => {
1861                go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1862            },
1863
1864            states::CommentStart
1865            | states::CommentStartDash
1866            | states::Comment
1867            | states::CommentEndDash
1868            | states::CommentEnd
1869            | states::CommentEndBang => {
1870                self.bad_eof_error();
1871                go!(self: emit_comment; to Data)
1872            },
1873
1874            states::CommentLessThanSign | states::CommentLessThanSignBang => {
1875                go!(self: reconsume Comment)
1876            },
1877
1878            states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1879
1880            states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1881
1882            states::Doctype | states::BeforeDoctypeName => {
1883                self.bad_eof_error();
1884                go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1885            },
1886
1887            states::DoctypeName
1888            | states::AfterDoctypeName
1889            | states::AfterDoctypeKeyword(_)
1890            | states::BeforeDoctypeIdentifier(_)
1891            | states::DoctypeIdentifierDoubleQuoted(_)
1892            | states::DoctypeIdentifierSingleQuoted(_)
1893            | states::AfterDoctypeIdentifier(_)
1894            | states::BetweenDoctypePublicAndSystemIdentifiers => {
1895                self.bad_eof_error();
1896                go!(self: force_quirks; emit_doctype; to Data)
1897            },
1898
1899            states::BogusDoctype => go!(self: emit_doctype; to Data),
1900
1901            states::BogusComment => go!(self: emit_comment; to Data),
1902
1903            states::MarkupDeclarationOpen => {
1904                self.bad_char_error();
1905                go!(self: to BogusComment)
1906            },
1907
1908            states::CdataSection => {
1909                self.emit_temp_buf();
1910                self.bad_eof_error();
1911                go!(self: to Data)
1912            },
1913
1914            states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1915
1916            states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1917        }
1918    }
1919
1920    /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64.
1921    fn is_supported_simd_feature_detected() -> bool {
1922        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1923        {
1924            is_x86_feature_detected!("sse2")
1925        }
1926
1927        #[cfg(target_arch = "aarch64")]
1928        {
1929            std::arch::is_aarch64_feature_detected!("neon")
1930        }
1931
1932        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
1933        false
1934    }
1935
1936    #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
1937    /// Implements the [data state] with SIMD instructions.
1938    /// Calls SSE2- or NEON-specific function for chunks and processes any remaining bytes.
1939    ///
1940    /// The algorithm implemented is the naive SIMD approach described [here].
1941    ///
1942    /// ### SAFETY:
1943    /// Calling this function on a CPU that supports neither SSE2 nor NEON causes undefined behaviour.
1944    ///
1945    /// [data state]: https://html.spec.whatwg.org/#data-state
1946    /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1947    unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1948        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1949        let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
1950
1951        #[cfg(target_arch = "aarch64")]
1952        let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
1953
1954        // Process any remaining bytes (less than STRIDE)
1955        while let Some(c) = input.as_bytes().get(i) {
1956            if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1957                break;
1958            }
1959            if *c == b'\n' {
1960                n_newlines += 1;
1961            }
1962
1963            i += 1;
1964        }
1965
1966        let set_result = if i == 0 {
1967            let first_char = input.pop_front_char().unwrap();
1968            debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1969
1970            // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1971            // Still, it would be nice to not have to do that.
1972            // The same is true for the unwrap call.
1973            let preprocessed_char = self
1974                .get_preprocessed_char(first_char, &BufferQueue::default())
1975                .unwrap();
1976            SetResult::FromSet(preprocessed_char)
1977        } else {
1978            debug_assert!(
1979                input.len() >= i,
1980                "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1981                i,
1982                input.len()
1983            );
1984            let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1985            input.unsafe_pop_front(i as u32);
1986            SetResult::NotFromSet(consumed_chunk)
1987        };
1988
1989        self.current_line.set(self.current_line.get() + n_newlines);
1990
1991        Some(set_result)
1992    }
1993
1994    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1995    #[target_feature(enable = "sse2")]
1996    /// Implements the [data state] with SSE2 instructions for x86/x86_64.
1997    /// Returns a pair of the number of bytes processed and the number of newlines found.
1998    ///
1999    /// ### SAFETY:
2000    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
2001    ///
2002    /// [data state]: https://html.spec.whatwg.org/#data-state
2003    unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2004        #[cfg(target_arch = "x86")]
2005        use std::arch::x86::{
2006            __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
2007            _mm_set1_epi8,
2008        };
2009        #[cfg(target_arch = "x86_64")]
2010        use std::arch::x86_64::{
2011            __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
2012            _mm_set1_epi8,
2013        };
2014
2015        debug_assert!(!input.is_empty());
2016
2017        let quote_mask = _mm_set1_epi8('<' as i8);
2018        let escape_mask = _mm_set1_epi8('&' as i8);
2019        let carriage_return_mask = _mm_set1_epi8('\r' as i8);
2020        let zero_mask = _mm_set1_epi8('\0' as i8);
2021        let newline_mask = _mm_set1_epi8('\n' as i8);
2022
2023        let raw_bytes: &[u8] = input.as_bytes();
2024        let start = raw_bytes.as_ptr();
2025
2026        const STRIDE: usize = 16;
2027        let mut i = 0;
2028        let mut n_newlines = 0;
2029        while i + STRIDE <= raw_bytes.len() {
2030            // Load a 16 byte chunk from the input
2031            let data = _mm_loadu_si128(start.add(i) as *const __m128i);
2032
2033            // Compare the chunk against each mask
2034            let quotes = _mm_cmpeq_epi8(data, quote_mask);
2035            let escapes = _mm_cmpeq_epi8(data, escape_mask);
2036            let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
2037            let zeros = _mm_cmpeq_epi8(data, zero_mask);
2038            let newlines = _mm_cmpeq_epi8(data, newline_mask);
2039
2040            // Combine all test results and create a bitmask from them.
2041            // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2042            let test_result = _mm_or_si128(
2043                _mm_or_si128(quotes, zeros),
2044                _mm_or_si128(escapes, carriage_returns),
2045            );
2046            let bitmask = _mm_movemask_epi8(test_result);
2047            let newline_mask = _mm_movemask_epi8(newlines);
2048
2049            if (bitmask != 0) {
2050                // We have reached one of the characters that cause the state machine to transition
2051                let position = if cfg!(target_endian = "little") {
2052                    bitmask.trailing_zeros() as usize
2053                } else {
2054                    bitmask.leading_zeros() as usize
2055                };
2056
2057                n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
2058                i += position;
2059                break;
2060            } else {
2061                n_newlines += newline_mask.count_ones() as u64;
2062            }
2063
2064            i += STRIDE;
2065        }
2066
2067        (i, n_newlines)
2068    }
2069
2070    #[cfg(target_arch = "aarch64")]
2071    #[target_feature(enable = "neon")]
2072    /// Implements the [data state] with NEON SIMD instructions for AArch64.
2073    /// Returns a pair of the number of bytes processed and the number of newlines found.
2074    ///
2075    /// ### SAFETY:
2076    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
2077    ///
2078    /// [data state]: https://html.spec.whatwg.org/#data-state
2079    unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2080        use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
2081
2082        debug_assert!(!input.is_empty());
2083
2084        let quote_mask = vdupq_n_u8(b'<');
2085        let escape_mask = vdupq_n_u8(b'&');
2086        let carriage_return_mask = vdupq_n_u8(b'\r');
2087        let zero_mask = vdupq_n_u8(b'\0');
2088        let newline_mask = vdupq_n_u8(b'\n');
2089
2090        let raw_bytes: &[u8] = input.as_bytes();
2091        let start = raw_bytes.as_ptr();
2092
2093        const STRIDE: usize = 16;
2094        let mut i = 0;
2095        let mut n_newlines = 0;
2096        while i + STRIDE <= raw_bytes.len() {
2097            // Load a 16 byte chunk from the input
2098            let data = vld1q_u8(start.add(i));
2099
2100            // Compare the chunk against each mask
2101            let quotes = vceqq_u8(data, quote_mask);
2102            let escapes = vceqq_u8(data, escape_mask);
2103            let carriage_returns = vceqq_u8(data, carriage_return_mask);
2104            let zeros = vceqq_u8(data, zero_mask);
2105            let newlines = vceqq_u8(data, newline_mask);
2106
2107            // Combine all test results and create a bitmask from them.
2108            // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2109            let test_result =
2110                vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
2111            let bitmask = vmaxvq_u8(test_result);
2112            let newline_mask = vmaxvq_u8(newlines);
2113            if bitmask != 0 {
2114                // We have reached one of the characters that cause the state machine to transition
2115                let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2116                let position = chunk_bytes
2117                    .iter()
2118                    .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
2119                    .unwrap();
2120
2121                n_newlines += chunk_bytes[..position]
2122                    .iter()
2123                    .filter(|&&b| b == b'\n')
2124                    .count() as u64;
2125
2126                i += position;
2127                break;
2128            } else if newline_mask != 0 {
2129                let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2130                n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
2131            }
2132
2133            i += STRIDE;
2134        }
2135
2136        (i, n_newlines)
2137    }
2138}
2139
2140#[cfg(test)]
2141#[allow(non_snake_case)]
2142mod test {
2143    use super::option_push; // private items
2144    use crate::tendril::{SliceExt, StrTendril};
2145
2146    use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
2147
2148    use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
2149    use super::interface::{EndTag, StartTag, Tag, TagKind};
2150    use super::interface::{TagToken, Token};
2151
2152    use markup5ever::buffer_queue::BufferQueue;
2153    use std::cell::RefCell;
2154
2155    use crate::LocalName;
2156
2157    // LinesMatch implements the TokenSink trait. It is used for testing to see
2158    // if current_line is being updated when process_token is called. The lines
2159    // vector is a collection of the line numbers that each token is on.
2160    struct LinesMatch {
2161        tokens: RefCell<Vec<Token>>,
2162        current_str: RefCell<StrTendril>,
2163        lines: RefCell<Vec<(Token, u64)>>,
2164    }
2165
2166    impl LinesMatch {
2167        fn new() -> LinesMatch {
2168            LinesMatch {
2169                tokens: RefCell::new(vec![]),
2170                current_str: RefCell::new(StrTendril::new()),
2171                lines: RefCell::new(vec![]),
2172            }
2173        }
2174
2175        fn push(&self, token: Token, line_number: u64) {
2176            self.finish_str();
2177            self.lines.borrow_mut().push((token, line_number));
2178        }
2179
2180        fn finish_str(&self) {
2181            if !self.current_str.borrow().is_empty() {
2182                let s = self.current_str.take();
2183                self.tokens.borrow_mut().push(CharacterTokens(s));
2184            }
2185        }
2186    }
2187
2188    impl TokenSink for LinesMatch {
2189        type Handle = ();
2190
2191        fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
2192            match token {
2193                CharacterTokens(b) => {
2194                    self.current_str.borrow_mut().push_slice(&b);
2195                },
2196
2197                NullCharacterToken => {
2198                    self.current_str.borrow_mut().push_char('\0');
2199                },
2200
2201                ParseError(_) => {
2202                    panic!("unexpected parse error");
2203                },
2204
2205                TagToken(mut t) => {
2206                    // The spec seems to indicate that one can emit
2207                    // erroneous end tags with attrs, but the test
2208                    // cases don't contain them.
2209                    match t.kind {
2210                        EndTag => {
2211                            t.self_closing = false;
2212                            t.attrs = vec![];
2213                        },
2214                        _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
2215                    }
2216                    self.push(TagToken(t), line_number);
2217                },
2218
2219                EOFToken => (),
2220
2221                _ => self.push(token, line_number),
2222            }
2223            TokenSinkResult::Continue
2224        }
2225    }
2226
2227    // Take in tokens, process them, and return vector with line
2228    // numbers that each token is on
2229    fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
2230        let sink = LinesMatch::new();
2231        let tok = Tokenizer::new(sink, opts);
2232        let buffer = BufferQueue::default();
2233        for chunk in input.into_iter() {
2234            buffer.push_back(chunk);
2235            let _ = tok.feed(&buffer);
2236        }
2237        tok.end();
2238        tok.sink.lines.take()
2239    }
2240
2241    // Create a tag token
2242    fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
2243        let name = LocalName::from(&*token);
2244
2245        TagToken(Tag {
2246            kind: tagkind,
2247            name,
2248            self_closing: false,
2249            attrs: vec![],
2250            had_duplicate_attributes: false,
2251        })
2252    }
2253
2254    #[test]
2255    fn push_to_None_gives_singleton() {
2256        let mut s: Option<StrTendril> = None;
2257        option_push(&mut s, 'x');
2258        assert_eq!(s, Some("x".to_tendril()));
2259    }
2260
2261    #[test]
2262    fn push_to_empty_appends() {
2263        let mut s: Option<StrTendril> = Some(StrTendril::new());
2264        option_push(&mut s, 'x');
2265        assert_eq!(s, Some("x".to_tendril()));
2266    }
2267
2268    #[test]
2269    fn push_to_nonempty_appends() {
2270        let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
2271        option_push(&mut s, 'x');
2272        assert_eq!(s, Some("yx".to_tendril()));
2273    }
2274
2275    #[test]
2276    fn check_lines() {
2277        let opts = TokenizerOpts {
2278            exact_errors: false,
2279            discard_bom: true,
2280            profile: false,
2281            initial_state: None,
2282            last_start_tag_name: None,
2283        };
2284        let vector = vec![
2285            StrTendril::from("<a>\n"),
2286            StrTendril::from("<b>\n"),
2287            StrTendril::from("</b>\n"),
2288            StrTendril::from("</a>\n"),
2289        ];
2290        let expected = vec![
2291            (create_tag(StrTendril::from("a"), StartTag), 1),
2292            (create_tag(StrTendril::from("b"), StartTag), 2),
2293            (create_tag(StrTendril::from("b"), EndTag), 3),
2294            (create_tag(StrTendril::from("a"), EndTag), 4),
2295        ];
2296        let results = tokenize(vector, opts);
2297        assert_eq!(results, expected);
2298    }
2299
2300    #[test]
2301    fn check_lines_with_new_line() {
2302        let opts = TokenizerOpts {
2303            exact_errors: false,
2304            discard_bom: true,
2305            profile: false,
2306            initial_state: None,
2307            last_start_tag_name: None,
2308        };
2309        let vector = vec![
2310            StrTendril::from("<a>\r\n"),
2311            StrTendril::from("<b>\r\n"),
2312            StrTendril::from("</b>\r\n"),
2313            StrTendril::from("</a>\r\n"),
2314        ];
2315        let expected = vec![
2316            (create_tag(StrTendril::from("a"), StartTag), 1),
2317            (create_tag(StrTendril::from("b"), StartTag), 2),
2318            (create_tag(StrTendril::from("b"), EndTag), 3),
2319            (create_tag(StrTendril::from("a"), EndTag), 4),
2320        ];
2321        let results = tokenize(vector, opts);
2322        assert_eq!(results, expected);
2323    }
2324}