html5ever/tokenizer/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The HTML5 tokenizer.
11
12pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use markup5ever::{ns, small_char_set, TokenizerResult};
28use std::borrow::Cow::{self, Borrowed};
29use std::cell::{Cell, RefCell, RefMut};
30use std::collections::BTreeMap;
31use std::mem;
32
33pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34use crate::macros::{time, unwrap_or_return};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42/// The result of invoking the tokenizer once.
43pub enum ProcessResult<Handle> {
44    /// The tokenizer should be re-invoked immediately.
45    Continue,
46    /// The tokenizer has not finished, but it needs to wait for more
47    /// input to arrive before it can continue.
48    Suspend,
49    /// The tokenizer was blocked by a `<script>`.
50    ///
51    /// This `<script>` needs to be executed before tokenization
52    /// can continue, as it might invoke `document.write`.
53    Script(Handle),
54    /// The tokenizer was blocked because it found a `<meta charset>` tag.
55    ///
56    /// Such tags may force the user agent to re-parse the document with the new
57    /// encoding, but non-conformant implementations can reasonably treat
58    /// this as [Self::Continue].
59    EncodingIndicator(StrTendril),
60}
61
62fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
63    match *opt_str {
64        Some(ref mut s) => s.push_char(c),
65        None => *opt_str = Some(StrTendril::from_char(c)),
66    }
67}
68
69/// Tokenizer options, with an impl for `Default`.
70#[derive(Clone)]
71pub struct TokenizerOpts {
72    /// Report all parse errors described in the spec, at some
73    /// performance penalty?  Default: false
74    pub exact_errors: bool,
75
76    /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
77    /// of the stream?  Default: true
78    pub discard_bom: bool,
79
80    /// Keep a record of how long we spent in each state?  Printed
81    /// when `end()` is called.  Default: false
82    pub profile: bool,
83
84    /// Initial state override.  Only the test runner should use
85    /// a non-`None` value!
86    pub initial_state: Option<states::State>,
87
88    /// Last start tag.  Only the test runner should use a
89    /// non-`None` value!
90    ///
91    /// FIXME: Can't use Tendril because we want TokenizerOpts
92    /// to be Send.
93    pub last_start_tag_name: Option<String>,
94}
95
96impl Default for TokenizerOpts {
97    fn default() -> TokenizerOpts {
98        TokenizerOpts {
99            exact_errors: false,
100            discard_bom: true,
101            profile: false,
102            initial_state: None,
103            last_start_tag_name: None,
104        }
105    }
106}
107
108/// The HTML tokenizer.
109pub struct Tokenizer<Sink> {
110    /// Options controlling the behavior of the tokenizer.
111    opts: TokenizerOpts,
112
113    /// Destination for tokens we emit.
114    pub sink: Sink,
115
116    /// The abstract machine state as described in the spec.
117    state: Cell<states::State>,
118
119    /// Are we at the end of the file, once buffers have been processed
120    /// completely? This affects whether we will wait for lookahead or not.
121    at_eof: Cell<bool>,
122
123    /// Tokenizer for character references, if we're tokenizing
124    /// one at the moment.
125    char_ref_tokenizer: RefCell<Option<CharRefTokenizer>>,
126
127    /// Current input character.  Just consumed, may reconsume.
128    current_char: Cell<char>,
129
130    /// Should we reconsume the current input character?
131    reconsume: Cell<bool>,
132
133    /// Did we just consume \r, translating it to \n?  In that case we need
134    /// to ignore the next character if it's \n.
135    ignore_lf: Cell<bool>,
136
137    /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
138    /// beginning of the stream.
139    discard_bom: Cell<bool>,
140
141    /// Current tag kind.
142    current_tag_kind: Cell<TagKind>,
143
144    /// Current tag name.
145    current_tag_name: RefCell<StrTendril>,
146
147    /// Current tag is self-closing?
148    current_tag_self_closing: Cell<bool>,
149
150    /// Current tag attributes.
151    current_tag_attrs: RefCell<Vec<Attribute>>,
152
153    /// Current attribute name.
154    current_attr_name: RefCell<StrTendril>,
155
156    /// Current attribute value.
157    current_attr_value: RefCell<StrTendril>,
158
159    /// Current comment.
160    current_comment: RefCell<StrTendril>,
161
162    /// Current doctype token.
163    current_doctype: RefCell<Doctype>,
164
165    /// Last start tag name, for use in checking "appropriate end tag".
166    last_start_tag_name: RefCell<Option<LocalName>>,
167
168    /// The "temporary buffer" mentioned in the spec.
169    temp_buf: RefCell<StrTendril>,
170
171    /// Record of how many ns we spent in each state, if profiling is enabled.
172    state_profile: RefCell<BTreeMap<states::State, u64>>,
173
174    /// Record of how many ns we spent in the token sink.
175    time_in_sink: Cell<u64>,
176
177    /// Track current line
178    current_line: Cell<u64>,
179}
180
181impl<Sink: TokenSink> Tokenizer<Sink> {
182    /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
183    pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
184        let start_tag_name = opts
185            .last_start_tag_name
186            .take()
187            .map(|s| LocalName::from(&*s));
188        let state = opts.initial_state.unwrap_or(states::Data);
189        let discard_bom = opts.discard_bom;
190        Tokenizer {
191            opts,
192            sink,
193            state: Cell::new(state),
194            char_ref_tokenizer: RefCell::new(None),
195            at_eof: Cell::new(false),
196            current_char: Cell::new('\0'),
197            reconsume: Cell::new(false),
198            ignore_lf: Cell::new(false),
199            discard_bom: Cell::new(discard_bom),
200            current_tag_kind: Cell::new(StartTag),
201            current_tag_name: RefCell::new(StrTendril::new()),
202            current_tag_self_closing: Cell::new(false),
203            current_tag_attrs: RefCell::new(vec![]),
204            current_attr_name: RefCell::new(StrTendril::new()),
205            current_attr_value: RefCell::new(StrTendril::new()),
206            current_comment: RefCell::new(StrTendril::new()),
207            current_doctype: RefCell::new(Doctype::default()),
208            last_start_tag_name: RefCell::new(start_tag_name),
209            temp_buf: RefCell::new(StrTendril::new()),
210            state_profile: RefCell::new(BTreeMap::new()),
211            time_in_sink: Cell::new(0),
212            current_line: Cell::new(1),
213        }
214    }
215
216    /// Feed an input string into the tokenizer.
217    pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
218        if input.is_empty() {
219            return TokenizerResult::Done;
220        }
221
222        if self.discard_bom.get() {
223            if let Some(c) = input.peek() {
224                if c == '\u{feff}' {
225                    input.next();
226                }
227            } else {
228                return TokenizerResult::Done;
229            }
230        };
231
232        self.run(input)
233    }
234
235    pub fn set_plaintext_state(&self) {
236        self.state.set(states::Plaintext);
237    }
238
239    fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
240        if self.opts.profile {
241            let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
242            self.time_in_sink.set(self.time_in_sink.get() + dt);
243            ret
244        } else {
245            self.sink.process_token(token, self.current_line.get())
246        }
247    }
248
249    fn process_token_and_continue(&self, token: Token) {
250        assert!(matches!(
251            self.process_token(token),
252            TokenSinkResult::Continue
253        ));
254    }
255
256    //§ preprocessing-the-input-stream
257    // Get the next input character, which might be the character
258    // 'c' that we already consumed from the buffers.
259    fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
260        if self.ignore_lf.get() {
261            self.ignore_lf.set(false);
262            if c == '\n' {
263                c = input.next()?;
264            }
265        }
266
267        if c == '\r' {
268            self.ignore_lf.set(true);
269            c = '\n';
270        }
271
272        if c == '\n' {
273            self.current_line.set(self.current_line.get() + 1);
274        }
275
276        if self.opts.exact_errors
277            && match c as u32 {
278                0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
279                n if (n & 0xFFFE) == 0xFFFE => true,
280                _ => false,
281            }
282        {
283            let msg = format!("Bad character {c}");
284            self.emit_error(Cow::Owned(msg));
285        }
286
287        trace!("got character {c}");
288        self.current_char.set(c);
289        Some(c)
290    }
291
292    //§ tokenization
293    // Get the next input character, if one is available.
294    fn get_char(&self, input: &BufferQueue) -> Option<char> {
295        if self.reconsume.get() {
296            self.reconsume.set(false);
297            Some(self.current_char.get())
298        } else {
299            input
300                .next()
301                .and_then(|c| self.get_preprocessed_char(c, input))
302        }
303    }
304
305    fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
306        // Bail to the slow path for various corner cases.
307        // This means that `FromSet` can contain characters not in the set!
308        // It shouldn't matter because the fallback `FromSet` case should
309        // always do the same thing as the `NotFromSet` case.
310        if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
311            return self.get_char(input).map(FromSet);
312        }
313
314        let d = input.pop_except_from(set);
315        trace!("got characters {d:?}");
316        match d {
317            Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
318
319            // NB: We don't set self.current_char for a run of characters not
320            // in the set.  It shouldn't matter for the codepaths that use
321            // this.
322            _ => d,
323        }
324    }
325
326    // Check if the next characters are an ASCII case-insensitive match.  See
327    // BufferQueue::eat.
328    //
329    // NB: this doesn't set the current input character.
330    fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
331        if self.ignore_lf.get() {
332            self.ignore_lf.set(false);
333            if self.peek(input) == Some('\n') {
334                self.discard_char(input);
335            }
336        }
337
338        input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
339        match input.eat(pat, eq) {
340            None if self.at_eof.get() => Some(false),
341            None => {
342                while let Some(data) = input.next() {
343                    self.temp_buf.borrow_mut().push_char(data);
344                }
345                None
346            },
347            Some(matched) => Some(matched),
348        }
349    }
350
351    /// Run the state machine for as long as we can.
352    fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
353        if self.opts.profile {
354            loop {
355                let state = self.state.get();
356                let old_sink = self.time_in_sink.get();
357                let (run, mut dt) = time!(self.step(input));
358                dt -= (self.time_in_sink.get() - old_sink);
359                let new = match self.state_profile.borrow_mut().get_mut(&state) {
360                    Some(x) => {
361                        *x += dt;
362                        false
363                    },
364                    None => true,
365                };
366                if new {
367                    // do this here because of borrow shenanigans
368                    self.state_profile.borrow_mut().insert(state, dt);
369                }
370                match run {
371                    ProcessResult::Continue => (),
372                    ProcessResult::Suspend => break,
373                    ProcessResult::Script(node) => return TokenizerResult::Script(node),
374                    ProcessResult::EncodingIndicator(encoding) => {
375                        return TokenizerResult::EncodingIndicator(encoding)
376                    },
377                }
378            }
379        } else {
380            loop {
381                match self.step(input) {
382                    ProcessResult::Continue => (),
383                    ProcessResult::Suspend => break,
384                    ProcessResult::Script(node) => return TokenizerResult::Script(node),
385                    ProcessResult::EncodingIndicator(encoding) => {
386                        return TokenizerResult::EncodingIndicator(encoding)
387                    },
388                }
389            }
390        }
391        TokenizerResult::Done
392    }
393
394    #[inline]
395    fn bad_char_error(&self) {
396        #[cfg(feature = "trace_tokenizer")]
397        trace!("  error");
398
399        let msg = if self.opts.exact_errors {
400            Cow::from("Bad character")
401        } else {
402            let c = self.current_char.get();
403            let state = self.state.get();
404            Cow::from(format!("Saw {c} in state {state:?}"))
405        };
406        self.emit_error(msg);
407    }
408
409    #[inline]
410    fn bad_eof_error(&self) {
411        #[cfg(feature = "trace_tokenizer")]
412        trace!("  error_eof");
413
414        let msg = if self.opts.exact_errors {
415            Cow::from("Unexpected EOF")
416        } else {
417            let state = self.state.get();
418            Cow::from(format!("Saw EOF in state {state:?}"))
419        };
420        self.emit_error(msg);
421    }
422
423    fn emit_char(&self, c: char) {
424        #[cfg(feature = "trace_tokenizer")]
425        trace!("  emit");
426
427        self.process_token_and_continue(match c {
428            '\0' => NullCharacterToken,
429            _ => CharacterTokens(StrTendril::from_char(c)),
430        });
431    }
432
433    // The string must not contain '\0'!
434    fn emit_chars(&self, b: StrTendril) {
435        self.process_token_and_continue(CharacterTokens(b));
436    }
437
438    fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
439        self.finish_attribute();
440
441        let name = LocalName::from(&**self.current_tag_name.borrow());
442        self.current_tag_name.borrow_mut().clear();
443
444        match self.current_tag_kind.get() {
445            StartTag => {
446                *self.last_start_tag_name.borrow_mut() = Some(name.clone());
447            },
448            EndTag => {
449                if !self.current_tag_attrs.borrow().is_empty() {
450                    self.emit_error(Borrowed("Attributes on an end tag"));
451                }
452                if self.current_tag_self_closing.get() {
453                    self.emit_error(Borrowed("Self-closing end tag"));
454                }
455            },
456        }
457
458        let token = TagToken(Tag {
459            kind: self.current_tag_kind.get(),
460            name,
461            self_closing: self.current_tag_self_closing.get(),
462            attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
463        });
464
465        match self.process_token(token) {
466            TokenSinkResult::Continue => ProcessResult::Continue,
467            TokenSinkResult::Plaintext => {
468                self.state.set(states::Plaintext);
469                ProcessResult::Continue
470            },
471            TokenSinkResult::Script(node) => {
472                self.state.set(states::Data);
473                ProcessResult::Script(node)
474            },
475            TokenSinkResult::RawData(kind) => {
476                self.state.set(states::RawData(kind));
477                ProcessResult::Continue
478            },
479            TokenSinkResult::EncodingIndicator(encoding) => {
480                ProcessResult::EncodingIndicator(encoding)
481            },
482        }
483    }
484
485    fn emit_temp_buf(&self) {
486        #[cfg(feature = "trace_tokenizer")]
487        trace!("  emit_temp");
488
489        // FIXME: Make sure that clearing on emit is spec-compatible.
490        let buf = mem::take(&mut *self.temp_buf.borrow_mut());
491        self.emit_chars(buf);
492    }
493
494    fn clear_temp_buf(&self) {
495        // Do this without a new allocation.
496        self.temp_buf.borrow_mut().clear();
497    }
498
499    fn emit_current_comment(&self) {
500        let comment = mem::take(&mut *self.current_comment.borrow_mut());
501        self.process_token_and_continue(CommentToken(comment));
502    }
503
504    fn discard_tag(&self) {
505        self.current_tag_name.borrow_mut().clear();
506        self.current_tag_self_closing.set(false);
507        *self.current_tag_attrs.borrow_mut() = vec![];
508    }
509
510    fn create_tag(&self, kind: TagKind, c: char) {
511        self.discard_tag();
512        self.current_tag_name.borrow_mut().push_char(c);
513        self.current_tag_kind.set(kind);
514    }
515
516    fn have_appropriate_end_tag(&self) -> bool {
517        match self.last_start_tag_name.borrow().as_ref() {
518            Some(last) => {
519                (self.current_tag_kind.get() == EndTag)
520                    && (**self.current_tag_name.borrow() == **last)
521            },
522            None => false,
523        }
524    }
525
526    fn create_attribute(&self, c: char) {
527        self.finish_attribute();
528
529        self.current_attr_name.borrow_mut().push_char(c);
530    }
531
532    fn finish_attribute(&self) {
533        if self.current_attr_name.borrow().is_empty() {
534            return;
535        }
536
537        // Check for a duplicate attribute.
538        // FIXME: the spec says we should error as soon as the name is finished.
539        let dup = {
540            let name = &*self.current_attr_name.borrow();
541            self.current_tag_attrs
542                .borrow()
543                .iter()
544                .any(|a| *a.name.local == **name)
545        };
546
547        if dup {
548            self.emit_error(Borrowed("Duplicate attribute"));
549            self.current_attr_name.borrow_mut().clear();
550            self.current_attr_value.borrow_mut().clear();
551        } else {
552            let name = LocalName::from(&**self.current_attr_name.borrow());
553            self.current_attr_name.borrow_mut().clear();
554            self.current_tag_attrs.borrow_mut().push(Attribute {
555                // The tree builder will adjust the namespace if necessary.
556                // This only happens in foreign elements.
557                name: QualName::new(None, ns!(), name),
558                value: mem::take(&mut self.current_attr_value.borrow_mut()),
559            });
560        }
561    }
562
563    fn emit_current_doctype(&self) {
564        let doctype = self.current_doctype.take();
565        self.process_token_and_continue(DoctypeToken(doctype));
566    }
567
568    fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<'_, Option<StrTendril>> {
569        let current_doctype = self.current_doctype.borrow_mut();
570        match kind {
571            Public => RefMut::map(current_doctype, |d| &mut d.public_id),
572            System => RefMut::map(current_doctype, |d| &mut d.system_id),
573        }
574    }
575
576    fn clear_doctype_id(&self, kind: DoctypeIdKind) {
577        let mut id = self.doctype_id(kind);
578        match *id {
579            Some(ref mut s) => s.clear(),
580            None => *id = Some(StrTendril::new()),
581        }
582    }
583
584    fn start_consuming_character_reference(&self) {
585        debug_assert!(
586            self.char_ref_tokenizer.borrow().is_none(),
587            "Nested character references are impossible"
588        );
589
590        let is_in_attribute = matches!(self.state.get(), states::AttributeValue(_));
591        *self.char_ref_tokenizer.borrow_mut() = Some(CharRefTokenizer::new(is_in_attribute));
592    }
593
594    fn emit_eof(&self) {
595        self.process_token_and_continue(EOFToken);
596    }
597
598    fn peek(&self, input: &BufferQueue) -> Option<char> {
599        if self.reconsume.get() {
600            Some(self.current_char.get())
601        } else {
602            input.peek()
603        }
604    }
605
606    fn discard_char(&self, input: &BufferQueue) {
607        // peek() deals in un-processed characters (no newline normalization), while get_char()
608        // does.
609        //
610        // since discard_char is supposed to be used in combination with peek(), discard_char must
611        // discard a single raw input character, not a normalized newline.
612        if self.reconsume.get() {
613            self.reconsume.set(false);
614        } else {
615            input.next();
616        }
617    }
618
619    fn emit_error(&self, error: Cow<'static, str>) {
620        self.process_token_and_continue(ParseError(error));
621    }
622}
623//§ END
624
625// Shorthand for common state machine behaviors.
626macro_rules! shorthand (
627    ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c)                           );
628    ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.borrow_mut().push_char($c)     );
629    ( $me:ident : discard_tag                      ) => ( $me.discard_tag()                                   );
630    ( $me:ident : discard_char $input:expr         ) => ( $me.discard_char($input)                            );
631    ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.borrow_mut().push_char($c)             );
632    ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf()                                );
633    ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c)                            );
634    ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.borrow_mut().push_char($c)    );
635    ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.borrow_mut().push_char($c)   );
636    ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
637    ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.borrow_mut().push_char($c)      );
638    ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.borrow_mut().push_slice($c)     );
639    ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment()                          );
640    ( $me:ident : clear_comment                    ) => ( $me.current_comment.borrow_mut().clear()            );
641    ( $me:ident : create_doctype                   ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
642    ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
643    ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c)            );
644    ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k)                            );
645    ( $me:ident : force_quirks                     ) => ( $me.current_doctype.borrow_mut().force_quirks = true);
646    ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype()                          );
647);
648
649// Tracing of tokenizer actions.  This adds significant bloat and compile time,
650// so it's behind a cfg flag.
651#[cfg(feature = "trace_tokenizer")]
652macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
653    trace!("  {:?}", stringify!($($cmds)*));
654    shorthand!($me : $($cmds)*);
655}));
656
657#[cfg(not(feature = "trace_tokenizer"))]
658macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
659
660// A little DSL for sequencing shorthand actions.
661macro_rules! go (
662    // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
663    // We have to tell the parser how much lookahead we need.
664
665    ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
666    ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
667    ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
668    ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
669
670    // These can only come at the end.
671
672    ( $me:ident : to $s:ident                    ) => ({ $me.state.set(states::$s); return ProcessResult::Continue;           });
673    ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue;      });
674    ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
675
676    ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume.set(true); go!($me: to $s);         });
677    ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume.set(true); go!($me: to $s $k1);     });
678    ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
679
680    ( $me:ident : consume_char_ref             ) => ({ $me.start_consuming_character_reference(); return ProcessResult::Continue;         });
681
682    // We have a default next state after emitting a tag, but the sink can override.
683    ( $me:ident : emit_tag $s:ident ) => ({
684        $me.state.set(states::$s);
685        return $me.emit_current_tag();
686    });
687
688    ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
689
690    // If nothing else matched, it's a single command
691    ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
692
693    // or nothing.
694    ( $me:ident : ) => (());
695);
696
697// This is a macro because it can cause early return
698// from the function where it is used.
699macro_rules! get_char ( ($me:expr, $input:expr) => (
700    unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
701));
702
703macro_rules! peek ( ($me:expr, $input:expr) => (
704    unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
705));
706
707macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
708    unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
709));
710
711macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
712    unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
713));
714
715macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
716    unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
717));
718
719impl<Sink: TokenSink> Tokenizer<Sink> {
720    // Run the state machine for a while.
721    // Return true if we should be immediately re-invoked
722    // (this just simplifies control flow vs. break / continue).
723    #[allow(clippy::never_loop)]
724    fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
725        if self.char_ref_tokenizer.borrow().is_some() {
726            return self.step_char_ref_tokenizer(input);
727        }
728
729        trace!("processing in state {:?}", self.state);
730        match self.state.get() {
731            //§ data-state
732            states::Data => loop {
733                let set = small_char_set!('\r' '\0' '&' '<' '\n');
734
735                #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
736                let set_result = if !(self.opts.exact_errors
737                    || self.reconsume.get()
738                    || self.ignore_lf.get())
739                    && Self::is_supported_simd_feature_detected()
740                {
741                    let front_buffer = input.peek_front_chunk_mut();
742                    let Some(mut front_buffer) = front_buffer else {
743                        return ProcessResult::Suspend;
744                    };
745
746                    // Special case: The fast path is not worth taking if the first character is already in the set,
747                    // which is fairly common
748                    let first_char = front_buffer
749                        .chars()
750                        .next()
751                        .expect("Input buffers are never empty");
752
753                    if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
754                        drop(front_buffer);
755                        self.pop_except_from(input, set)
756                    } else {
757                        // SAFETY:
758                        // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
759                        let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
760
761                        if front_buffer.is_empty() {
762                            drop(front_buffer);
763                            input.pop_front();
764                        }
765
766                        result
767                    }
768                } else {
769                    self.pop_except_from(input, set)
770                };
771
772                #[cfg(not(any(
773                    target_arch = "x86",
774                    target_arch = "x86_64",
775                    target_arch = "aarch64"
776                )))]
777                let set_result = self.pop_except_from(input, set);
778
779                let Some(set_result) = set_result else {
780                    return ProcessResult::Suspend;
781                };
782                match set_result {
783                    FromSet('\0') => {
784                        self.bad_char_error();
785                        self.emit_char('\0');
786                    },
787                    FromSet('&') => go!(self: consume_char_ref),
788                    FromSet('<') => go!(self: to TagOpen),
789                    FromSet(c) => {
790                        self.emit_char(c);
791                    },
792                    NotFromSet(b) => self.emit_chars(b),
793                }
794            },
795
796            //§ rcdata-state
797            states::RawData(Rcdata) => loop {
798                match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
799                    FromSet('\0') => {
800                        self.bad_char_error();
801                        self.emit_char('\u{fffd}');
802                    },
803                    FromSet('&') => go!(self: consume_char_ref),
804                    FromSet('<') => go!(self: to RawLessThanSign Rcdata),
805                    FromSet(c) => self.emit_char(c),
806                    NotFromSet(b) => self.emit_chars(b),
807                }
808            },
809
810            //§ rawtext-state
811            states::RawData(Rawtext) => loop {
812                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
813                    FromSet('\0') => {
814                        self.bad_char_error();
815                        self.emit_char('\u{fffd}');
816                    },
817                    FromSet('<') => go!(self: to RawLessThanSign Rawtext),
818                    FromSet(c) => self.emit_char(c),
819                    NotFromSet(b) => self.emit_chars(b),
820                }
821            },
822
823            //§ script-data-state
824            states::RawData(ScriptData) => loop {
825                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
826                    FromSet('\0') => {
827                        self.bad_char_error();
828                        self.emit_char('\u{fffd}');
829                    },
830                    FromSet('<') => go!(self: to RawLessThanSign ScriptData),
831                    FromSet(c) => self.emit_char(c),
832                    NotFromSet(b) => self.emit_chars(b),
833                }
834            },
835
836            //§ script-data-escaped-state
837            states::RawData(ScriptDataEscaped(Escaped)) => loop {
838                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
839                    FromSet('\0') => {
840                        self.bad_char_error();
841                        self.emit_char('\u{fffd}');
842                    },
843                    FromSet('-') => {
844                        self.emit_char('-');
845                        go!(self: to ScriptDataEscapedDash Escaped);
846                    },
847                    FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
848                    FromSet(c) => self.emit_char(c),
849                    NotFromSet(b) => self.emit_chars(b),
850                }
851            },
852
853            //§ script-data-double-escaped-state
854            states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
855                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
856                    FromSet('\0') => {
857                        self.bad_char_error();
858                        self.emit_char('\u{fffd}');
859                    },
860                    FromSet('-') => {
861                        self.emit_char('-');
862                        go!(self: to ScriptDataEscapedDash DoubleEscaped);
863                    },
864                    FromSet('<') => {
865                        self.emit_char('<');
866                        go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped)
867                    },
868                    FromSet(c) => self.emit_char(c),
869                    NotFromSet(b) => self.emit_chars(b),
870                }
871            },
872
873            //§ plaintext-state
874            states::Plaintext => loop {
875                match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
876                    FromSet('\0') => {
877                        self.bad_char_error();
878                        self.emit_char('\u{fffd}');
879                    },
880                    FromSet(c) => self.emit_char(c),
881                    NotFromSet(b) => self.emit_chars(b),
882                }
883            },
884
885            //§ tag-open-state
886            states::TagOpen => loop {
887                match get_char!(self, input) {
888                    '!' => go!(self: to MarkupDeclarationOpen),
889                    '/' => go!(self: to EndTagOpen),
890                    '?' => {
891                        self.bad_char_error();
892                        go!(self: clear_comment; reconsume BogusComment)
893                    },
894                    c => match lower_ascii_letter(c) {
895                        Some(cl) => go!(self: create_tag StartTag cl; to TagName),
896                        None => {
897                            self.bad_char_error();
898                            self.emit_char('<');
899                            go!(self: reconsume Data)
900                        },
901                    },
902                }
903            },
904
905            //§ end-tag-open-state
906            states::EndTagOpen => loop {
907                match get_char!(self, input) {
908                    '>' => {
909                        self.bad_char_error();
910                        go!(self: to Data)
911                    },
912                    c => match lower_ascii_letter(c) {
913                        Some(cl) => go!(self: create_tag EndTag cl; to TagName),
914                        None => {
915                            self.bad_char_error();
916                            go!(self: clear_comment; reconsume BogusComment)
917                        },
918                    },
919                }
920            },
921
922            //§ tag-name-state
923            states::TagName => loop {
924                match get_char!(self, input) {
925                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
926                    '/' => go!(self: to SelfClosingStartTag),
927                    '>' => go!(self: emit_tag Data),
928                    '\0' => {
929                        self.bad_char_error();
930                        go!(self: push_tag '\u{fffd}')
931                    },
932                    c => go!(self: push_tag (c.to_ascii_lowercase())),
933                }
934            },
935
936            //§ script-data-escaped-less-than-sign-state
937            states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
938                match get_char!(self, input) {
939                    '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
940                    c => match lower_ascii_letter(c) {
941                        Some(cl) => {
942                            go!(self: clear_temp; push_temp cl);
943                            self.emit_char('<');
944                            self.emit_char(c);
945                            go!(self: to ScriptDataEscapeStart DoubleEscaped);
946                        },
947                        None => {
948                            self.emit_char('<');
949                            go!(self: reconsume RawData ScriptDataEscaped Escaped);
950                        },
951                    },
952                }
953            },
954
955            //§ script-data-double-escaped-less-than-sign-state
956            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
957                match get_char!(self, input) {
958                    '/' => {
959                        go!(self: clear_temp);
960                        self.emit_char('/');
961                        go!(self: to ScriptDataDoubleEscapeEnd);
962                    },
963                    _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
964                }
965            },
966
967            //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
968            // otherwise
969            states::RawLessThanSign(kind) => loop {
970                match get_char!(self, input) {
971                    '/' => go!(self: clear_temp; to RawEndTagOpen kind),
972                    '!' if kind == ScriptData => {
973                        self.emit_char('<');
974                        self.emit_char('!');
975                        go!(self: to ScriptDataEscapeStart Escaped);
976                    },
977                    _ => {
978                        self.emit_char('<');
979                        go!(self: reconsume RawData kind);
980                    },
981                }
982            },
983
984            //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
985            states::RawEndTagOpen(kind) => loop {
986                let c = get_char!(self, input);
987                match lower_ascii_letter(c) {
988                    Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
989                    None => {
990                        self.emit_char('<');
991                        self.emit_char('/');
992                        go!(self: reconsume RawData kind);
993                    },
994                }
995            },
996
997            //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
998            states::RawEndTagName(kind) => loop {
999                let c = get_char!(self, input);
1000                if self.have_appropriate_end_tag() {
1001                    match c {
1002                        '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName),
1003                        '/' => go!(self: clear_temp; to SelfClosingStartTag),
1004                        '>' => go!(self: clear_temp; emit_tag Data),
1005                        _ => (),
1006                    }
1007                }
1008
1009                match lower_ascii_letter(c) {
1010                    Some(cl) => go!(self: push_tag cl; push_temp c),
1011                    None => {
1012                        go!(self: discard_tag);
1013                        self.emit_char('<');
1014                        self.emit_char('/');
1015                        self.emit_temp_buf();
1016                        go!(self: reconsume RawData kind);
1017                    },
1018                }
1019            },
1020
1021            //§ script-data-double-escape-start-state
1022            states::ScriptDataEscapeStart(DoubleEscaped) => loop {
1023                let c = get_char!(self, input);
1024                match c {
1025                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1026                        let esc = if &**self.temp_buf.borrow() == "script" {
1027                            DoubleEscaped
1028                        } else {
1029                            Escaped
1030                        };
1031                        self.emit_char(c);
1032                        go!(self: to RawData ScriptDataEscaped esc);
1033                    },
1034                    _ => match lower_ascii_letter(c) {
1035                        Some(cl) => {
1036                            go!(self: push_temp cl);
1037                            self.emit_char(c);
1038                        },
1039                        None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
1040                    },
1041                }
1042            },
1043
1044            //§ script-data-escape-start-state
1045            states::ScriptDataEscapeStart(Escaped) => loop {
1046                match get_char!(self, input) {
1047                    '-' => {
1048                        self.emit_char('-');
1049                        go!(self: to ScriptDataEscapeStartDash);
1050                    },
1051                    _ => go!(self: reconsume RawData ScriptData),
1052                }
1053            },
1054
1055            //§ script-data-escape-start-dash-state
1056            states::ScriptDataEscapeStartDash => loop {
1057                match get_char!(self, input) {
1058                    '-' => {
1059                        self.emit_char('-');
1060                        go!(self: to ScriptDataEscapedDashDash Escaped);
1061                    },
1062                    _ => go!(self: reconsume RawData ScriptData),
1063                }
1064            },
1065
1066            //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
1067            states::ScriptDataEscapedDash(kind) => loop {
1068                match get_char!(self, input) {
1069                    '-' => {
1070                        self.emit_char('-');
1071                        go!(self: to ScriptDataEscapedDashDash kind);
1072                    },
1073                    '<' => {
1074                        if kind == DoubleEscaped {
1075                            self.emit_char('<');
1076                        }
1077                        go!(self: to RawLessThanSign ScriptDataEscaped kind);
1078                    },
1079                    '\0' => {
1080                        self.bad_char_error();
1081                        self.emit_char('\u{fffd}');
1082                        go!(self: to RawData ScriptDataEscaped kind)
1083                    },
1084                    c => {
1085                        self.emit_char(c);
1086                        go!(self: to RawData ScriptDataEscaped kind);
1087                    },
1088                }
1089            },
1090
1091            //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
1092            states::ScriptDataEscapedDashDash(kind) => loop {
1093                match get_char!(self, input) {
1094                    '-' => {
1095                        self.emit_char('-');
1096                    },
1097                    '<' => {
1098                        if kind == DoubleEscaped {
1099                            self.emit_char('<');
1100                        }
1101                        go!(self: to RawLessThanSign ScriptDataEscaped kind);
1102                    },
1103                    '>' => {
1104                        self.emit_char('>');
1105                        go!(self: to RawData ScriptData);
1106                    },
1107                    '\0' => {
1108                        self.bad_char_error();
1109                        self.emit_char('\u{fffd}');
1110                        go!(self: to RawData ScriptDataEscaped kind)
1111                    },
1112                    c => {
1113                        self.emit_char(c);
1114                        go!(self: to RawData ScriptDataEscaped kind);
1115                    },
1116                }
1117            },
1118
1119            //§ script-data-double-escape-end-state
1120            states::ScriptDataDoubleEscapeEnd => loop {
1121                let c = get_char!(self, input);
1122                match c {
1123                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1124                        let esc = if &**self.temp_buf.borrow() == "script" {
1125                            Escaped
1126                        } else {
1127                            DoubleEscaped
1128                        };
1129                        self.emit_char(c);
1130                        go!(self: to RawData ScriptDataEscaped esc);
1131                    },
1132                    _ => match lower_ascii_letter(c) {
1133                        Some(cl) => {
1134                            go!(self: push_temp cl);
1135                            self.emit_char(c);
1136                        },
1137                        None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
1138                    },
1139                }
1140            },
1141
1142            //§ before-attribute-name-state
1143            states::BeforeAttributeName => loop {
1144                match get_char!(self, input) {
1145                    '\t' | '\n' | '\x0C' | ' ' => (),
1146                    '/' => go!(self: to SelfClosingStartTag),
1147                    '>' => go!(self: emit_tag Data),
1148                    '\0' => {
1149                        self.bad_char_error();
1150                        go!(self: create_attr '\u{fffd}'; to AttributeName)
1151                    },
1152                    c => match lower_ascii_letter(c) {
1153                        Some(cl) => go!(self: create_attr cl; to AttributeName),
1154                        None => {
1155                            if matches!(c, '"' | '\'' | '<' | '=') {
1156                                self.bad_char_error();
1157                            }
1158
1159                            go!(self: create_attr c; to AttributeName);
1160                        },
1161                    },
1162                }
1163            },
1164
1165            //§ attribute-name-state
1166            states::AttributeName => loop {
1167                match get_char!(self, input) {
1168                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
1169                    '/' => go!(self: to SelfClosingStartTag),
1170                    '=' => go!(self: to BeforeAttributeValue),
1171                    '>' => go!(self: emit_tag Data),
1172                    '\0' => {
1173                        self.bad_char_error();
1174                        go!(self: push_name '\u{fffd}')
1175                    },
1176                    c => match lower_ascii_letter(c) {
1177                        Some(cl) => go!(self: push_name cl),
1178                        None => {
1179                            if matches!(c, '"' | '\'' | '<') {
1180                                self.bad_char_error();
1181                            }
1182                            go!(self: push_name c);
1183                        },
1184                    },
1185                }
1186            },
1187
1188            //§ after-attribute-name-state
1189            states::AfterAttributeName => loop {
1190                match get_char!(self, input) {
1191                    '\t' | '\n' | '\x0C' | ' ' => (),
1192                    '/' => go!(self: to SelfClosingStartTag),
1193                    '=' => go!(self: to BeforeAttributeValue),
1194                    '>' => go!(self: emit_tag Data),
1195                    '\0' => {
1196                        self.bad_char_error();
1197                        go!(self: create_attr '\u{fffd}'; to AttributeName)
1198                    },
1199                    c => match lower_ascii_letter(c) {
1200                        Some(cl) => go!(self: create_attr cl; to AttributeName),
1201                        None => {
1202                            if matches!(c, '"' | '\'' | '<') {
1203                                self.bad_char_error();
1204                            }
1205
1206                            go!(self: create_attr c; to AttributeName);
1207                        },
1208                    },
1209                }
1210            },
1211
1212            //§ before-attribute-value-state
1213            // Use peek so we can handle the first attr character along with the rest,
1214            // hopefully in the same zero-copy buffer.
1215            states::BeforeAttributeValue => loop {
1216                match peek!(self, input) {
1217                    '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1218                    '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1219                    '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1220                    '>' => {
1221                        go!(self: discard_char input);
1222                        self.bad_char_error();
1223                        go!(self: emit_tag Data)
1224                    },
1225                    _ => go!(self: to AttributeValue Unquoted),
1226                }
1227            },
1228
1229            //§ attribute-value-(double-quoted)-state
1230            states::AttributeValue(DoubleQuoted) => loop {
1231                match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1232                    FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1233                    FromSet('&') => go!(self: consume_char_ref),
1234                    FromSet('\0') => {
1235                        self.bad_char_error();
1236                        go!(self: push_value '\u{fffd}')
1237                    },
1238                    FromSet(c) => go!(self: push_value c),
1239                    NotFromSet(ref b) => go!(self: append_value b),
1240                }
1241            },
1242
1243            //§ attribute-value-(single-quoted)-state
1244            states::AttributeValue(SingleQuoted) => loop {
1245                match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1246                    FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1247                    FromSet('&') => go!(self: consume_char_ref),
1248                    FromSet('\0') => {
1249                        self.bad_char_error();
1250                        go!(self: push_value '\u{fffd}')
1251                    },
1252                    FromSet(c) => go!(self: push_value c),
1253                    NotFromSet(ref b) => go!(self: append_value b),
1254                }
1255            },
1256
1257            //§ attribute-value-(unquoted)-state
1258            states::AttributeValue(Unquoted) => loop {
1259                match pop_except_from!(
1260                    self,
1261                    input,
1262                    small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1263                ) {
1264                    FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1265                        go!(self: to BeforeAttributeName)
1266                    },
1267                    FromSet('&') => go!(self: consume_char_ref),
1268                    FromSet('>') => go!(self: emit_tag Data),
1269                    FromSet('\0') => {
1270                        self.bad_char_error();
1271                        go!(self: push_value '\u{fffd}')
1272                    },
1273                    FromSet(c) => {
1274                        if matches!(c, '"' | '\'' | '<' | '=' | '`') {
1275                            self.bad_char_error();
1276                        }
1277                        go!(self: push_value c);
1278                    },
1279                    NotFromSet(ref b) => go!(self: append_value b),
1280                }
1281            },
1282
1283            //§ after-attribute-value-(quoted)-state
1284            states::AfterAttributeValueQuoted => loop {
1285                match get_char!(self, input) {
1286                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1287                    '/' => go!(self: to SelfClosingStartTag),
1288                    '>' => go!(self: emit_tag Data),
1289                    _ => {
1290                        self.bad_char_error();
1291                        go!(self: reconsume BeforeAttributeName)
1292                    },
1293                }
1294            },
1295
1296            //§ self-closing-start-tag-state
1297            states::SelfClosingStartTag => loop {
1298                match get_char!(self, input) {
1299                    '>' => {
1300                        self.current_tag_self_closing.set(true);
1301                        go!(self: emit_tag Data);
1302                    },
1303                    _ => {
1304                        self.bad_char_error();
1305                        go!(self: reconsume BeforeAttributeName)
1306                    },
1307                }
1308            },
1309
1310            //§ comment-start-state
1311            states::CommentStart => loop {
1312                match get_char!(self, input) {
1313                    '-' => go!(self: to CommentStartDash),
1314                    '\0' => {
1315                        self.bad_char_error();
1316                        go!(self: push_comment '\u{fffd}'; to Comment)
1317                    },
1318                    '>' => {
1319                        self.bad_char_error();
1320                        go!(self: emit_comment; to Data)
1321                    },
1322                    c => go!(self: push_comment c; to Comment),
1323                }
1324            },
1325
1326            //§ comment-start-dash-state
1327            states::CommentStartDash => loop {
1328                match get_char!(self, input) {
1329                    '-' => go!(self: to CommentEnd),
1330                    '\0' => {
1331                        self.bad_char_error();
1332                        go!(self: append_comment "-\u{fffd}"; to Comment)
1333                    },
1334                    '>' => {
1335                        self.bad_char_error();
1336                        go!(self: emit_comment; to Data)
1337                    },
1338                    c => go!(self: push_comment '-'; push_comment c; to Comment),
1339                }
1340            },
1341
1342            //§ comment-state
1343            states::Comment => loop {
1344                match get_char!(self, input) {
1345                    c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1346                    '-' => go!(self: to CommentEndDash),
1347                    '\0' => {
1348                        self.bad_char_error();
1349                        go!(self: push_comment '\u{fffd}')
1350                    },
1351                    c => go!(self: push_comment c),
1352                }
1353            },
1354
1355            //§ comment-less-than-sign-state
1356            states::CommentLessThanSign => loop {
1357                match get_char!(self, input) {
1358                    c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1359                    c @ '<' => go!(self: push_comment c),
1360                    _ => go!(self: reconsume Comment),
1361                }
1362            },
1363
1364            //§ comment-less-than-sign-bang
1365            states::CommentLessThanSignBang => loop {
1366                match get_char!(self, input) {
1367                    '-' => go!(self: to CommentLessThanSignBangDash),
1368                    _ => go!(self: reconsume Comment),
1369                }
1370            },
1371
1372            //§ comment-less-than-sign-bang-dash
1373            states::CommentLessThanSignBangDash => loop {
1374                match get_char!(self, input) {
1375                    '-' => go!(self: to CommentLessThanSignBangDashDash),
1376                    _ => go!(self: reconsume CommentEndDash),
1377                }
1378            },
1379
1380            //§ comment-less-than-sign-bang-dash-dash
1381            states::CommentLessThanSignBangDashDash => loop {
1382                match get_char!(self, input) {
1383                    '>' => go!(self: reconsume CommentEnd),
1384                    _ => {
1385                        self.bad_char_error();
1386                        go!(self: reconsume CommentEnd)
1387                    },
1388                }
1389            },
1390
1391            //§ comment-end-dash-state
1392            states::CommentEndDash => loop {
1393                match get_char!(self, input) {
1394                    '-' => go!(self: to CommentEnd),
1395                    '\0' => {
1396                        self.bad_char_error();
1397                        go!(self: append_comment "-\u{fffd}"; to Comment)
1398                    },
1399                    c => go!(self: push_comment '-'; push_comment c; to Comment),
1400                }
1401            },
1402
1403            //§ comment-end-state
1404            states::CommentEnd => loop {
1405                match get_char!(self, input) {
1406                    '>' => go!(self: emit_comment; to Data),
1407                    '!' => go!(self: to CommentEndBang),
1408                    '-' => go!(self: push_comment '-'),
1409                    _ => go!(self: append_comment "--"; reconsume Comment),
1410                }
1411            },
1412
1413            //§ comment-end-bang-state
1414            states::CommentEndBang => loop {
1415                match get_char!(self, input) {
1416                    '-' => go!(self: append_comment "--!"; to CommentEndDash),
1417                    '>' => {
1418                        self.bad_char_error();
1419                        go!(self: emit_comment; to Data)
1420                    },
1421                    '\0' => {
1422                        self.bad_char_error();
1423                        go!(self: append_comment "--!\u{fffd}"; to Comment)
1424                    },
1425                    c => go!(self: append_comment "--!"; push_comment c; to Comment),
1426                }
1427            },
1428
1429            //§ doctype-state
1430            states::Doctype => loop {
1431                match get_char!(self, input) {
1432                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1433                    '>' => go!(self: reconsume BeforeDoctypeName),
1434                    _ => {
1435                        self.bad_char_error();
1436                        go!(self: reconsume BeforeDoctypeName)
1437                    },
1438                }
1439            },
1440
1441            //§ before-doctype-name-state
1442            states::BeforeDoctypeName => loop {
1443                match get_char!(self, input) {
1444                    '\t' | '\n' | '\x0C' | ' ' => (),
1445                    '\0' => {
1446                        self.bad_char_error();
1447                        go!(self: create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1448                    },
1449                    '>' => {
1450                        self.bad_char_error();
1451                        go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1452                    },
1453                    c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1454                                  to DoctypeName),
1455                }
1456            },
1457
1458            //§ doctype-name-state
1459            states::DoctypeName => loop {
1460                match get_char!(self, input) {
1461                    '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1462                    '>' => go!(self: emit_doctype; to Data),
1463                    '\0' => {
1464                        self.bad_char_error();
1465                        go!(self: push_doctype_name '\u{fffd}')
1466                    },
1467                    c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1468                }
1469            },
1470
1471            //§ after-doctype-name-state
1472            states::AfterDoctypeName => loop {
1473                if eat!(self, input, "public") {
1474                    go!(self: to AfterDoctypeKeyword Public);
1475                } else if eat!(self, input, "system") {
1476                    go!(self: to AfterDoctypeKeyword System);
1477                } else {
1478                    match get_char!(self, input) {
1479                        '\t' | '\n' | '\x0C' | ' ' => (),
1480                        '>' => go!(self: emit_doctype; to Data),
1481                        _ => {
1482                            self.bad_char_error();
1483                            go!(self: force_quirks; reconsume BogusDoctype)
1484                        },
1485                    }
1486                }
1487            },
1488
1489            //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1490            states::AfterDoctypeKeyword(kind) => loop {
1491                match get_char!(self, input) {
1492                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1493                    '"' => {
1494                        self.bad_char_error();
1495                        go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1496                    },
1497                    '\'' => {
1498                        self.bad_char_error();
1499                        go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1500                    },
1501                    '>' => {
1502                        self.bad_char_error();
1503                        go!(self: force_quirks; emit_doctype; to Data)
1504                    },
1505                    _ => {
1506                        self.bad_char_error();
1507                        go!(self: force_quirks; reconsume BogusDoctype)
1508                    },
1509                }
1510            },
1511
1512            //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1513            states::BeforeDoctypeIdentifier(kind) => loop {
1514                match get_char!(self, input) {
1515                    '\t' | '\n' | '\x0C' | ' ' => (),
1516                    '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1517                    '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1518                    '>' => {
1519                        self.bad_char_error();
1520                        go!(self: force_quirks; emit_doctype; to Data)
1521                    },
1522                    _ => {
1523                        self.bad_char_error();
1524                        go!(self: force_quirks; reconsume BogusDoctype)
1525                    },
1526                }
1527            },
1528
1529            //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1530            states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1531                match get_char!(self, input) {
1532                    '"' => go!(self: to AfterDoctypeIdentifier kind),
1533                    '\0' => {
1534                        self.bad_char_error();
1535                        go!(self: push_doctype_id kind '\u{fffd}')
1536                    },
1537                    '>' => {
1538                        self.bad_char_error();
1539                        go!(self: force_quirks; emit_doctype; to Data)
1540                    },
1541                    c => go!(self: push_doctype_id kind c),
1542                }
1543            },
1544
1545            //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1546            states::DoctypeIdentifierSingleQuoted(kind) => loop {
1547                match get_char!(self, input) {
1548                    '\'' => go!(self: to AfterDoctypeIdentifier kind),
1549                    '\0' => {
1550                        self.bad_char_error();
1551                        go!(self: push_doctype_id kind '\u{fffd}')
1552                    },
1553                    '>' => {
1554                        self.bad_char_error();
1555                        go!(self: force_quirks; emit_doctype; to Data)
1556                    },
1557                    c => go!(self: push_doctype_id kind c),
1558                }
1559            },
1560
1561            //§ after-doctype-public-identifier-state
1562            states::AfterDoctypeIdentifier(Public) => loop {
1563                match get_char!(self, input) {
1564                    '\t' | '\n' | '\x0C' | ' ' => {
1565                        go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1566                    },
1567                    '>' => go!(self: emit_doctype; to Data),
1568                    '"' => {
1569                        self.bad_char_error();
1570                        go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1571                    },
1572                    '\'' => {
1573                        self.bad_char_error();
1574                        go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1575                    },
1576                    _ => {
1577                        self.bad_char_error();
1578                        go!(self: force_quirks; reconsume BogusDoctype)
1579                    },
1580                }
1581            },
1582
1583            //§ after-doctype-system-identifier-state
1584            states::AfterDoctypeIdentifier(System) => loop {
1585                match get_char!(self, input) {
1586                    '\t' | '\n' | '\x0C' | ' ' => (),
1587                    '>' => go!(self: emit_doctype; to Data),
1588                    _ => {
1589                        self.bad_char_error();
1590                        go!(self: reconsume BogusDoctype)
1591                    },
1592                }
1593            },
1594
1595            //§ between-doctype-public-and-system-identifiers-state
1596            states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1597                match get_char!(self, input) {
1598                    '\t' | '\n' | '\x0C' | ' ' => (),
1599                    '>' => go!(self: emit_doctype; to Data),
1600                    '"' => {
1601                        go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1602                    },
1603                    '\'' => {
1604                        go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1605                    },
1606                    _ => {
1607                        self.bad_char_error();
1608                        go!(self: force_quirks; reconsume BogusDoctype)
1609                    },
1610                }
1611            },
1612
1613            //§ bogus-doctype-state
1614            states::BogusDoctype => loop {
1615                match get_char!(self, input) {
1616                    '>' => go!(self: emit_doctype; to Data),
1617                    '\0' => {
1618                        self.bad_char_error();
1619                    },
1620                    _ => (),
1621                }
1622            },
1623
1624            //§ bogus-comment-state
1625            states::BogusComment => loop {
1626                match get_char!(self, input) {
1627                    '>' => go!(self: emit_comment; to Data),
1628                    '\0' => {
1629                        self.bad_char_error();
1630                        go!(self: push_comment '\u{fffd}')
1631                    },
1632                    c => go!(self: push_comment c),
1633                }
1634            },
1635
1636            //§ markup-declaration-open-state
1637            states::MarkupDeclarationOpen => loop {
1638                if eat_exact!(self, input, "--") {
1639                    go!(self: clear_comment; to CommentStart);
1640                } else if eat!(self, input, "doctype") {
1641                    go!(self: to Doctype);
1642                } else {
1643                    if self
1644                        .sink
1645                        .adjusted_current_node_present_but_not_in_html_namespace()
1646                        && eat_exact!(self, input, "[CDATA[")
1647                    {
1648                        go!(self: clear_temp; to CdataSection);
1649                    }
1650                    self.bad_char_error();
1651                    go!(self: clear_comment; to BogusComment);
1652                }
1653            },
1654
1655            //§ cdata-section-state
1656            states::CdataSection => loop {
1657                match get_char!(self, input) {
1658                    ']' => go!(self: to CdataSectionBracket),
1659                    '\0' => {
1660                        self.emit_temp_buf();
1661                        self.emit_char('\0');
1662                    },
1663                    c => go!(self: push_temp c),
1664                }
1665            },
1666
1667            //§ cdata-section-bracket
1668            states::CdataSectionBracket => match get_char!(self, input) {
1669                ']' => go!(self: to CdataSectionEnd),
1670                _ => go!(self: push_temp ']'; reconsume CdataSection),
1671            },
1672
1673            //§ cdata-section-end
1674            states::CdataSectionEnd => loop {
1675                match get_char!(self, input) {
1676                    ']' => go!(self: push_temp ']'),
1677                    '>' => {
1678                        self.emit_temp_buf();
1679                        go!(self: to Data);
1680                    },
1681                    _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1682                }
1683            },
1684            //§ END
1685        }
1686    }
1687
1688    fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1689        let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
1690        let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
1691            char_ref::Status::Done(char_ref) => {
1692                self.process_char_ref(char_ref);
1693                *char_ref_tokenizer = None;
1694                return ProcessResult::Continue;
1695            },
1696
1697            char_ref::Status::Stuck => ProcessResult::Suspend,
1698            char_ref::Status::Progress => ProcessResult::Continue,
1699        };
1700
1701        progress
1702    }
1703
1704    fn process_char_ref(&self, char_ref: CharRef) {
1705        let CharRef {
1706            mut chars,
1707            mut num_chars,
1708        } = char_ref;
1709
1710        if num_chars == 0 {
1711            chars[0] = '&';
1712            num_chars = 1;
1713        }
1714
1715        for i in 0..num_chars {
1716            let c = chars[i as usize];
1717            match self.state.get() {
1718                states::Data | states::RawData(states::Rcdata) => self.emit_char(c),
1719
1720                states::AttributeValue(_) => go!(self: push_value c),
1721
1722                _ => panic!(
1723                    "state {:?} should not be reachable in process_char_ref",
1724                    self.state.get()
1725                ),
1726            }
1727        }
1728    }
1729
1730    /// Indicate that we have reached the end of the input.
1731    pub fn end(&self) {
1732        // Handle EOF in the char ref sub-tokenizer, if there is one.
1733        // Do this first because it might un-consume stuff.
1734        let input = BufferQueue::default();
1735        match self.char_ref_tokenizer.take() {
1736            None => (),
1737            Some(mut tokenizer) => {
1738                self.process_char_ref(tokenizer.end_of_file(self, &input));
1739            },
1740        }
1741
1742        // Process all remaining buffered input.
1743        // If we're waiting for lookahead, we're not gonna get it.
1744        self.at_eof.set(true);
1745        assert!(matches!(self.run(&input), TokenizerResult::Done));
1746        assert!(input.is_empty());
1747
1748        loop {
1749            match self.eof_step() {
1750                ProcessResult::Continue => (),
1751                ProcessResult::Suspend => break,
1752                ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
1753            }
1754        }
1755
1756        self.sink.end();
1757
1758        if self.opts.profile {
1759            self.dump_profile();
1760        }
1761    }
1762
1763    fn dump_profile(&self) {
1764        let mut results: Vec<(states::State, u64)> = self
1765            .state_profile
1766            .borrow()
1767            .iter()
1768            .map(|(s, t)| (*s, *t))
1769            .collect();
1770        results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1771
1772        let total: u64 = results
1773            .iter()
1774            .map(|&(_, t)| t)
1775            .fold(0, ::std::ops::Add::add);
1776        println!("\nTokenizer profile, in nanoseconds");
1777        println!(
1778            "\n{:12}         total in token sink",
1779            self.time_in_sink.get()
1780        );
1781        println!("\n{total:12}         total in tokenizer");
1782
1783        for (k, v) in results.into_iter() {
1784            let pct = 100.0 * (v as f64) / (total as f64);
1785            println!("{v:12}  {pct:4.1}%  {k:?}");
1786        }
1787    }
1788
1789    fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1790        debug!("processing EOF in state {:?}", self.state.get());
1791        match self.state.get() {
1792            states::Data
1793            | states::RawData(Rcdata)
1794            | states::RawData(Rawtext)
1795            | states::RawData(ScriptData)
1796            | states::Plaintext => go!(self: eof),
1797
1798            states::TagName
1799            | states::RawData(ScriptDataEscaped(_))
1800            | states::BeforeAttributeName
1801            | states::AttributeName
1802            | states::AfterAttributeName
1803            | states::AttributeValue(_)
1804            | states::AfterAttributeValueQuoted
1805            | states::SelfClosingStartTag
1806            | states::ScriptDataEscapedDash(_)
1807            | states::ScriptDataEscapedDashDash(_) => {
1808                self.bad_eof_error();
1809                go!(self: to Data)
1810            },
1811
1812            states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1813
1814            states::TagOpen => {
1815                self.bad_eof_error();
1816                self.emit_char('<');
1817                go!(self: to Data);
1818            },
1819
1820            states::EndTagOpen => {
1821                self.bad_eof_error();
1822                self.emit_char('<');
1823                self.emit_char('/');
1824                go!(self: to Data);
1825            },
1826
1827            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1828                go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1829            },
1830
1831            states::RawLessThanSign(kind) => {
1832                self.emit_char('<');
1833                go!(self: to RawData kind);
1834            },
1835
1836            states::RawEndTagOpen(kind) => {
1837                self.emit_char('<');
1838                self.emit_char('/');
1839                go!(self: to RawData kind);
1840            },
1841
1842            states::RawEndTagName(kind) => {
1843                self.emit_char('<');
1844                self.emit_char('/');
1845                self.emit_temp_buf();
1846                go!(self: to RawData kind)
1847            },
1848
1849            states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1850
1851            states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1852
1853            states::ScriptDataDoubleEscapeEnd => {
1854                go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1855            },
1856
1857            states::CommentStart
1858            | states::CommentStartDash
1859            | states::Comment
1860            | states::CommentEndDash
1861            | states::CommentEnd
1862            | states::CommentEndBang => {
1863                self.bad_eof_error();
1864                go!(self: emit_comment; to Data)
1865            },
1866
1867            states::CommentLessThanSign | states::CommentLessThanSignBang => {
1868                go!(self: reconsume Comment)
1869            },
1870
1871            states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1872
1873            states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1874
1875            states::Doctype | states::BeforeDoctypeName => {
1876                self.bad_eof_error();
1877                go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1878            },
1879
1880            states::DoctypeName
1881            | states::AfterDoctypeName
1882            | states::AfterDoctypeKeyword(_)
1883            | states::BeforeDoctypeIdentifier(_)
1884            | states::DoctypeIdentifierDoubleQuoted(_)
1885            | states::DoctypeIdentifierSingleQuoted(_)
1886            | states::AfterDoctypeIdentifier(_)
1887            | states::BetweenDoctypePublicAndSystemIdentifiers => {
1888                self.bad_eof_error();
1889                go!(self: force_quirks; emit_doctype; to Data)
1890            },
1891
1892            states::BogusDoctype => go!(self: emit_doctype; to Data),
1893
1894            states::BogusComment => go!(self: emit_comment; to Data),
1895
1896            states::MarkupDeclarationOpen => {
1897                self.bad_char_error();
1898                go!(self: to BogusComment)
1899            },
1900
1901            states::CdataSection => {
1902                self.emit_temp_buf();
1903                self.bad_eof_error();
1904                go!(self: to Data)
1905            },
1906
1907            states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1908
1909            states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1910        }
1911    }
1912
1913    /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64.
1914    fn is_supported_simd_feature_detected() -> bool {
1915        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1916        {
1917            is_x86_feature_detected!("sse2")
1918        }
1919
1920        #[cfg(target_arch = "aarch64")]
1921        {
1922            std::arch::is_aarch64_feature_detected!("neon")
1923        }
1924
1925        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
1926        false
1927    }
1928
1929    #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
1930    /// Implements the [data state] with SIMD instructions.
1931    /// Calls SSE2- or NEON-specific function for chunks and processes any remaining bytes.
1932    ///
1933    /// The algorithm implemented is the naive SIMD approach described [here].
1934    ///
1935    /// ### SAFETY:
1936    /// Calling this function on a CPU that supports neither SSE2 nor NEON causes undefined behaviour.
1937    ///
1938    /// [data state]: https://html.spec.whatwg.org/#data-state
1939    /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1940    unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1941        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1942        let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
1943
1944        #[cfg(target_arch = "aarch64")]
1945        let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
1946
1947        // Process any remaining bytes (less than STRIDE)
1948        while let Some(c) = input.as_bytes().get(i) {
1949            if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1950                break;
1951            }
1952            if *c == b'\n' {
1953                n_newlines += 1;
1954            }
1955
1956            i += 1;
1957        }
1958
1959        let set_result = if i == 0 {
1960            let first_char = input.pop_front_char().unwrap();
1961            debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1962
1963            // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1964            // Still, it would be nice to not have to do that.
1965            // The same is true for the unwrap call.
1966            let preprocessed_char = self
1967                .get_preprocessed_char(first_char, &BufferQueue::default())
1968                .unwrap();
1969            SetResult::FromSet(preprocessed_char)
1970        } else {
1971            debug_assert!(
1972                input.len() >= i,
1973                "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1974                i,
1975                input.len()
1976            );
1977            let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1978            input.unsafe_pop_front(i as u32);
1979            SetResult::NotFromSet(consumed_chunk)
1980        };
1981
1982        self.current_line.set(self.current_line.get() + n_newlines);
1983
1984        Some(set_result)
1985    }
1986
1987    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1988    #[target_feature(enable = "sse2")]
1989    /// Implements the [data state] with SSE2 instructions for x86/x86_64.
1990    /// Returns a pair of the number of bytes processed and the number of newlines found.
1991    ///
1992    /// ### SAFETY:
1993    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
1994    ///
1995    /// [data state]: https://html.spec.whatwg.org/#data-state
1996    unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
1997        #[cfg(target_arch = "x86")]
1998        use std::arch::x86::{
1999            __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
2000            _mm_set1_epi8,
2001        };
2002        #[cfg(target_arch = "x86_64")]
2003        use std::arch::x86_64::{
2004            __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
2005            _mm_set1_epi8,
2006        };
2007
2008        debug_assert!(!input.is_empty());
2009
2010        let quote_mask = _mm_set1_epi8('<' as i8);
2011        let escape_mask = _mm_set1_epi8('&' as i8);
2012        let carriage_return_mask = _mm_set1_epi8('\r' as i8);
2013        let zero_mask = _mm_set1_epi8('\0' as i8);
2014        let newline_mask = _mm_set1_epi8('\n' as i8);
2015
2016        let raw_bytes: &[u8] = input.as_bytes();
2017        let start = raw_bytes.as_ptr();
2018
2019        const STRIDE: usize = 16;
2020        let mut i = 0;
2021        let mut n_newlines = 0;
2022        while i + STRIDE <= raw_bytes.len() {
2023            // Load a 16 byte chunk from the input
2024            let data = _mm_loadu_si128(start.add(i) as *const __m128i);
2025
2026            // Compare the chunk against each mask
2027            let quotes = _mm_cmpeq_epi8(data, quote_mask);
2028            let escapes = _mm_cmpeq_epi8(data, escape_mask);
2029            let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
2030            let zeros = _mm_cmpeq_epi8(data, zero_mask);
2031            let newlines = _mm_cmpeq_epi8(data, newline_mask);
2032
2033            // Combine all test results and create a bitmask from them.
2034            // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2035            let test_result = _mm_or_si128(
2036                _mm_or_si128(quotes, zeros),
2037                _mm_or_si128(escapes, carriage_returns),
2038            );
2039            let bitmask = _mm_movemask_epi8(test_result);
2040            let newline_mask = _mm_movemask_epi8(newlines);
2041
2042            if (bitmask != 0) {
2043                // We have reached one of the characters that cause the state machine to transition
2044                let position = if cfg!(target_endian = "little") {
2045                    bitmask.trailing_zeros() as usize
2046                } else {
2047                    bitmask.leading_zeros() as usize
2048                };
2049
2050                n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
2051                i += position;
2052                break;
2053            } else {
2054                n_newlines += newline_mask.count_ones() as u64;
2055            }
2056
2057            i += STRIDE;
2058        }
2059
2060        (i, n_newlines)
2061    }
2062
2063    #[cfg(target_arch = "aarch64")]
2064    #[target_feature(enable = "neon")]
2065    /// Implements the [data state] with NEON SIMD instructions for AArch64.
2066    /// Returns a pair of the number of bytes processed and the number of newlines found.
2067    ///
2068    /// ### SAFETY:
2069    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
2070    ///
2071    /// [data state]: https://html.spec.whatwg.org/#data-state
2072    unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2073        use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
2074
2075        debug_assert!(!input.is_empty());
2076
2077        let quote_mask = vdupq_n_u8(b'<');
2078        let escape_mask = vdupq_n_u8(b'&');
2079        let carriage_return_mask = vdupq_n_u8(b'\r');
2080        let zero_mask = vdupq_n_u8(b'\0');
2081        let newline_mask = vdupq_n_u8(b'\n');
2082
2083        let raw_bytes: &[u8] = input.as_bytes();
2084        let start = raw_bytes.as_ptr();
2085
2086        const STRIDE: usize = 16;
2087        let mut i = 0;
2088        let mut n_newlines = 0;
2089        while i + STRIDE <= raw_bytes.len() {
2090            // Load a 16 byte chunk from the input
2091            let data = vld1q_u8(start.add(i));
2092
2093            // Compare the chunk against each mask
2094            let quotes = vceqq_u8(data, quote_mask);
2095            let escapes = vceqq_u8(data, escape_mask);
2096            let carriage_returns = vceqq_u8(data, carriage_return_mask);
2097            let zeros = vceqq_u8(data, zero_mask);
2098            let newlines = vceqq_u8(data, newline_mask);
2099
2100            // Combine all test results and create a bitmask from them.
2101            // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2102            let test_result =
2103                vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
2104            let bitmask = vmaxvq_u8(test_result);
2105            let newline_mask = vmaxvq_u8(newlines);
2106            if bitmask != 0 {
2107                // We have reached one of the characters that cause the state machine to transition
2108                let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2109                let position = chunk_bytes
2110                    .iter()
2111                    .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
2112                    .unwrap();
2113
2114                n_newlines += chunk_bytes[..position]
2115                    .iter()
2116                    .filter(|&&b| b == b'\n')
2117                    .count() as u64;
2118
2119                i += position;
2120                break;
2121            } else if newline_mask != 0 {
2122                let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2123                n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
2124            }
2125
2126            i += STRIDE;
2127        }
2128
2129        (i, n_newlines)
2130    }
2131}
2132
2133#[cfg(test)]
2134#[allow(non_snake_case)]
2135mod test {
2136    use super::option_push; // private items
2137    use crate::tendril::{SliceExt, StrTendril};
2138
2139    use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
2140
2141    use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
2142    use super::interface::{EndTag, StartTag, Tag, TagKind};
2143    use super::interface::{TagToken, Token};
2144
2145    use markup5ever::buffer_queue::BufferQueue;
2146    use std::cell::RefCell;
2147
2148    use crate::LocalName;
2149
2150    // LinesMatch implements the TokenSink trait. It is used for testing to see
2151    // if current_line is being updated when process_token is called. The lines
2152    // vector is a collection of the line numbers that each token is on.
2153    struct LinesMatch {
2154        tokens: RefCell<Vec<Token>>,
2155        current_str: RefCell<StrTendril>,
2156        lines: RefCell<Vec<(Token, u64)>>,
2157    }
2158
2159    impl LinesMatch {
2160        fn new() -> LinesMatch {
2161            LinesMatch {
2162                tokens: RefCell::new(vec![]),
2163                current_str: RefCell::new(StrTendril::new()),
2164                lines: RefCell::new(vec![]),
2165            }
2166        }
2167
2168        fn push(&self, token: Token, line_number: u64) {
2169            self.finish_str();
2170            self.lines.borrow_mut().push((token, line_number));
2171        }
2172
2173        fn finish_str(&self) {
2174            if !self.current_str.borrow().is_empty() {
2175                let s = self.current_str.take();
2176                self.tokens.borrow_mut().push(CharacterTokens(s));
2177            }
2178        }
2179    }
2180
2181    impl TokenSink for LinesMatch {
2182        type Handle = ();
2183
2184        fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
2185            match token {
2186                CharacterTokens(b) => {
2187                    self.current_str.borrow_mut().push_slice(&b);
2188                },
2189
2190                NullCharacterToken => {
2191                    self.current_str.borrow_mut().push_char('\0');
2192                },
2193
2194                ParseError(_) => {
2195                    panic!("unexpected parse error");
2196                },
2197
2198                TagToken(mut t) => {
2199                    // The spec seems to indicate that one can emit
2200                    // erroneous end tags with attrs, but the test
2201                    // cases don't contain them.
2202                    match t.kind {
2203                        EndTag => {
2204                            t.self_closing = false;
2205                            t.attrs = vec![];
2206                        },
2207                        _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
2208                    }
2209                    self.push(TagToken(t), line_number);
2210                },
2211
2212                EOFToken => (),
2213
2214                _ => self.push(token, line_number),
2215            }
2216            TokenSinkResult::Continue
2217        }
2218    }
2219
2220    // Take in tokens, process them, and return vector with line
2221    // numbers that each token is on
2222    fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
2223        let sink = LinesMatch::new();
2224        let tok = Tokenizer::new(sink, opts);
2225        let buffer = BufferQueue::default();
2226        for chunk in input.into_iter() {
2227            buffer.push_back(chunk);
2228            let _ = tok.feed(&buffer);
2229        }
2230        tok.end();
2231        tok.sink.lines.take()
2232    }
2233
2234    // Create a tag token
2235    fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
2236        let name = LocalName::from(&*token);
2237
2238        TagToken(Tag {
2239            kind: tagkind,
2240            name,
2241            self_closing: false,
2242            attrs: vec![],
2243        })
2244    }
2245
2246    #[test]
2247    fn push_to_None_gives_singleton() {
2248        let mut s: Option<StrTendril> = None;
2249        option_push(&mut s, 'x');
2250        assert_eq!(s, Some("x".to_tendril()));
2251    }
2252
2253    #[test]
2254    fn push_to_empty_appends() {
2255        let mut s: Option<StrTendril> = Some(StrTendril::new());
2256        option_push(&mut s, 'x');
2257        assert_eq!(s, Some("x".to_tendril()));
2258    }
2259
2260    #[test]
2261    fn push_to_nonempty_appends() {
2262        let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
2263        option_push(&mut s, 'x');
2264        assert_eq!(s, Some("yx".to_tendril()));
2265    }
2266
2267    #[test]
2268    fn check_lines() {
2269        let opts = TokenizerOpts {
2270            exact_errors: false,
2271            discard_bom: true,
2272            profile: false,
2273            initial_state: None,
2274            last_start_tag_name: None,
2275        };
2276        let vector = vec![
2277            StrTendril::from("<a>\n"),
2278            StrTendril::from("<b>\n"),
2279            StrTendril::from("</b>\n"),
2280            StrTendril::from("</a>\n"),
2281        ];
2282        let expected = vec![
2283            (create_tag(StrTendril::from("a"), StartTag), 1),
2284            (create_tag(StrTendril::from("b"), StartTag), 2),
2285            (create_tag(StrTendril::from("b"), EndTag), 3),
2286            (create_tag(StrTendril::from("a"), EndTag), 4),
2287        ];
2288        let results = tokenize(vector, opts);
2289        assert_eq!(results, expected);
2290    }
2291
2292    #[test]
2293    fn check_lines_with_new_line() {
2294        let opts = TokenizerOpts {
2295            exact_errors: false,
2296            discard_bom: true,
2297            profile: false,
2298            initial_state: None,
2299            last_start_tag_name: None,
2300        };
2301        let vector = vec![
2302            StrTendril::from("<a>\r\n"),
2303            StrTendril::from("<b>\r\n"),
2304            StrTendril::from("</b>\r\n"),
2305            StrTendril::from("</a>\r\n"),
2306        ];
2307        let expected = vec![
2308            (create_tag(StrTendril::from("a"), StartTag), 1),
2309            (create_tag(StrTendril::from("b"), StartTag), 2),
2310            (create_tag(StrTendril::from("b"), EndTag), 3),
2311            (create_tag(StrTendril::from("a"), EndTag), 4),
2312        ];
2313        let results = tokenize(vector, opts);
2314        assert_eq!(results, expected);
2315    }
2316}