html5ever/tokenizer/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The HTML5 tokenizer.
11
12pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use markup5ever::{ns, small_char_set, TokenizerResult};
28use std::borrow::Cow::{self, Borrowed};
29use std::cell::{Cell, RefCell, RefMut};
30use std::collections::BTreeMap;
31use std::mem;
32
33pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34use crate::macros::{time, unwrap_or_return};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42pub enum ProcessResult<Handle> {
43    Continue,
44    Suspend,
45    Script(Handle),
46}
47
48fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
49    match *opt_str {
50        Some(ref mut s) => s.push_char(c),
51        None => *opt_str = Some(StrTendril::from_char(c)),
52    }
53}
54
55/// Tokenizer options, with an impl for `Default`.
56#[derive(Clone)]
57pub struct TokenizerOpts {
58    /// Report all parse errors described in the spec, at some
59    /// performance penalty?  Default: false
60    pub exact_errors: bool,
61
62    /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
63    /// of the stream?  Default: true
64    pub discard_bom: bool,
65
66    /// Keep a record of how long we spent in each state?  Printed
67    /// when `end()` is called.  Default: false
68    pub profile: bool,
69
70    /// Initial state override.  Only the test runner should use
71    /// a non-`None` value!
72    pub initial_state: Option<states::State>,
73
74    /// Last start tag.  Only the test runner should use a
75    /// non-`None` value!
76    ///
77    /// FIXME: Can't use Tendril because we want TokenizerOpts
78    /// to be Send.
79    pub last_start_tag_name: Option<String>,
80}
81
82impl Default for TokenizerOpts {
83    fn default() -> TokenizerOpts {
84        TokenizerOpts {
85            exact_errors: false,
86            discard_bom: true,
87            profile: false,
88            initial_state: None,
89            last_start_tag_name: None,
90        }
91    }
92}
93
94/// The HTML tokenizer.
95pub struct Tokenizer<Sink> {
96    /// Options controlling the behavior of the tokenizer.
97    opts: TokenizerOpts,
98
99    /// Destination for tokens we emit.
100    pub sink: Sink,
101
102    /// The abstract machine state as described in the spec.
103    state: Cell<states::State>,
104
105    /// Are we at the end of the file, once buffers have been processed
106    /// completely? This affects whether we will wait for lookahead or not.
107    at_eof: Cell<bool>,
108
109    /// Tokenizer for character references, if we're tokenizing
110    /// one at the moment.
111    char_ref_tokenizer: RefCell<Option<CharRefTokenizer>>,
112
113    /// Current input character.  Just consumed, may reconsume.
114    current_char: Cell<char>,
115
116    /// Should we reconsume the current input character?
117    reconsume: Cell<bool>,
118
119    /// Did we just consume \r, translating it to \n?  In that case we need
120    /// to ignore the next character if it's \n.
121    ignore_lf: Cell<bool>,
122
123    /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
124    /// beginning of the stream.
125    discard_bom: Cell<bool>,
126
127    /// Current tag kind.
128    current_tag_kind: Cell<TagKind>,
129
130    /// Current tag name.
131    current_tag_name: RefCell<StrTendril>,
132
133    /// Current tag is self-closing?
134    current_tag_self_closing: Cell<bool>,
135
136    /// Current tag attributes.
137    current_tag_attrs: RefCell<Vec<Attribute>>,
138
139    /// Current attribute name.
140    current_attr_name: RefCell<StrTendril>,
141
142    /// Current attribute value.
143    current_attr_value: RefCell<StrTendril>,
144
145    /// Current comment.
146    current_comment: RefCell<StrTendril>,
147
148    /// Current doctype token.
149    current_doctype: RefCell<Doctype>,
150
151    /// Last start tag name, for use in checking "appropriate end tag".
152    last_start_tag_name: RefCell<Option<LocalName>>,
153
154    /// The "temporary buffer" mentioned in the spec.
155    temp_buf: RefCell<StrTendril>,
156
157    /// Record of how many ns we spent in each state, if profiling is enabled.
158    state_profile: RefCell<BTreeMap<states::State, u64>>,
159
160    /// Record of how many ns we spent in the token sink.
161    time_in_sink: Cell<u64>,
162
163    /// Track current line
164    current_line: Cell<u64>,
165}
166
167impl<Sink: TokenSink> Tokenizer<Sink> {
168    /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
169    pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
170        let start_tag_name = opts
171            .last_start_tag_name
172            .take()
173            .map(|s| LocalName::from(&*s));
174        let state = opts.initial_state.unwrap_or(states::Data);
175        let discard_bom = opts.discard_bom;
176        Tokenizer {
177            opts,
178            sink,
179            state: Cell::new(state),
180            char_ref_tokenizer: RefCell::new(None),
181            at_eof: Cell::new(false),
182            current_char: Cell::new('\0'),
183            reconsume: Cell::new(false),
184            ignore_lf: Cell::new(false),
185            discard_bom: Cell::new(discard_bom),
186            current_tag_kind: Cell::new(StartTag),
187            current_tag_name: RefCell::new(StrTendril::new()),
188            current_tag_self_closing: Cell::new(false),
189            current_tag_attrs: RefCell::new(vec![]),
190            current_attr_name: RefCell::new(StrTendril::new()),
191            current_attr_value: RefCell::new(StrTendril::new()),
192            current_comment: RefCell::new(StrTendril::new()),
193            current_doctype: RefCell::new(Doctype::default()),
194            last_start_tag_name: RefCell::new(start_tag_name),
195            temp_buf: RefCell::new(StrTendril::new()),
196            state_profile: RefCell::new(BTreeMap::new()),
197            time_in_sink: Cell::new(0),
198            current_line: Cell::new(1),
199        }
200    }
201
202    /// Feed an input string into the tokenizer.
203    pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
204        if input.is_empty() {
205            return TokenizerResult::Done;
206        }
207
208        if self.discard_bom.get() {
209            if let Some(c) = input.peek() {
210                if c == '\u{feff}' {
211                    input.next();
212                }
213            } else {
214                return TokenizerResult::Done;
215            }
216        };
217
218        self.run(input)
219    }
220
221    pub fn set_plaintext_state(&self) {
222        self.state.set(states::Plaintext);
223    }
224
225    fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
226        if self.opts.profile {
227            let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
228            self.time_in_sink.set(self.time_in_sink.get() + dt);
229            ret
230        } else {
231            self.sink.process_token(token, self.current_line.get())
232        }
233    }
234
235    fn process_token_and_continue(&self, token: Token) {
236        assert!(matches!(
237            self.process_token(token),
238            TokenSinkResult::Continue
239        ));
240    }
241
242    //§ preprocessing-the-input-stream
243    // Get the next input character, which might be the character
244    // 'c' that we already consumed from the buffers.
245    fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
246        if self.ignore_lf.get() {
247            self.ignore_lf.set(false);
248            if c == '\n' {
249                c = input.next()?;
250            }
251        }
252
253        if c == '\r' {
254            self.ignore_lf.set(true);
255            c = '\n';
256        }
257
258        if c == '\n' {
259            self.current_line.set(self.current_line.get() + 1);
260        }
261
262        if self.opts.exact_errors
263            && match c as u32 {
264                0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
265                n if (n & 0xFFFE) == 0xFFFE => true,
266                _ => false,
267            }
268        {
269            let msg = format!("Bad character {c}");
270            self.emit_error(Cow::Owned(msg));
271        }
272
273        trace!("got character {c}");
274        self.current_char.set(c);
275        Some(c)
276    }
277
278    //§ tokenization
279    // Get the next input character, if one is available.
280    fn get_char(&self, input: &BufferQueue) -> Option<char> {
281        if self.reconsume.get() {
282            self.reconsume.set(false);
283            Some(self.current_char.get())
284        } else {
285            input
286                .next()
287                .and_then(|c| self.get_preprocessed_char(c, input))
288        }
289    }
290
291    fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
292        // Bail to the slow path for various corner cases.
293        // This means that `FromSet` can contain characters not in the set!
294        // It shouldn't matter because the fallback `FromSet` case should
295        // always do the same thing as the `NotFromSet` case.
296        if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
297            return self.get_char(input).map(FromSet);
298        }
299
300        let d = input.pop_except_from(set);
301        trace!("got characters {d:?}");
302        match d {
303            Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
304
305            // NB: We don't set self.current_char for a run of characters not
306            // in the set.  It shouldn't matter for the codepaths that use
307            // this.
308            _ => d,
309        }
310    }
311
312    // Check if the next characters are an ASCII case-insensitive match.  See
313    // BufferQueue::eat.
314    //
315    // NB: this doesn't set the current input character.
316    fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
317        if self.ignore_lf.get() {
318            self.ignore_lf.set(false);
319            if self.peek(input) == Some('\n') {
320                self.discard_char(input);
321            }
322        }
323
324        input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
325        match input.eat(pat, eq) {
326            None if self.at_eof.get() => Some(false),
327            None => {
328                while let Some(data) = input.next() {
329                    self.temp_buf.borrow_mut().push_char(data);
330                }
331                None
332            },
333            Some(matched) => Some(matched),
334        }
335    }
336
337    /// Run the state machine for as long as we can.
338    fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
339        if self.opts.profile {
340            loop {
341                let state = self.state.get();
342                let old_sink = self.time_in_sink.get();
343                let (run, mut dt) = time!(self.step(input));
344                dt -= (self.time_in_sink.get() - old_sink);
345                let new = match self.state_profile.borrow_mut().get_mut(&state) {
346                    Some(x) => {
347                        *x += dt;
348                        false
349                    },
350                    None => true,
351                };
352                if new {
353                    // do this here because of borrow shenanigans
354                    self.state_profile.borrow_mut().insert(state, dt);
355                }
356                match run {
357                    ProcessResult::Continue => (),
358                    ProcessResult::Suspend => break,
359                    ProcessResult::Script(node) => return TokenizerResult::Script(node),
360                }
361            }
362        } else {
363            loop {
364                match self.step(input) {
365                    ProcessResult::Continue => (),
366                    ProcessResult::Suspend => break,
367                    ProcessResult::Script(node) => return TokenizerResult::Script(node),
368                }
369            }
370        }
371        TokenizerResult::Done
372    }
373
374    #[inline]
375    fn bad_char_error(&self) {
376        #[cfg(feature = "trace_tokenizer")]
377        trace!("  error");
378
379        let msg = if self.opts.exact_errors {
380            Cow::from("Bad character")
381        } else {
382            let c = self.current_char.get();
383            let state = self.state.get();
384            Cow::from(format!("Saw {c} in state {state:?}"))
385        };
386        self.emit_error(msg);
387    }
388
389    #[inline]
390    fn bad_eof_error(&self) {
391        #[cfg(feature = "trace_tokenizer")]
392        trace!("  error_eof");
393
394        let msg = if self.opts.exact_errors {
395            Cow::from("Unexpected EOF")
396        } else {
397            let state = self.state.get();
398            Cow::from(format!("Saw EOF in state {state:?}"))
399        };
400        self.emit_error(msg);
401    }
402
403    fn emit_char(&self, c: char) {
404        #[cfg(feature = "trace_tokenizer")]
405        trace!("  emit");
406
407        self.process_token_and_continue(match c {
408            '\0' => NullCharacterToken,
409            _ => CharacterTokens(StrTendril::from_char(c)),
410        });
411    }
412
413    // The string must not contain '\0'!
414    fn emit_chars(&self, b: StrTendril) {
415        self.process_token_and_continue(CharacterTokens(b));
416    }
417
418    fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
419        self.finish_attribute();
420
421        let name = LocalName::from(&**self.current_tag_name.borrow());
422        self.current_tag_name.borrow_mut().clear();
423
424        match self.current_tag_kind.get() {
425            StartTag => {
426                *self.last_start_tag_name.borrow_mut() = Some(name.clone());
427            },
428            EndTag => {
429                if !self.current_tag_attrs.borrow().is_empty() {
430                    self.emit_error(Borrowed("Attributes on an end tag"));
431                }
432                if self.current_tag_self_closing.get() {
433                    self.emit_error(Borrowed("Self-closing end tag"));
434                }
435            },
436        }
437
438        let token = TagToken(Tag {
439            kind: self.current_tag_kind.get(),
440            name,
441            self_closing: self.current_tag_self_closing.get(),
442            attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
443        });
444
445        match self.process_token(token) {
446            TokenSinkResult::Continue => ProcessResult::Continue,
447            TokenSinkResult::Plaintext => {
448                self.state.set(states::Plaintext);
449                ProcessResult::Continue
450            },
451            TokenSinkResult::Script(node) => {
452                self.state.set(states::Data);
453                ProcessResult::Script(node)
454            },
455            TokenSinkResult::RawData(kind) => {
456                self.state.set(states::RawData(kind));
457                ProcessResult::Continue
458            },
459        }
460    }
461
462    fn emit_temp_buf(&self) {
463        #[cfg(feature = "trace_tokenizer")]
464        trace!("  emit_temp");
465
466        // FIXME: Make sure that clearing on emit is spec-compatible.
467        let buf = mem::take(&mut *self.temp_buf.borrow_mut());
468        self.emit_chars(buf);
469    }
470
471    fn clear_temp_buf(&self) {
472        // Do this without a new allocation.
473        self.temp_buf.borrow_mut().clear();
474    }
475
476    fn emit_current_comment(&self) {
477        let comment = mem::take(&mut *self.current_comment.borrow_mut());
478        self.process_token_and_continue(CommentToken(comment));
479    }
480
481    fn discard_tag(&self) {
482        self.current_tag_name.borrow_mut().clear();
483        self.current_tag_self_closing.set(false);
484        *self.current_tag_attrs.borrow_mut() = vec![];
485    }
486
487    fn create_tag(&self, kind: TagKind, c: char) {
488        self.discard_tag();
489        self.current_tag_name.borrow_mut().push_char(c);
490        self.current_tag_kind.set(kind);
491    }
492
493    fn have_appropriate_end_tag(&self) -> bool {
494        match self.last_start_tag_name.borrow().as_ref() {
495            Some(last) => {
496                (self.current_tag_kind.get() == EndTag)
497                    && (**self.current_tag_name.borrow() == **last)
498            },
499            None => false,
500        }
501    }
502
503    fn create_attribute(&self, c: char) {
504        self.finish_attribute();
505
506        self.current_attr_name.borrow_mut().push_char(c);
507    }
508
509    fn finish_attribute(&self) {
510        if self.current_attr_name.borrow().is_empty() {
511            return;
512        }
513
514        // Check for a duplicate attribute.
515        // FIXME: the spec says we should error as soon as the name is finished.
516        let dup = {
517            let name = &*self.current_attr_name.borrow();
518            self.current_tag_attrs
519                .borrow()
520                .iter()
521                .any(|a| *a.name.local == **name)
522        };
523
524        if dup {
525            self.emit_error(Borrowed("Duplicate attribute"));
526            self.current_attr_name.borrow_mut().clear();
527            self.current_attr_value.borrow_mut().clear();
528        } else {
529            let name = LocalName::from(&**self.current_attr_name.borrow());
530            self.current_attr_name.borrow_mut().clear();
531            self.current_tag_attrs.borrow_mut().push(Attribute {
532                // The tree builder will adjust the namespace if necessary.
533                // This only happens in foreign elements.
534                name: QualName::new(None, ns!(), name),
535                value: mem::take(&mut self.current_attr_value.borrow_mut()),
536            });
537        }
538    }
539
540    fn emit_current_doctype(&self) {
541        let doctype = self.current_doctype.take();
542        self.process_token_and_continue(DoctypeToken(doctype));
543    }
544
545    fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<'_, Option<StrTendril>> {
546        let current_doctype = self.current_doctype.borrow_mut();
547        match kind {
548            Public => RefMut::map(current_doctype, |d| &mut d.public_id),
549            System => RefMut::map(current_doctype, |d| &mut d.system_id),
550        }
551    }
552
553    fn clear_doctype_id(&self, kind: DoctypeIdKind) {
554        let mut id = self.doctype_id(kind);
555        match *id {
556            Some(ref mut s) => s.clear(),
557            None => *id = Some(StrTendril::new()),
558        }
559    }
560
561    fn start_consuming_character_reference(&self) {
562        debug_assert!(
563            self.char_ref_tokenizer.borrow().is_none(),
564            "Nested character references are impossible"
565        );
566
567        let is_in_attribute = matches!(self.state.get(), states::AttributeValue(_));
568        *self.char_ref_tokenizer.borrow_mut() = Some(CharRefTokenizer::new(is_in_attribute));
569    }
570
571    fn emit_eof(&self) {
572        self.process_token_and_continue(EOFToken);
573    }
574
575    fn peek(&self, input: &BufferQueue) -> Option<char> {
576        if self.reconsume.get() {
577            Some(self.current_char.get())
578        } else {
579            input.peek()
580        }
581    }
582
583    fn discard_char(&self, input: &BufferQueue) {
584        // peek() deals in un-processed characters (no newline normalization), while get_char()
585        // does.
586        //
587        // since discard_char is supposed to be used in combination with peek(), discard_char must
588        // discard a single raw input character, not a normalized newline.
589        if self.reconsume.get() {
590            self.reconsume.set(false);
591        } else {
592            input.next();
593        }
594    }
595
596    fn emit_error(&self, error: Cow<'static, str>) {
597        self.process_token_and_continue(ParseError(error));
598    }
599}
600//§ END
601
602// Shorthand for common state machine behaviors.
603macro_rules! shorthand (
604    ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c)                           );
605    ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.borrow_mut().push_char($c)     );
606    ( $me:ident : discard_tag                      ) => ( $me.discard_tag()                                   );
607    ( $me:ident : discard_char $input:expr         ) => ( $me.discard_char($input)                            );
608    ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.borrow_mut().push_char($c)             );
609    ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf()                                );
610    ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c)                            );
611    ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.borrow_mut().push_char($c)    );
612    ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.borrow_mut().push_char($c)   );
613    ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
614    ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.borrow_mut().push_char($c)      );
615    ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.borrow_mut().push_slice($c)     );
616    ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment()                          );
617    ( $me:ident : clear_comment                    ) => ( $me.current_comment.borrow_mut().clear()            );
618    ( $me:ident : create_doctype                   ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
619    ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
620    ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c)            );
621    ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k)                            );
622    ( $me:ident : force_quirks                     ) => ( $me.current_doctype.borrow_mut().force_quirks = true);
623    ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype()                          );
624);
625
626// Tracing of tokenizer actions.  This adds significant bloat and compile time,
627// so it's behind a cfg flag.
628#[cfg(feature = "trace_tokenizer")]
629macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
630    trace!("  {:?}", stringify!($($cmds)*));
631    shorthand!($me : $($cmds)*);
632}));
633
634#[cfg(not(feature = "trace_tokenizer"))]
635macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
636
637// A little DSL for sequencing shorthand actions.
638macro_rules! go (
639    // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
640    // We have to tell the parser how much lookahead we need.
641
642    ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
643    ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
644    ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
645    ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
646
647    // These can only come at the end.
648
649    ( $me:ident : to $s:ident                    ) => ({ $me.state.set(states::$s); return ProcessResult::Continue;           });
650    ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue;      });
651    ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
652
653    ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume.set(true); go!($me: to $s);         });
654    ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume.set(true); go!($me: to $s $k1);     });
655    ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
656
657    ( $me:ident : consume_char_ref             ) => ({ $me.start_consuming_character_reference(); return ProcessResult::Continue;         });
658
659    // We have a default next state after emitting a tag, but the sink can override.
660    ( $me:ident : emit_tag $s:ident ) => ({
661        $me.state.set(states::$s);
662        return $me.emit_current_tag();
663    });
664
665    ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
666
667    // If nothing else matched, it's a single command
668    ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
669
670    // or nothing.
671    ( $me:ident : ) => (());
672);
673
674// This is a macro because it can cause early return
675// from the function where it is used.
676macro_rules! get_char ( ($me:expr, $input:expr) => (
677    unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
678));
679
680macro_rules! peek ( ($me:expr, $input:expr) => (
681    unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
682));
683
684macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
685    unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
686));
687
688macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
689    unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
690));
691
692macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
693    unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
694));
695
696impl<Sink: TokenSink> Tokenizer<Sink> {
697    // Run the state machine for a while.
698    // Return true if we should be immediately re-invoked
699    // (this just simplifies control flow vs. break / continue).
700    #[allow(clippy::never_loop)]
701    fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
702        if self.char_ref_tokenizer.borrow().is_some() {
703            return self.step_char_ref_tokenizer(input);
704        }
705
706        trace!("processing in state {:?}", self.state);
707        match self.state.get() {
708            //§ data-state
709            states::Data => loop {
710                let set = small_char_set!('\r' '\0' '&' '<' '\n');
711
712                #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
713                let set_result = if !(self.opts.exact_errors
714                    || self.reconsume.get()
715                    || self.ignore_lf.get())
716                    && Self::is_supported_simd_feature_detected()
717                {
718                    let front_buffer = input.peek_front_chunk_mut();
719                    let Some(mut front_buffer) = front_buffer else {
720                        return ProcessResult::Suspend;
721                    };
722
723                    // Special case: The fast path is not worth taking if the first character is already in the set,
724                    // which is fairly common
725                    let first_char = front_buffer
726                        .chars()
727                        .next()
728                        .expect("Input buffers are never empty");
729
730                    if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
731                        drop(front_buffer);
732                        self.pop_except_from(input, set)
733                    } else {
734                        // SAFETY:
735                        // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
736                        let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
737
738                        if front_buffer.is_empty() {
739                            drop(front_buffer);
740                            input.pop_front();
741                        }
742
743                        result
744                    }
745                } else {
746                    self.pop_except_from(input, set)
747                };
748
749                #[cfg(not(any(
750                    target_arch = "x86",
751                    target_arch = "x86_64",
752                    target_arch = "aarch64"
753                )))]
754                let set_result = self.pop_except_from(input, set);
755
756                let Some(set_result) = set_result else {
757                    return ProcessResult::Suspend;
758                };
759                match set_result {
760                    FromSet('\0') => {
761                        self.bad_char_error();
762                        self.emit_char('\0');
763                    },
764                    FromSet('&') => go!(self: consume_char_ref),
765                    FromSet('<') => go!(self: to TagOpen),
766                    FromSet(c) => {
767                        self.emit_char(c);
768                    },
769                    NotFromSet(b) => self.emit_chars(b),
770                }
771            },
772
773            //§ rcdata-state
774            states::RawData(Rcdata) => loop {
775                match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
776                    FromSet('\0') => {
777                        self.bad_char_error();
778                        self.emit_char('\u{fffd}');
779                    },
780                    FromSet('&') => go!(self: consume_char_ref),
781                    FromSet('<') => go!(self: to RawLessThanSign Rcdata),
782                    FromSet(c) => self.emit_char(c),
783                    NotFromSet(b) => self.emit_chars(b),
784                }
785            },
786
787            //§ rawtext-state
788            states::RawData(Rawtext) => loop {
789                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
790                    FromSet('\0') => {
791                        self.bad_char_error();
792                        self.emit_char('\u{fffd}');
793                    },
794                    FromSet('<') => go!(self: to RawLessThanSign Rawtext),
795                    FromSet(c) => self.emit_char(c),
796                    NotFromSet(b) => self.emit_chars(b),
797                }
798            },
799
800            //§ script-data-state
801            states::RawData(ScriptData) => loop {
802                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
803                    FromSet('\0') => {
804                        self.bad_char_error();
805                        self.emit_char('\u{fffd}');
806                    },
807                    FromSet('<') => go!(self: to RawLessThanSign ScriptData),
808                    FromSet(c) => self.emit_char(c),
809                    NotFromSet(b) => self.emit_chars(b),
810                }
811            },
812
813            //§ script-data-escaped-state
814            states::RawData(ScriptDataEscaped(Escaped)) => loop {
815                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
816                    FromSet('\0') => {
817                        self.bad_char_error();
818                        self.emit_char('\u{fffd}');
819                    },
820                    FromSet('-') => {
821                        self.emit_char('-');
822                        go!(self: to ScriptDataEscapedDash Escaped);
823                    },
824                    FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
825                    FromSet(c) => self.emit_char(c),
826                    NotFromSet(b) => self.emit_chars(b),
827                }
828            },
829
830            //§ script-data-double-escaped-state
831            states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
832                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
833                    FromSet('\0') => {
834                        self.bad_char_error();
835                        self.emit_char('\u{fffd}');
836                    },
837                    FromSet('-') => {
838                        self.emit_char('-');
839                        go!(self: to ScriptDataEscapedDash DoubleEscaped);
840                    },
841                    FromSet('<') => {
842                        self.emit_char('<');
843                        go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped)
844                    },
845                    FromSet(c) => self.emit_char(c),
846                    NotFromSet(b) => self.emit_chars(b),
847                }
848            },
849
850            //§ plaintext-state
851            states::Plaintext => loop {
852                match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
853                    FromSet('\0') => {
854                        self.bad_char_error();
855                        self.emit_char('\u{fffd}');
856                    },
857                    FromSet(c) => self.emit_char(c),
858                    NotFromSet(b) => self.emit_chars(b),
859                }
860            },
861
862            //§ tag-open-state
863            states::TagOpen => loop {
864                match get_char!(self, input) {
865                    '!' => go!(self: to MarkupDeclarationOpen),
866                    '/' => go!(self: to EndTagOpen),
867                    '?' => {
868                        self.bad_char_error();
869                        go!(self: clear_comment; reconsume BogusComment)
870                    },
871                    c => match lower_ascii_letter(c) {
872                        Some(cl) => go!(self: create_tag StartTag cl; to TagName),
873                        None => {
874                            self.bad_char_error();
875                            self.emit_char('<');
876                            go!(self: reconsume Data)
877                        },
878                    },
879                }
880            },
881
882            //§ end-tag-open-state
883            states::EndTagOpen => loop {
884                match get_char!(self, input) {
885                    '>' => {
886                        self.bad_char_error();
887                        go!(self: to Data)
888                    },
889                    c => match lower_ascii_letter(c) {
890                        Some(cl) => go!(self: create_tag EndTag cl; to TagName),
891                        None => {
892                            self.bad_char_error();
893                            go!(self: clear_comment; reconsume BogusComment)
894                        },
895                    },
896                }
897            },
898
899            //§ tag-name-state
900            states::TagName => loop {
901                match get_char!(self, input) {
902                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
903                    '/' => go!(self: to SelfClosingStartTag),
904                    '>' => go!(self: emit_tag Data),
905                    '\0' => {
906                        self.bad_char_error();
907                        go!(self: push_tag '\u{fffd}')
908                    },
909                    c => go!(self: push_tag (c.to_ascii_lowercase())),
910                }
911            },
912
913            //§ script-data-escaped-less-than-sign-state
914            states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
915                match get_char!(self, input) {
916                    '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
917                    c => match lower_ascii_letter(c) {
918                        Some(cl) => {
919                            go!(self: clear_temp; push_temp cl);
920                            self.emit_char('<');
921                            self.emit_char(c);
922                            go!(self: to ScriptDataEscapeStart DoubleEscaped);
923                        },
924                        None => {
925                            self.emit_char('<');
926                            go!(self: reconsume RawData ScriptDataEscaped Escaped);
927                        },
928                    },
929                }
930            },
931
932            //§ script-data-double-escaped-less-than-sign-state
933            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
934                match get_char!(self, input) {
935                    '/' => {
936                        go!(self: clear_temp);
937                        self.emit_char('/');
938                        go!(self: to ScriptDataDoubleEscapeEnd);
939                    },
940                    _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
941                }
942            },
943
944            //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
945            // otherwise
946            states::RawLessThanSign(kind) => loop {
947                match get_char!(self, input) {
948                    '/' => go!(self: clear_temp; to RawEndTagOpen kind),
949                    '!' if kind == ScriptData => {
950                        self.emit_char('<');
951                        self.emit_char('!');
952                        go!(self: to ScriptDataEscapeStart Escaped);
953                    },
954                    _ => {
955                        self.emit_char('<');
956                        go!(self: reconsume RawData kind);
957                    },
958                }
959            },
960
961            //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
962            states::RawEndTagOpen(kind) => loop {
963                let c = get_char!(self, input);
964                match lower_ascii_letter(c) {
965                    Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
966                    None => {
967                        self.emit_char('<');
968                        self.emit_char('/');
969                        go!(self: reconsume RawData kind);
970                    },
971                }
972            },
973
974            //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
975            states::RawEndTagName(kind) => loop {
976                let c = get_char!(self, input);
977                if self.have_appropriate_end_tag() {
978                    match c {
979                        '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName),
980                        '/' => go!(self: clear_temp; to SelfClosingStartTag),
981                        '>' => go!(self: clear_temp; emit_tag Data),
982                        _ => (),
983                    }
984                }
985
986                match lower_ascii_letter(c) {
987                    Some(cl) => go!(self: push_tag cl; push_temp c),
988                    None => {
989                        go!(self: discard_tag);
990                        self.emit_char('<');
991                        self.emit_char('/');
992                        self.emit_temp_buf();
993                        go!(self: reconsume RawData kind);
994                    },
995                }
996            },
997
998            //§ script-data-double-escape-start-state
999            states::ScriptDataEscapeStart(DoubleEscaped) => loop {
1000                let c = get_char!(self, input);
1001                match c {
1002                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1003                        let esc = if &**self.temp_buf.borrow() == "script" {
1004                            DoubleEscaped
1005                        } else {
1006                            Escaped
1007                        };
1008                        self.emit_char(c);
1009                        go!(self: to RawData ScriptDataEscaped esc);
1010                    },
1011                    _ => match lower_ascii_letter(c) {
1012                        Some(cl) => {
1013                            go!(self: push_temp cl);
1014                            self.emit_char(c);
1015                        },
1016                        None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
1017                    },
1018                }
1019            },
1020
1021            //§ script-data-escape-start-state
1022            states::ScriptDataEscapeStart(Escaped) => loop {
1023                match get_char!(self, input) {
1024                    '-' => {
1025                        self.emit_char('-');
1026                        go!(self: to ScriptDataEscapeStartDash);
1027                    },
1028                    _ => go!(self: reconsume RawData ScriptData),
1029                }
1030            },
1031
1032            //§ script-data-escape-start-dash-state
1033            states::ScriptDataEscapeStartDash => loop {
1034                match get_char!(self, input) {
1035                    '-' => {
1036                        self.emit_char('-');
1037                        go!(self: to ScriptDataEscapedDashDash Escaped);
1038                    },
1039                    _ => go!(self: reconsume RawData ScriptData),
1040                }
1041            },
1042
1043            //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
1044            states::ScriptDataEscapedDash(kind) => loop {
1045                match get_char!(self, input) {
1046                    '-' => {
1047                        self.emit_char('-');
1048                        go!(self: to ScriptDataEscapedDashDash kind);
1049                    },
1050                    '<' => {
1051                        if kind == DoubleEscaped {
1052                            self.emit_char('<');
1053                        }
1054                        go!(self: to RawLessThanSign ScriptDataEscaped kind);
1055                    },
1056                    '\0' => {
1057                        self.bad_char_error();
1058                        self.emit_char('\u{fffd}');
1059                        go!(self: to RawData ScriptDataEscaped kind)
1060                    },
1061                    c => {
1062                        self.emit_char(c);
1063                        go!(self: to RawData ScriptDataEscaped kind);
1064                    },
1065                }
1066            },
1067
1068            //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
1069            states::ScriptDataEscapedDashDash(kind) => loop {
1070                match get_char!(self, input) {
1071                    '-' => {
1072                        self.emit_char('-');
1073                    },
1074                    '<' => {
1075                        if kind == DoubleEscaped {
1076                            self.emit_char('<');
1077                        }
1078                        go!(self: to RawLessThanSign ScriptDataEscaped kind);
1079                    },
1080                    '>' => {
1081                        self.emit_char('>');
1082                        go!(self: to RawData ScriptData);
1083                    },
1084                    '\0' => {
1085                        self.bad_char_error();
1086                        self.emit_char('\u{fffd}');
1087                        go!(self: to RawData ScriptDataEscaped kind)
1088                    },
1089                    c => {
1090                        self.emit_char(c);
1091                        go!(self: to RawData ScriptDataEscaped kind);
1092                    },
1093                }
1094            },
1095
1096            //§ script-data-double-escape-end-state
1097            states::ScriptDataDoubleEscapeEnd => loop {
1098                let c = get_char!(self, input);
1099                match c {
1100                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1101                        let esc = if &**self.temp_buf.borrow() == "script" {
1102                            Escaped
1103                        } else {
1104                            DoubleEscaped
1105                        };
1106                        self.emit_char(c);
1107                        go!(self: to RawData ScriptDataEscaped esc);
1108                    },
1109                    _ => match lower_ascii_letter(c) {
1110                        Some(cl) => {
1111                            go!(self: push_temp cl);
1112                            self.emit_char(c);
1113                        },
1114                        None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
1115                    },
1116                }
1117            },
1118
1119            //§ before-attribute-name-state
1120            states::BeforeAttributeName => loop {
1121                match get_char!(self, input) {
1122                    '\t' | '\n' | '\x0C' | ' ' => (),
1123                    '/' => go!(self: to SelfClosingStartTag),
1124                    '>' => go!(self: emit_tag Data),
1125                    '\0' => {
1126                        self.bad_char_error();
1127                        go!(self: create_attr '\u{fffd}'; to AttributeName)
1128                    },
1129                    c => match lower_ascii_letter(c) {
1130                        Some(cl) => go!(self: create_attr cl; to AttributeName),
1131                        None => {
1132                            if matches!(c, '"' | '\'' | '<' | '=') {
1133                                self.bad_char_error();
1134                            }
1135
1136                            go!(self: create_attr c; to AttributeName);
1137                        },
1138                    },
1139                }
1140            },
1141
1142            //§ attribute-name-state
1143            states::AttributeName => loop {
1144                match get_char!(self, input) {
1145                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
1146                    '/' => go!(self: to SelfClosingStartTag),
1147                    '=' => go!(self: to BeforeAttributeValue),
1148                    '>' => go!(self: emit_tag Data),
1149                    '\0' => {
1150                        self.bad_char_error();
1151                        go!(self: push_name '\u{fffd}')
1152                    },
1153                    c => match lower_ascii_letter(c) {
1154                        Some(cl) => go!(self: push_name cl),
1155                        None => {
1156                            if matches!(c, '"' | '\'' | '<') {
1157                                self.bad_char_error();
1158                            }
1159                            go!(self: push_name c);
1160                        },
1161                    },
1162                }
1163            },
1164
1165            //§ after-attribute-name-state
1166            states::AfterAttributeName => loop {
1167                match get_char!(self, input) {
1168                    '\t' | '\n' | '\x0C' | ' ' => (),
1169                    '/' => go!(self: to SelfClosingStartTag),
1170                    '=' => go!(self: to BeforeAttributeValue),
1171                    '>' => go!(self: emit_tag Data),
1172                    '\0' => {
1173                        self.bad_char_error();
1174                        go!(self: create_attr '\u{fffd}'; to AttributeName)
1175                    },
1176                    c => match lower_ascii_letter(c) {
1177                        Some(cl) => go!(self: create_attr cl; to AttributeName),
1178                        None => {
1179                            if matches!(c, '"' | '\'' | '<') {
1180                                self.bad_char_error();
1181                            }
1182
1183                            go!(self: create_attr c; to AttributeName);
1184                        },
1185                    },
1186                }
1187            },
1188
1189            //§ before-attribute-value-state
1190            // Use peek so we can handle the first attr character along with the rest,
1191            // hopefully in the same zero-copy buffer.
1192            states::BeforeAttributeValue => loop {
1193                match peek!(self, input) {
1194                    '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1195                    '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1196                    '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1197                    '>' => {
1198                        go!(self: discard_char input);
1199                        self.bad_char_error();
1200                        go!(self: emit_tag Data)
1201                    },
1202                    _ => go!(self: to AttributeValue Unquoted),
1203                }
1204            },
1205
1206            //§ attribute-value-(double-quoted)-state
1207            states::AttributeValue(DoubleQuoted) => loop {
1208                match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1209                    FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1210                    FromSet('&') => go!(self: consume_char_ref),
1211                    FromSet('\0') => {
1212                        self.bad_char_error();
1213                        go!(self: push_value '\u{fffd}')
1214                    },
1215                    FromSet(c) => go!(self: push_value c),
1216                    NotFromSet(ref b) => go!(self: append_value b),
1217                }
1218            },
1219
1220            //§ attribute-value-(single-quoted)-state
1221            states::AttributeValue(SingleQuoted) => loop {
1222                match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1223                    FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1224                    FromSet('&') => go!(self: consume_char_ref),
1225                    FromSet('\0') => {
1226                        self.bad_char_error();
1227                        go!(self: push_value '\u{fffd}')
1228                    },
1229                    FromSet(c) => go!(self: push_value c),
1230                    NotFromSet(ref b) => go!(self: append_value b),
1231                }
1232            },
1233
1234            //§ attribute-value-(unquoted)-state
1235            states::AttributeValue(Unquoted) => loop {
1236                match pop_except_from!(
1237                    self,
1238                    input,
1239                    small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1240                ) {
1241                    FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1242                        go!(self: to BeforeAttributeName)
1243                    },
1244                    FromSet('&') => go!(self: consume_char_ref),
1245                    FromSet('>') => go!(self: emit_tag Data),
1246                    FromSet('\0') => {
1247                        self.bad_char_error();
1248                        go!(self: push_value '\u{fffd}')
1249                    },
1250                    FromSet(c) => {
1251                        if matches!(c, '"' | '\'' | '<' | '=' | '`') {
1252                            self.bad_char_error();
1253                        }
1254                        go!(self: push_value c);
1255                    },
1256                    NotFromSet(ref b) => go!(self: append_value b),
1257                }
1258            },
1259
1260            //§ after-attribute-value-(quoted)-state
1261            states::AfterAttributeValueQuoted => loop {
1262                match get_char!(self, input) {
1263                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1264                    '/' => go!(self: to SelfClosingStartTag),
1265                    '>' => go!(self: emit_tag Data),
1266                    _ => {
1267                        self.bad_char_error();
1268                        go!(self: reconsume BeforeAttributeName)
1269                    },
1270                }
1271            },
1272
1273            //§ self-closing-start-tag-state
1274            states::SelfClosingStartTag => loop {
1275                match get_char!(self, input) {
1276                    '>' => {
1277                        self.current_tag_self_closing.set(true);
1278                        go!(self: emit_tag Data);
1279                    },
1280                    _ => {
1281                        self.bad_char_error();
1282                        go!(self: reconsume BeforeAttributeName)
1283                    },
1284                }
1285            },
1286
1287            //§ comment-start-state
1288            states::CommentStart => loop {
1289                match get_char!(self, input) {
1290                    '-' => go!(self: to CommentStartDash),
1291                    '\0' => {
1292                        self.bad_char_error();
1293                        go!(self: push_comment '\u{fffd}'; to Comment)
1294                    },
1295                    '>' => {
1296                        self.bad_char_error();
1297                        go!(self: emit_comment; to Data)
1298                    },
1299                    c => go!(self: push_comment c; to Comment),
1300                }
1301            },
1302
1303            //§ comment-start-dash-state
1304            states::CommentStartDash => loop {
1305                match get_char!(self, input) {
1306                    '-' => go!(self: to CommentEnd),
1307                    '\0' => {
1308                        self.bad_char_error();
1309                        go!(self: append_comment "-\u{fffd}"; to Comment)
1310                    },
1311                    '>' => {
1312                        self.bad_char_error();
1313                        go!(self: emit_comment; to Data)
1314                    },
1315                    c => go!(self: push_comment '-'; push_comment c; to Comment),
1316                }
1317            },
1318
1319            //§ comment-state
1320            states::Comment => loop {
1321                match get_char!(self, input) {
1322                    c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1323                    '-' => go!(self: to CommentEndDash),
1324                    '\0' => {
1325                        self.bad_char_error();
1326                        go!(self: push_comment '\u{fffd}')
1327                    },
1328                    c => go!(self: push_comment c),
1329                }
1330            },
1331
1332            //§ comment-less-than-sign-state
1333            states::CommentLessThanSign => loop {
1334                match get_char!(self, input) {
1335                    c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1336                    c @ '<' => go!(self: push_comment c),
1337                    _ => go!(self: reconsume Comment),
1338                }
1339            },
1340
1341            //§ comment-less-than-sign-bang
1342            states::CommentLessThanSignBang => loop {
1343                match get_char!(self, input) {
1344                    '-' => go!(self: to CommentLessThanSignBangDash),
1345                    _ => go!(self: reconsume Comment),
1346                }
1347            },
1348
1349            //§ comment-less-than-sign-bang-dash
1350            states::CommentLessThanSignBangDash => loop {
1351                match get_char!(self, input) {
1352                    '-' => go!(self: to CommentLessThanSignBangDashDash),
1353                    _ => go!(self: reconsume CommentEndDash),
1354                }
1355            },
1356
1357            //§ comment-less-than-sign-bang-dash-dash
1358            states::CommentLessThanSignBangDashDash => loop {
1359                match get_char!(self, input) {
1360                    '>' => go!(self: reconsume CommentEnd),
1361                    _ => {
1362                        self.bad_char_error();
1363                        go!(self: reconsume CommentEnd)
1364                    },
1365                }
1366            },
1367
1368            //§ comment-end-dash-state
1369            states::CommentEndDash => loop {
1370                match get_char!(self, input) {
1371                    '-' => go!(self: to CommentEnd),
1372                    '\0' => {
1373                        self.bad_char_error();
1374                        go!(self: append_comment "-\u{fffd}"; to Comment)
1375                    },
1376                    c => go!(self: push_comment '-'; push_comment c; to Comment),
1377                }
1378            },
1379
1380            //§ comment-end-state
1381            states::CommentEnd => loop {
1382                match get_char!(self, input) {
1383                    '>' => go!(self: emit_comment; to Data),
1384                    '!' => go!(self: to CommentEndBang),
1385                    '-' => go!(self: push_comment '-'),
1386                    _ => go!(self: append_comment "--"; reconsume Comment),
1387                }
1388            },
1389
1390            //§ comment-end-bang-state
1391            states::CommentEndBang => loop {
1392                match get_char!(self, input) {
1393                    '-' => go!(self: append_comment "--!"; to CommentEndDash),
1394                    '>' => {
1395                        self.bad_char_error();
1396                        go!(self: emit_comment; to Data)
1397                    },
1398                    '\0' => {
1399                        self.bad_char_error();
1400                        go!(self: append_comment "--!\u{fffd}"; to Comment)
1401                    },
1402                    c => go!(self: append_comment "--!"; push_comment c; to Comment),
1403                }
1404            },
1405
1406            //§ doctype-state
1407            states::Doctype => loop {
1408                match get_char!(self, input) {
1409                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1410                    '>' => go!(self: reconsume BeforeDoctypeName),
1411                    _ => {
1412                        self.bad_char_error();
1413                        go!(self: reconsume BeforeDoctypeName)
1414                    },
1415                }
1416            },
1417
1418            //§ before-doctype-name-state
1419            states::BeforeDoctypeName => loop {
1420                match get_char!(self, input) {
1421                    '\t' | '\n' | '\x0C' | ' ' => (),
1422                    '\0' => {
1423                        self.bad_char_error();
1424                        go!(self: create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1425                    },
1426                    '>' => {
1427                        self.bad_char_error();
1428                        go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1429                    },
1430                    c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1431                                  to DoctypeName),
1432                }
1433            },
1434
1435            //§ doctype-name-state
1436            states::DoctypeName => loop {
1437                match get_char!(self, input) {
1438                    '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1439                    '>' => go!(self: emit_doctype; to Data),
1440                    '\0' => {
1441                        self.bad_char_error();
1442                        go!(self: push_doctype_name '\u{fffd}')
1443                    },
1444                    c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1445                }
1446            },
1447
1448            //§ after-doctype-name-state
1449            states::AfterDoctypeName => loop {
1450                if eat!(self, input, "public") {
1451                    go!(self: to AfterDoctypeKeyword Public);
1452                } else if eat!(self, input, "system") {
1453                    go!(self: to AfterDoctypeKeyword System);
1454                } else {
1455                    match get_char!(self, input) {
1456                        '\t' | '\n' | '\x0C' | ' ' => (),
1457                        '>' => go!(self: emit_doctype; to Data),
1458                        _ => {
1459                            self.bad_char_error();
1460                            go!(self: force_quirks; reconsume BogusDoctype)
1461                        },
1462                    }
1463                }
1464            },
1465
1466            //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1467            states::AfterDoctypeKeyword(kind) => loop {
1468                match get_char!(self, input) {
1469                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1470                    '"' => {
1471                        self.bad_char_error();
1472                        go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1473                    },
1474                    '\'' => {
1475                        self.bad_char_error();
1476                        go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1477                    },
1478                    '>' => {
1479                        self.bad_char_error();
1480                        go!(self: force_quirks; emit_doctype; to Data)
1481                    },
1482                    _ => {
1483                        self.bad_char_error();
1484                        go!(self: force_quirks; reconsume BogusDoctype)
1485                    },
1486                }
1487            },
1488
1489            //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1490            states::BeforeDoctypeIdentifier(kind) => loop {
1491                match get_char!(self, input) {
1492                    '\t' | '\n' | '\x0C' | ' ' => (),
1493                    '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1494                    '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1495                    '>' => {
1496                        self.bad_char_error();
1497                        go!(self: force_quirks; emit_doctype; to Data)
1498                    },
1499                    _ => {
1500                        self.bad_char_error();
1501                        go!(self: force_quirks; reconsume BogusDoctype)
1502                    },
1503                }
1504            },
1505
1506            //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1507            states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1508                match get_char!(self, input) {
1509                    '"' => go!(self: to AfterDoctypeIdentifier kind),
1510                    '\0' => {
1511                        self.bad_char_error();
1512                        go!(self: push_doctype_id kind '\u{fffd}')
1513                    },
1514                    '>' => {
1515                        self.bad_char_error();
1516                        go!(self: force_quirks; emit_doctype; to Data)
1517                    },
1518                    c => go!(self: push_doctype_id kind c),
1519                }
1520            },
1521
1522            //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1523            states::DoctypeIdentifierSingleQuoted(kind) => loop {
1524                match get_char!(self, input) {
1525                    '\'' => go!(self: to AfterDoctypeIdentifier kind),
1526                    '\0' => {
1527                        self.bad_char_error();
1528                        go!(self: push_doctype_id kind '\u{fffd}')
1529                    },
1530                    '>' => {
1531                        self.bad_char_error();
1532                        go!(self: force_quirks; emit_doctype; to Data)
1533                    },
1534                    c => go!(self: push_doctype_id kind c),
1535                }
1536            },
1537
1538            //§ after-doctype-public-identifier-state
1539            states::AfterDoctypeIdentifier(Public) => loop {
1540                match get_char!(self, input) {
1541                    '\t' | '\n' | '\x0C' | ' ' => {
1542                        go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1543                    },
1544                    '>' => go!(self: emit_doctype; to Data),
1545                    '"' => {
1546                        self.bad_char_error();
1547                        go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1548                    },
1549                    '\'' => {
1550                        self.bad_char_error();
1551                        go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1552                    },
1553                    _ => {
1554                        self.bad_char_error();
1555                        go!(self: force_quirks; reconsume BogusDoctype)
1556                    },
1557                }
1558            },
1559
1560            //§ after-doctype-system-identifier-state
1561            states::AfterDoctypeIdentifier(System) => loop {
1562                match get_char!(self, input) {
1563                    '\t' | '\n' | '\x0C' | ' ' => (),
1564                    '>' => go!(self: emit_doctype; to Data),
1565                    _ => {
1566                        self.bad_char_error();
1567                        go!(self: reconsume BogusDoctype)
1568                    },
1569                }
1570            },
1571
1572            //§ between-doctype-public-and-system-identifiers-state
1573            states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1574                match get_char!(self, input) {
1575                    '\t' | '\n' | '\x0C' | ' ' => (),
1576                    '>' => go!(self: emit_doctype; to Data),
1577                    '"' => {
1578                        go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1579                    },
1580                    '\'' => {
1581                        go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1582                    },
1583                    _ => {
1584                        self.bad_char_error();
1585                        go!(self: force_quirks; reconsume BogusDoctype)
1586                    },
1587                }
1588            },
1589
1590            //§ bogus-doctype-state
1591            states::BogusDoctype => loop {
1592                match get_char!(self, input) {
1593                    '>' => go!(self: emit_doctype; to Data),
1594                    '\0' => {
1595                        self.bad_char_error();
1596                    },
1597                    _ => (),
1598                }
1599            },
1600
1601            //§ bogus-comment-state
1602            states::BogusComment => loop {
1603                match get_char!(self, input) {
1604                    '>' => go!(self: emit_comment; to Data),
1605                    '\0' => {
1606                        self.bad_char_error();
1607                        go!(self: push_comment '\u{fffd}')
1608                    },
1609                    c => go!(self: push_comment c),
1610                }
1611            },
1612
1613            //§ markup-declaration-open-state
1614            states::MarkupDeclarationOpen => loop {
1615                if eat_exact!(self, input, "--") {
1616                    go!(self: clear_comment; to CommentStart);
1617                } else if eat!(self, input, "doctype") {
1618                    go!(self: to Doctype);
1619                } else {
1620                    if self
1621                        .sink
1622                        .adjusted_current_node_present_but_not_in_html_namespace()
1623                        && eat_exact!(self, input, "[CDATA[")
1624                    {
1625                        go!(self: clear_temp; to CdataSection);
1626                    }
1627                    self.bad_char_error();
1628                    go!(self: clear_comment; to BogusComment);
1629                }
1630            },
1631
1632            //§ cdata-section-state
1633            states::CdataSection => loop {
1634                match get_char!(self, input) {
1635                    ']' => go!(self: to CdataSectionBracket),
1636                    '\0' => {
1637                        self.emit_temp_buf();
1638                        self.emit_char('\0');
1639                    },
1640                    c => go!(self: push_temp c),
1641                }
1642            },
1643
1644            //§ cdata-section-bracket
1645            states::CdataSectionBracket => match get_char!(self, input) {
1646                ']' => go!(self: to CdataSectionEnd),
1647                _ => go!(self: push_temp ']'; reconsume CdataSection),
1648            },
1649
1650            //§ cdata-section-end
1651            states::CdataSectionEnd => loop {
1652                match get_char!(self, input) {
1653                    ']' => go!(self: push_temp ']'),
1654                    '>' => {
1655                        self.emit_temp_buf();
1656                        go!(self: to Data);
1657                    },
1658                    _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1659                }
1660            },
1661            //§ END
1662        }
1663    }
1664
1665    fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1666        let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
1667        let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
1668            char_ref::Status::Done(char_ref) => {
1669                self.process_char_ref(char_ref);
1670                *char_ref_tokenizer = None;
1671                return ProcessResult::Continue;
1672            },
1673
1674            char_ref::Status::Stuck => ProcessResult::Suspend,
1675            char_ref::Status::Progress => ProcessResult::Continue,
1676        };
1677
1678        progress
1679    }
1680
1681    fn process_char_ref(&self, char_ref: CharRef) {
1682        let CharRef {
1683            mut chars,
1684            mut num_chars,
1685        } = char_ref;
1686
1687        if num_chars == 0 {
1688            chars[0] = '&';
1689            num_chars = 1;
1690        }
1691
1692        for i in 0..num_chars {
1693            let c = chars[i as usize];
1694            match self.state.get() {
1695                states::Data | states::RawData(states::Rcdata) => self.emit_char(c),
1696
1697                states::AttributeValue(_) => go!(self: push_value c),
1698
1699                _ => panic!(
1700                    "state {:?} should not be reachable in process_char_ref",
1701                    self.state.get()
1702                ),
1703            }
1704        }
1705    }
1706
1707    /// Indicate that we have reached the end of the input.
1708    pub fn end(&self) {
1709        // Handle EOF in the char ref sub-tokenizer, if there is one.
1710        // Do this first because it might un-consume stuff.
1711        let input = BufferQueue::default();
1712        match self.char_ref_tokenizer.take() {
1713            None => (),
1714            Some(mut tokenizer) => {
1715                self.process_char_ref(tokenizer.end_of_file(self, &input));
1716            },
1717        }
1718
1719        // Process all remaining buffered input.
1720        // If we're waiting for lookahead, we're not gonna get it.
1721        self.at_eof.set(true);
1722        assert!(matches!(self.run(&input), TokenizerResult::Done));
1723        assert!(input.is_empty());
1724
1725        loop {
1726            match self.eof_step() {
1727                ProcessResult::Continue => (),
1728                ProcessResult::Suspend => break,
1729                ProcessResult::Script(_) => unreachable!(),
1730            }
1731        }
1732
1733        self.sink.end();
1734
1735        if self.opts.profile {
1736            self.dump_profile();
1737        }
1738    }
1739
1740    fn dump_profile(&self) {
1741        let mut results: Vec<(states::State, u64)> = self
1742            .state_profile
1743            .borrow()
1744            .iter()
1745            .map(|(s, t)| (*s, *t))
1746            .collect();
1747        results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1748
1749        let total: u64 = results
1750            .iter()
1751            .map(|&(_, t)| t)
1752            .fold(0, ::std::ops::Add::add);
1753        println!("\nTokenizer profile, in nanoseconds");
1754        println!(
1755            "\n{:12}         total in token sink",
1756            self.time_in_sink.get()
1757        );
1758        println!("\n{total:12}         total in tokenizer");
1759
1760        for (k, v) in results.into_iter() {
1761            let pct = 100.0 * (v as f64) / (total as f64);
1762            println!("{v:12}  {pct:4.1}%  {k:?}");
1763        }
1764    }
1765
1766    fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1767        debug!("processing EOF in state {:?}", self.state.get());
1768        match self.state.get() {
1769            states::Data
1770            | states::RawData(Rcdata)
1771            | states::RawData(Rawtext)
1772            | states::RawData(ScriptData)
1773            | states::Plaintext => go!(self: eof),
1774
1775            states::TagName
1776            | states::RawData(ScriptDataEscaped(_))
1777            | states::BeforeAttributeName
1778            | states::AttributeName
1779            | states::AfterAttributeName
1780            | states::AttributeValue(_)
1781            | states::AfterAttributeValueQuoted
1782            | states::SelfClosingStartTag
1783            | states::ScriptDataEscapedDash(_)
1784            | states::ScriptDataEscapedDashDash(_) => {
1785                self.bad_eof_error();
1786                go!(self: to Data)
1787            },
1788
1789            states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1790
1791            states::TagOpen => {
1792                self.bad_eof_error();
1793                self.emit_char('<');
1794                go!(self: to Data);
1795            },
1796
1797            states::EndTagOpen => {
1798                self.bad_eof_error();
1799                self.emit_char('<');
1800                self.emit_char('/');
1801                go!(self: to Data);
1802            },
1803
1804            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1805                go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1806            },
1807
1808            states::RawLessThanSign(kind) => {
1809                self.emit_char('<');
1810                go!(self: to RawData kind);
1811            },
1812
1813            states::RawEndTagOpen(kind) => {
1814                self.emit_char('<');
1815                self.emit_char('/');
1816                go!(self: to RawData kind);
1817            },
1818
1819            states::RawEndTagName(kind) => {
1820                self.emit_char('<');
1821                self.emit_char('/');
1822                self.emit_temp_buf();
1823                go!(self: to RawData kind)
1824            },
1825
1826            states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1827
1828            states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1829
1830            states::ScriptDataDoubleEscapeEnd => {
1831                go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1832            },
1833
1834            states::CommentStart
1835            | states::CommentStartDash
1836            | states::Comment
1837            | states::CommentEndDash
1838            | states::CommentEnd
1839            | states::CommentEndBang => {
1840                self.bad_eof_error();
1841                go!(self: emit_comment; to Data)
1842            },
1843
1844            states::CommentLessThanSign | states::CommentLessThanSignBang => {
1845                go!(self: reconsume Comment)
1846            },
1847
1848            states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1849
1850            states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1851
1852            states::Doctype | states::BeforeDoctypeName => {
1853                self.bad_eof_error();
1854                go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1855            },
1856
1857            states::DoctypeName
1858            | states::AfterDoctypeName
1859            | states::AfterDoctypeKeyword(_)
1860            | states::BeforeDoctypeIdentifier(_)
1861            | states::DoctypeIdentifierDoubleQuoted(_)
1862            | states::DoctypeIdentifierSingleQuoted(_)
1863            | states::AfterDoctypeIdentifier(_)
1864            | states::BetweenDoctypePublicAndSystemIdentifiers => {
1865                self.bad_eof_error();
1866                go!(self: force_quirks; emit_doctype; to Data)
1867            },
1868
1869            states::BogusDoctype => go!(self: emit_doctype; to Data),
1870
1871            states::BogusComment => go!(self: emit_comment; to Data),
1872
1873            states::MarkupDeclarationOpen => {
1874                self.bad_char_error();
1875                go!(self: to BogusComment)
1876            },
1877
1878            states::CdataSection => {
1879                self.emit_temp_buf();
1880                self.bad_eof_error();
1881                go!(self: to Data)
1882            },
1883
1884            states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1885
1886            states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1887        }
1888    }
1889
1890    /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64.
1891    fn is_supported_simd_feature_detected() -> bool {
1892        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1893        {
1894            is_x86_feature_detected!("sse2")
1895        }
1896
1897        #[cfg(target_arch = "aarch64")]
1898        {
1899            std::arch::is_aarch64_feature_detected!("neon")
1900        }
1901
1902        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
1903        false
1904    }
1905
1906    #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
1907    /// Implements the [data state] with SIMD instructions.
1908    /// Calls SSE2- or NEON-specific function for chunks and processes any remaining bytes.
1909    ///
1910    /// The algorithm implemented is the naive SIMD approach described [here].
1911    ///
1912    /// ### SAFETY:
1913    /// Calling this function on a CPU that supports neither SSE2 nor NEON causes undefined behaviour.
1914    ///
1915    /// [data state]: https://html.spec.whatwg.org/#data-state
1916    /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1917    unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1918        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1919        let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
1920
1921        #[cfg(target_arch = "aarch64")]
1922        let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
1923
1924        // Process any remaining bytes (less than STRIDE)
1925        while let Some(c) = input.as_bytes().get(i) {
1926            if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1927                break;
1928            }
1929            if *c == b'\n' {
1930                n_newlines += 1;
1931            }
1932
1933            i += 1;
1934        }
1935
1936        let set_result = if i == 0 {
1937            let first_char = input.pop_front_char().unwrap();
1938            debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1939
1940            // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1941            // Still, it would be nice to not have to do that.
1942            // The same is true for the unwrap call.
1943            let preprocessed_char = self
1944                .get_preprocessed_char(first_char, &BufferQueue::default())
1945                .unwrap();
1946            SetResult::FromSet(preprocessed_char)
1947        } else {
1948            debug_assert!(
1949                input.len() >= i,
1950                "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1951                i,
1952                input.len()
1953            );
1954            let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1955            input.unsafe_pop_front(i as u32);
1956            SetResult::NotFromSet(consumed_chunk)
1957        };
1958
1959        self.current_line.set(self.current_line.get() + n_newlines);
1960
1961        Some(set_result)
1962    }
1963
1964    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1965    #[target_feature(enable = "sse2")]
1966    /// Implements the [data state] with SSE2 instructions for x86/x86_64.
1967    /// Returns a pair of the number of bytes processed and the number of newlines found.
1968    ///
1969    /// ### SAFETY:
1970    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
1971    ///
1972    /// [data state]: https://html.spec.whatwg.org/#data-state
1973    unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
1974        #[cfg(target_arch = "x86")]
1975        use std::arch::x86::{
1976            __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1977            _mm_set1_epi8,
1978        };
1979        #[cfg(target_arch = "x86_64")]
1980        use std::arch::x86_64::{
1981            __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1982            _mm_set1_epi8,
1983        };
1984
1985        debug_assert!(!input.is_empty());
1986
1987        let quote_mask = _mm_set1_epi8('<' as i8);
1988        let escape_mask = _mm_set1_epi8('&' as i8);
1989        let carriage_return_mask = _mm_set1_epi8('\r' as i8);
1990        let zero_mask = _mm_set1_epi8('\0' as i8);
1991        let newline_mask = _mm_set1_epi8('\n' as i8);
1992
1993        let raw_bytes: &[u8] = input.as_bytes();
1994        let start = raw_bytes.as_ptr();
1995
1996        const STRIDE: usize = 16;
1997        let mut i = 0;
1998        let mut n_newlines = 0;
1999        while i + STRIDE <= raw_bytes.len() {
2000            // Load a 16 byte chunk from the input
2001            let data = _mm_loadu_si128(start.add(i) as *const __m128i);
2002
2003            // Compare the chunk against each mask
2004            let quotes = _mm_cmpeq_epi8(data, quote_mask);
2005            let escapes = _mm_cmpeq_epi8(data, escape_mask);
2006            let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
2007            let zeros = _mm_cmpeq_epi8(data, zero_mask);
2008            let newlines = _mm_cmpeq_epi8(data, newline_mask);
2009
2010            // Combine all test results and create a bitmask from them.
2011            // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2012            let test_result = _mm_or_si128(
2013                _mm_or_si128(quotes, zeros),
2014                _mm_or_si128(escapes, carriage_returns),
2015            );
2016            let bitmask = _mm_movemask_epi8(test_result);
2017            let newline_mask = _mm_movemask_epi8(newlines);
2018
2019            if (bitmask != 0) {
2020                // We have reached one of the characters that cause the state machine to transition
2021                let position = if cfg!(target_endian = "little") {
2022                    bitmask.trailing_zeros() as usize
2023                } else {
2024                    bitmask.leading_zeros() as usize
2025                };
2026
2027                n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
2028                i += position;
2029                break;
2030            } else {
2031                n_newlines += newline_mask.count_ones() as u64;
2032            }
2033
2034            i += STRIDE;
2035        }
2036
2037        (i, n_newlines)
2038    }
2039
2040    #[cfg(target_arch = "aarch64")]
2041    #[target_feature(enable = "neon")]
2042    /// Implements the [data state] with NEON SIMD instructions for AArch64.
2043    /// Returns a pair of the number of bytes processed and the number of newlines found.
2044    ///
2045    /// ### SAFETY:
2046    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
2047    ///
2048    /// [data state]: https://html.spec.whatwg.org/#data-state
2049    unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2050        use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
2051
2052        debug_assert!(!input.is_empty());
2053
2054        let quote_mask = vdupq_n_u8(b'<');
2055        let escape_mask = vdupq_n_u8(b'&');
2056        let carriage_return_mask = vdupq_n_u8(b'\r');
2057        let zero_mask = vdupq_n_u8(b'\0');
2058        let newline_mask = vdupq_n_u8(b'\n');
2059
2060        let raw_bytes: &[u8] = input.as_bytes();
2061        let start = raw_bytes.as_ptr();
2062
2063        const STRIDE: usize = 16;
2064        let mut i = 0;
2065        let mut n_newlines = 0;
2066        while i + STRIDE <= raw_bytes.len() {
2067            // Load a 16 byte chunk from the input
2068            let data = vld1q_u8(start.add(i));
2069
2070            // Compare the chunk against each mask
2071            let quotes = vceqq_u8(data, quote_mask);
2072            let escapes = vceqq_u8(data, escape_mask);
2073            let carriage_returns = vceqq_u8(data, carriage_return_mask);
2074            let zeros = vceqq_u8(data, zero_mask);
2075            let newlines = vceqq_u8(data, newline_mask);
2076
2077            // Combine all test results and create a bitmask from them.
2078            // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2079            let test_result =
2080                vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
2081            let bitmask = vmaxvq_u8(test_result);
2082            let newline_mask = vmaxvq_u8(newlines);
2083            if bitmask != 0 {
2084                // We have reached one of the characters that cause the state machine to transition
2085                let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2086                let position = chunk_bytes
2087                    .iter()
2088                    .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
2089                    .unwrap();
2090
2091                n_newlines += chunk_bytes[..position]
2092                    .iter()
2093                    .filter(|&&b| b == b'\n')
2094                    .count() as u64;
2095
2096                i += position;
2097                break;
2098            } else if newline_mask != 0 {
2099                let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2100                n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
2101            }
2102
2103            i += STRIDE;
2104        }
2105
2106        (i, n_newlines)
2107    }
2108}
2109
2110#[cfg(test)]
2111#[allow(non_snake_case)]
2112mod test {
2113    use super::option_push; // private items
2114    use crate::tendril::{SliceExt, StrTendril};
2115
2116    use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
2117
2118    use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
2119    use super::interface::{EndTag, StartTag, Tag, TagKind};
2120    use super::interface::{TagToken, Token};
2121
2122    use markup5ever::buffer_queue::BufferQueue;
2123    use std::cell::RefCell;
2124
2125    use crate::LocalName;
2126
2127    // LinesMatch implements the TokenSink trait. It is used for testing to see
2128    // if current_line is being updated when process_token is called. The lines
2129    // vector is a collection of the line numbers that each token is on.
2130    struct LinesMatch {
2131        tokens: RefCell<Vec<Token>>,
2132        current_str: RefCell<StrTendril>,
2133        lines: RefCell<Vec<(Token, u64)>>,
2134    }
2135
2136    impl LinesMatch {
2137        fn new() -> LinesMatch {
2138            LinesMatch {
2139                tokens: RefCell::new(vec![]),
2140                current_str: RefCell::new(StrTendril::new()),
2141                lines: RefCell::new(vec![]),
2142            }
2143        }
2144
2145        fn push(&self, token: Token, line_number: u64) {
2146            self.finish_str();
2147            self.lines.borrow_mut().push((token, line_number));
2148        }
2149
2150        fn finish_str(&self) {
2151            if !self.current_str.borrow().is_empty() {
2152                let s = self.current_str.take();
2153                self.tokens.borrow_mut().push(CharacterTokens(s));
2154            }
2155        }
2156    }
2157
2158    impl TokenSink for LinesMatch {
2159        type Handle = ();
2160
2161        fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
2162            match token {
2163                CharacterTokens(b) => {
2164                    self.current_str.borrow_mut().push_slice(&b);
2165                },
2166
2167                NullCharacterToken => {
2168                    self.current_str.borrow_mut().push_char('\0');
2169                },
2170
2171                ParseError(_) => {
2172                    panic!("unexpected parse error");
2173                },
2174
2175                TagToken(mut t) => {
2176                    // The spec seems to indicate that one can emit
2177                    // erroneous end tags with attrs, but the test
2178                    // cases don't contain them.
2179                    match t.kind {
2180                        EndTag => {
2181                            t.self_closing = false;
2182                            t.attrs = vec![];
2183                        },
2184                        _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
2185                    }
2186                    self.push(TagToken(t), line_number);
2187                },
2188
2189                EOFToken => (),
2190
2191                _ => self.push(token, line_number),
2192            }
2193            TokenSinkResult::Continue
2194        }
2195    }
2196
2197    // Take in tokens, process them, and return vector with line
2198    // numbers that each token is on
2199    fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
2200        let sink = LinesMatch::new();
2201        let tok = Tokenizer::new(sink, opts);
2202        let buffer = BufferQueue::default();
2203        for chunk in input.into_iter() {
2204            buffer.push_back(chunk);
2205            let _ = tok.feed(&buffer);
2206        }
2207        tok.end();
2208        tok.sink.lines.take()
2209    }
2210
2211    // Create a tag token
2212    fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
2213        let name = LocalName::from(&*token);
2214
2215        TagToken(Tag {
2216            kind: tagkind,
2217            name,
2218            self_closing: false,
2219            attrs: vec![],
2220        })
2221    }
2222
2223    #[test]
2224    fn push_to_None_gives_singleton() {
2225        let mut s: Option<StrTendril> = None;
2226        option_push(&mut s, 'x');
2227        assert_eq!(s, Some("x".to_tendril()));
2228    }
2229
2230    #[test]
2231    fn push_to_empty_appends() {
2232        let mut s: Option<StrTendril> = Some(StrTendril::new());
2233        option_push(&mut s, 'x');
2234        assert_eq!(s, Some("x".to_tendril()));
2235    }
2236
2237    #[test]
2238    fn push_to_nonempty_appends() {
2239        let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
2240        option_push(&mut s, 'x');
2241        assert_eq!(s, Some("yx".to_tendril()));
2242    }
2243
2244    #[test]
2245    fn check_lines() {
2246        let opts = TokenizerOpts {
2247            exact_errors: false,
2248            discard_bom: true,
2249            profile: false,
2250            initial_state: None,
2251            last_start_tag_name: None,
2252        };
2253        let vector = vec![
2254            StrTendril::from("<a>\n"),
2255            StrTendril::from("<b>\n"),
2256            StrTendril::from("</b>\n"),
2257            StrTendril::from("</a>\n"),
2258        ];
2259        let expected = vec![
2260            (create_tag(StrTendril::from("a"), StartTag), 1),
2261            (create_tag(StrTendril::from("b"), StartTag), 2),
2262            (create_tag(StrTendril::from("b"), EndTag), 3),
2263            (create_tag(StrTendril::from("a"), EndTag), 4),
2264        ];
2265        let results = tokenize(vector, opts);
2266        assert_eq!(results, expected);
2267    }
2268
2269    #[test]
2270    fn check_lines_with_new_line() {
2271        let opts = TokenizerOpts {
2272            exact_errors: false,
2273            discard_bom: true,
2274            profile: false,
2275            initial_state: None,
2276            last_start_tag_name: None,
2277        };
2278        let vector = vec![
2279            StrTendril::from("<a>\r\n"),
2280            StrTendril::from("<b>\r\n"),
2281            StrTendril::from("</b>\r\n"),
2282            StrTendril::from("</a>\r\n"),
2283        ];
2284        let expected = vec![
2285            (create_tag(StrTendril::from("a"), StartTag), 1),
2286            (create_tag(StrTendril::from("b"), StartTag), 2),
2287            (create_tag(StrTendril::from("b"), EndTag), 3),
2288            (create_tag(StrTendril::from("a"), EndTag), 4),
2289        ];
2290        let results = tokenize(vector, opts);
2291        assert_eq!(results, expected);
2292    }
2293}
html5ever/tokenizer/mod.rs

html5ever/tokenizer/
mod.rs