cssparser/
tokenizer.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5// https://drafts.csswg.org/css-syntax/#tokenization
6
7use self::Token::*;
8use crate::cow_rc_str::CowRcStr;
9use crate::parser::{ArbitrarySubstitutionFunctions, ParserState};
10use std::char;
11use std::ops::Range;
12
13#[cfg(not(feature = "dummy_match_byte"))]
14use cssparser_macros::match_byte;
15
16#[cfg(feature = "dummy_match_byte")]
17macro_rules! match_byte {
18    ($value:expr, $($rest:tt)* ) => {
19        match $value {
20            $(
21                $rest
22            )+
23        }
24    };
25}
26
27/// One of the pieces the CSS input is broken into.
28///
29/// Some components use `Cow` in order to borrow from the original input string
30/// and avoid allocating/copying when possible.
31#[derive(PartialEq, Debug, Clone)]
32pub enum Token<'a> {
33    /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram)
34    Ident(CowRcStr<'a>),
35
36    /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram)
37    ///
38    /// The value does not include the `@` marker.
39    AtKeyword(CowRcStr<'a>),
40
41    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"
42    ///
43    /// The value does not include the `#` marker.
44    Hash(CowRcStr<'a>),
45
46    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"
47    ///
48    /// The value does not include the `#` marker.
49    IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.
50
51    /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram)
52    ///
53    /// The value does not include the quotes.
54    QuotedString(CowRcStr<'a>),
55
56    /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram)
57    ///
58    /// The value does not include the `url(` `)` markers.  Note that `url( <string-token> )` is represented by a
59    /// `Function` token.
60    UnquotedUrl(CowRcStr<'a>),
61
62    /// A `<delim-token>`
63    Delim(char),
64
65    /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram)
66    Number {
67        /// Whether the number had a `+` or `-` sign.
68        ///
69        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
70        has_sign: bool,
71
72        /// The value as a float
73        value: f32,
74
75        /// If the origin source did not include a fractional part, the value as an integer.
76        int_value: Option<i32>,
77    },
78
79    /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram)
80    Percentage {
81        /// Whether the number had a `+` or `-` sign.
82        has_sign: bool,
83
84        /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
85        unit_value: f32,
86
87        /// If the origin source did not include a fractional part, the value as an integer.
88        /// It is **not** divided by 100.
89        int_value: Option<i32>,
90    },
91
92    /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram)
93    Dimension {
94        /// Whether the number had a `+` or `-` sign.
95        ///
96        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
97        has_sign: bool,
98
99        /// The value as a float
100        value: f32,
101
102        /// If the origin source did not include a fractional part, the value as an integer.
103        int_value: Option<i32>,
104
105        /// The unit, e.g. "px" in `12px`
106        unit: CowRcStr<'a>,
107    },
108
109    /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram)
110    WhiteSpace(&'a str),
111
112    /// A comment.
113    ///
114    /// The CSS Syntax spec does not generate tokens for comments,
115    /// But we do, because we can (borrowed &str makes it cheap).
116    ///
117    /// The value does not include the `/*` `*/` markers.
118    Comment(&'a str),
119
120    /// A `:` `<colon-token>`
121    Colon, // :
122
123    /// A `;` `<semicolon-token>`
124    Semicolon, // ;
125
126    /// A `,` `<comma-token>`
127    Comma, // ,
128
129    /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram)
130    IncludeMatch,
131
132    /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram)
133    DashMatch,
134
135    /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram)
136    PrefixMatch,
137
138    /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram)
139    SuffixMatch,
140
141    /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram)
142    SubstringMatch,
143
144    /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram)
145    CDO,
146
147    /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram)
148    CDC,
149
150    /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram)
151    ///
152    /// The value (name) does not include the `(` marker.
153    Function(CowRcStr<'a>),
154
155    /// A `<(-token>`
156    ParenthesisBlock,
157
158    /// A `<[-token>`
159    SquareBracketBlock,
160
161    /// A `<{-token>`
162    CurlyBracketBlock,
163
164    /// A `<bad-url-token>`
165    ///
166    /// This token always indicates a parse error.
167    BadUrl(CowRcStr<'a>),
168
169    /// A `<bad-string-token>`
170    ///
171    /// This token always indicates a parse error.
172    BadString(CowRcStr<'a>),
173
174    /// A `<)-token>`
175    ///
176    /// When obtained from one of the `Parser::next*` methods,
177    /// this token is always unmatched and indicates a parse error.
178    CloseParenthesis,
179
180    /// A `<]-token>`
181    ///
182    /// When obtained from one of the `Parser::next*` methods,
183    /// this token is always unmatched and indicates a parse error.
184    CloseSquareBracket,
185
186    /// A `<}-token>`
187    ///
188    /// When obtained from one of the `Parser::next*` methods,
189    /// this token is always unmatched and indicates a parse error.
190    CloseCurlyBracket,
191}
192
193impl Token<'_> {
194    /// Return whether this token represents a parse error.
195    ///
196    /// `BadUrl` and `BadString` are tokenizer-level parse errors.
197    ///
198    /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*
199    /// and therefore parse errors when returned by one of the `Parser::next*` methods.
200    pub fn is_parse_error(&self) -> bool {
201        matches!(
202            *self,
203            BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
204        )
205    }
206}
207
208#[derive(Clone)]
209pub struct Tokenizer<'a> {
210    input: &'a str,
211    /// Counted in bytes, not code points. From 0.
212    position: usize,
213    /// The position at the start of the current line; but adjusted to
214    /// ensure that computing the column will give the result in units
215    /// of UTF-16 characters.
216    current_line_start_position: usize,
217    current_line_number: u32,
218    arbitrary_substitution_functions: SeenStatus<'a>,
219    source_map_url: Option<&'a str>,
220    source_url: Option<&'a str>,
221}
222
223#[derive(Copy, Clone, PartialEq, Eq)]
224enum SeenStatus<'a> {
225    DontCare,
226    LookingForThem(ArbitrarySubstitutionFunctions<'a>),
227    SeenAtLeastOne,
228}
229
230impl<'a> Tokenizer<'a> {
231    #[inline]
232    pub fn new(input: &'a str) -> Self {
233        Tokenizer {
234            input,
235            position: 0,
236            current_line_start_position: 0,
237            current_line_number: 0,
238            arbitrary_substitution_functions: SeenStatus::DontCare,
239            source_map_url: None,
240            source_url: None,
241        }
242    }
243
244    #[inline]
245    pub fn look_for_arbitrary_substitution_functions(
246        &mut self,
247        fns: ArbitrarySubstitutionFunctions<'a>,
248    ) {
249        self.arbitrary_substitution_functions = SeenStatus::LookingForThem(fns);
250    }
251
252    #[inline]
253    pub fn seen_arbitrary_substitution_functions(&mut self) -> bool {
254        let seen = self.arbitrary_substitution_functions == SeenStatus::SeenAtLeastOne;
255        self.arbitrary_substitution_functions = SeenStatus::DontCare;
256        seen
257    }
258
259    #[inline]
260    pub fn see_function(&mut self, name: &str) {
261        if let SeenStatus::LookingForThem(fns) = self.arbitrary_substitution_functions {
262            if fns.iter().any(|a| name.eq_ignore_ascii_case(a)) {
263                self.arbitrary_substitution_functions = SeenStatus::SeenAtLeastOne;
264            }
265        }
266    }
267
268    #[inline]
269    pub fn next(&mut self) -> Result<Token<'a>, ()> {
270        next_token(self)
271    }
272
273    #[inline]
274    pub fn position(&self) -> SourcePosition {
275        debug_assert!(self.input.is_char_boundary(self.position));
276        SourcePosition(self.position)
277    }
278
279    #[inline]
280    pub fn current_source_location(&self) -> SourceLocation {
281        SourceLocation {
282            line: self.current_line_number,
283            column: (self.position - self.current_line_start_position + 1) as u32,
284        }
285    }
286
287    #[inline]
288    pub fn current_source_map_url(&self) -> Option<&'a str> {
289        self.source_map_url
290    }
291
292    #[inline]
293    pub fn current_source_url(&self) -> Option<&'a str> {
294        self.source_url
295    }
296
297    #[inline]
298    pub fn state(&self) -> ParserState {
299        ParserState {
300            position: self.position,
301            current_line_start_position: self.current_line_start_position,
302            current_line_number: self.current_line_number,
303            at_start_of: None,
304        }
305    }
306
307    #[inline]
308    pub fn reset(&mut self, state: &ParserState) {
309        self.position = state.position;
310        self.current_line_start_position = state.current_line_start_position;
311        self.current_line_number = state.current_line_number;
312    }
313
314    #[inline]
315    pub(crate) fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
316        self.slice(start_pos..self.position())
317    }
318
319    #[inline]
320    pub(crate) fn slice(&self, range: Range<SourcePosition>) -> &'a str {
321        debug_assert!(self.input.is_char_boundary(range.start.0));
322        debug_assert!(self.input.is_char_boundary(range.end.0));
323        unsafe { self.input.get_unchecked(range.start.0..range.end.0) }
324    }
325
326    pub fn current_source_line(&self) -> &'a str {
327        let current = self.position();
328        let start = self
329            .slice(SourcePosition(0)..current)
330            .rfind(['\r', '\n', '\x0C'])
331            .map_or(0, |start| start + 1);
332        let end = self
333            .slice(current..SourcePosition(self.input.len()))
334            .find(['\r', '\n', '\x0C'])
335            .map_or(self.input.len(), |end| current.0 + end);
336        self.slice(SourcePosition(start)..SourcePosition(end))
337    }
338
339    #[inline]
340    pub fn next_byte(&self) -> Option<u8> {
341        if self.is_eof() {
342            None
343        } else {
344            Some(self.input.as_bytes()[self.position])
345        }
346    }
347
348    // If false, `tokenizer.next_char()` will not panic.
349    #[inline]
350    fn is_eof(&self) -> bool {
351        !self.has_at_least(0)
352    }
353
354    // If true, the input has at least `n` bytes left *after* the current one.
355    // That is, `tokenizer.char_at(n)` will not panic.
356    #[inline]
357    fn has_at_least(&self, n: usize) -> bool {
358        self.position + n < self.input.len()
359    }
360
361    // Advance over N bytes in the input.  This function can advance
362    // over ASCII bytes (excluding newlines), or UTF-8 sequence
363    // leaders (excluding leaders for 4-byte sequences).
364    #[inline]
365    pub fn advance(&mut self, n: usize) {
366        if cfg!(debug_assertions) {
367            // Each byte must either be an ASCII byte or a sequence
368            // leader, but not a 4-byte leader; also newlines are
369            // rejected.
370            for i in 0..n {
371                let b = self.byte_at(i);
372                debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
373                debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
374            }
375        }
376        self.position += n
377    }
378
379    // Assumes non-EOF
380    #[inline]
381    fn next_byte_unchecked(&self) -> u8 {
382        self.byte_at(0)
383    }
384
385    #[inline]
386    fn byte_at(&self, offset: usize) -> u8 {
387        self.input.as_bytes()[self.position + offset]
388    }
389
390    // Advance over a single byte; the byte must be a UTF-8 sequence
391    // leader for a 4-byte sequence.
392    #[inline]
393    fn consume_4byte_intro(&mut self) {
394        debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
395        // This takes two UTF-16 characters to represent, so we
396        // actually have an undercount.
397        self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
398        self.position += 1;
399    }
400
401    // Advance over a single byte; the byte must be a UTF-8
402    // continuation byte.
403    #[inline]
404    fn consume_continuation_byte(&mut self) {
405        debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
406        // Continuation bytes contribute to column overcount.  Note
407        // that due to the special case for the 4-byte sequence intro,
408        // we must use wrapping add here.
409        self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
410        self.position += 1;
411    }
412
413    // Advance over any kind of byte, excluding newlines.
414    #[inline(never)]
415    fn consume_known_byte(&mut self, byte: u8) {
416        debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
417        self.position += 1;
418        // Continuation bytes contribute to column overcount.
419        if byte & 0xF0 == 0xF0 {
420            // This takes two UTF-16 characters to represent, so we
421            // actually have an undercount.
422            self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
423        } else if byte & 0xC0 == 0x80 {
424            // Note that due to the special case for the 4-byte
425            // sequence intro, we must use wrapping add here.
426            self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
427        }
428    }
429
430    #[inline]
431    fn next_char(&self) -> char {
432        unsafe { self.input.get_unchecked(self.position().0..) }
433            .chars()
434            .next()
435            .unwrap()
436    }
437
438    // Given that a newline has been seen, advance over the newline
439    // and update the state.
440    #[inline]
441    fn consume_newline(&mut self) {
442        let byte = self.next_byte_unchecked();
443        debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
444        self.position += 1;
445        if byte == b'\r' && self.next_byte() == Some(b'\n') {
446            self.position += 1;
447        }
448        self.current_line_start_position = self.position;
449        self.current_line_number += 1;
450    }
451
452    #[inline]
453    fn has_newline_at(&self, offset: usize) -> bool {
454        self.position + offset < self.input.len()
455            && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
456    }
457
458    #[inline]
459    fn consume_char(&mut self) -> char {
460        let c = self.next_char();
461        let len_utf8 = c.len_utf8();
462        self.position += len_utf8;
463        // Note that due to the special case for the 4-byte sequence
464        // intro, we must use wrapping add here.
465        self.current_line_start_position = self
466            .current_line_start_position
467            .wrapping_add(len_utf8 - c.len_utf16());
468        c
469    }
470
471    #[inline]
472    fn starts_with(&self, needle: &[u8]) -> bool {
473        self.input.as_bytes()[self.position..].starts_with(needle)
474    }
475
476    pub fn skip_whitespace(&mut self) {
477        while !self.is_eof() {
478            match_byte! { self.next_byte_unchecked(),
479                b' ' | b'\t' => {
480                    self.advance(1)
481                },
482                b'\n' | b'\x0C' | b'\r' => {
483                    self.consume_newline();
484                },
485                b'/' => {
486                    if self.starts_with(b"/*") {
487                        consume_comment(self);
488                    } else {
489                        return
490                    }
491                }
492                _ => return,
493            }
494        }
495    }
496
497    pub fn skip_cdc_and_cdo(&mut self) {
498        while !self.is_eof() {
499            match_byte! { self.next_byte_unchecked(),
500                b' ' | b'\t' => {
501                    self.advance(1)
502                },
503                b'\n' | b'\x0C' | b'\r' => {
504                    self.consume_newline();
505                },
506                b'/' => {
507                    if self.starts_with(b"/*") {
508                        consume_comment(self);
509                    } else {
510                        return
511                    }
512                }
513                b'<' => {
514                    if self.starts_with(b"<!--") {
515                        self.advance(4)
516                    } else {
517                        return
518                    }
519                }
520                b'-' => {
521                    if self.starts_with(b"-->") {
522                        self.advance(3)
523                    } else {
524                        return
525                    }
526                }
527                _ => {
528                    return
529                }
530            }
531        }
532    }
533}
534
535/// A position from the start of the input, counted in UTF-8 bytes.
536#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
537pub struct SourcePosition(pub(crate) usize);
538
539#[cfg(feature = "malloc_size_of")]
540malloc_size_of::malloc_size_of_is_0!(SourcePosition);
541
542impl SourcePosition {
543    /// Returns the current byte index in the original input.
544    #[inline]
545    pub fn byte_index(&self) -> usize {
546        self.0
547    }
548}
549
550/// The line and column number for a given position within the input.
551#[derive(PartialEq, Eq, Debug, Clone, Copy, Default)]
552pub struct SourceLocation {
553    /// The line number, starting at 0 for the first line.
554    pub line: u32,
555
556    /// The column number within a line, starting at 1 for first the character of the line.
557    /// Column numbers are counted in UTF-16 code units.
558    pub column: u32,
559}
560
561#[cfg(feature = "malloc_size_of")]
562malloc_size_of::malloc_size_of_is_0!(SourceLocation);
563
564fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
565    if tokenizer.is_eof() {
566        return Err(());
567    }
568    let b = tokenizer.next_byte_unchecked();
569    let token = match_byte! { b,
570        b' ' | b'\t' => {
571            consume_whitespace(tokenizer, false)
572        },
573        b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true),
574        b'"' => consume_string(tokenizer, false),
575        b'#' => {
576            tokenizer.advance(1);
577            if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
578            else if !tokenizer.is_eof() &&
579                matches!(tokenizer.next_byte_unchecked(), b'0'..=b'9' | b'-') {
580                // Any other valid case here already resulted in IDHash.
581                Hash(consume_name(tokenizer))
582            }
583            else { Delim('#') }
584        },
585        b'$' => {
586            if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
587            else { tokenizer.advance(1); Delim('$') }
588        },
589        b'\'' => consume_string(tokenizer, true),
590        b'(' => { tokenizer.advance(1); ParenthesisBlock },
591        b')' => { tokenizer.advance(1); CloseParenthesis },
592        b'*' => {
593            if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
594            else { tokenizer.advance(1); Delim('*') }
595        },
596        b'+' => {
597            if (
598                tokenizer.has_at_least(1)
599                && tokenizer.byte_at(1).is_ascii_digit()
600            ) || (
601                tokenizer.has_at_least(2)
602                && tokenizer.byte_at(1) == b'.'
603                && tokenizer.byte_at(2).is_ascii_digit()
604            ) {
605                consume_numeric(tokenizer)
606            } else {
607                tokenizer.advance(1);
608                Delim('+')
609            }
610        },
611        b',' => { tokenizer.advance(1); Comma },
612        b'-' => {
613            if (
614                tokenizer.has_at_least(1)
615                && tokenizer.byte_at(1).is_ascii_digit()
616            ) || (
617                tokenizer.has_at_least(2)
618                && tokenizer.byte_at(1) == b'.'
619                && tokenizer.byte_at(2).is_ascii_digit()
620            ) {
621                consume_numeric(tokenizer)
622            } else if tokenizer.starts_with(b"-->") {
623                tokenizer.advance(3);
624                CDC
625            } else if is_ident_start(tokenizer) {
626                consume_ident_like(tokenizer)
627            } else {
628                tokenizer.advance(1);
629                Delim('-')
630            }
631        },
632        b'.' => {
633            if tokenizer.has_at_least(1)
634                && tokenizer.byte_at(1).is_ascii_digit() {
635                consume_numeric(tokenizer)
636            } else {
637                tokenizer.advance(1);
638                Delim('.')
639            }
640        }
641        b'/' => {
642            if tokenizer.starts_with(b"/*") {
643                Comment(consume_comment(tokenizer))
644            } else {
645                tokenizer.advance(1);
646                Delim('/')
647            }
648        }
649        b'0'..=b'9' => consume_numeric(tokenizer),
650        b':' => { tokenizer.advance(1); Colon },
651        b';' => { tokenizer.advance(1); Semicolon },
652        b'<' => {
653            if tokenizer.starts_with(b"<!--") {
654                tokenizer.advance(4);
655                CDO
656            } else {
657                tokenizer.advance(1);
658                Delim('<')
659            }
660        },
661        b'@' => {
662            tokenizer.advance(1);
663            if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
664            else { Delim('@') }
665        },
666        b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => consume_ident_like(tokenizer),
667        b'[' => { tokenizer.advance(1); SquareBracketBlock },
668        b'\\' => {
669            if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
670            else { tokenizer.advance(1); Delim('\\') }
671        },
672        b']' => { tokenizer.advance(1); CloseSquareBracket },
673        b'^' => {
674            if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
675            else { tokenizer.advance(1); Delim('^') }
676        },
677        b'{' => { tokenizer.advance(1); CurlyBracketBlock },
678        b'|' => {
679            if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
680            else { tokenizer.advance(1); Delim('|') }
681        },
682        b'}' => { tokenizer.advance(1); CloseCurlyBracket },
683        b'~' => {
684            if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
685            else { tokenizer.advance(1); Delim('~') }
686        },
687        _ => {
688            if !b.is_ascii() {
689                consume_ident_like(tokenizer)
690            } else {
691                tokenizer.advance(1);
692                Delim(b as char)
693            }
694        },
695    };
696    Ok(token)
697}
698
699fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
700    let start_position = tokenizer.position();
701    if newline {
702        tokenizer.consume_newline();
703    } else {
704        tokenizer.advance(1);
705    }
706    while !tokenizer.is_eof() {
707        let b = tokenizer.next_byte_unchecked();
708        match_byte! { b,
709            b' ' | b'\t' => {
710                tokenizer.advance(1);
711            }
712            b'\n' | b'\x0C' | b'\r' => {
713                tokenizer.consume_newline();
714            }
715            _ => {
716                break
717            }
718        }
719    }
720    WhiteSpace(tokenizer.slice_from(start_position))
721}
722
723// Check for sourceMappingURL or sourceURL comments and update the
724// tokenizer appropriately.
725fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
726    let directive = "# sourceMappingURL=";
727    let directive_old = "@ sourceMappingURL=";
728
729    // If there is a source map directive, extract the URL.
730    if contents.starts_with(directive) || contents.starts_with(directive_old) {
731        let contents = &contents[directive.len()..];
732        tokenizer.source_map_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next();
733    }
734
735    let directive = "# sourceURL=";
736    let directive_old = "@ sourceURL=";
737
738    // If there is a source map directive, extract the URL.
739    if contents.starts_with(directive) || contents.starts_with(directive_old) {
740        let contents = &contents[directive.len()..];
741        tokenizer.source_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next()
742    }
743}
744
745fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
746    tokenizer.advance(2); // consume "/*"
747    let start_position = tokenizer.position();
748    while !tokenizer.is_eof() {
749        match_byte! { tokenizer.next_byte_unchecked(),
750            b'*' => {
751                let end_position = tokenizer.position();
752                tokenizer.advance(1);
753                if tokenizer.next_byte() == Some(b'/') {
754                    tokenizer.advance(1);
755                    let contents = tokenizer.slice(start_position..end_position);
756                    check_for_source_map(tokenizer, contents);
757                    return contents
758                }
759            }
760            b'\n' | b'\x0C' | b'\r' => {
761                tokenizer.consume_newline();
762            }
763            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
764            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
765            _ => {
766                // ASCII or other leading byte.
767                tokenizer.advance(1);
768            }
769        }
770    }
771    let contents = tokenizer.slice_from(start_position);
772    check_for_source_map(tokenizer, contents);
773    contents
774}
775
776fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
777    match consume_quoted_string(tokenizer, single_quote) {
778        Ok(value) => QuotedString(value),
779        Err(value) => BadString(value),
780    }
781}
782
783/// Return `Err(())` on syntax error (ie. unescaped newline)
784fn consume_quoted_string<'a>(
785    tokenizer: &mut Tokenizer<'a>,
786    single_quote: bool,
787) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
788    tokenizer.advance(1); // Skip the initial quote
789                          // start_pos is at code point boundary, after " or '
790    let start_pos = tokenizer.position();
791    let mut string_bytes;
792    loop {
793        if tokenizer.is_eof() {
794            return Ok(tokenizer.slice_from(start_pos).into());
795        }
796        match_byte! { tokenizer.next_byte_unchecked(),
797            b'"' => {
798                if !single_quote {
799                    let value = tokenizer.slice_from(start_pos);
800                    tokenizer.advance(1);
801                    return Ok(value.into())
802                }
803                tokenizer.advance(1);
804            }
805            b'\'' => {
806                if single_quote {
807                    let value = tokenizer.slice_from(start_pos);
808                    tokenizer.advance(1);
809                    return Ok(value.into())
810                }
811                tokenizer.advance(1);
812            }
813            b'\\' | b'\0' => {
814                // * The tokenizer’s input is UTF-8 since it’s `&str`.
815                // * start_pos is at a code point boundary
816                // * so is the current position (which is before '\\' or '\0'
817                //
818                // So `string_bytes` is well-formed UTF-8.
819                string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
820                break
821            }
822            b'\n' | b'\r' | b'\x0C' => {
823                return Err(tokenizer.slice_from(start_pos).into())
824            },
825            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
826            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
827            _ => {
828                // ASCII or other leading byte.
829                tokenizer.advance(1);
830            }
831        }
832    }
833
834    while !tokenizer.is_eof() {
835        let b = tokenizer.next_byte_unchecked();
836        match_byte! { b,
837            b'\n' | b'\r' | b'\x0C' => {
838                return Err(
839                    // string_bytes is well-formed UTF-8, see other comments.
840                    unsafe {
841                        from_utf8_release_unchecked(string_bytes)
842                    }.into()
843                );
844            }
845            b'"' => {
846                tokenizer.advance(1);
847                if !single_quote {
848                    break;
849                }
850            }
851            b'\'' => {
852                tokenizer.advance(1);
853                if single_quote {
854                    break;
855                }
856            }
857            b'\\' => {
858                tokenizer.advance(1);
859                if !tokenizer.is_eof() {
860                    match tokenizer.next_byte_unchecked() {
861                        // Escaped newline
862                        b'\n' | b'\x0C' | b'\r' => {
863                            tokenizer.consume_newline();
864                        }
865                        // This pushes one well-formed code point
866                        _ => consume_escape_and_write(tokenizer, &mut string_bytes)
867                    }
868                }
869                // else: escaped EOF, do nothing.
870                continue;
871            }
872            b'\0' => {
873                tokenizer.advance(1);
874                string_bytes.extend("\u{FFFD}".as_bytes());
875                continue;
876            }
877            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
878            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
879            _ => {
880                // ASCII or other leading byte.
881                tokenizer.advance(1);
882            },
883        }
884
885        // If this byte is part of a multi-byte code point,
886        // we’ll end up copying the whole code point before this loop does something else.
887        string_bytes.push(b);
888    }
889
890    Ok(
891        // string_bytes is well-formed UTF-8, see other comments.
892        unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
893    )
894}
895
896#[inline]
897fn is_ident_start(tokenizer: &Tokenizer) -> bool {
898    !tokenizer.is_eof()
899        && match_byte! { tokenizer.next_byte_unchecked(),
900            b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => true,
901            b'-' => {
902                tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
903                    b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
904                        true
905                    }
906                    b'\\' => !tokenizer.has_newline_at(1),
907                    b => !b.is_ascii(),
908                }
909            },
910            b'\\' => !tokenizer.has_newline_at(1),
911            b => !b.is_ascii(),
912        }
913}
914
915fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
916    let value = consume_name(tokenizer);
917    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
918        tokenizer.advance(1);
919        if value.eq_ignore_ascii_case("url") {
920            consume_unquoted_url(tokenizer).unwrap_or(Function(value))
921        } else {
922            tokenizer.see_function(&value);
923            Function(value)
924        }
925    } else {
926        Ident(value)
927    }
928}
929
930fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
931    // start_pos is the end of the previous token, therefore at a code point boundary
932    let start_pos = tokenizer.position();
933    let mut value_bytes;
934    loop {
935        if tokenizer.is_eof() {
936            return tokenizer.slice_from(start_pos).into();
937        }
938        match_byte! { tokenizer.next_byte_unchecked(),
939            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => tokenizer.advance(1),
940            b'\\' | b'\0' => {
941                // * The tokenizer’s input is UTF-8 since it’s `&str`.
942                // * start_pos is at a code point boundary
943                // * so is the current position (which is before '\\' or '\0'
944                //
945                // So `value_bytes` is well-formed UTF-8.
946                value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
947                break
948            }
949            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
950            b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
951            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
952            _b => {
953                return tokenizer.slice_from(start_pos).into();
954            }
955        }
956    }
957
958    while !tokenizer.is_eof() {
959        let b = tokenizer.next_byte_unchecked();
960        match_byte! { b,
961            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-'  => {
962                tokenizer.advance(1);
963                value_bytes.push(b)  // ASCII
964            }
965            b'\\' => {
966                if tokenizer.has_newline_at(1) { break }
967                tokenizer.advance(1);
968                // This pushes one well-formed code point
969                consume_escape_and_write(tokenizer, &mut value_bytes)
970            }
971            b'\0' => {
972                tokenizer.advance(1);
973                value_bytes.extend("\u{FFFD}".as_bytes());
974            },
975            b'\x80'..=b'\xBF' => {
976                // This byte *is* part of a multi-byte code point,
977                // we’ll end up copying the whole code point before this loop does something else.
978                tokenizer.consume_continuation_byte();
979                value_bytes.push(b)
980            }
981            b'\xC0'..=b'\xEF' => {
982                // This byte *is* part of a multi-byte code point,
983                // we’ll end up copying the whole code point before this loop does something else.
984                tokenizer.advance(1);
985                value_bytes.push(b)
986            }
987            b'\xF0'..=b'\xFF' => {
988                tokenizer.consume_4byte_intro();
989                value_bytes.push(b)
990            }
991            _ => {
992                // ASCII
993                break;
994            }
995        }
996    }
997    // string_bytes is well-formed UTF-8, see other comments.
998    unsafe { from_utf8_release_unchecked(value_bytes) }.into()
999}
1000
1001fn byte_to_hex_digit(b: u8) -> Option<u32> {
1002    Some(match_byte! { b,
1003        b'0' ..= b'9' => b - b'0',
1004        b'a' ..= b'f' => b - b'a' + 10,
1005        b'A' ..= b'F' => b - b'A' + 10,
1006        _ => {
1007            return None
1008        }
1009    } as u32)
1010}
1011
1012fn byte_to_decimal_digit(b: u8) -> Option<u32> {
1013    if b.is_ascii_digit() {
1014        Some((b - b'0') as u32)
1015    } else {
1016        None
1017    }
1018}
1019
1020fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1021    // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
1022    // But this is always called so that there is at least one digit in \d*(\.\d+)?
1023
1024    // Do all the math in f64 so that large numbers overflow to +/-inf
1025    // and i32::{MIN, MAX} are within range.
1026
1027    let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1028        b'-' => (true, -1.),
1029        b'+' => (true, 1.),
1030        _ => (false, 1.),
1031    };
1032    if has_sign {
1033        tokenizer.advance(1);
1034    }
1035
1036    let mut integral_part: f64 = 0.;
1037    while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1038        integral_part = integral_part * 10. + digit as f64;
1039        tokenizer.advance(1);
1040        if tokenizer.is_eof() {
1041            break;
1042        }
1043    }
1044
1045    let mut is_integer = true;
1046
1047    let mut fractional_part: f64 = 0.;
1048    if tokenizer.has_at_least(1)
1049        && tokenizer.next_byte_unchecked() == b'.'
1050        && tokenizer.byte_at(1).is_ascii_digit()
1051    {
1052        is_integer = false;
1053        tokenizer.advance(1); // Consume '.'
1054        let mut factor = 0.1;
1055        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1056            fractional_part += digit as f64 * factor;
1057            factor *= 0.1;
1058            tokenizer.advance(1);
1059            if tokenizer.is_eof() {
1060                break;
1061            }
1062        }
1063    }
1064
1065    let mut value = sign * (integral_part + fractional_part);
1066
1067    if tokenizer.has_at_least(1)
1068        && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E')
1069        && (tokenizer.byte_at(1).is_ascii_digit()
1070            || (tokenizer.has_at_least(2)
1071                && matches!(tokenizer.byte_at(1), b'+' | b'-')
1072                && tokenizer.byte_at(2).is_ascii_digit()))
1073    {
1074        is_integer = false;
1075        tokenizer.advance(1);
1076        let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1077            b'-' => (true, -1.),
1078            b'+' => (true, 1.),
1079            _ => (false, 1.),
1080        };
1081        if has_sign {
1082            tokenizer.advance(1);
1083        }
1084        let mut exponent: f64 = 0.;
1085        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1086            exponent = exponent * 10. + digit as f64;
1087            tokenizer.advance(1);
1088            if tokenizer.is_eof() {
1089                break;
1090            }
1091        }
1092        value *= f64::powf(10., sign * exponent);
1093    }
1094
1095    let int_value = if is_integer {
1096        Some(if value >= i32::MAX as f64 {
1097            i32::MAX
1098        } else if value <= i32::MIN as f64 {
1099            i32::MIN
1100        } else {
1101            value as i32
1102        })
1103    } else {
1104        None
1105    };
1106
1107    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
1108        tokenizer.advance(1);
1109        return Percentage {
1110            unit_value: (value / 100.) as f32,
1111            int_value,
1112            has_sign,
1113        };
1114    }
1115    let value = value as f32;
1116    if is_ident_start(tokenizer) {
1117        let unit = consume_name(tokenizer);
1118        Dimension {
1119            value,
1120            int_value,
1121            has_sign,
1122            unit,
1123        }
1124    } else {
1125        Number {
1126            value,
1127            int_value,
1128            has_sign,
1129        }
1130    }
1131}
1132
1133#[inline]
1134unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
1135    if cfg!(debug_assertions) {
1136        String::from_utf8(string_bytes).unwrap()
1137    } else {
1138        String::from_utf8_unchecked(string_bytes)
1139    }
1140}
1141
1142fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
1143    // This is only called after "url(", so the current position is a code point boundary.
1144    let start_position = tokenizer.position;
1145    let from_start = &tokenizer.input[tokenizer.position..];
1146    let mut newlines = 0;
1147    let mut last_newline = 0;
1148    let mut found_printable_char = false;
1149    let mut iter = from_start.bytes().enumerate();
1150    loop {
1151        let (offset, b) = match iter.next() {
1152            Some(item) => item,
1153            None => {
1154                tokenizer.position = tokenizer.input.len();
1155                break;
1156            }
1157        };
1158        match_byte! { b,
1159            b' ' | b'\t' => {},
1160            b'\n' | b'\x0C' => {
1161                newlines += 1;
1162                last_newline = offset;
1163            }
1164            b'\r' => {
1165                if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
1166                    newlines += 1;
1167                    last_newline = offset;
1168                }
1169            }
1170            b'"' | b'\'' => return Err(()),  // Do not advance
1171            b')' => {
1172                // Don't use advance, because we may be skipping
1173                // newlines here, and we want to avoid the assert.
1174                tokenizer.position += offset + 1;
1175                break
1176            }
1177            _ => {
1178                // Don't use advance, because we may be skipping
1179                // newlines here, and we want to avoid the assert.
1180                tokenizer.position += offset;
1181                found_printable_char = true;
1182                break
1183            }
1184        }
1185    }
1186
1187    if newlines > 0 {
1188        tokenizer.current_line_number += newlines;
1189        // No need for wrapping_add here, because there's no possible
1190        // way to wrap.
1191        tokenizer.current_line_start_position = start_position + last_newline + 1;
1192    }
1193
1194    if found_printable_char {
1195        // This function only consumed ASCII (whitespace) bytes,
1196        // so the current position is a code point boundary.
1197        return Ok(consume_unquoted_url_internal(tokenizer));
1198    } else {
1199        return Ok(UnquotedUrl("".into()));
1200    }
1201
1202    fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1203        // This function is only called with start_pos at a code point boundary.
1204        let start_pos = tokenizer.position();
1205        let mut string_bytes: Vec<u8>;
1206        loop {
1207            if tokenizer.is_eof() {
1208                return UnquotedUrl(tokenizer.slice_from(start_pos).into());
1209            }
1210            match_byte! { tokenizer.next_byte_unchecked(),
1211                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1212                    let value = tokenizer.slice_from(start_pos);
1213                    return consume_url_end(tokenizer, start_pos, value.into())
1214                }
1215                b')' => {
1216                    let value = tokenizer.slice_from(start_pos);
1217                    tokenizer.advance(1);
1218                    return UnquotedUrl(value.into())
1219                }
1220                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
1221                    | b'"' | b'\'' | b'(' => {
1222                    tokenizer.advance(1);
1223                    return consume_bad_url(tokenizer, start_pos)
1224                },
1225                b'\\' | b'\0' => {
1226                    // * The tokenizer’s input is UTF-8 since it’s `&str`.
1227                    // * start_pos is at a code point boundary
1228                    // * so is the current position (which is before '\\' or '\0'
1229                    //
1230                    // So `string_bytes` is well-formed UTF-8.
1231                    string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
1232                    break
1233                }
1234                b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
1235                b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
1236                _ => {
1237                    // ASCII or other leading byte.
1238                    tokenizer.advance(1);
1239                }
1240            }
1241        }
1242        while !tokenizer.is_eof() {
1243            let b = tokenizer.next_byte_unchecked();
1244            match_byte! { b,
1245                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1246                    // string_bytes is well-formed UTF-8, see other comments.
1247                    let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1248                    return consume_url_end(tokenizer, start_pos, string)
1249                }
1250                b')' => {
1251                    tokenizer.advance(1);
1252                    break;
1253                }
1254                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
1255                    | b'"' | b'\'' | b'(' => {
1256                    tokenizer.advance(1);
1257                    return consume_bad_url(tokenizer, start_pos);
1258                }
1259                b'\\' => {
1260                    tokenizer.advance(1);
1261                    if tokenizer.has_newline_at(0) {
1262                        return consume_bad_url(tokenizer, start_pos)
1263                    }
1264
1265                    // This pushes one well-formed code point to string_bytes
1266                    consume_escape_and_write(tokenizer, &mut string_bytes)
1267                },
1268                b'\0' => {
1269                    tokenizer.advance(1);
1270                    string_bytes.extend("\u{FFFD}".as_bytes());
1271                }
1272                b'\x80'..=b'\xBF' => {
1273                    // We’ll end up copying the whole code point
1274                    // before this loop does something else.
1275                    tokenizer.consume_continuation_byte();
1276                    string_bytes.push(b);
1277                }
1278                b'\xF0'..=b'\xFF' => {
1279                    // We’ll end up copying the whole code point
1280                    // before this loop does something else.
1281                    tokenizer.consume_4byte_intro();
1282                    string_bytes.push(b);
1283                }
1284                // If this byte is part of a multi-byte code point,
1285                // we’ll end up copying the whole code point before this loop does something else.
1286                b => {
1287                    // ASCII or other leading byte.
1288                    tokenizer.advance(1);
1289                    string_bytes.push(b)
1290                }
1291            }
1292        }
1293        UnquotedUrl(
1294            // string_bytes is well-formed UTF-8, see other comments.
1295            unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
1296        )
1297    }
1298
1299    fn consume_url_end<'a>(
1300        tokenizer: &mut Tokenizer<'a>,
1301        start_pos: SourcePosition,
1302        string: CowRcStr<'a>,
1303    ) -> Token<'a> {
1304        while !tokenizer.is_eof() {
1305            match_byte! { tokenizer.next_byte_unchecked(),
1306                b')' => {
1307                    tokenizer.advance(1);
1308                    break
1309                }
1310                b' ' | b'\t' => { tokenizer.advance(1); }
1311                b'\n' | b'\x0C' | b'\r' => {
1312                    tokenizer.consume_newline();
1313                }
1314                b => {
1315                    tokenizer.consume_known_byte(b);
1316                    return consume_bad_url(tokenizer, start_pos);
1317                }
1318            }
1319        }
1320        UnquotedUrl(string)
1321    }
1322
1323    fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
1324        // Consume up to the closing )
1325        while !tokenizer.is_eof() {
1326            match_byte! { tokenizer.next_byte_unchecked(),
1327                b')' => {
1328                    let contents = tokenizer.slice_from(start_pos).into();
1329                    tokenizer.advance(1);
1330                    return BadUrl(contents)
1331                }
1332                b'\\' => {
1333                    tokenizer.advance(1);
1334                    if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
1335                        tokenizer.advance(1); // Skip an escaped ')' or '\'
1336                    }
1337                }
1338                b'\n' | b'\x0C' | b'\r' => {
1339                    tokenizer.consume_newline();
1340                }
1341                b => {
1342                    tokenizer.consume_known_byte(b);
1343                }
1344            }
1345        }
1346        BadUrl(tokenizer.slice_from(start_pos).into())
1347    }
1348}
1349
1350// (value, number of digits up to 6)
1351fn consume_hex_digits(tokenizer: &mut Tokenizer<'_>) -> (u32, u32) {
1352    let mut value = 0;
1353    let mut digits = 0;
1354    while digits < 6 && !tokenizer.is_eof() {
1355        match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
1356            Some(digit) => {
1357                value = value * 16 + digit;
1358                digits += 1;
1359                tokenizer.advance(1);
1360            }
1361            None => break,
1362        }
1363    }
1364    (value, digits)
1365}
1366
1367// Same constraints as consume_escape except it writes into `bytes` the result
1368// instead of returning it.
1369fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
1370    bytes.extend(
1371        consume_escape(tokenizer)
1372            .encode_utf8(&mut [0; 4])
1373            .as_bytes(),
1374    )
1375}
1376
1377// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
1378// and that the next input character has already been verified
1379// to not be a newline.
1380fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1381    if tokenizer.is_eof() {
1382        return '\u{FFFD}';
1383    } // Escaped EOF
1384    match_byte! { tokenizer.next_byte_unchecked(),
1385        b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
1386            let (c, _) = consume_hex_digits(tokenizer);
1387            if !tokenizer.is_eof() {
1388                match_byte! { tokenizer.next_byte_unchecked(),
1389                    b' ' | b'\t' => {
1390                        tokenizer.advance(1)
1391                    }
1392                    b'\n' | b'\x0C' | b'\r' => {
1393                        tokenizer.consume_newline();
1394                    }
1395                    _ => {}
1396                }
1397            }
1398            static REPLACEMENT_CHAR: char = '\u{FFFD}';
1399            if c != 0 {
1400                let c = char::from_u32(c);
1401                c.unwrap_or(REPLACEMENT_CHAR)
1402            } else {
1403                REPLACEMENT_CHAR
1404            }
1405        },
1406        b'\0' => {
1407            tokenizer.advance(1);
1408            '\u{FFFD}'
1409        }
1410        _ => tokenizer.consume_char(),
1411    }
1412}