Skip to main content

cssparser/
tokenizer.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5// https://drafts.csswg.org/css-syntax/#tokenization
6
7use self::Token::*;
8use crate::cow_rc_str::CowRcStr;
9use crate::parser::{ArbitrarySubstitutionFunctions, ParserState};
10use std::char;
11use std::ops::Range;
12
13#[cfg(feature = "fast_match_byte")]
14pub use crate::match_byte;
15
16/// One of the pieces the CSS input is broken into.
17///
18/// Some components use `Cow` in order to borrow from the original input string
19/// and avoid allocating/copying when possible.
20#[derive(PartialEq, Debug, Clone)]
21pub enum Token<'a> {
22    /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram)
23    Ident(CowRcStr<'a>),
24
25    /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram)
26    ///
27    /// The value does not include the `@` marker.
28    AtKeyword(CowRcStr<'a>),
29
30    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"
31    ///
32    /// The value does not include the `#` marker.
33    Hash(CowRcStr<'a>),
34
35    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"
36    ///
37    /// The value does not include the `#` marker.
38    IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.
39
40    /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram)
41    ///
42    /// The value does not include the quotes.
43    QuotedString(CowRcStr<'a>),
44
45    /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram)
46    ///
47    /// The value does not include the `url(` `)` markers.  Note that `url( <string-token> )` is represented by a
48    /// `Function` token.
49    UnquotedUrl(CowRcStr<'a>),
50
51    /// A `<delim-token>`
52    Delim(char),
53
54    /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram)
55    Number {
56        /// Whether the number had a `+` or `-` sign.
57        ///
58        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
59        has_sign: bool,
60
61        /// The value as a float
62        value: f32,
63
64        /// If the origin source did not include a fractional part, the value as an integer.
65        int_value: Option<i32>,
66    },
67
68    /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram)
69    Percentage {
70        /// Whether the number had a `+` or `-` sign.
71        has_sign: bool,
72
73        /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
74        unit_value: f32,
75
76        /// If the origin source did not include a fractional part, the value as an integer.
77        /// It is **not** divided by 100.
78        int_value: Option<i32>,
79    },
80
81    /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram)
82    Dimension {
83        /// Whether the number had a `+` or `-` sign.
84        ///
85        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
86        has_sign: bool,
87
88        /// The value as a float
89        value: f32,
90
91        /// If the origin source did not include a fractional part, the value as an integer.
92        int_value: Option<i32>,
93
94        /// The unit, e.g. "px" in `12px`
95        unit: CowRcStr<'a>,
96    },
97
98    /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram)
99    WhiteSpace(&'a str),
100
101    /// A comment.
102    ///
103    /// The CSS Syntax spec does not generate tokens for comments,
104    /// But we do, because we can (borrowed &str makes it cheap).
105    ///
106    /// The value does not include the `/*` `*/` markers.
107    Comment(&'a str),
108
109    /// A `:` `<colon-token>`
110    Colon, // :
111
112    /// A `;` `<semicolon-token>`
113    Semicolon, // ;
114
115    /// A `,` `<comma-token>`
116    Comma, // ,
117
118    /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram)
119    IncludeMatch,
120
121    /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram)
122    DashMatch,
123
124    /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram)
125    PrefixMatch,
126
127    /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram)
128    SuffixMatch,
129
130    /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram)
131    SubstringMatch,
132
133    /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram)
134    CDO,
135
136    /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram)
137    CDC,
138
139    /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram)
140    ///
141    /// The value (name) does not include the `(` marker.
142    Function(CowRcStr<'a>),
143
144    /// A `<(-token>`
145    ParenthesisBlock,
146
147    /// A `<[-token>`
148    SquareBracketBlock,
149
150    /// A `<{-token>`
151    CurlyBracketBlock,
152
153    /// A `<bad-url-token>`
154    ///
155    /// This token always indicates a parse error.
156    BadUrl(CowRcStr<'a>),
157
158    /// A `<bad-string-token>`
159    ///
160    /// This token always indicates a parse error.
161    BadString(CowRcStr<'a>),
162
163    /// A `<)-token>`
164    ///
165    /// When obtained from one of the `Parser::next*` methods,
166    /// this token is always unmatched and indicates a parse error.
167    CloseParenthesis,
168
169    /// A `<]-token>`
170    ///
171    /// When obtained from one of the `Parser::next*` methods,
172    /// this token is always unmatched and indicates a parse error.
173    CloseSquareBracket,
174
175    /// A `<}-token>`
176    ///
177    /// When obtained from one of the `Parser::next*` methods,
178    /// this token is always unmatched and indicates a parse error.
179    CloseCurlyBracket,
180}
181
182impl Token<'_> {
183    /// Return whether this token represents a parse error.
184    ///
185    /// `BadUrl` and `BadString` are tokenizer-level parse errors.
186    ///
187    /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*
188    /// and therefore parse errors when returned by one of the `Parser::next*` methods.
189    pub fn is_parse_error(&self) -> bool {
190        matches!(
191            *self,
192            BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
193        )
194    }
195}
196
197#[derive(Clone)]
198pub struct Tokenizer<'a> {
199    input: &'a str,
200    /// Counted in bytes, not code points. From 0.
201    position: usize,
202    /// The position at the start of the current line; but adjusted to
203    /// ensure that computing the column will give the result in units
204    /// of UTF-16 characters.
205    current_line_start_position: usize,
206    current_line_number: u32,
207    arbitrary_substitution_functions: SeenStatus<'a>,
208    source_map_url: Option<&'a str>,
209    source_url: Option<&'a str>,
210}
211
212#[derive(Copy, Clone, PartialEq, Eq)]
213enum SeenStatus<'a> {
214    DontCare,
215    LookingForThem(ArbitrarySubstitutionFunctions<'a>),
216    SeenAtLeastOne,
217}
218
219impl<'a> Tokenizer<'a> {
220    #[inline]
221    pub fn new(input: &'a str) -> Self {
222        Tokenizer {
223            input,
224            position: 0,
225            current_line_start_position: 0,
226            current_line_number: 0,
227            arbitrary_substitution_functions: SeenStatus::DontCare,
228            source_map_url: None,
229            source_url: None,
230        }
231    }
232
233    #[inline]
234    pub fn look_for_arbitrary_substitution_functions(
235        &mut self,
236        fns: ArbitrarySubstitutionFunctions<'a>,
237    ) {
238        self.arbitrary_substitution_functions = SeenStatus::LookingForThem(fns);
239    }
240
241    #[inline]
242    pub fn seen_arbitrary_substitution_functions(&mut self) -> bool {
243        let seen = self.arbitrary_substitution_functions == SeenStatus::SeenAtLeastOne;
244        self.arbitrary_substitution_functions = SeenStatus::DontCare;
245        seen
246    }
247
248    #[inline]
249    pub fn see_function(&mut self, name: &str) {
250        if let SeenStatus::LookingForThem(fns) = self.arbitrary_substitution_functions {
251            if fns.iter().any(|a| name.eq_ignore_ascii_case(a)) {
252                self.arbitrary_substitution_functions = SeenStatus::SeenAtLeastOne;
253            }
254        }
255    }
256
257    #[inline]
258    pub fn next(&mut self) -> Result<Token<'a>, ()> {
259        next_token(self)
260    }
261
262    #[inline]
263    pub fn position(&self) -> SourcePosition {
264        debug_assert!(self.input.is_char_boundary(self.position));
265        SourcePosition(self.position)
266    }
267
268    #[inline]
269    pub fn current_source_location(&self) -> SourceLocation {
270        SourceLocation {
271            line: self.current_line_number,
272            column: (self.position - self.current_line_start_position + 1) as u32,
273        }
274    }
275
276    #[inline]
277    pub fn current_source_map_url(&self) -> Option<&'a str> {
278        self.source_map_url
279    }
280
281    #[inline]
282    pub fn current_source_url(&self) -> Option<&'a str> {
283        self.source_url
284    }
285
286    #[inline]
287    pub fn state(&self) -> ParserState {
288        ParserState {
289            position: self.position,
290            current_line_start_position: self.current_line_start_position,
291            current_line_number: self.current_line_number,
292            at_start_of: None,
293        }
294    }
295
296    #[inline]
297    pub fn reset(&mut self, state: &ParserState) {
298        self.position = state.position;
299        self.current_line_start_position = state.current_line_start_position;
300        self.current_line_number = state.current_line_number;
301    }
302
303    #[inline]
304    pub(crate) fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
305        self.slice(start_pos..self.position())
306    }
307
308    #[inline]
309    pub(crate) fn slice(&self, range: Range<SourcePosition>) -> &'a str {
310        debug_assert!(self.input.is_char_boundary(range.start.0));
311        debug_assert!(self.input.is_char_boundary(range.end.0));
312        unsafe { self.input.get_unchecked(range.start.0..range.end.0) }
313    }
314
315    pub fn current_source_line(&self) -> &'a str {
316        let current = self.position();
317        let start = self
318            .slice(SourcePosition(0)..current)
319            .rfind(['\r', '\n', '\x0C'])
320            .map_or(0, |start| start + 1);
321        let end = self
322            .slice(current..SourcePosition(self.input.len()))
323            .find(['\r', '\n', '\x0C'])
324            .map_or(self.input.len(), |end| current.0 + end);
325        self.slice(SourcePosition(start)..SourcePosition(end))
326    }
327
328    #[inline]
329    pub fn next_byte(&self) -> Option<u8> {
330        if self.is_eof() {
331            None
332        } else {
333            Some(self.input.as_bytes()[self.position])
334        }
335    }
336
337    // If false, `tokenizer.next_char()` will not panic.
338    #[inline]
339    fn is_eof(&self) -> bool {
340        !self.has_at_least(0)
341    }
342
343    // If true, the input has at least `n` bytes left *after* the current one.
344    // That is, `tokenizer.char_at(n)` will not panic.
345    #[inline]
346    fn has_at_least(&self, n: usize) -> bool {
347        self.position + n < self.input.len()
348    }
349
350    // Advance over N bytes in the input.  This function can advance
351    // over ASCII bytes (excluding newlines), or UTF-8 sequence
352    // leaders (excluding leaders for 4-byte sequences).
353    #[inline]
354    pub fn advance(&mut self, n: usize) {
355        if cfg!(debug_assertions) {
356            // Each byte must either be an ASCII byte or a sequence
357            // leader, but not a 4-byte leader; also newlines are
358            // rejected.
359            for i in 0..n {
360                let b = self.byte_at(i);
361                debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
362                debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
363            }
364        }
365        self.position += n
366    }
367
368    // Assumes non-EOF
369    #[inline]
370    fn next_byte_unchecked(&self) -> u8 {
371        self.byte_at(0)
372    }
373
374    #[inline]
375    fn byte_at(&self, offset: usize) -> u8 {
376        self.input.as_bytes()[self.position + offset]
377    }
378
379    // Advance over a single byte; the byte must be a UTF-8 sequence
380    // leader for a 4-byte sequence.
381    #[inline]
382    fn consume_4byte_intro(&mut self) {
383        debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
384        // This takes two UTF-16 characters to represent, so we
385        // actually have an undercount.
386        self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
387        self.position += 1;
388    }
389
390    // Advance over a single byte; the byte must be a UTF-8
391    // continuation byte.
392    #[inline]
393    fn consume_continuation_byte(&mut self) {
394        debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
395        // Continuation bytes contribute to column overcount.  Note
396        // that due to the special case for the 4-byte sequence intro,
397        // we must use wrapping add here.
398        self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
399        self.position += 1;
400    }
401
402    // Advance over any kind of byte, excluding newlines.
403    #[inline(never)]
404    fn consume_known_byte(&mut self, byte: u8) {
405        debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
406        self.position += 1;
407        // Continuation bytes contribute to column overcount.
408        if byte & 0xF0 == 0xF0 {
409            // This takes two UTF-16 characters to represent, so we
410            // actually have an undercount.
411            self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
412        } else if byte & 0xC0 == 0x80 {
413            // Note that due to the special case for the 4-byte
414            // sequence intro, we must use wrapping add here.
415            self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
416        }
417    }
418
419    #[inline]
420    fn next_char(&self) -> char {
421        unsafe { self.input.get_unchecked(self.position().0..) }
422            .chars()
423            .next()
424            .unwrap()
425    }
426
427    // Given that a newline has been seen, advance over the newline
428    // and update the state.
429    #[inline]
430    fn consume_newline(&mut self) {
431        let byte = self.next_byte_unchecked();
432        debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
433        self.position += 1;
434        if byte == b'\r' && self.next_byte() == Some(b'\n') {
435            self.position += 1;
436        }
437        self.current_line_start_position = self.position;
438        self.current_line_number += 1;
439    }
440
441    #[inline]
442    fn has_newline_at(&self, offset: usize) -> bool {
443        self.position + offset < self.input.len()
444            && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
445    }
446
447    #[inline]
448    fn consume_char(&mut self) -> char {
449        let c = self.next_char();
450        let len_utf8 = c.len_utf8();
451        self.position += len_utf8;
452        // Note that due to the special case for the 4-byte sequence
453        // intro, we must use wrapping add here.
454        self.current_line_start_position = self
455            .current_line_start_position
456            .wrapping_add(len_utf8 - c.len_utf16());
457        c
458    }
459
460    #[inline]
461    fn starts_with(&self, needle: &[u8]) -> bool {
462        self.input.as_bytes()[self.position..].starts_with(needle)
463    }
464
465    pub fn skip_whitespace(&mut self) {
466        while !self.is_eof() {
467            match_byte! { self.next_byte_unchecked(),
468                b' ' | b'\t' => {
469                    self.advance(1)
470                },
471                b'\n' | b'\x0C' | b'\r' => {
472                    self.consume_newline();
473                },
474                b'/' => {
475                    if self.starts_with(b"/*") {
476                        consume_comment(self);
477                    } else {
478                        return
479                    }
480                }
481                _ => return,
482            }
483        }
484    }
485
486    pub fn skip_cdc_and_cdo(&mut self) {
487        while !self.is_eof() {
488            match_byte! { self.next_byte_unchecked(),
489                b' ' | b'\t' => {
490                    self.advance(1)
491                },
492                b'\n' | b'\x0C' | b'\r' => {
493                    self.consume_newline();
494                },
495                b'/' => {
496                    if self.starts_with(b"/*") {
497                        consume_comment(self);
498                    } else {
499                        return
500                    }
501                }
502                b'<' => {
503                    if self.starts_with(b"<!--") {
504                        self.advance(4)
505                    } else {
506                        return
507                    }
508                }
509                b'-' => {
510                    if self.starts_with(b"-->") {
511                        self.advance(3)
512                    } else {
513                        return
514                    }
515                }
516                _ => {
517                    return
518                }
519            }
520        }
521    }
522}
523
524/// A position from the start of the input, counted in UTF-8 bytes.
525#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
526pub struct SourcePosition(pub(crate) usize);
527
528#[cfg(feature = "malloc_size_of")]
529malloc_size_of::malloc_size_of_is_0!(SourcePosition);
530
531impl SourcePosition {
532    /// Returns the current byte index in the original input.
533    #[inline]
534    pub fn byte_index(&self) -> usize {
535        self.0
536    }
537}
538
539/// The line and column number for a given position within the input.
540#[derive(PartialEq, Eq, Debug, Clone, Copy, Default)]
541pub struct SourceLocation {
542    /// The line number, starting at 0 for the first line.
543    pub line: u32,
544
545    /// The column number within a line, starting at 1 for first the character of the line.
546    /// Column numbers are counted in UTF-16 code units.
547    pub column: u32,
548}
549
550#[cfg(feature = "malloc_size_of")]
551malloc_size_of::malloc_size_of_is_0!(SourceLocation);
552
553fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
554    if tokenizer.is_eof() {
555        return Err(());
556    }
557    let b = tokenizer.next_byte_unchecked();
558    let token = match_byte! { b,
559        b' ' | b'\t' => {
560            consume_whitespace(tokenizer, false)
561        },
562        b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true),
563        b'"' => consume_string(tokenizer, false),
564        b'#' => {
565            tokenizer.advance(1);
566            if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
567            else if !tokenizer.is_eof() &&
568                matches!(tokenizer.next_byte_unchecked(), b'0'..=b'9' | b'-') {
569                // Any other valid case here already resulted in IDHash.
570                Hash(consume_name(tokenizer))
571            }
572            else { Delim('#') }
573        },
574        b'$' => {
575            if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
576            else { tokenizer.advance(1); Delim('$') }
577        },
578        b'\'' => consume_string(tokenizer, true),
579        b'(' => { tokenizer.advance(1); ParenthesisBlock },
580        b')' => { tokenizer.advance(1); CloseParenthesis },
581        b'*' => {
582            if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
583            else { tokenizer.advance(1); Delim('*') }
584        },
585        b'+' => {
586            if (
587                tokenizer.has_at_least(1)
588                && tokenizer.byte_at(1).is_ascii_digit()
589            ) || (
590                tokenizer.has_at_least(2)
591                && tokenizer.byte_at(1) == b'.'
592                && tokenizer.byte_at(2).is_ascii_digit()
593            ) {
594                consume_numeric(tokenizer)
595            } else {
596                tokenizer.advance(1);
597                Delim('+')
598            }
599        },
600        b',' => { tokenizer.advance(1); Comma },
601        b'-' => {
602            if (
603                tokenizer.has_at_least(1)
604                && tokenizer.byte_at(1).is_ascii_digit()
605            ) || (
606                tokenizer.has_at_least(2)
607                && tokenizer.byte_at(1) == b'.'
608                && tokenizer.byte_at(2).is_ascii_digit()
609            ) {
610                consume_numeric(tokenizer)
611            } else if tokenizer.starts_with(b"-->") {
612                tokenizer.advance(3);
613                CDC
614            } else if is_ident_start(tokenizer) {
615                consume_ident_like(tokenizer)
616            } else {
617                tokenizer.advance(1);
618                Delim('-')
619            }
620        },
621        b'.' => {
622            if tokenizer.has_at_least(1)
623                && tokenizer.byte_at(1).is_ascii_digit() {
624                consume_numeric(tokenizer)
625            } else {
626                tokenizer.advance(1);
627                Delim('.')
628            }
629        }
630        b'/' => {
631            if tokenizer.starts_with(b"/*") {
632                Comment(consume_comment(tokenizer))
633            } else {
634                tokenizer.advance(1);
635                Delim('/')
636            }
637        }
638        b'0'..=b'9' => consume_numeric(tokenizer),
639        b':' => { tokenizer.advance(1); Colon },
640        b';' => { tokenizer.advance(1); Semicolon },
641        b'<' => {
642            if tokenizer.starts_with(b"<!--") {
643                tokenizer.advance(4);
644                CDO
645            } else {
646                tokenizer.advance(1);
647                Delim('<')
648            }
649        },
650        b'@' => {
651            tokenizer.advance(1);
652            if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
653            else { Delim('@') }
654        },
655        b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => consume_ident_like(tokenizer),
656        b'[' => { tokenizer.advance(1); SquareBracketBlock },
657        b'\\' => {
658            if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
659            else { tokenizer.advance(1); Delim('\\') }
660        },
661        b']' => { tokenizer.advance(1); CloseSquareBracket },
662        b'^' => {
663            if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
664            else { tokenizer.advance(1); Delim('^') }
665        },
666        b'{' => { tokenizer.advance(1); CurlyBracketBlock },
667        b'|' => {
668            if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
669            else { tokenizer.advance(1); Delim('|') }
670        },
671        b'}' => { tokenizer.advance(1); CloseCurlyBracket },
672        b'~' => {
673            if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
674            else { tokenizer.advance(1); Delim('~') }
675        },
676        _ => {
677            if !b.is_ascii() {
678                consume_ident_like(tokenizer)
679            } else {
680                tokenizer.advance(1);
681                Delim(b as char)
682            }
683        },
684    };
685    Ok(token)
686}
687
688fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
689    let start_position = tokenizer.position();
690    if newline {
691        tokenizer.consume_newline();
692    } else {
693        tokenizer.advance(1);
694    }
695    while !tokenizer.is_eof() {
696        let b = tokenizer.next_byte_unchecked();
697        match_byte! { b,
698            b' ' | b'\t' => {
699                tokenizer.advance(1);
700            }
701            b'\n' | b'\x0C' | b'\r' => {
702                tokenizer.consume_newline();
703            }
704            _ => {
705                break
706            }
707        }
708    }
709    WhiteSpace(tokenizer.slice_from(start_position))
710}
711
712// Check for sourceMappingURL or sourceURL comments and update the
713// tokenizer appropriately.
714fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
715    let directive = "# sourceMappingURL=";
716    let directive_old = "@ sourceMappingURL=";
717
718    // If there is a source map directive, extract the URL.
719    if contents.starts_with(directive) || contents.starts_with(directive_old) {
720        let contents = &contents[directive.len()..];
721        tokenizer.source_map_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next();
722    }
723
724    let directive = "# sourceURL=";
725    let directive_old = "@ sourceURL=";
726
727    // If there is a source map directive, extract the URL.
728    if contents.starts_with(directive) || contents.starts_with(directive_old) {
729        let contents = &contents[directive.len()..];
730        tokenizer.source_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next()
731    }
732}
733
734fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
735    tokenizer.advance(2); // consume "/*"
736    let start_position = tokenizer.position();
737    while !tokenizer.is_eof() {
738        match_byte! { tokenizer.next_byte_unchecked(),
739            b'*' => {
740                let end_position = tokenizer.position();
741                tokenizer.advance(1);
742                if tokenizer.next_byte() == Some(b'/') {
743                    tokenizer.advance(1);
744                    let contents = tokenizer.slice(start_position..end_position);
745                    check_for_source_map(tokenizer, contents);
746                    return contents
747                }
748            }
749            b'\n' | b'\x0C' | b'\r' => {
750                tokenizer.consume_newline();
751            }
752            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
753            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
754            _ => {
755                // ASCII or other leading byte.
756                tokenizer.advance(1);
757            }
758        }
759    }
760    let contents = tokenizer.slice_from(start_position);
761    check_for_source_map(tokenizer, contents);
762    contents
763}
764
765fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
766    match consume_quoted_string(tokenizer, single_quote) {
767        Ok(value) => QuotedString(value),
768        Err(value) => BadString(value),
769    }
770}
771
772/// Return `Err(())` on syntax error (ie. unescaped newline)
773fn consume_quoted_string<'a>(
774    tokenizer: &mut Tokenizer<'a>,
775    single_quote: bool,
776) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
777    tokenizer.advance(1); // Skip the initial quote
778                          // start_pos is at code point boundary, after " or '
779    let start_pos = tokenizer.position();
780    let mut string_bytes;
781    loop {
782        if tokenizer.is_eof() {
783            return Ok(tokenizer.slice_from(start_pos).into());
784        }
785        match_byte! { tokenizer.next_byte_unchecked(),
786            b'"' => {
787                if !single_quote {
788                    let value = tokenizer.slice_from(start_pos);
789                    tokenizer.advance(1);
790                    return Ok(value.into())
791                }
792                tokenizer.advance(1);
793            }
794            b'\'' => {
795                if single_quote {
796                    let value = tokenizer.slice_from(start_pos);
797                    tokenizer.advance(1);
798                    return Ok(value.into())
799                }
800                tokenizer.advance(1);
801            }
802            b'\\' | b'\0' => {
803                // * The tokenizer’s input is UTF-8 since it’s `&str`.
804                // * start_pos is at a code point boundary
805                // * so is the current position (which is before '\\' or '\0'
806                //
807                // So `string_bytes` is well-formed UTF-8.
808                string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
809                break
810            }
811            b'\n' | b'\r' | b'\x0C' => {
812                return Err(tokenizer.slice_from(start_pos).into())
813            },
814            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
815            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
816            _ => {
817                // ASCII or other leading byte.
818                tokenizer.advance(1);
819            }
820        }
821    }
822
823    while !tokenizer.is_eof() {
824        let b = tokenizer.next_byte_unchecked();
825        match_byte! { b,
826            b'\n' | b'\r' | b'\x0C' => {
827                return Err(
828                    // string_bytes is well-formed UTF-8, see other comments.
829                    unsafe {
830                        from_utf8_release_unchecked(string_bytes)
831                    }.into()
832                );
833            }
834            b'"' => {
835                tokenizer.advance(1);
836                if !single_quote {
837                    break;
838                }
839            }
840            b'\'' => {
841                tokenizer.advance(1);
842                if single_quote {
843                    break;
844                }
845            }
846            b'\\' => {
847                tokenizer.advance(1);
848                if !tokenizer.is_eof() {
849                    match tokenizer.next_byte_unchecked() {
850                        // Escaped newline
851                        b'\n' | b'\x0C' | b'\r' => {
852                            tokenizer.consume_newline();
853                        }
854                        // This pushes one well-formed code point
855                        _ => consume_escape_and_write(tokenizer, &mut string_bytes)
856                    }
857                }
858                // else: escaped EOF, do nothing.
859                continue;
860            }
861            b'\0' => {
862                tokenizer.advance(1);
863                string_bytes.extend("\u{FFFD}".as_bytes());
864                continue;
865            }
866            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
867            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
868            _ => {
869                // ASCII or other leading byte.
870                tokenizer.advance(1);
871            },
872        }
873
874        // If this byte is part of a multi-byte code point,
875        // we’ll end up copying the whole code point before this loop does something else.
876        string_bytes.push(b);
877    }
878
879    Ok(
880        // string_bytes is well-formed UTF-8, see other comments.
881        unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
882    )
883}
884
885#[inline]
886fn is_ident_start(tokenizer: &Tokenizer) -> bool {
887    !tokenizer.is_eof()
888        && match_byte! { tokenizer.next_byte_unchecked(),
889            b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => true,
890            b'-' => {
891                tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
892                    b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
893                        true
894                    }
895                    b'\\' => !tokenizer.has_newline_at(1),
896                    b => !b.is_ascii(),
897                }
898            },
899            b'\\' => !tokenizer.has_newline_at(1),
900            b => !b.is_ascii(),
901        }
902}
903
904fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
905    let value = consume_name(tokenizer);
906    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
907        tokenizer.advance(1);
908        if value.eq_ignore_ascii_case("url") {
909            consume_unquoted_url(tokenizer).unwrap_or(Function(value))
910        } else {
911            tokenizer.see_function(&value);
912            Function(value)
913        }
914    } else {
915        Ident(value)
916    }
917}
918
919fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
920    // start_pos is the end of the previous token, therefore at a code point boundary
921    let start_pos = tokenizer.position();
922    let mut value_bytes;
923    loop {
924        if tokenizer.is_eof() {
925            return tokenizer.slice_from(start_pos).into();
926        }
927        match_byte! { tokenizer.next_byte_unchecked(),
928            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => tokenizer.advance(1),
929            b'\\' | b'\0' => {
930                // * The tokenizer’s input is UTF-8 since it’s `&str`.
931                // * start_pos is at a code point boundary
932                // * so is the current position (which is before '\\' or '\0'
933                //
934                // So `value_bytes` is well-formed UTF-8.
935                value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
936                break
937            }
938            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
939            b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
940            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
941            _b => {
942                return tokenizer.slice_from(start_pos).into();
943            }
944        }
945    }
946
947    while !tokenizer.is_eof() {
948        let b = tokenizer.next_byte_unchecked();
949        match_byte! { b,
950            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-'  => {
951                tokenizer.advance(1);
952                value_bytes.push(b)  // ASCII
953            }
954            b'\\' => {
955                if tokenizer.has_newline_at(1) { break }
956                tokenizer.advance(1);
957                // This pushes one well-formed code point
958                consume_escape_and_write(tokenizer, &mut value_bytes)
959            }
960            b'\0' => {
961                tokenizer.advance(1);
962                value_bytes.extend("\u{FFFD}".as_bytes());
963            },
964            b'\x80'..=b'\xBF' => {
965                // This byte *is* part of a multi-byte code point,
966                // we’ll end up copying the whole code point before this loop does something else.
967                tokenizer.consume_continuation_byte();
968                value_bytes.push(b)
969            }
970            b'\xC0'..=b'\xEF' => {
971                // This byte *is* part of a multi-byte code point,
972                // we’ll end up copying the whole code point before this loop does something else.
973                tokenizer.advance(1);
974                value_bytes.push(b)
975            }
976            b'\xF0'..=b'\xFF' => {
977                tokenizer.consume_4byte_intro();
978                value_bytes.push(b)
979            }
980            _ => {
981                // ASCII
982                break;
983            }
984        }
985    }
986    // string_bytes is well-formed UTF-8, see other comments.
987    unsafe { from_utf8_release_unchecked(value_bytes) }.into()
988}
989
990fn byte_to_hex_digit(b: u8) -> Option<u32> {
991    Some(match_byte! { b,
992        b'0' ..= b'9' => b - b'0',
993        b'a' ..= b'f' => b - b'a' + 10,
994        b'A' ..= b'F' => b - b'A' + 10,
995        _ => {
996            return None
997        }
998    } as u32)
999}
1000
1001fn byte_to_decimal_digit(b: u8) -> Option<u32> {
1002    if b.is_ascii_digit() {
1003        Some((b - b'0') as u32)
1004    } else {
1005        None
1006    }
1007}
1008
1009fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1010    // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
1011    // But this is always called so that there is at least one digit in \d*(\.\d+)?
1012
1013    // Do all the math in f64 so that large numbers overflow to +/-inf
1014    // and i32::{MIN, MAX} are within range.
1015
1016    let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1017        b'-' => (true, -1.),
1018        b'+' => (true, 1.),
1019        _ => (false, 1.),
1020    };
1021    if has_sign {
1022        tokenizer.advance(1);
1023    }
1024
1025    let mut integral_part: f64 = 0.;
1026    while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1027        integral_part = integral_part * 10. + digit as f64;
1028        tokenizer.advance(1);
1029        if tokenizer.is_eof() {
1030            break;
1031        }
1032    }
1033
1034    let mut is_integer = true;
1035
1036    let mut fractional_part: f64 = 0.;
1037    if tokenizer.has_at_least(1)
1038        && tokenizer.next_byte_unchecked() == b'.'
1039        && tokenizer.byte_at(1).is_ascii_digit()
1040    {
1041        is_integer = false;
1042        tokenizer.advance(1); // Consume '.'
1043        let mut factor = 0.1;
1044        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1045            fractional_part += digit as f64 * factor;
1046            factor *= 0.1;
1047            tokenizer.advance(1);
1048            if tokenizer.is_eof() {
1049                break;
1050            }
1051        }
1052    }
1053
1054    let mut value = sign * (integral_part + fractional_part);
1055
1056    if tokenizer.has_at_least(1)
1057        && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E')
1058        && (tokenizer.byte_at(1).is_ascii_digit()
1059            || (tokenizer.has_at_least(2)
1060                && matches!(tokenizer.byte_at(1), b'+' | b'-')
1061                && tokenizer.byte_at(2).is_ascii_digit()))
1062    {
1063        is_integer = false;
1064        tokenizer.advance(1);
1065        let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1066            b'-' => (true, -1.),
1067            b'+' => (true, 1.),
1068            _ => (false, 1.),
1069        };
1070        if has_sign {
1071            tokenizer.advance(1);
1072        }
1073        let mut exponent: f64 = 0.;
1074        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1075            exponent = exponent * 10. + digit as f64;
1076            tokenizer.advance(1);
1077            if tokenizer.is_eof() {
1078                break;
1079            }
1080        }
1081        value *= f64::powf(10., sign * exponent);
1082    }
1083
1084    let int_value = if is_integer {
1085        Some(if value >= i32::MAX as f64 {
1086            i32::MAX
1087        } else if value <= i32::MIN as f64 {
1088            i32::MIN
1089        } else {
1090            value as i32
1091        })
1092    } else {
1093        None
1094    };
1095
1096    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
1097        tokenizer.advance(1);
1098        return Percentage {
1099            unit_value: (value / 100.) as f32,
1100            int_value,
1101            has_sign,
1102        };
1103    }
1104    let value = value as f32;
1105    if is_ident_start(tokenizer) {
1106        let unit = consume_name(tokenizer);
1107        Dimension {
1108            value,
1109            int_value,
1110            has_sign,
1111            unit,
1112        }
1113    } else {
1114        Number {
1115            value,
1116            int_value,
1117            has_sign,
1118        }
1119    }
1120}
1121
1122#[inline]
1123unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
1124    if cfg!(debug_assertions) {
1125        String::from_utf8(string_bytes).unwrap()
1126    } else {
1127        String::from_utf8_unchecked(string_bytes)
1128    }
1129}
1130
1131fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
1132    // This is only called after "url(", so the current position is a code point boundary.
1133    let start_position = tokenizer.position;
1134    let from_start = &tokenizer.input[tokenizer.position..];
1135    let mut newlines = 0;
1136    let mut last_newline = 0;
1137    let mut found_printable_char = false;
1138    let mut iter = from_start.bytes().enumerate();
1139    loop {
1140        let (offset, b) = match iter.next() {
1141            Some(item) => item,
1142            None => {
1143                tokenizer.position = tokenizer.input.len();
1144                break;
1145            }
1146        };
1147        match_byte! { b,
1148            b' ' | b'\t' => {},
1149            b'\n' | b'\x0C' => {
1150                newlines += 1;
1151                last_newline = offset;
1152            }
1153            b'\r' => {
1154                if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
1155                    newlines += 1;
1156                    last_newline = offset;
1157                }
1158            }
1159            b'"' | b'\'' => return Err(()),  // Do not advance
1160            b')' => {
1161                // Don't use advance, because we may be skipping
1162                // newlines here, and we want to avoid the assert.
1163                tokenizer.position += offset + 1;
1164                break
1165            }
1166            _ => {
1167                // Don't use advance, because we may be skipping
1168                // newlines here, and we want to avoid the assert.
1169                tokenizer.position += offset;
1170                found_printable_char = true;
1171                break
1172            }
1173        }
1174    }
1175
1176    if newlines > 0 {
1177        tokenizer.current_line_number += newlines;
1178        // No need for wrapping_add here, because there's no possible
1179        // way to wrap.
1180        tokenizer.current_line_start_position = start_position + last_newline + 1;
1181    }
1182
1183    if found_printable_char {
1184        // This function only consumed ASCII (whitespace) bytes,
1185        // so the current position is a code point boundary.
1186        return Ok(consume_unquoted_url_internal(tokenizer));
1187    } else {
1188        return Ok(UnquotedUrl("".into()));
1189    }
1190
1191    fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1192        // This function is only called with start_pos at a code point boundary.
1193        let start_pos = tokenizer.position();
1194        let mut string_bytes: Vec<u8>;
1195        loop {
1196            if tokenizer.is_eof() {
1197                return UnquotedUrl(tokenizer.slice_from(start_pos).into());
1198            }
1199            match_byte! { tokenizer.next_byte_unchecked(),
1200                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1201                    let value = tokenizer.slice_from(start_pos);
1202                    return consume_url_end(tokenizer, start_pos, value.into())
1203                }
1204                b')' => {
1205                    let value = tokenizer.slice_from(start_pos);
1206                    tokenizer.advance(1);
1207                    return UnquotedUrl(value.into())
1208                }
1209                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
1210                    | b'"' | b'\'' | b'(' => {
1211                    tokenizer.advance(1);
1212                    return consume_bad_url(tokenizer, start_pos)
1213                },
1214                b'\\' | b'\0' => {
1215                    // * The tokenizer’s input is UTF-8 since it’s `&str`.
1216                    // * start_pos is at a code point boundary
1217                    // * so is the current position (which is before '\\' or '\0'
1218                    //
1219                    // So `string_bytes` is well-formed UTF-8.
1220                    string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
1221                    break
1222                }
1223                b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
1224                b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
1225                _ => {
1226                    // ASCII or other leading byte.
1227                    tokenizer.advance(1);
1228                }
1229            }
1230        }
1231        while !tokenizer.is_eof() {
1232            let b = tokenizer.next_byte_unchecked();
1233            match_byte! { b,
1234                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1235                    // string_bytes is well-formed UTF-8, see other comments.
1236                    let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1237                    return consume_url_end(tokenizer, start_pos, string)
1238                }
1239                b')' => {
1240                    tokenizer.advance(1);
1241                    break;
1242                }
1243                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
1244                    | b'"' | b'\'' | b'(' => {
1245                    tokenizer.advance(1);
1246                    return consume_bad_url(tokenizer, start_pos);
1247                }
1248                b'\\' => {
1249                    tokenizer.advance(1);
1250                    if tokenizer.has_newline_at(0) {
1251                        return consume_bad_url(tokenizer, start_pos)
1252                    }
1253
1254                    // This pushes one well-formed code point to string_bytes
1255                    consume_escape_and_write(tokenizer, &mut string_bytes)
1256                },
1257                b'\0' => {
1258                    tokenizer.advance(1);
1259                    string_bytes.extend("\u{FFFD}".as_bytes());
1260                }
1261                b'\x80'..=b'\xBF' => {
1262                    // We’ll end up copying the whole code point
1263                    // before this loop does something else.
1264                    tokenizer.consume_continuation_byte();
1265                    string_bytes.push(b);
1266                }
1267                b'\xF0'..=b'\xFF' => {
1268                    // We’ll end up copying the whole code point
1269                    // before this loop does something else.
1270                    tokenizer.consume_4byte_intro();
1271                    string_bytes.push(b);
1272                }
1273                // If this byte is part of a multi-byte code point,
1274                // we’ll end up copying the whole code point before this loop does something else.
1275                b => {
1276                    // ASCII or other leading byte.
1277                    tokenizer.advance(1);
1278                    string_bytes.push(b)
1279                }
1280            }
1281        }
1282        UnquotedUrl(
1283            // string_bytes is well-formed UTF-8, see other comments.
1284            unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
1285        )
1286    }
1287
1288    fn consume_url_end<'a>(
1289        tokenizer: &mut Tokenizer<'a>,
1290        start_pos: SourcePosition,
1291        string: CowRcStr<'a>,
1292    ) -> Token<'a> {
1293        while !tokenizer.is_eof() {
1294            match_byte! { tokenizer.next_byte_unchecked(),
1295                b')' => {
1296                    tokenizer.advance(1);
1297                    break
1298                }
1299                b' ' | b'\t' => { tokenizer.advance(1); }
1300                b'\n' | b'\x0C' | b'\r' => {
1301                    tokenizer.consume_newline();
1302                }
1303                b => {
1304                    tokenizer.consume_known_byte(b);
1305                    return consume_bad_url(tokenizer, start_pos);
1306                }
1307            }
1308        }
1309        UnquotedUrl(string)
1310    }
1311
1312    fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
1313        // Consume up to the closing )
1314        while !tokenizer.is_eof() {
1315            match_byte! { tokenizer.next_byte_unchecked(),
1316                b')' => {
1317                    let contents = tokenizer.slice_from(start_pos).into();
1318                    tokenizer.advance(1);
1319                    return BadUrl(contents)
1320                }
1321                b'\\' => {
1322                    tokenizer.advance(1);
1323                    if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
1324                        tokenizer.advance(1); // Skip an escaped ')' or '\'
1325                    }
1326                }
1327                b'\n' | b'\x0C' | b'\r' => {
1328                    tokenizer.consume_newline();
1329                }
1330                b => {
1331                    tokenizer.consume_known_byte(b);
1332                }
1333            }
1334        }
1335        BadUrl(tokenizer.slice_from(start_pos).into())
1336    }
1337}
1338
1339// (value, number of digits up to 6)
1340fn consume_hex_digits(tokenizer: &mut Tokenizer<'_>) -> (u32, u32) {
1341    let mut value = 0;
1342    let mut digits = 0;
1343    while digits < 6 && !tokenizer.is_eof() {
1344        match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
1345            Some(digit) => {
1346                value = value * 16 + digit;
1347                digits += 1;
1348                tokenizer.advance(1);
1349            }
1350            None => break,
1351        }
1352    }
1353    (value, digits)
1354}
1355
1356// Same constraints as consume_escape except it writes into `bytes` the result
1357// instead of returning it.
1358fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
1359    bytes.extend(
1360        consume_escape(tokenizer)
1361            .encode_utf8(&mut [0; 4])
1362            .as_bytes(),
1363    )
1364}
1365
1366// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
1367// and that the next input character has already been verified
1368// to not be a newline.
1369fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1370    if tokenizer.is_eof() {
1371        return '\u{FFFD}';
1372    } // Escaped EOF
1373    match_byte! { tokenizer.next_byte_unchecked(),
1374        b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
1375            let (c, _) = consume_hex_digits(tokenizer);
1376            if !tokenizer.is_eof() {
1377                match_byte! { tokenizer.next_byte_unchecked(),
1378                    b' ' | b'\t' => {
1379                        tokenizer.advance(1)
1380                    }
1381                    b'\n' | b'\x0C' | b'\r' => {
1382                        tokenizer.consume_newline();
1383                    }
1384                    _ => {}
1385                }
1386            }
1387            static REPLACEMENT_CHAR: char = '\u{FFFD}';
1388            if c != 0 {
1389                let c = char::from_u32(c);
1390                c.unwrap_or(REPLACEMENT_CHAR)
1391            } else {
1392                REPLACEMENT_CHAR
1393            }
1394        },
1395        b'\0' => {
1396            tokenizer.advance(1);
1397            '\u{FFFD}'
1398        }
1399        _ => tokenizer.consume_char(),
1400    }
1401}