cssparser/
tokenizer.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5// https://drafts.csswg.org/css-syntax/#tokenization
6
7use self::Token::*;
8use crate::cow_rc_str::CowRcStr;
9use crate::parser::ParserState;
10use std::char;
11use std::ops::Range;
12
13#[cfg(not(feature = "dummy_match_byte"))]
14use cssparser_macros::match_byte;
15
16#[cfg(feature = "dummy_match_byte")]
17macro_rules! match_byte {
18    ($value:expr, $($rest:tt)* ) => {
19        match $value {
20            $(
21                $rest
22            )+
23        }
24    };
25}
26
27/// One of the pieces the CSS input is broken into.
28///
29/// Some components use `Cow` in order to borrow from the original input string
30/// and avoid allocating/copying when possible.
31#[derive(PartialEq, Debug, Clone)]
32pub enum Token<'a> {
33    /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram)
34    Ident(CowRcStr<'a>),
35
36    /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram)
37    ///
38    /// The value does not include the `@` marker.
39    AtKeyword(CowRcStr<'a>),
40
41    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"
42    ///
43    /// The value does not include the `#` marker.
44    Hash(CowRcStr<'a>),
45
46    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"
47    ///
48    /// The value does not include the `#` marker.
49    IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.
50
51    /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram)
52    ///
53    /// The value does not include the quotes.
54    QuotedString(CowRcStr<'a>),
55
56    /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram)
57    ///
58    /// The value does not include the `url(` `)` markers.  Note that `url( <string-token> )` is represented by a
59    /// `Function` token.
60    UnquotedUrl(CowRcStr<'a>),
61
62    /// A `<delim-token>`
63    Delim(char),
64
65    /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram)
66    Number {
67        /// Whether the number had a `+` or `-` sign.
68        ///
69        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
70        has_sign: bool,
71
72        /// The value as a float
73        value: f32,
74
75        /// If the origin source did not include a fractional part, the value as an integer.
76        int_value: Option<i32>,
77    },
78
79    /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram)
80    Percentage {
81        /// Whether the number had a `+` or `-` sign.
82        has_sign: bool,
83
84        /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
85        unit_value: f32,
86
87        /// If the origin source did not include a fractional part, the value as an integer.
88        /// It is **not** divided by 100.
89        int_value: Option<i32>,
90    },
91
92    /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram)
93    Dimension {
94        /// Whether the number had a `+` or `-` sign.
95        ///
96        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
97        has_sign: bool,
98
99        /// The value as a float
100        value: f32,
101
102        /// If the origin source did not include a fractional part, the value as an integer.
103        int_value: Option<i32>,
104
105        /// The unit, e.g. "px" in `12px`
106        unit: CowRcStr<'a>,
107    },
108
109    /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram)
110    WhiteSpace(&'a str),
111
112    /// A comment.
113    ///
114    /// The CSS Syntax spec does not generate tokens for comments,
115    /// But we do, because we can (borrowed &str makes it cheap).
116    ///
117    /// The value does not include the `/*` `*/` markers.
118    Comment(&'a str),
119
120    /// A `:` `<colon-token>`
121    Colon, // :
122
123    /// A `;` `<semicolon-token>`
124    Semicolon, // ;
125
126    /// A `,` `<comma-token>`
127    Comma, // ,
128
129    /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram)
130    IncludeMatch,
131
132    /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram)
133    DashMatch,
134
135    /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram)
136    PrefixMatch,
137
138    /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram)
139    SuffixMatch,
140
141    /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram)
142    SubstringMatch,
143
144    /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram)
145    CDO,
146
147    /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram)
148    CDC,
149
150    /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram)
151    ///
152    /// The value (name) does not include the `(` marker.
153    Function(CowRcStr<'a>),
154
155    /// A `<(-token>`
156    ParenthesisBlock,
157
158    /// A `<[-token>`
159    SquareBracketBlock,
160
161    /// A `<{-token>`
162    CurlyBracketBlock,
163
164    /// A `<bad-url-token>`
165    ///
166    /// This token always indicates a parse error.
167    BadUrl(CowRcStr<'a>),
168
169    /// A `<bad-string-token>`
170    ///
171    /// This token always indicates a parse error.
172    BadString(CowRcStr<'a>),
173
174    /// A `<)-token>`
175    ///
176    /// When obtained from one of the `Parser::next*` methods,
177    /// this token is always unmatched and indicates a parse error.
178    CloseParenthesis,
179
180    /// A `<]-token>`
181    ///
182    /// When obtained from one of the `Parser::next*` methods,
183    /// this token is always unmatched and indicates a parse error.
184    CloseSquareBracket,
185
186    /// A `<}-token>`
187    ///
188    /// When obtained from one of the `Parser::next*` methods,
189    /// this token is always unmatched and indicates a parse error.
190    CloseCurlyBracket,
191}
192
193impl Token<'_> {
194    /// Return whether this token represents a parse error.
195    ///
196    /// `BadUrl` and `BadString` are tokenizer-level parse errors.
197    ///
198    /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*
199    /// and therefore parse errors when returned by one of the `Parser::next*` methods.
200    pub fn is_parse_error(&self) -> bool {
201        matches!(
202            *self,
203            BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
204        )
205    }
206}
207
208#[derive(Clone)]
209pub struct Tokenizer<'a> {
210    input: &'a str,
211    /// Counted in bytes, not code points. From 0.
212    position: usize,
213    /// The position at the start of the current line; but adjusted to
214    /// ensure that computing the column will give the result in units
215    /// of UTF-16 characters.
216    current_line_start_position: usize,
217    current_line_number: u32,
218    var_or_env_functions: SeenStatus,
219    source_map_url: Option<&'a str>,
220    source_url: Option<&'a str>,
221}
222
223#[derive(Copy, Clone, PartialEq, Eq)]
224enum SeenStatus {
225    DontCare,
226    LookingForThem,
227    SeenAtLeastOne,
228}
229
230impl<'a> Tokenizer<'a> {
231    #[inline]
232    pub fn new(input: &str) -> Tokenizer {
233        Tokenizer {
234            input,
235            position: 0,
236            current_line_start_position: 0,
237            current_line_number: 0,
238            var_or_env_functions: SeenStatus::DontCare,
239            source_map_url: None,
240            source_url: None,
241        }
242    }
243
244    #[inline]
245    pub fn look_for_var_or_env_functions(&mut self) {
246        self.var_or_env_functions = SeenStatus::LookingForThem;
247    }
248
249    #[inline]
250    pub fn seen_var_or_env_functions(&mut self) -> bool {
251        let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne;
252        self.var_or_env_functions = SeenStatus::DontCare;
253        seen
254    }
255
256    #[inline]
257    pub fn see_function(&mut self, name: &str) {
258        if self.var_or_env_functions == SeenStatus::LookingForThem
259            && (name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env"))
260        {
261            self.var_or_env_functions = SeenStatus::SeenAtLeastOne;
262        }
263    }
264
265    #[inline]
266    pub fn next(&mut self) -> Result<Token<'a>, ()> {
267        next_token(self)
268    }
269
270    #[inline]
271    pub fn position(&self) -> SourcePosition {
272        debug_assert!(self.input.is_char_boundary(self.position));
273        SourcePosition(self.position)
274    }
275
276    #[inline]
277    pub fn current_source_location(&self) -> SourceLocation {
278        SourceLocation {
279            line: self.current_line_number,
280            column: (self.position - self.current_line_start_position + 1) as u32,
281        }
282    }
283
284    #[inline]
285    pub fn current_source_map_url(&self) -> Option<&'a str> {
286        self.source_map_url
287    }
288
289    #[inline]
290    pub fn current_source_url(&self) -> Option<&'a str> {
291        self.source_url
292    }
293
294    #[inline]
295    pub fn state(&self) -> ParserState {
296        ParserState {
297            position: self.position,
298            current_line_start_position: self.current_line_start_position,
299            current_line_number: self.current_line_number,
300            at_start_of: None,
301        }
302    }
303
304    #[inline]
305    pub fn reset(&mut self, state: &ParserState) {
306        self.position = state.position;
307        self.current_line_start_position = state.current_line_start_position;
308        self.current_line_number = state.current_line_number;
309    }
310
311    #[inline]
312    pub(crate) fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
313        self.slice(start_pos..self.position())
314    }
315
316    #[inline]
317    pub(crate) fn slice(&self, range: Range<SourcePosition>) -> &'a str {
318        debug_assert!(self.input.is_char_boundary(range.start.0));
319        debug_assert!(self.input.is_char_boundary(range.end.0));
320        unsafe { self.input.get_unchecked(range.start.0..range.end.0) }
321    }
322
323    pub fn current_source_line(&self) -> &'a str {
324        let current = self.position();
325        let start = self
326            .slice(SourcePosition(0)..current)
327            .rfind(['\r', '\n', '\x0C'])
328            .map_or(0, |start| start + 1);
329        let end = self
330            .slice(current..SourcePosition(self.input.len()))
331            .find(['\r', '\n', '\x0C'])
332            .map_or(self.input.len(), |end| current.0 + end);
333        self.slice(SourcePosition(start)..SourcePosition(end))
334    }
335
336    #[inline]
337    pub fn next_byte(&self) -> Option<u8> {
338        if self.is_eof() {
339            None
340        } else {
341            Some(self.input.as_bytes()[self.position])
342        }
343    }
344
345    // If false, `tokenizer.next_char()` will not panic.
346    #[inline]
347    fn is_eof(&self) -> bool {
348        !self.has_at_least(0)
349    }
350
351    // If true, the input has at least `n` bytes left *after* the current one.
352    // That is, `tokenizer.char_at(n)` will not panic.
353    #[inline]
354    fn has_at_least(&self, n: usize) -> bool {
355        self.position + n < self.input.len()
356    }
357
358    // Advance over N bytes in the input.  This function can advance
359    // over ASCII bytes (excluding newlines), or UTF-8 sequence
360    // leaders (excluding leaders for 4-byte sequences).
361    #[inline]
362    pub fn advance(&mut self, n: usize) {
363        if cfg!(debug_assertions) {
364            // Each byte must either be an ASCII byte or a sequence
365            // leader, but not a 4-byte leader; also newlines are
366            // rejected.
367            for i in 0..n {
368                let b = self.byte_at(i);
369                debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
370                debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
371            }
372        }
373        self.position += n
374    }
375
376    // Assumes non-EOF
377    #[inline]
378    fn next_byte_unchecked(&self) -> u8 {
379        self.byte_at(0)
380    }
381
382    #[inline]
383    fn byte_at(&self, offset: usize) -> u8 {
384        self.input.as_bytes()[self.position + offset]
385    }
386
387    // Advance over a single byte; the byte must be a UTF-8 sequence
388    // leader for a 4-byte sequence.
389    #[inline]
390    fn consume_4byte_intro(&mut self) {
391        debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
392        // This takes two UTF-16 characters to represent, so we
393        // actually have an undercount.
394        self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
395        self.position += 1;
396    }
397
398    // Advance over a single byte; the byte must be a UTF-8
399    // continuation byte.
400    #[inline]
401    fn consume_continuation_byte(&mut self) {
402        debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
403        // Continuation bytes contribute to column overcount.  Note
404        // that due to the special case for the 4-byte sequence intro,
405        // we must use wrapping add here.
406        self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
407        self.position += 1;
408    }
409
410    // Advance over any kind of byte, excluding newlines.
411    #[inline(never)]
412    fn consume_known_byte(&mut self, byte: u8) {
413        debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
414        self.position += 1;
415        // Continuation bytes contribute to column overcount.
416        if byte & 0xF0 == 0xF0 {
417            // This takes two UTF-16 characters to represent, so we
418            // actually have an undercount.
419            self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
420        } else if byte & 0xC0 == 0x80 {
421            // Note that due to the special case for the 4-byte
422            // sequence intro, we must use wrapping add here.
423            self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
424        }
425    }
426
427    #[inline]
428    fn next_char(&self) -> char {
429        unsafe { self.input.get_unchecked(self.position().0..) }
430            .chars()
431            .next()
432            .unwrap()
433    }
434
435    // Given that a newline has been seen, advance over the newline
436    // and update the state.
437    #[inline]
438    fn consume_newline(&mut self) {
439        let byte = self.next_byte_unchecked();
440        debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
441        self.position += 1;
442        if byte == b'\r' && self.next_byte() == Some(b'\n') {
443            self.position += 1;
444        }
445        self.current_line_start_position = self.position;
446        self.current_line_number += 1;
447    }
448
449    #[inline]
450    fn has_newline_at(&self, offset: usize) -> bool {
451        self.position + offset < self.input.len()
452            && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
453    }
454
455    #[inline]
456    fn consume_char(&mut self) -> char {
457        let c = self.next_char();
458        let len_utf8 = c.len_utf8();
459        self.position += len_utf8;
460        // Note that due to the special case for the 4-byte sequence
461        // intro, we must use wrapping add here.
462        self.current_line_start_position = self
463            .current_line_start_position
464            .wrapping_add(len_utf8 - c.len_utf16());
465        c
466    }
467
468    #[inline]
469    fn starts_with(&self, needle: &[u8]) -> bool {
470        self.input.as_bytes()[self.position..].starts_with(needle)
471    }
472
473    pub fn skip_whitespace(&mut self) {
474        while !self.is_eof() {
475            match_byte! { self.next_byte_unchecked(),
476                b' ' | b'\t' => {
477                    self.advance(1)
478                },
479                b'\n' | b'\x0C' | b'\r' => {
480                    self.consume_newline();
481                },
482                b'/' => {
483                    if self.starts_with(b"/*") {
484                        consume_comment(self);
485                    } else {
486                        return
487                    }
488                }
489                _ => return,
490            }
491        }
492    }
493
494    pub fn skip_cdc_and_cdo(&mut self) {
495        while !self.is_eof() {
496            match_byte! { self.next_byte_unchecked(),
497                b' ' | b'\t' => {
498                    self.advance(1)
499                },
500                b'\n' | b'\x0C' | b'\r' => {
501                    self.consume_newline();
502                },
503                b'/' => {
504                    if self.starts_with(b"/*") {
505                        consume_comment(self);
506                    } else {
507                        return
508                    }
509                }
510                b'<' => {
511                    if self.starts_with(b"<!--") {
512                        self.advance(4)
513                    } else {
514                        return
515                    }
516                }
517                b'-' => {
518                    if self.starts_with(b"-->") {
519                        self.advance(3)
520                    } else {
521                        return
522                    }
523                }
524                _ => {
525                    return
526                }
527            }
528        }
529    }
530}
531
532/// A position from the start of the input, counted in UTF-8 bytes.
533#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
534pub struct SourcePosition(pub(crate) usize);
535
536#[cfg(feature = "malloc_size_of")]
537malloc_size_of::malloc_size_of_is_0!(SourcePosition);
538
539impl SourcePosition {
540    /// Returns the current byte index in the original input.
541    #[inline]
542    pub fn byte_index(&self) -> usize {
543        self.0
544    }
545}
546
547/// The line and column number for a given position within the input.
548#[derive(PartialEq, Eq, Debug, Clone, Copy, Default)]
549pub struct SourceLocation {
550    /// The line number, starting at 0 for the first line.
551    pub line: u32,
552
553    /// The column number within a line, starting at 1 for first the character of the line.
554    /// Column numbers are counted in UTF-16 code units.
555    pub column: u32,
556}
557
558#[cfg(feature = "malloc_size_of")]
559malloc_size_of::malloc_size_of_is_0!(SourceLocation);
560
561fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
562    if tokenizer.is_eof() {
563        return Err(());
564    }
565    let b = tokenizer.next_byte_unchecked();
566    let token = match_byte! { b,
567        b' ' | b'\t' => {
568            consume_whitespace(tokenizer, false)
569        },
570        b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true),
571        b'"' => consume_string(tokenizer, false),
572        b'#' => {
573            tokenizer.advance(1);
574            if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
575            else if !tokenizer.is_eof() &&
576                matches!(tokenizer.next_byte_unchecked(), b'0'..=b'9' | b'-') {
577                // Any other valid case here already resulted in IDHash.
578                Hash(consume_name(tokenizer))
579            }
580            else { Delim('#') }
581        },
582        b'$' => {
583            if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
584            else { tokenizer.advance(1); Delim('$') }
585        },
586        b'\'' => consume_string(tokenizer, true),
587        b'(' => { tokenizer.advance(1); ParenthesisBlock },
588        b')' => { tokenizer.advance(1); CloseParenthesis },
589        b'*' => {
590            if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
591            else { tokenizer.advance(1); Delim('*') }
592        },
593        b'+' => {
594            if (
595                tokenizer.has_at_least(1)
596                && tokenizer.byte_at(1).is_ascii_digit()
597            ) || (
598                tokenizer.has_at_least(2)
599                && tokenizer.byte_at(1) == b'.'
600                && tokenizer.byte_at(2).is_ascii_digit()
601            ) {
602                consume_numeric(tokenizer)
603            } else {
604                tokenizer.advance(1);
605                Delim('+')
606            }
607        },
608        b',' => { tokenizer.advance(1); Comma },
609        b'-' => {
610            if (
611                tokenizer.has_at_least(1)
612                && tokenizer.byte_at(1).is_ascii_digit()
613            ) || (
614                tokenizer.has_at_least(2)
615                && tokenizer.byte_at(1) == b'.'
616                && tokenizer.byte_at(2).is_ascii_digit()
617            ) {
618                consume_numeric(tokenizer)
619            } else if tokenizer.starts_with(b"-->") {
620                tokenizer.advance(3);
621                CDC
622            } else if is_ident_start(tokenizer) {
623                consume_ident_like(tokenizer)
624            } else {
625                tokenizer.advance(1);
626                Delim('-')
627            }
628        },
629        b'.' => {
630            if tokenizer.has_at_least(1)
631                && tokenizer.byte_at(1).is_ascii_digit() {
632                consume_numeric(tokenizer)
633            } else {
634                tokenizer.advance(1);
635                Delim('.')
636            }
637        }
638        b'/' => {
639            if tokenizer.starts_with(b"/*") {
640                Comment(consume_comment(tokenizer))
641            } else {
642                tokenizer.advance(1);
643                Delim('/')
644            }
645        }
646        b'0'..=b'9' => consume_numeric(tokenizer),
647        b':' => { tokenizer.advance(1); Colon },
648        b';' => { tokenizer.advance(1); Semicolon },
649        b'<' => {
650            if tokenizer.starts_with(b"<!--") {
651                tokenizer.advance(4);
652                CDO
653            } else {
654                tokenizer.advance(1);
655                Delim('<')
656            }
657        },
658        b'@' => {
659            tokenizer.advance(1);
660            if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
661            else { Delim('@') }
662        },
663        b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => consume_ident_like(tokenizer),
664        b'[' => { tokenizer.advance(1); SquareBracketBlock },
665        b'\\' => {
666            if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
667            else { tokenizer.advance(1); Delim('\\') }
668        },
669        b']' => { tokenizer.advance(1); CloseSquareBracket },
670        b'^' => {
671            if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
672            else { tokenizer.advance(1); Delim('^') }
673        },
674        b'{' => { tokenizer.advance(1); CurlyBracketBlock },
675        b'|' => {
676            if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
677            else { tokenizer.advance(1); Delim('|') }
678        },
679        b'}' => { tokenizer.advance(1); CloseCurlyBracket },
680        b'~' => {
681            if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
682            else { tokenizer.advance(1); Delim('~') }
683        },
684        _ => {
685            if !b.is_ascii() {
686                consume_ident_like(tokenizer)
687            } else {
688                tokenizer.advance(1);
689                Delim(b as char)
690            }
691        },
692    };
693    Ok(token)
694}
695
696fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
697    let start_position = tokenizer.position();
698    if newline {
699        tokenizer.consume_newline();
700    } else {
701        tokenizer.advance(1);
702    }
703    while !tokenizer.is_eof() {
704        let b = tokenizer.next_byte_unchecked();
705        match_byte! { b,
706            b' ' | b'\t' => {
707                tokenizer.advance(1);
708            }
709            b'\n' | b'\x0C' | b'\r' => {
710                tokenizer.consume_newline();
711            }
712            _ => {
713                break
714            }
715        }
716    }
717    WhiteSpace(tokenizer.slice_from(start_position))
718}
719
720// Check for sourceMappingURL or sourceURL comments and update the
721// tokenizer appropriately.
722fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
723    let directive = "# sourceMappingURL=";
724    let directive_old = "@ sourceMappingURL=";
725
726    // If there is a source map directive, extract the URL.
727    if contents.starts_with(directive) || contents.starts_with(directive_old) {
728        let contents = &contents[directive.len()..];
729        tokenizer.source_map_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next();
730    }
731
732    let directive = "# sourceURL=";
733    let directive_old = "@ sourceURL=";
734
735    // If there is a source map directive, extract the URL.
736    if contents.starts_with(directive) || contents.starts_with(directive_old) {
737        let contents = &contents[directive.len()..];
738        tokenizer.source_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next()
739    }
740}
741
742fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
743    tokenizer.advance(2); // consume "/*"
744    let start_position = tokenizer.position();
745    while !tokenizer.is_eof() {
746        match_byte! { tokenizer.next_byte_unchecked(),
747            b'*' => {
748                let end_position = tokenizer.position();
749                tokenizer.advance(1);
750                if tokenizer.next_byte() == Some(b'/') {
751                    tokenizer.advance(1);
752                    let contents = tokenizer.slice(start_position..end_position);
753                    check_for_source_map(tokenizer, contents);
754                    return contents
755                }
756            }
757            b'\n' | b'\x0C' | b'\r' => {
758                tokenizer.consume_newline();
759            }
760            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
761            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
762            _ => {
763                // ASCII or other leading byte.
764                tokenizer.advance(1);
765            }
766        }
767    }
768    let contents = tokenizer.slice_from(start_position);
769    check_for_source_map(tokenizer, contents);
770    contents
771}
772
773fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
774    match consume_quoted_string(tokenizer, single_quote) {
775        Ok(value) => QuotedString(value),
776        Err(value) => BadString(value),
777    }
778}
779
780/// Return `Err(())` on syntax error (ie. unescaped newline)
781fn consume_quoted_string<'a>(
782    tokenizer: &mut Tokenizer<'a>,
783    single_quote: bool,
784) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
785    tokenizer.advance(1); // Skip the initial quote
786                          // start_pos is at code point boundary, after " or '
787    let start_pos = tokenizer.position();
788    let mut string_bytes;
789    loop {
790        if tokenizer.is_eof() {
791            return Ok(tokenizer.slice_from(start_pos).into());
792        }
793        match_byte! { tokenizer.next_byte_unchecked(),
794            b'"' => {
795                if !single_quote {
796                    let value = tokenizer.slice_from(start_pos);
797                    tokenizer.advance(1);
798                    return Ok(value.into())
799                }
800                tokenizer.advance(1);
801            }
802            b'\'' => {
803                if single_quote {
804                    let value = tokenizer.slice_from(start_pos);
805                    tokenizer.advance(1);
806                    return Ok(value.into())
807                }
808                tokenizer.advance(1);
809            }
810            b'\\' | b'\0' => {
811                // * The tokenizer’s input is UTF-8 since it’s `&str`.
812                // * start_pos is at a code point boundary
813                // * so is the current position (which is before '\\' or '\0'
814                //
815                // So `string_bytes` is well-formed UTF-8.
816                string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
817                break
818            }
819            b'\n' | b'\r' | b'\x0C' => {
820                return Err(tokenizer.slice_from(start_pos).into())
821            },
822            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
823            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
824            _ => {
825                // ASCII or other leading byte.
826                tokenizer.advance(1);
827            }
828        }
829    }
830
831    while !tokenizer.is_eof() {
832        let b = tokenizer.next_byte_unchecked();
833        match_byte! { b,
834            b'\n' | b'\r' | b'\x0C' => {
835                return Err(
836                    // string_bytes is well-formed UTF-8, see other comments.
837                    unsafe {
838                        from_utf8_release_unchecked(string_bytes)
839                    }.into()
840                );
841            }
842            b'"' => {
843                tokenizer.advance(1);
844                if !single_quote {
845                    break;
846                }
847            }
848            b'\'' => {
849                tokenizer.advance(1);
850                if single_quote {
851                    break;
852                }
853            }
854            b'\\' => {
855                tokenizer.advance(1);
856                if !tokenizer.is_eof() {
857                    match tokenizer.next_byte_unchecked() {
858                        // Escaped newline
859                        b'\n' | b'\x0C' | b'\r' => {
860                            tokenizer.consume_newline();
861                        }
862                        // This pushes one well-formed code point
863                        _ => consume_escape_and_write(tokenizer, &mut string_bytes)
864                    }
865                }
866                // else: escaped EOF, do nothing.
867                continue;
868            }
869            b'\0' => {
870                tokenizer.advance(1);
871                string_bytes.extend("\u{FFFD}".as_bytes());
872                continue;
873            }
874            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
875            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
876            _ => {
877                // ASCII or other leading byte.
878                tokenizer.advance(1);
879            },
880        }
881
882        // If this byte is part of a multi-byte code point,
883        // we’ll end up copying the whole code point before this loop does something else.
884        string_bytes.push(b);
885    }
886
887    Ok(
888        // string_bytes is well-formed UTF-8, see other comments.
889        unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
890    )
891}
892
893#[inline]
894fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
895    !tokenizer.is_eof()
896        && match_byte! { tokenizer.next_byte_unchecked(),
897            b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => true,
898            b'-' => {
899                tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
900                    b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
901                        true
902                    }
903                    b'\\' => !tokenizer.has_newline_at(1),
904                    b => !b.is_ascii(),
905                }
906            },
907            b'\\' => !tokenizer.has_newline_at(1),
908            b => !b.is_ascii(),
909        }
910}
911
912fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
913    let value = consume_name(tokenizer);
914    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
915        tokenizer.advance(1);
916        if value.eq_ignore_ascii_case("url") {
917            consume_unquoted_url(tokenizer).unwrap_or(Function(value))
918        } else {
919            tokenizer.see_function(&value);
920            Function(value)
921        }
922    } else {
923        Ident(value)
924    }
925}
926
927fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
928    // start_pos is the end of the previous token, therefore at a code point boundary
929    let start_pos = tokenizer.position();
930    let mut value_bytes;
931    loop {
932        if tokenizer.is_eof() {
933            return tokenizer.slice_from(start_pos).into();
934        }
935        match_byte! { tokenizer.next_byte_unchecked(),
936            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => tokenizer.advance(1),
937            b'\\' | b'\0' => {
938                // * The tokenizer’s input is UTF-8 since it’s `&str`.
939                // * start_pos is at a code point boundary
940                // * so is the current position (which is before '\\' or '\0'
941                //
942                // So `value_bytes` is well-formed UTF-8.
943                value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
944                break
945            }
946            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
947            b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
948            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
949            _b => {
950                return tokenizer.slice_from(start_pos).into();
951            }
952        }
953    }
954
955    while !tokenizer.is_eof() {
956        let b = tokenizer.next_byte_unchecked();
957        match_byte! { b,
958            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-'  => {
959                tokenizer.advance(1);
960                value_bytes.push(b)  // ASCII
961            }
962            b'\\' => {
963                if tokenizer.has_newline_at(1) { break }
964                tokenizer.advance(1);
965                // This pushes one well-formed code point
966                consume_escape_and_write(tokenizer, &mut value_bytes)
967            }
968            b'\0' => {
969                tokenizer.advance(1);
970                value_bytes.extend("\u{FFFD}".as_bytes());
971            },
972            b'\x80'..=b'\xBF' => {
973                // This byte *is* part of a multi-byte code point,
974                // we’ll end up copying the whole code point before this loop does something else.
975                tokenizer.consume_continuation_byte();
976                value_bytes.push(b)
977            }
978            b'\xC0'..=b'\xEF' => {
979                // This byte *is* part of a multi-byte code point,
980                // we’ll end up copying the whole code point before this loop does something else.
981                tokenizer.advance(1);
982                value_bytes.push(b)
983            }
984            b'\xF0'..=b'\xFF' => {
985                tokenizer.consume_4byte_intro();
986                value_bytes.push(b)
987            }
988            _ => {
989                // ASCII
990                break;
991            }
992        }
993    }
994    // string_bytes is well-formed UTF-8, see other comments.
995    unsafe { from_utf8_release_unchecked(value_bytes) }.into()
996}
997
998fn byte_to_hex_digit(b: u8) -> Option<u32> {
999    Some(match_byte! { b,
1000        b'0' ..= b'9' => b - b'0',
1001        b'a' ..= b'f' => b - b'a' + 10,
1002        b'A' ..= b'F' => b - b'A' + 10,
1003        _ => {
1004            return None
1005        }
1006    } as u32)
1007}
1008
1009fn byte_to_decimal_digit(b: u8) -> Option<u32> {
1010    if b.is_ascii_digit() {
1011        Some((b - b'0') as u32)
1012    } else {
1013        None
1014    }
1015}
1016
1017fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1018    // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
1019    // But this is always called so that there is at least one digit in \d*(\.\d+)?
1020
1021    // Do all the math in f64 so that large numbers overflow to +/-inf
1022    // and i32::{MIN, MAX} are within range.
1023
1024    let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1025        b'-' => (true, -1.),
1026        b'+' => (true, 1.),
1027        _ => (false, 1.),
1028    };
1029    if has_sign {
1030        tokenizer.advance(1);
1031    }
1032
1033    let mut integral_part: f64 = 0.;
1034    while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1035        integral_part = integral_part * 10. + digit as f64;
1036        tokenizer.advance(1);
1037        if tokenizer.is_eof() {
1038            break;
1039        }
1040    }
1041
1042    let mut is_integer = true;
1043
1044    let mut fractional_part: f64 = 0.;
1045    if tokenizer.has_at_least(1)
1046        && tokenizer.next_byte_unchecked() == b'.'
1047        && tokenizer.byte_at(1).is_ascii_digit()
1048    {
1049        is_integer = false;
1050        tokenizer.advance(1); // Consume '.'
1051        let mut factor = 0.1;
1052        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1053            fractional_part += digit as f64 * factor;
1054            factor *= 0.1;
1055            tokenizer.advance(1);
1056            if tokenizer.is_eof() {
1057                break;
1058            }
1059        }
1060    }
1061
1062    let mut value = sign * (integral_part + fractional_part);
1063
1064    if tokenizer.has_at_least(1)
1065        && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E')
1066        && (tokenizer.byte_at(1).is_ascii_digit()
1067            || (tokenizer.has_at_least(2)
1068                && matches!(tokenizer.byte_at(1), b'+' | b'-')
1069                && tokenizer.byte_at(2).is_ascii_digit()))
1070    {
1071        is_integer = false;
1072        tokenizer.advance(1);
1073        let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1074            b'-' => (true, -1.),
1075            b'+' => (true, 1.),
1076            _ => (false, 1.),
1077        };
1078        if has_sign {
1079            tokenizer.advance(1);
1080        }
1081        let mut exponent: f64 = 0.;
1082        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1083            exponent = exponent * 10. + digit as f64;
1084            tokenizer.advance(1);
1085            if tokenizer.is_eof() {
1086                break;
1087            }
1088        }
1089        value *= f64::powf(10., sign * exponent);
1090    }
1091
1092    let int_value = if is_integer {
1093        Some(if value >= i32::MAX as f64 {
1094            i32::MAX
1095        } else if value <= i32::MIN as f64 {
1096            i32::MIN
1097        } else {
1098            value as i32
1099        })
1100    } else {
1101        None
1102    };
1103
1104    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
1105        tokenizer.advance(1);
1106        return Percentage {
1107            unit_value: (value / 100.) as f32,
1108            int_value,
1109            has_sign,
1110        };
1111    }
1112    let value = value as f32;
1113    if is_ident_start(tokenizer) {
1114        let unit = consume_name(tokenizer);
1115        Dimension {
1116            value,
1117            int_value,
1118            has_sign,
1119            unit,
1120        }
1121    } else {
1122        Number {
1123            value,
1124            int_value,
1125            has_sign,
1126        }
1127    }
1128}
1129
1130#[inline]
1131unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
1132    if cfg!(debug_assertions) {
1133        String::from_utf8(string_bytes).unwrap()
1134    } else {
1135        String::from_utf8_unchecked(string_bytes)
1136    }
1137}
1138
1139fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
1140    // This is only called after "url(", so the current position is a code point boundary.
1141    let start_position = tokenizer.position;
1142    let from_start = &tokenizer.input[tokenizer.position..];
1143    let mut newlines = 0;
1144    let mut last_newline = 0;
1145    let mut found_printable_char = false;
1146    let mut iter = from_start.bytes().enumerate();
1147    loop {
1148        let (offset, b) = match iter.next() {
1149            Some(item) => item,
1150            None => {
1151                tokenizer.position = tokenizer.input.len();
1152                break;
1153            }
1154        };
1155        match_byte! { b,
1156            b' ' | b'\t' => {},
1157            b'\n' | b'\x0C' => {
1158                newlines += 1;
1159                last_newline = offset;
1160            }
1161            b'\r' => {
1162                if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
1163                    newlines += 1;
1164                    last_newline = offset;
1165                }
1166            }
1167            b'"' | b'\'' => return Err(()),  // Do not advance
1168            b')' => {
1169                // Don't use advance, because we may be skipping
1170                // newlines here, and we want to avoid the assert.
1171                tokenizer.position += offset + 1;
1172                break
1173            }
1174            _ => {
1175                // Don't use advance, because we may be skipping
1176                // newlines here, and we want to avoid the assert.
1177                tokenizer.position += offset;
1178                found_printable_char = true;
1179                break
1180            }
1181        }
1182    }
1183
1184    if newlines > 0 {
1185        tokenizer.current_line_number += newlines;
1186        // No need for wrapping_add here, because there's no possible
1187        // way to wrap.
1188        tokenizer.current_line_start_position = start_position + last_newline + 1;
1189    }
1190
1191    if found_printable_char {
1192        // This function only consumed ASCII (whitespace) bytes,
1193        // so the current position is a code point boundary.
1194        return Ok(consume_unquoted_url_internal(tokenizer));
1195    } else {
1196        return Ok(UnquotedUrl("".into()));
1197    }
1198
1199    fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1200        // This function is only called with start_pos at a code point boundary.
1201        let start_pos = tokenizer.position();
1202        let mut string_bytes: Vec<u8>;
1203        loop {
1204            if tokenizer.is_eof() {
1205                return UnquotedUrl(tokenizer.slice_from(start_pos).into());
1206            }
1207            match_byte! { tokenizer.next_byte_unchecked(),
1208                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1209                    let value = tokenizer.slice_from(start_pos);
1210                    return consume_url_end(tokenizer, start_pos, value.into())
1211                }
1212                b')' => {
1213                    let value = tokenizer.slice_from(start_pos);
1214                    tokenizer.advance(1);
1215                    return UnquotedUrl(value.into())
1216                }
1217                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
1218                    | b'"' | b'\'' | b'(' => {
1219                    tokenizer.advance(1);
1220                    return consume_bad_url(tokenizer, start_pos)
1221                },
1222                b'\\' | b'\0' => {
1223                    // * The tokenizer’s input is UTF-8 since it’s `&str`.
1224                    // * start_pos is at a code point boundary
1225                    // * so is the current position (which is before '\\' or '\0'
1226                    //
1227                    // So `string_bytes` is well-formed UTF-8.
1228                    string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
1229                    break
1230                }
1231                b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
1232                b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
1233                _ => {
1234                    // ASCII or other leading byte.
1235                    tokenizer.advance(1);
1236                }
1237            }
1238        }
1239        while !tokenizer.is_eof() {
1240            let b = tokenizer.next_byte_unchecked();
1241            match_byte! { b,
1242                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1243                    // string_bytes is well-formed UTF-8, see other comments.
1244                    let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1245                    return consume_url_end(tokenizer, start_pos, string)
1246                }
1247                b')' => {
1248                    tokenizer.advance(1);
1249                    break;
1250                }
1251                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
1252                    | b'"' | b'\'' | b'(' => {
1253                    tokenizer.advance(1);
1254                    return consume_bad_url(tokenizer, start_pos);
1255                }
1256                b'\\' => {
1257                    tokenizer.advance(1);
1258                    if tokenizer.has_newline_at(0) {
1259                        return consume_bad_url(tokenizer, start_pos)
1260                    }
1261
1262                    // This pushes one well-formed code point to string_bytes
1263                    consume_escape_and_write(tokenizer, &mut string_bytes)
1264                },
1265                b'\0' => {
1266                    tokenizer.advance(1);
1267                    string_bytes.extend("\u{FFFD}".as_bytes());
1268                }
1269                b'\x80'..=b'\xBF' => {
1270                    // We’ll end up copying the whole code point
1271                    // before this loop does something else.
1272                    tokenizer.consume_continuation_byte();
1273                    string_bytes.push(b);
1274                }
1275                b'\xF0'..=b'\xFF' => {
1276                    // We’ll end up copying the whole code point
1277                    // before this loop does something else.
1278                    tokenizer.consume_4byte_intro();
1279                    string_bytes.push(b);
1280                }
1281                // If this byte is part of a multi-byte code point,
1282                // we’ll end up copying the whole code point before this loop does something else.
1283                b => {
1284                    // ASCII or other leading byte.
1285                    tokenizer.advance(1);
1286                    string_bytes.push(b)
1287                }
1288            }
1289        }
1290        UnquotedUrl(
1291            // string_bytes is well-formed UTF-8, see other comments.
1292            unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
1293        )
1294    }
1295
1296    fn consume_url_end<'a>(
1297        tokenizer: &mut Tokenizer<'a>,
1298        start_pos: SourcePosition,
1299        string: CowRcStr<'a>,
1300    ) -> Token<'a> {
1301        while !tokenizer.is_eof() {
1302            match_byte! { tokenizer.next_byte_unchecked(),
1303                b')' => {
1304                    tokenizer.advance(1);
1305                    break
1306                }
1307                b' ' | b'\t' => { tokenizer.advance(1); }
1308                b'\n' | b'\x0C' | b'\r' => {
1309                    tokenizer.consume_newline();
1310                }
1311                b => {
1312                    tokenizer.consume_known_byte(b);
1313                    return consume_bad_url(tokenizer, start_pos);
1314                }
1315            }
1316        }
1317        UnquotedUrl(string)
1318    }
1319
1320    fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
1321        // Consume up to the closing )
1322        while !tokenizer.is_eof() {
1323            match_byte! { tokenizer.next_byte_unchecked(),
1324                b')' => {
1325                    let contents = tokenizer.slice_from(start_pos).into();
1326                    tokenizer.advance(1);
1327                    return BadUrl(contents)
1328                }
1329                b'\\' => {
1330                    tokenizer.advance(1);
1331                    if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
1332                        tokenizer.advance(1); // Skip an escaped ')' or '\'
1333                    }
1334                }
1335                b'\n' | b'\x0C' | b'\r' => {
1336                    tokenizer.consume_newline();
1337                }
1338                b => {
1339                    tokenizer.consume_known_byte(b);
1340                }
1341            }
1342        }
1343        BadUrl(tokenizer.slice_from(start_pos).into())
1344    }
1345}
1346
1347// (value, number of digits up to 6)
1348fn consume_hex_digits(tokenizer: &mut Tokenizer<'_>) -> (u32, u32) {
1349    let mut value = 0;
1350    let mut digits = 0;
1351    while digits < 6 && !tokenizer.is_eof() {
1352        match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
1353            Some(digit) => {
1354                value = value * 16 + digit;
1355                digits += 1;
1356                tokenizer.advance(1);
1357            }
1358            None => break,
1359        }
1360    }
1361    (value, digits)
1362}
1363
1364// Same constraints as consume_escape except it writes into `bytes` the result
1365// instead of returning it.
1366fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
1367    bytes.extend(
1368        consume_escape(tokenizer)
1369            .encode_utf8(&mut [0; 4])
1370            .as_bytes(),
1371    )
1372}
1373
1374// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
1375// and that the next input character has already been verified
1376// to not be a newline.
1377fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1378    if tokenizer.is_eof() {
1379        return '\u{FFFD}';
1380    } // Escaped EOF
1381    match_byte! { tokenizer.next_byte_unchecked(),
1382        b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
1383            let (c, _) = consume_hex_digits(tokenizer);
1384            if !tokenizer.is_eof() {
1385                match_byte! { tokenizer.next_byte_unchecked(),
1386                    b' ' | b'\t' => {
1387                        tokenizer.advance(1)
1388                    }
1389                    b'\n' | b'\x0C' | b'\r' => {
1390                        tokenizer.consume_newline();
1391                    }
1392                    _ => {}
1393                }
1394            }
1395            static REPLACEMENT_CHAR: char = '\u{FFFD}';
1396            if c != 0 {
1397                let c = char::from_u32(c);
1398                c.unwrap_or(REPLACEMENT_CHAR)
1399            } else {
1400                REPLACEMENT_CHAR
1401            }
1402        },
1403        b'\0' => {
1404            tokenizer.advance(1);
1405            '\u{FFFD}'
1406        }
1407        _ => tokenizer.consume_char(),
1408    }
1409}