1use self::Token::*;
8use crate::cow_rc_str::CowRcStr;
9use crate::parser::{ArbitrarySubstitutionFunctions, ParserState};
10use std::char;
11use std::ops::Range;
12
13#[cfg(feature = "fast_match_byte")]
14pub use crate::match_byte;
15
16#[derive(PartialEq, Debug, Clone)]
21pub enum Token<'a> {
22 Ident(CowRcStr<'a>),
24
25 AtKeyword(CowRcStr<'a>),
29
30 Hash(CowRcStr<'a>),
34
35 IDHash(CowRcStr<'a>), QuotedString(CowRcStr<'a>),
44
45 UnquotedUrl(CowRcStr<'a>),
50
51 Delim(char),
53
54 Number {
56 has_sign: bool,
60
61 value: f32,
63
64 int_value: Option<i32>,
66 },
67
68 Percentage {
70 has_sign: bool,
72
73 unit_value: f32,
75
76 int_value: Option<i32>,
79 },
80
81 Dimension {
83 has_sign: bool,
87
88 value: f32,
90
91 int_value: Option<i32>,
93
94 unit: CowRcStr<'a>,
96 },
97
98 WhiteSpace(&'a str),
100
101 Comment(&'a str),
108
109 Colon, Semicolon, Comma, IncludeMatch,
120
121 DashMatch,
123
124 PrefixMatch,
126
127 SuffixMatch,
129
130 SubstringMatch,
132
133 CDO,
135
136 CDC,
138
139 Function(CowRcStr<'a>),
143
144 ParenthesisBlock,
146
147 SquareBracketBlock,
149
150 CurlyBracketBlock,
152
153 BadUrl(CowRcStr<'a>),
157
158 BadString(CowRcStr<'a>),
162
163 CloseParenthesis,
168
169 CloseSquareBracket,
174
175 CloseCurlyBracket,
180}
181
182impl Token<'_> {
183 pub fn is_parse_error(&self) -> bool {
190 matches!(
191 *self,
192 BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
193 )
194 }
195}
196
197#[derive(Clone)]
198pub struct Tokenizer<'a> {
199 input: &'a str,
200 position: usize,
202 current_line_start_position: usize,
206 current_line_number: u32,
207 arbitrary_substitution_functions: SeenStatus<'a>,
208 source_map_url: Option<&'a str>,
209 source_url: Option<&'a str>,
210}
211
212#[derive(Copy, Clone, PartialEq, Eq)]
213enum SeenStatus<'a> {
214 DontCare,
215 LookingForThem(ArbitrarySubstitutionFunctions<'a>),
216 SeenAtLeastOne,
217}
218
219impl<'a> Tokenizer<'a> {
220 #[inline]
221 pub fn new(input: &'a str) -> Self {
222 Tokenizer {
223 input,
224 position: 0,
225 current_line_start_position: 0,
226 current_line_number: 0,
227 arbitrary_substitution_functions: SeenStatus::DontCare,
228 source_map_url: None,
229 source_url: None,
230 }
231 }
232
233 #[inline]
234 pub fn look_for_arbitrary_substitution_functions(
235 &mut self,
236 fns: ArbitrarySubstitutionFunctions<'a>,
237 ) {
238 self.arbitrary_substitution_functions = SeenStatus::LookingForThem(fns);
239 }
240
241 #[inline]
242 pub fn seen_arbitrary_substitution_functions(&mut self) -> bool {
243 let seen = self.arbitrary_substitution_functions == SeenStatus::SeenAtLeastOne;
244 self.arbitrary_substitution_functions = SeenStatus::DontCare;
245 seen
246 }
247
248 #[inline]
249 pub fn see_function(&mut self, name: &str) {
250 if let SeenStatus::LookingForThem(fns) = self.arbitrary_substitution_functions {
251 if fns.iter().any(|a| name.eq_ignore_ascii_case(a)) {
252 self.arbitrary_substitution_functions = SeenStatus::SeenAtLeastOne;
253 }
254 }
255 }
256
257 #[inline]
258 pub fn next(&mut self) -> Result<Token<'a>, ()> {
259 next_token(self)
260 }
261
262 #[inline]
263 pub fn position(&self) -> SourcePosition {
264 debug_assert!(self.input.is_char_boundary(self.position));
265 SourcePosition(self.position)
266 }
267
268 #[inline]
269 pub fn current_source_location(&self) -> SourceLocation {
270 SourceLocation {
271 line: self.current_line_number,
272 column: (self.position - self.current_line_start_position + 1) as u32,
273 }
274 }
275
276 #[inline]
277 pub fn current_source_map_url(&self) -> Option<&'a str> {
278 self.source_map_url
279 }
280
281 #[inline]
282 pub fn current_source_url(&self) -> Option<&'a str> {
283 self.source_url
284 }
285
286 #[inline]
287 pub fn state(&self) -> ParserState {
288 ParserState {
289 position: self.position,
290 current_line_start_position: self.current_line_start_position,
291 current_line_number: self.current_line_number,
292 at_start_of: None,
293 }
294 }
295
296 #[inline]
297 pub fn reset(&mut self, state: &ParserState) {
298 self.position = state.position;
299 self.current_line_start_position = state.current_line_start_position;
300 self.current_line_number = state.current_line_number;
301 }
302
303 #[inline]
304 pub(crate) fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
305 self.slice(start_pos..self.position())
306 }
307
308 #[inline]
309 pub(crate) fn slice(&self, range: Range<SourcePosition>) -> &'a str {
310 debug_assert!(self.input.is_char_boundary(range.start.0));
311 debug_assert!(self.input.is_char_boundary(range.end.0));
312 unsafe { self.input.get_unchecked(range.start.0..range.end.0) }
313 }
314
315 pub fn current_source_line(&self) -> &'a str {
316 let current = self.position();
317 let start = self
318 .slice(SourcePosition(0)..current)
319 .rfind(['\r', '\n', '\x0C'])
320 .map_or(0, |start| start + 1);
321 let end = self
322 .slice(current..SourcePosition(self.input.len()))
323 .find(['\r', '\n', '\x0C'])
324 .map_or(self.input.len(), |end| current.0 + end);
325 self.slice(SourcePosition(start)..SourcePosition(end))
326 }
327
328 #[inline]
329 pub fn next_byte(&self) -> Option<u8> {
330 if self.is_eof() {
331 None
332 } else {
333 Some(self.input.as_bytes()[self.position])
334 }
335 }
336
337 #[inline]
339 fn is_eof(&self) -> bool {
340 !self.has_at_least(0)
341 }
342
343 #[inline]
346 fn has_at_least(&self, n: usize) -> bool {
347 self.position + n < self.input.len()
348 }
349
350 #[inline]
354 pub fn advance(&mut self, n: usize) {
355 if cfg!(debug_assertions) {
356 for i in 0..n {
360 let b = self.byte_at(i);
361 debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
362 debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
363 }
364 }
365 self.position += n
366 }
367
368 #[inline]
370 fn next_byte_unchecked(&self) -> u8 {
371 self.byte_at(0)
372 }
373
374 #[inline]
375 fn byte_at(&self, offset: usize) -> u8 {
376 self.input.as_bytes()[self.position + offset]
377 }
378
379 #[inline]
382 fn consume_4byte_intro(&mut self) {
383 debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
384 self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
387 self.position += 1;
388 }
389
390 #[inline]
393 fn consume_continuation_byte(&mut self) {
394 debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
395 self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
399 self.position += 1;
400 }
401
402 #[inline(never)]
404 fn consume_known_byte(&mut self, byte: u8) {
405 debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
406 self.position += 1;
407 if byte & 0xF0 == 0xF0 {
409 self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
412 } else if byte & 0xC0 == 0x80 {
413 self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
416 }
417 }
418
419 #[inline]
420 fn next_char(&self) -> char {
421 unsafe { self.input.get_unchecked(self.position().0..) }
422 .chars()
423 .next()
424 .unwrap()
425 }
426
427 #[inline]
430 fn consume_newline(&mut self) {
431 let byte = self.next_byte_unchecked();
432 debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
433 self.position += 1;
434 if byte == b'\r' && self.next_byte() == Some(b'\n') {
435 self.position += 1;
436 }
437 self.current_line_start_position = self.position;
438 self.current_line_number += 1;
439 }
440
441 #[inline]
442 fn has_newline_at(&self, offset: usize) -> bool {
443 self.position + offset < self.input.len()
444 && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
445 }
446
447 #[inline]
448 fn consume_char(&mut self) -> char {
449 let c = self.next_char();
450 let len_utf8 = c.len_utf8();
451 self.position += len_utf8;
452 self.current_line_start_position = self
455 .current_line_start_position
456 .wrapping_add(len_utf8 - c.len_utf16());
457 c
458 }
459
460 #[inline]
461 fn starts_with(&self, needle: &[u8]) -> bool {
462 self.input.as_bytes()[self.position..].starts_with(needle)
463 }
464
465 pub fn skip_whitespace(&mut self) {
466 while !self.is_eof() {
467 match_byte! { self.next_byte_unchecked(),
468 b' ' | b'\t' => {
469 self.advance(1)
470 },
471 b'\n' | b'\x0C' | b'\r' => {
472 self.consume_newline();
473 },
474 b'/' => {
475 if self.starts_with(b"/*") {
476 consume_comment(self);
477 } else {
478 return
479 }
480 }
481 _ => return,
482 }
483 }
484 }
485
486 pub fn skip_cdc_and_cdo(&mut self) {
487 while !self.is_eof() {
488 match_byte! { self.next_byte_unchecked(),
489 b' ' | b'\t' => {
490 self.advance(1)
491 },
492 b'\n' | b'\x0C' | b'\r' => {
493 self.consume_newline();
494 },
495 b'/' => {
496 if self.starts_with(b"/*") {
497 consume_comment(self);
498 } else {
499 return
500 }
501 }
502 b'<' => {
503 if self.starts_with(b"<!--") {
504 self.advance(4)
505 } else {
506 return
507 }
508 }
509 b'-' => {
510 if self.starts_with(b"-->") {
511 self.advance(3)
512 } else {
513 return
514 }
515 }
516 _ => {
517 return
518 }
519 }
520 }
521 }
522}
523
524#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
526pub struct SourcePosition(pub(crate) usize);
527
528#[cfg(feature = "malloc_size_of")]
529malloc_size_of::malloc_size_of_is_0!(SourcePosition);
530
531impl SourcePosition {
532 #[inline]
534 pub fn byte_index(&self) -> usize {
535 self.0
536 }
537}
538
539#[derive(PartialEq, Eq, Debug, Clone, Copy, Default)]
541pub struct SourceLocation {
542 pub line: u32,
544
545 pub column: u32,
548}
549
550#[cfg(feature = "malloc_size_of")]
551malloc_size_of::malloc_size_of_is_0!(SourceLocation);
552
553fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
554 if tokenizer.is_eof() {
555 return Err(());
556 }
557 let b = tokenizer.next_byte_unchecked();
558 let token = match_byte! { b,
559 b' ' | b'\t' => {
560 consume_whitespace(tokenizer, false)
561 },
562 b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true),
563 b'"' => consume_string(tokenizer, false),
564 b'#' => {
565 tokenizer.advance(1);
566 if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
567 else if !tokenizer.is_eof() &&
568 matches!(tokenizer.next_byte_unchecked(), b'0'..=b'9' | b'-') {
569 Hash(consume_name(tokenizer))
571 }
572 else { Delim('#') }
573 },
574 b'$' => {
575 if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
576 else { tokenizer.advance(1); Delim('$') }
577 },
578 b'\'' => consume_string(tokenizer, true),
579 b'(' => { tokenizer.advance(1); ParenthesisBlock },
580 b')' => { tokenizer.advance(1); CloseParenthesis },
581 b'*' => {
582 if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
583 else { tokenizer.advance(1); Delim('*') }
584 },
585 b'+' => {
586 if (
587 tokenizer.has_at_least(1)
588 && tokenizer.byte_at(1).is_ascii_digit()
589 ) || (
590 tokenizer.has_at_least(2)
591 && tokenizer.byte_at(1) == b'.'
592 && tokenizer.byte_at(2).is_ascii_digit()
593 ) {
594 consume_numeric(tokenizer)
595 } else {
596 tokenizer.advance(1);
597 Delim('+')
598 }
599 },
600 b',' => { tokenizer.advance(1); Comma },
601 b'-' => {
602 if (
603 tokenizer.has_at_least(1)
604 && tokenizer.byte_at(1).is_ascii_digit()
605 ) || (
606 tokenizer.has_at_least(2)
607 && tokenizer.byte_at(1) == b'.'
608 && tokenizer.byte_at(2).is_ascii_digit()
609 ) {
610 consume_numeric(tokenizer)
611 } else if tokenizer.starts_with(b"-->") {
612 tokenizer.advance(3);
613 CDC
614 } else if is_ident_start(tokenizer) {
615 consume_ident_like(tokenizer)
616 } else {
617 tokenizer.advance(1);
618 Delim('-')
619 }
620 },
621 b'.' => {
622 if tokenizer.has_at_least(1)
623 && tokenizer.byte_at(1).is_ascii_digit() {
624 consume_numeric(tokenizer)
625 } else {
626 tokenizer.advance(1);
627 Delim('.')
628 }
629 }
630 b'/' => {
631 if tokenizer.starts_with(b"/*") {
632 Comment(consume_comment(tokenizer))
633 } else {
634 tokenizer.advance(1);
635 Delim('/')
636 }
637 }
638 b'0'..=b'9' => consume_numeric(tokenizer),
639 b':' => { tokenizer.advance(1); Colon },
640 b';' => { tokenizer.advance(1); Semicolon },
641 b'<' => {
642 if tokenizer.starts_with(b"<!--") {
643 tokenizer.advance(4);
644 CDO
645 } else {
646 tokenizer.advance(1);
647 Delim('<')
648 }
649 },
650 b'@' => {
651 tokenizer.advance(1);
652 if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
653 else { Delim('@') }
654 },
655 b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => consume_ident_like(tokenizer),
656 b'[' => { tokenizer.advance(1); SquareBracketBlock },
657 b'\\' => {
658 if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
659 else { tokenizer.advance(1); Delim('\\') }
660 },
661 b']' => { tokenizer.advance(1); CloseSquareBracket },
662 b'^' => {
663 if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
664 else { tokenizer.advance(1); Delim('^') }
665 },
666 b'{' => { tokenizer.advance(1); CurlyBracketBlock },
667 b'|' => {
668 if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
669 else { tokenizer.advance(1); Delim('|') }
670 },
671 b'}' => { tokenizer.advance(1); CloseCurlyBracket },
672 b'~' => {
673 if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
674 else { tokenizer.advance(1); Delim('~') }
675 },
676 _ => {
677 if !b.is_ascii() {
678 consume_ident_like(tokenizer)
679 } else {
680 tokenizer.advance(1);
681 Delim(b as char)
682 }
683 },
684 };
685 Ok(token)
686}
687
688fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
689 let start_position = tokenizer.position();
690 if newline {
691 tokenizer.consume_newline();
692 } else {
693 tokenizer.advance(1);
694 }
695 while !tokenizer.is_eof() {
696 let b = tokenizer.next_byte_unchecked();
697 match_byte! { b,
698 b' ' | b'\t' => {
699 tokenizer.advance(1);
700 }
701 b'\n' | b'\x0C' | b'\r' => {
702 tokenizer.consume_newline();
703 }
704 _ => {
705 break
706 }
707 }
708 }
709 WhiteSpace(tokenizer.slice_from(start_position))
710}
711
712fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
715 let directive = "# sourceMappingURL=";
716 let directive_old = "@ sourceMappingURL=";
717
718 if contents.starts_with(directive) || contents.starts_with(directive_old) {
720 let contents = &contents[directive.len()..];
721 tokenizer.source_map_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next();
722 }
723
724 let directive = "# sourceURL=";
725 let directive_old = "@ sourceURL=";
726
727 if contents.starts_with(directive) || contents.starts_with(directive_old) {
729 let contents = &contents[directive.len()..];
730 tokenizer.source_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next()
731 }
732}
733
734fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
735 tokenizer.advance(2); let start_position = tokenizer.position();
737 while !tokenizer.is_eof() {
738 match_byte! { tokenizer.next_byte_unchecked(),
739 b'*' => {
740 let end_position = tokenizer.position();
741 tokenizer.advance(1);
742 if tokenizer.next_byte() == Some(b'/') {
743 tokenizer.advance(1);
744 let contents = tokenizer.slice(start_position..end_position);
745 check_for_source_map(tokenizer, contents);
746 return contents
747 }
748 }
749 b'\n' | b'\x0C' | b'\r' => {
750 tokenizer.consume_newline();
751 }
752 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
753 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
754 _ => {
755 tokenizer.advance(1);
757 }
758 }
759 }
760 let contents = tokenizer.slice_from(start_position);
761 check_for_source_map(tokenizer, contents);
762 contents
763}
764
765fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
766 match consume_quoted_string(tokenizer, single_quote) {
767 Ok(value) => QuotedString(value),
768 Err(value) => BadString(value),
769 }
770}
771
772fn consume_quoted_string<'a>(
774 tokenizer: &mut Tokenizer<'a>,
775 single_quote: bool,
776) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
777 tokenizer.advance(1); let start_pos = tokenizer.position();
780 let mut string_bytes;
781 loop {
782 if tokenizer.is_eof() {
783 return Ok(tokenizer.slice_from(start_pos).into());
784 }
785 match_byte! { tokenizer.next_byte_unchecked(),
786 b'"' => {
787 if !single_quote {
788 let value = tokenizer.slice_from(start_pos);
789 tokenizer.advance(1);
790 return Ok(value.into())
791 }
792 tokenizer.advance(1);
793 }
794 b'\'' => {
795 if single_quote {
796 let value = tokenizer.slice_from(start_pos);
797 tokenizer.advance(1);
798 return Ok(value.into())
799 }
800 tokenizer.advance(1);
801 }
802 b'\\' | b'\0' => {
803 string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
809 break
810 }
811 b'\n' | b'\r' | b'\x0C' => {
812 return Err(tokenizer.slice_from(start_pos).into())
813 },
814 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
815 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
816 _ => {
817 tokenizer.advance(1);
819 }
820 }
821 }
822
823 while !tokenizer.is_eof() {
824 let b = tokenizer.next_byte_unchecked();
825 match_byte! { b,
826 b'\n' | b'\r' | b'\x0C' => {
827 return Err(
828 unsafe {
830 from_utf8_release_unchecked(string_bytes)
831 }.into()
832 );
833 }
834 b'"' => {
835 tokenizer.advance(1);
836 if !single_quote {
837 break;
838 }
839 }
840 b'\'' => {
841 tokenizer.advance(1);
842 if single_quote {
843 break;
844 }
845 }
846 b'\\' => {
847 tokenizer.advance(1);
848 if !tokenizer.is_eof() {
849 match tokenizer.next_byte_unchecked() {
850 b'\n' | b'\x0C' | b'\r' => {
852 tokenizer.consume_newline();
853 }
854 _ => consume_escape_and_write(tokenizer, &mut string_bytes)
856 }
857 }
858 continue;
860 }
861 b'\0' => {
862 tokenizer.advance(1);
863 string_bytes.extend("\u{FFFD}".as_bytes());
864 continue;
865 }
866 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
867 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
868 _ => {
869 tokenizer.advance(1);
871 },
872 }
873
874 string_bytes.push(b);
877 }
878
879 Ok(
880 unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
882 )
883}
884
885#[inline]
886fn is_ident_start(tokenizer: &Tokenizer) -> bool {
887 !tokenizer.is_eof()
888 && match_byte! { tokenizer.next_byte_unchecked(),
889 b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => true,
890 b'-' => {
891 tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
892 b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
893 true
894 }
895 b'\\' => !tokenizer.has_newline_at(1),
896 b => !b.is_ascii(),
897 }
898 },
899 b'\\' => !tokenizer.has_newline_at(1),
900 b => !b.is_ascii(),
901 }
902}
903
904fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
905 let value = consume_name(tokenizer);
906 if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
907 tokenizer.advance(1);
908 if value.eq_ignore_ascii_case("url") {
909 consume_unquoted_url(tokenizer).unwrap_or(Function(value))
910 } else {
911 tokenizer.see_function(&value);
912 Function(value)
913 }
914 } else {
915 Ident(value)
916 }
917}
918
919fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
920 let start_pos = tokenizer.position();
922 let mut value_bytes;
923 loop {
924 if tokenizer.is_eof() {
925 return tokenizer.slice_from(start_pos).into();
926 }
927 match_byte! { tokenizer.next_byte_unchecked(),
928 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => tokenizer.advance(1),
929 b'\\' | b'\0' => {
930 value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
936 break
937 }
938 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
939 b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
940 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
941 _b => {
942 return tokenizer.slice_from(start_pos).into();
943 }
944 }
945 }
946
947 while !tokenizer.is_eof() {
948 let b = tokenizer.next_byte_unchecked();
949 match_byte! { b,
950 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => {
951 tokenizer.advance(1);
952 value_bytes.push(b) }
954 b'\\' => {
955 if tokenizer.has_newline_at(1) { break }
956 tokenizer.advance(1);
957 consume_escape_and_write(tokenizer, &mut value_bytes)
959 }
960 b'\0' => {
961 tokenizer.advance(1);
962 value_bytes.extend("\u{FFFD}".as_bytes());
963 },
964 b'\x80'..=b'\xBF' => {
965 tokenizer.consume_continuation_byte();
968 value_bytes.push(b)
969 }
970 b'\xC0'..=b'\xEF' => {
971 tokenizer.advance(1);
974 value_bytes.push(b)
975 }
976 b'\xF0'..=b'\xFF' => {
977 tokenizer.consume_4byte_intro();
978 value_bytes.push(b)
979 }
980 _ => {
981 break;
983 }
984 }
985 }
986 unsafe { from_utf8_release_unchecked(value_bytes) }.into()
988}
989
990fn byte_to_hex_digit(b: u8) -> Option<u32> {
991 Some(match_byte! { b,
992 b'0' ..= b'9' => b - b'0',
993 b'a' ..= b'f' => b - b'a' + 10,
994 b'A' ..= b'F' => b - b'A' + 10,
995 _ => {
996 return None
997 }
998 } as u32)
999}
1000
1001fn byte_to_decimal_digit(b: u8) -> Option<u32> {
1002 if b.is_ascii_digit() {
1003 Some((b - b'0') as u32)
1004 } else {
1005 None
1006 }
1007}
1008
1009fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1010 let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1017 b'-' => (true, -1.),
1018 b'+' => (true, 1.),
1019 _ => (false, 1.),
1020 };
1021 if has_sign {
1022 tokenizer.advance(1);
1023 }
1024
1025 let mut integral_part: f64 = 0.;
1026 while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1027 integral_part = integral_part * 10. + digit as f64;
1028 tokenizer.advance(1);
1029 if tokenizer.is_eof() {
1030 break;
1031 }
1032 }
1033
1034 let mut is_integer = true;
1035
1036 let mut fractional_part: f64 = 0.;
1037 if tokenizer.has_at_least(1)
1038 && tokenizer.next_byte_unchecked() == b'.'
1039 && tokenizer.byte_at(1).is_ascii_digit()
1040 {
1041 is_integer = false;
1042 tokenizer.advance(1); let mut factor = 0.1;
1044 while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1045 fractional_part += digit as f64 * factor;
1046 factor *= 0.1;
1047 tokenizer.advance(1);
1048 if tokenizer.is_eof() {
1049 break;
1050 }
1051 }
1052 }
1053
1054 let mut value = sign * (integral_part + fractional_part);
1055
1056 if tokenizer.has_at_least(1)
1057 && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E')
1058 && (tokenizer.byte_at(1).is_ascii_digit()
1059 || (tokenizer.has_at_least(2)
1060 && matches!(tokenizer.byte_at(1), b'+' | b'-')
1061 && tokenizer.byte_at(2).is_ascii_digit()))
1062 {
1063 is_integer = false;
1064 tokenizer.advance(1);
1065 let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1066 b'-' => (true, -1.),
1067 b'+' => (true, 1.),
1068 _ => (false, 1.),
1069 };
1070 if has_sign {
1071 tokenizer.advance(1);
1072 }
1073 let mut exponent: f64 = 0.;
1074 while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1075 exponent = exponent * 10. + digit as f64;
1076 tokenizer.advance(1);
1077 if tokenizer.is_eof() {
1078 break;
1079 }
1080 }
1081 value *= f64::powf(10., sign * exponent);
1082 }
1083
1084 let int_value = if is_integer {
1085 Some(if value >= i32::MAX as f64 {
1086 i32::MAX
1087 } else if value <= i32::MIN as f64 {
1088 i32::MIN
1089 } else {
1090 value as i32
1091 })
1092 } else {
1093 None
1094 };
1095
1096 if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
1097 tokenizer.advance(1);
1098 return Percentage {
1099 unit_value: (value / 100.) as f32,
1100 int_value,
1101 has_sign,
1102 };
1103 }
1104 let value = value as f32;
1105 if is_ident_start(tokenizer) {
1106 let unit = consume_name(tokenizer);
1107 Dimension {
1108 value,
1109 int_value,
1110 has_sign,
1111 unit,
1112 }
1113 } else {
1114 Number {
1115 value,
1116 int_value,
1117 has_sign,
1118 }
1119 }
1120}
1121
1122#[inline]
1123unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
1124 if cfg!(debug_assertions) {
1125 String::from_utf8(string_bytes).unwrap()
1126 } else {
1127 String::from_utf8_unchecked(string_bytes)
1128 }
1129}
1130
1131fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
1132 let start_position = tokenizer.position;
1134 let from_start = &tokenizer.input[tokenizer.position..];
1135 let mut newlines = 0;
1136 let mut last_newline = 0;
1137 let mut found_printable_char = false;
1138 let mut iter = from_start.bytes().enumerate();
1139 loop {
1140 let (offset, b) = match iter.next() {
1141 Some(item) => item,
1142 None => {
1143 tokenizer.position = tokenizer.input.len();
1144 break;
1145 }
1146 };
1147 match_byte! { b,
1148 b' ' | b'\t' => {},
1149 b'\n' | b'\x0C' => {
1150 newlines += 1;
1151 last_newline = offset;
1152 }
1153 b'\r' => {
1154 if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
1155 newlines += 1;
1156 last_newline = offset;
1157 }
1158 }
1159 b'"' | b'\'' => return Err(()), b')' => {
1161 tokenizer.position += offset + 1;
1164 break
1165 }
1166 _ => {
1167 tokenizer.position += offset;
1170 found_printable_char = true;
1171 break
1172 }
1173 }
1174 }
1175
1176 if newlines > 0 {
1177 tokenizer.current_line_number += newlines;
1178 tokenizer.current_line_start_position = start_position + last_newline + 1;
1181 }
1182
1183 if found_printable_char {
1184 return Ok(consume_unquoted_url_internal(tokenizer));
1187 } else {
1188 return Ok(UnquotedUrl("".into()));
1189 }
1190
1191 fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1192 let start_pos = tokenizer.position();
1194 let mut string_bytes: Vec<u8>;
1195 loop {
1196 if tokenizer.is_eof() {
1197 return UnquotedUrl(tokenizer.slice_from(start_pos).into());
1198 }
1199 match_byte! { tokenizer.next_byte_unchecked(),
1200 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1201 let value = tokenizer.slice_from(start_pos);
1202 return consume_url_end(tokenizer, start_pos, value.into())
1203 }
1204 b')' => {
1205 let value = tokenizer.slice_from(start_pos);
1206 tokenizer.advance(1);
1207 return UnquotedUrl(value.into())
1208 }
1209 b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' | b'"' | b'\'' | b'(' => {
1211 tokenizer.advance(1);
1212 return consume_bad_url(tokenizer, start_pos)
1213 },
1214 b'\\' | b'\0' => {
1215 string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
1221 break
1222 }
1223 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
1224 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
1225 _ => {
1226 tokenizer.advance(1);
1228 }
1229 }
1230 }
1231 while !tokenizer.is_eof() {
1232 let b = tokenizer.next_byte_unchecked();
1233 match_byte! { b,
1234 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1235 let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1237 return consume_url_end(tokenizer, start_pos, string)
1238 }
1239 b')' => {
1240 tokenizer.advance(1);
1241 break;
1242 }
1243 b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' | b'"' | b'\'' | b'(' => {
1245 tokenizer.advance(1);
1246 return consume_bad_url(tokenizer, start_pos);
1247 }
1248 b'\\' => {
1249 tokenizer.advance(1);
1250 if tokenizer.has_newline_at(0) {
1251 return consume_bad_url(tokenizer, start_pos)
1252 }
1253
1254 consume_escape_and_write(tokenizer, &mut string_bytes)
1256 },
1257 b'\0' => {
1258 tokenizer.advance(1);
1259 string_bytes.extend("\u{FFFD}".as_bytes());
1260 }
1261 b'\x80'..=b'\xBF' => {
1262 tokenizer.consume_continuation_byte();
1265 string_bytes.push(b);
1266 }
1267 b'\xF0'..=b'\xFF' => {
1268 tokenizer.consume_4byte_intro();
1271 string_bytes.push(b);
1272 }
1273 b => {
1276 tokenizer.advance(1);
1278 string_bytes.push(b)
1279 }
1280 }
1281 }
1282 UnquotedUrl(
1283 unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
1285 )
1286 }
1287
1288 fn consume_url_end<'a>(
1289 tokenizer: &mut Tokenizer<'a>,
1290 start_pos: SourcePosition,
1291 string: CowRcStr<'a>,
1292 ) -> Token<'a> {
1293 while !tokenizer.is_eof() {
1294 match_byte! { tokenizer.next_byte_unchecked(),
1295 b')' => {
1296 tokenizer.advance(1);
1297 break
1298 }
1299 b' ' | b'\t' => { tokenizer.advance(1); }
1300 b'\n' | b'\x0C' | b'\r' => {
1301 tokenizer.consume_newline();
1302 }
1303 b => {
1304 tokenizer.consume_known_byte(b);
1305 return consume_bad_url(tokenizer, start_pos);
1306 }
1307 }
1308 }
1309 UnquotedUrl(string)
1310 }
1311
1312 fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
1313 while !tokenizer.is_eof() {
1315 match_byte! { tokenizer.next_byte_unchecked(),
1316 b')' => {
1317 let contents = tokenizer.slice_from(start_pos).into();
1318 tokenizer.advance(1);
1319 return BadUrl(contents)
1320 }
1321 b'\\' => {
1322 tokenizer.advance(1);
1323 if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
1324 tokenizer.advance(1); }
1326 }
1327 b'\n' | b'\x0C' | b'\r' => {
1328 tokenizer.consume_newline();
1329 }
1330 b => {
1331 tokenizer.consume_known_byte(b);
1332 }
1333 }
1334 }
1335 BadUrl(tokenizer.slice_from(start_pos).into())
1336 }
1337}
1338
1339fn consume_hex_digits(tokenizer: &mut Tokenizer<'_>) -> (u32, u32) {
1341 let mut value = 0;
1342 let mut digits = 0;
1343 while digits < 6 && !tokenizer.is_eof() {
1344 match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
1345 Some(digit) => {
1346 value = value * 16 + digit;
1347 digits += 1;
1348 tokenizer.advance(1);
1349 }
1350 None => break,
1351 }
1352 }
1353 (value, digits)
1354}
1355
1356fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
1359 bytes.extend(
1360 consume_escape(tokenizer)
1361 .encode_utf8(&mut [0; 4])
1362 .as_bytes(),
1363 )
1364}
1365
1366fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1370 if tokenizer.is_eof() {
1371 return '\u{FFFD}';
1372 } match_byte! { tokenizer.next_byte_unchecked(),
1374 b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
1375 let (c, _) = consume_hex_digits(tokenizer);
1376 if !tokenizer.is_eof() {
1377 match_byte! { tokenizer.next_byte_unchecked(),
1378 b' ' | b'\t' => {
1379 tokenizer.advance(1)
1380 }
1381 b'\n' | b'\x0C' | b'\r' => {
1382 tokenizer.consume_newline();
1383 }
1384 _ => {}
1385 }
1386 }
1387 static REPLACEMENT_CHAR: char = '\u{FFFD}';
1388 if c != 0 {
1389 let c = char::from_u32(c);
1390 c.unwrap_or(REPLACEMENT_CHAR)
1391 } else {
1392 REPLACEMENT_CHAR
1393 }
1394 },
1395 b'\0' => {
1396 tokenizer.advance(1);
1397 '\u{FFFD}'
1398 }
1399 _ => tokenizer.consume_char(),
1400 }
1401}