urlpattern/
parser.rs

1// Copyright 2018-2021 the Deno authors. All rights reserved. MIT license.
2
3use crate::error::ParserError;
4use crate::tokenizer::Token;
5use crate::tokenizer::TokenType;
6use crate::Error;
7
8// Ref: https://wicg.github.io/urlpattern/#full-wildcard-regexp-value
9pub const FULL_WILDCARD_REGEXP_VALUE: &str = ".*";
10
11/// The regexp syntax that should be used.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum RegexSyntax {
14  /// Compile regexes to rust-regex syntax. This is the default.
15  Rust,
16  /// Compile regexes to ECMAScript syntax. This should be used with the
17  /// [crate::quirks::component_regex].
18  ///
19  /// NOTE: enabling this syntax kind, means the regex syntax will NOT be
20  /// validated during parsing.
21  EcmaScript,
22}
23
24// Ref: https://wicg.github.io/urlpattern/#options-header
25#[derive(Debug, Clone)]
26pub struct Options {
27  pub delimiter_code_point: Option<char>,
28  pub prefix_code_point: String, // TODO: It must contain one ASCII code point or the empty string. maybe Option<char>?
29  pub regex_syntax: RegexSyntax,
30  pub ignore_case: bool,
31}
32
33impl std::default::Default for Options {
34  // Ref: https://wicg.github.io/urlpattern/#default-options
35  #[inline]
36  fn default() -> Self {
37    Options {
38      delimiter_code_point: None,
39      prefix_code_point: String::new(),
40      regex_syntax: RegexSyntax::Rust,
41      ignore_case: false,
42    }
43  }
44}
45
46impl Options {
47  // Ref: https://wicg.github.io/urlpattern/#hostname-options
48  #[inline]
49  pub fn hostname() -> Self {
50    Options {
51      delimiter_code_point: Some('.'),
52      prefix_code_point: String::new(),
53      regex_syntax: RegexSyntax::Rust,
54      ignore_case: false,
55    }
56  }
57
58  // Ref: https://wicg.github.io/urlpattern/#pathname-options
59  #[inline]
60  pub fn pathname() -> Self {
61    Options {
62      delimiter_code_point: Some('/'),
63      prefix_code_point: String::from("/"),
64      regex_syntax: RegexSyntax::Rust,
65      ignore_case: false,
66    }
67  }
68
69  // Ref: https://wicg.github.io/urlpattern/#escape-a-regexp-string
70  pub fn escape_regexp_string(&self, input: &str) -> String {
71    assert!(input.is_ascii());
72    let mut result = String::new();
73    for char in input.chars() {
74      if matches!(
75        char,
76        '.'
77        | '+'
78        | '*'
79        | '?'
80        | '^'
81        | '$'
82        | '{'
83        | '}'
84        | '('
85        | ')'
86        | '['
87        | ']'
88        | '|'
89        // | '/': deviation from spec, rust regexp crate does not handle '\/' as a valid escape sequence
90        | '\\'
91      ) || (char == '/' && self.regex_syntax == RegexSyntax::EcmaScript)
92      {
93        result.push('\\');
94      }
95      result.push(char);
96    }
97    result
98  }
99
100  // Ref: https://wicg.github.io/urlpattern/#generate-a-segment-wildcard-regexp
101  #[inline]
102  pub fn generate_segment_wildcard_regexp(&self) -> String {
103    // NOTE: this is a deliberate deviation from the spec. In rust-regex, you
104    // can not have a negative character class without specifying any
105    // characters.
106    if let Some(code_point) = self.delimiter_code_point {
107      let mut buffer = [0; 4];
108      format!(
109        "[^{}]+?",
110        self.escape_regexp_string(code_point.encode_utf8(&mut buffer))
111      )
112    } else {
113      ".+?".to_owned()
114    }
115  }
116}
117
118// Ref: https://wicg.github.io/urlpattern/#part-type
119#[derive(Debug, Eq, PartialEq)]
120pub enum PartType {
121  FixedText,
122  Regexp,
123  SegmentWildcard,
124  FullWildcard,
125}
126
127// Ref: https://wicg.github.io/urlpattern/#part-modifier
128#[derive(Debug, Eq, PartialEq)]
129pub enum PartModifier {
130  None,
131  Optional,
132  ZeroOrMore,
133  OneOrMore,
134}
135
136impl std::fmt::Display for PartModifier {
137  // Ref: https://wicg.github.io/urlpattern/#convert-a-modifier-to-a-string
138  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
139    f.write_str(match self {
140      PartModifier::None => "",
141      PartModifier::Optional => "?",
142      PartModifier::ZeroOrMore => "*",
143      PartModifier::OneOrMore => "+",
144    })
145  }
146}
147
148// Ref: https://wicg.github.io/urlpattern/#part
149#[derive(Debug)]
150pub struct Part {
151  pub kind: PartType,
152  pub value: String,
153  pub modifier: PartModifier,
154  pub name: String,
155  pub prefix: String,
156  pub suffix: String,
157}
158
159impl Part {
160  fn new(kind: PartType, value: String, modifier: PartModifier) -> Self {
161    Part {
162      kind,
163      value,
164      modifier,
165      name: String::new(),
166      prefix: String::new(),
167      suffix: String::new(),
168    }
169  }
170}
171
172// Ref: https://wicg.github.io/urlpattern/#pattern-parser
173struct PatternParser<F>
174where
175  F: Fn(&str) -> Result<String, Error>,
176{
177  token_list: Vec<Token>,
178  encoding_callback: F,
179  segment_wildcard_regexp: String,
180  part_list: Vec<Part>,
181  pending_fixed_value: String,
182  index: usize,
183  next_numeric_name: usize,
184}
185
186impl<F> PatternParser<F>
187where
188  F: Fn(&str) -> Result<String, Error>,
189{
190  // Ref: https://wicg.github.io/urlpattern/#try-to-consume-a-token
191  fn try_consume_token(&mut self, kind: TokenType) -> Option<Token> {
192    assert!(self.index < self.token_list.len());
193    let next_token = self.token_list[self.index].clone();
194    if next_token.kind != kind {
195      None
196    } else {
197      self.index += 1;
198      Some(next_token)
199    }
200  }
201
202  // Ref: https://wicg.github.io/urlpattern/#try-to-consume-a-regexp-or-wildcard-token
203  #[inline]
204  fn try_consume_regexp_or_wildcard_token(
205    &mut self,
206    name_token_is_none: bool,
207  ) -> Option<Token> {
208    let token = self.try_consume_token(TokenType::Regexp);
209    if name_token_is_none && token.is_none() {
210      self.try_consume_token(TokenType::Asterisk)
211    } else {
212      token
213    }
214  }
215
216  // Ref: https://wicg.github.io/urlpattern/#try-to-consume-a-modifier-token
217  #[inline]
218  fn try_consume_modifier_token(&mut self) -> Option<Token> {
219    self
220      .try_consume_token(TokenType::OtherModifier)
221      .or_else(|| self.try_consume_token(TokenType::Asterisk))
222  }
223
224  // Ref: https://wicg.github.io/urlpattern/#maybe-add-a-part-from-the-pending-fixed-value
225  #[inline]
226  fn maybe_add_part_from_pending_fixed_value(&mut self) -> Result<(), Error> {
227    if self.pending_fixed_value.is_empty() {
228      return Ok(());
229    }
230    let encoded_value = (self.encoding_callback)(&self.pending_fixed_value)?;
231    self.pending_fixed_value = String::new();
232    self.part_list.push(Part::new(
233      PartType::FixedText,
234      encoded_value,
235      PartModifier::None,
236    ));
237
238    Ok(())
239  }
240
241  // Ref: https://wicg.github.io/urlpattern/#add-a-part
242  fn add_part(
243    &mut self,
244    prefix: &str,
245    name_token: Option<Token>,
246    regexp_or_wildcard_token: Option<Token>,
247    suffix: &str,
248    modifier_token: Option<Token>,
249  ) -> Result<(), Error> {
250    let mut modifier = PartModifier::None;
251    if let Some(modifier_token) = modifier_token {
252      modifier = match modifier_token.value.as_ref() {
253        "?" => PartModifier::Optional,
254        "*" => PartModifier::ZeroOrMore,
255        "+" => PartModifier::OneOrMore,
256        _ => unreachable!(),
257      };
258    }
259    if name_token.is_none()
260      && regexp_or_wildcard_token.is_none()
261      && modifier == PartModifier::None
262    {
263      self.pending_fixed_value.push_str(prefix);
264      return Ok(());
265    }
266    self.maybe_add_part_from_pending_fixed_value()?;
267    if name_token.is_none() && regexp_or_wildcard_token.is_none() {
268      assert!(suffix.is_empty());
269      if prefix.is_empty() {
270        return Ok(());
271      }
272      let encoded_value = (self.encoding_callback)(prefix)?;
273      self.part_list.push(Part::new(
274        PartType::FixedText,
275        encoded_value,
276        modifier,
277      ));
278      return Ok(());
279    }
280
281    let mut regexp_value = match &regexp_or_wildcard_token {
282      None => self.segment_wildcard_regexp.to_owned(),
283      Some(regexp_or_wildcard_token) => {
284        if regexp_or_wildcard_token.kind == TokenType::Asterisk {
285          FULL_WILDCARD_REGEXP_VALUE.to_string()
286        } else {
287          regexp_or_wildcard_token.value.to_owned()
288        }
289      }
290    };
291
292    let mut kind = PartType::Regexp;
293    if regexp_value == self.segment_wildcard_regexp {
294      kind = PartType::SegmentWildcard;
295      regexp_value = String::new();
296    } else if regexp_value == FULL_WILDCARD_REGEXP_VALUE {
297      kind = PartType::FullWildcard;
298      regexp_value = String::new();
299    }
300
301    let mut name = String::new();
302    if let Some(name_token) = name_token {
303      name = name_token.value;
304    } else if regexp_or_wildcard_token.is_some() {
305      name = self.next_numeric_name.to_string();
306      self.next_numeric_name += 1;
307    }
308    if self.is_duplicate_name(&name) {
309      return Err(Error::Parser(ParserError::DuplicateName(name)));
310    }
311    let encoded_prefix = (self.encoding_callback)(prefix)?;
312    let encoded_suffix = (self.encoding_callback)(suffix)?;
313    self.part_list.push(Part {
314      kind,
315      value: regexp_value,
316      modifier,
317      name,
318      prefix: encoded_prefix,
319      suffix: encoded_suffix,
320    });
321
322    Ok(())
323  }
324
325  // Ref: https://wicg.github.io/urlpattern/#is-a-duplicate-name
326  fn is_duplicate_name(&self, name: &str) -> bool {
327    self.part_list.iter().any(|p| p.name == name)
328  }
329
330  // Ref: https://wicg.github.io/urlpattern/#consume-text
331  fn consume_text(&mut self) -> String {
332    let mut result = String::new();
333    loop {
334      let mut token = self.try_consume_token(TokenType::Char);
335      if token.is_none() {
336        token = self.try_consume_token(TokenType::EscapedChar);
337      }
338      if token.is_none() {
339        break;
340      }
341      result.push_str(&token.unwrap().value);
342    }
343    result
344  }
345
346  // Ref: https://wicg.github.io/urlpattern/#consume-a-required-token
347  #[inline]
348  fn consume_required_token(
349    &mut self,
350    kind: TokenType,
351  ) -> Result<Token, Error> {
352    self.try_consume_token(kind.clone()).ok_or_else(|| {
353      Error::Parser(ParserError::ExpectedToken(
354        kind,
355        self.token_list[self.index].kind.clone(),
356        self.token_list[self.index].value.clone(),
357      ))
358    })
359  }
360}
361
362// Ref: https://wicg.github.io/urlpattern/#parse-a-pattern-string
363pub fn parse_pattern_string<F>(
364  input: &str,
365  options: &Options,
366  encoding_callback: F,
367) -> Result<Vec<Part>, Error>
368where
369  F: Fn(&str) -> Result<String, Error>,
370{
371  let token_list = crate::tokenizer::tokenize(
372    input,
373    crate::tokenizer::TokenizePolicy::Strict,
374  )?;
375
376  let mut parser = PatternParser {
377    token_list,
378    encoding_callback,
379    segment_wildcard_regexp: options.generate_segment_wildcard_regexp(),
380    part_list: vec![],
381    pending_fixed_value: String::new(),
382    index: 0,
383    next_numeric_name: 0,
384  };
385
386  while parser.index < parser.token_list.len() {
387    let char_token = parser.try_consume_token(TokenType::Char);
388    let mut name_token = parser.try_consume_token(TokenType::Name);
389    let mut regexp_or_wildcard_token =
390      parser.try_consume_regexp_or_wildcard_token(name_token.is_none());
391    if name_token.is_some() || regexp_or_wildcard_token.is_some() {
392      let mut prefix = String::new();
393      if let Some(char_token) = char_token {
394        char_token.value.clone_into(&mut prefix);
395      }
396      if !prefix.is_empty() && prefix != options.prefix_code_point {
397        parser.pending_fixed_value.push_str(&prefix);
398        prefix = String::new();
399      }
400      parser.maybe_add_part_from_pending_fixed_value()?;
401      let modifier_token = parser.try_consume_modifier_token();
402      parser.add_part(
403        &prefix,
404        name_token,
405        regexp_or_wildcard_token,
406        "",
407        modifier_token,
408      )?;
409      continue;
410    }
411    let mut fixed_token = char_token;
412    if fixed_token.is_none() {
413      fixed_token = parser.try_consume_token(TokenType::EscapedChar);
414    }
415    if let Some(fixed_token) = fixed_token {
416      parser.pending_fixed_value.push_str(&fixed_token.value);
417      continue;
418    }
419    let open_token = parser.try_consume_token(TokenType::Open);
420    if open_token.is_some() {
421      let prefix = parser.consume_text();
422      name_token = parser.try_consume_token(TokenType::Name);
423      regexp_or_wildcard_token =
424        parser.try_consume_regexp_or_wildcard_token(name_token.is_none());
425      let suffix = parser.consume_text();
426      parser.consume_required_token(TokenType::Close)?;
427      let modifier_token = parser.try_consume_modifier_token();
428      parser.add_part(
429        &prefix,
430        name_token,
431        regexp_or_wildcard_token,
432        &suffix,
433        modifier_token,
434      )?;
435      continue;
436    }
437    parser.maybe_add_part_from_pending_fixed_value()?;
438    parser.consume_required_token(TokenType::End)?;
439  }
440
441  Ok(parser.part_list)
442}