urlpattern/
tokenizer.rs

1// Copyright 2018-2021 the Deno authors. All rights reserved. MIT license.
2
3use crate::error::TokenizerError;
4use crate::Error;
5
6// Ref: https://wicg.github.io/urlpattern/#tokens
7// Ref: https://wicg.github.io/urlpattern/#tokenizing
8
9// Ref: https://wicg.github.io/urlpattern/#token-type
10#[derive(Debug, Clone, Eq, PartialEq)]
11pub enum TokenType {
12  Open,
13  Close,
14  Regexp,
15  Name,
16  Char,
17  EscapedChar,
18  OtherModifier,
19  Asterisk,
20  End,
21  InvalidChar,
22}
23
24// Ref: https://wicg.github.io/urlpattern/#token
25#[derive(Debug, Clone)]
26pub struct Token {
27  pub kind: TokenType,
28  pub index: usize,
29  pub value: String,
30}
31
32// Ref: https://wicg.github.io/urlpattern/#tokenize-policy
33#[derive(Debug, Eq, PartialEq)]
34pub enum TokenizePolicy {
35  Strict,
36  Lenient,
37}
38
39// Ref: https://wicg.github.io/urlpattern/#tokenizer
40struct Tokenizer {
41  input: Vec<char>,
42  policy: TokenizePolicy,
43  token_list: Vec<Token>,
44  index: usize,
45  next_index: usize,
46  code_point: Option<char>, // TODO: get rid of Option
47}
48
49impl Tokenizer {
50  // Ref: https://wicg.github.io/urlpattern/#get-the-next-code-point
51  #[inline]
52  fn get_next_codepoint(&mut self) {
53    self.code_point = Some(self.input[self.next_index]);
54    self.next_index += 1;
55  }
56
57  // Ref: https://wicg.github.io/urlpattern/#add-a-token-with-default-position-and-length
58  #[inline]
59  fn add_token_with_default_pos_and_len(&mut self, kind: TokenType) {
60    self.add_token_with_default_len(kind, self.next_index, self.index);
61  }
62
63  // Ref: https://wicg.github.io/urlpattern/#add-a-token-with-default-length
64  #[inline]
65  fn add_token_with_default_len(
66    &mut self,
67    kind: TokenType,
68    next_pos: usize,
69    value_pos: usize,
70  ) {
71    self.add_token(kind, next_pos, value_pos, next_pos - value_pos);
72  }
73
74  // Ref: https://wicg.github.io/urlpattern/#add-a-token
75  #[inline]
76  fn add_token(
77    &mut self,
78    kind: TokenType,
79    next_pos: usize,
80    value_pos: usize,
81    value_len: usize,
82  ) {
83    let range = value_pos..(value_pos + value_len);
84    let value = self.input[range].iter().collect::<String>();
85    self.token_list.push(Token {
86      kind,
87      index: self.index,
88      value,
89    });
90    self.index = next_pos;
91  }
92
93  // Ref: https://wicg.github.io/urlpattern/#process-a-tokenizing-error
94  fn process_tokenizing_error(
95    &mut self,
96    next_pos: usize,
97    value_pos: usize,
98    error: TokenizerError,
99  ) -> Result<(), Error> {
100    if self.policy == TokenizePolicy::Strict {
101      Err(Error::Tokenizer(error, value_pos))
102    } else {
103      self.add_token_with_default_len(
104        TokenType::InvalidChar,
105        next_pos,
106        value_pos,
107      );
108      Ok(())
109    }
110  }
111
112  // Ref: https://wicg.github.io/urlpattern/#seek-and-get-the-next-code-point
113  #[inline]
114  fn seek_and_get_next_codepoint(&mut self, index: usize) {
115    self.next_index = index;
116    self.get_next_codepoint();
117  }
118}
119
120// Ref: https://wicg.github.io/urlpattern/#tokenize
121pub fn tokenize(
122  input: &str,
123  policy: TokenizePolicy,
124) -> Result<Vec<Token>, Error> {
125  let mut tokenizer = Tokenizer {
126    input: input.chars().collect::<Vec<char>>(),
127    policy,
128    token_list: vec![],
129    index: 0,
130    next_index: 0,
131    code_point: None,
132  };
133
134  while tokenizer.index < tokenizer.input.len() {
135    tokenizer.seek_and_get_next_codepoint(tokenizer.index);
136
137    if tokenizer.code_point == Some('*') {
138      tokenizer.add_token_with_default_pos_and_len(TokenType::Asterisk);
139      continue;
140    }
141    if matches!(tokenizer.code_point, Some('+') | Some('?')) {
142      tokenizer.add_token_with_default_pos_and_len(TokenType::OtherModifier);
143      continue;
144    }
145    if tokenizer.code_point == Some('\\') {
146      if tokenizer.index == (tokenizer.input.len() - 1) {
147        tokenizer.process_tokenizing_error(
148          tokenizer.next_index,
149          tokenizer.index,
150          TokenizerError::IncompleteEscapeCode,
151        )?;
152        continue;
153      }
154      let escaped_index = tokenizer.next_index;
155      tokenizer.get_next_codepoint();
156      tokenizer.add_token_with_default_len(
157        TokenType::EscapedChar,
158        tokenizer.next_index,
159        escaped_index,
160      );
161      continue;
162    }
163    if tokenizer.code_point == Some('{') {
164      tokenizer.add_token_with_default_pos_and_len(TokenType::Open);
165      continue;
166    }
167    if tokenizer.code_point == Some('}') {
168      tokenizer.add_token_with_default_pos_and_len(TokenType::Close);
169      continue;
170    }
171    if tokenizer.code_point == Some(':') {
172      let mut name_pos = tokenizer.next_index;
173      let name_start = name_pos;
174      while name_pos < tokenizer.input.len() {
175        tokenizer.seek_and_get_next_codepoint(name_pos);
176        let first_code_point = name_pos == name_start;
177        let valid_codepoint = is_valid_name_codepoint(
178          tokenizer.code_point.unwrap(),
179          first_code_point,
180        );
181        if !valid_codepoint {
182          break;
183        }
184        name_pos = tokenizer.next_index;
185      }
186      if name_pos <= name_start {
187        tokenizer.process_tokenizing_error(
188          name_start,
189          tokenizer.index,
190          TokenizerError::InvalidName,
191        )?;
192        continue;
193      }
194      tokenizer.add_token_with_default_len(
195        TokenType::Name,
196        name_pos,
197        name_start,
198      );
199      continue;
200    }
201
202    if tokenizer.code_point == Some('(') {
203      let mut depth = 1;
204      let mut regexp_pos = tokenizer.next_index;
205      let regexp_start = regexp_pos;
206      let mut error = false;
207      // TODO: input code point length
208      while regexp_pos < tokenizer.input.len() {
209        tokenizer.seek_and_get_next_codepoint(regexp_pos);
210        if !tokenizer.code_point.unwrap().is_ascii()
211          || (regexp_pos == regexp_start && tokenizer.code_point == Some('?'))
212        {
213          tokenizer.process_tokenizing_error(
214            regexp_start,
215            tokenizer.index,
216            TokenizerError::InvalidRegex(
217              "must not start with ?, and may only contain ascii",
218            ),
219          )?;
220          error = true;
221          break;
222        }
223        if tokenizer.code_point == Some('\\') {
224          if regexp_pos == (tokenizer.input.len() - 1) {
225            tokenizer.process_tokenizing_error(
226              regexp_start,
227              tokenizer.index,
228              TokenizerError::IncompleteEscapeCode,
229            )?;
230            error = true;
231            break;
232          }
233          tokenizer.get_next_codepoint();
234          if !tokenizer.code_point.unwrap().is_ascii() {
235            tokenizer.process_tokenizing_error(
236              regexp_start,
237              tokenizer.index,
238              TokenizerError::InvalidRegex("non ascii character was escaped"),
239            )?;
240            error = true;
241            break;
242          }
243          regexp_pos = tokenizer.next_index;
244          continue;
245        }
246        if tokenizer.code_point == Some(')') {
247          depth -= 1;
248          if depth == 0 {
249            regexp_pos = tokenizer.next_index;
250            break;
251          }
252        } else if tokenizer.code_point == Some('(') {
253          depth += 1;
254          if regexp_pos == (tokenizer.input.len() - 1) {
255            tokenizer.process_tokenizing_error(
256              regexp_start,
257              tokenizer.index,
258              TokenizerError::InvalidRegex("nested groups not closed"),
259            )?;
260            error = true;
261            break;
262          }
263          let temp_pos = tokenizer.next_index;
264          tokenizer.get_next_codepoint();
265          if tokenizer.code_point != Some('?') {
266            tokenizer.process_tokenizing_error(
267              regexp_start,
268              tokenizer.index,
269              TokenizerError::InvalidRegex("nested groups must start with ?"),
270            )?;
271            error = true;
272            break;
273          }
274          tokenizer.next_index = temp_pos;
275        }
276        regexp_pos = tokenizer.next_index;
277      }
278      if error {
279        continue;
280      }
281      if depth != 0 {
282        tokenizer.process_tokenizing_error(
283          regexp_start,
284          tokenizer.index,
285          TokenizerError::InvalidRegex("missing closing )"),
286        )?;
287        continue;
288      }
289      let regexp_len = regexp_pos - regexp_start - 1;
290      if regexp_len == 0 {
291        tokenizer.process_tokenizing_error(
292          regexp_start,
293          tokenizer.index,
294          TokenizerError::InvalidRegex("length must be > 0"),
295        )?;
296        continue;
297      }
298      tokenizer.add_token(
299        TokenType::Regexp,
300        regexp_pos,
301        regexp_start,
302        regexp_len,
303      );
304      continue;
305    }
306
307    tokenizer.add_token_with_default_pos_and_len(TokenType::Char);
308  }
309
310  tokenizer.add_token_with_default_len(
311    TokenType::End,
312    tokenizer.index,
313    tokenizer.index,
314  );
315  Ok(tokenizer.token_list)
316}
317
318// Ref: https://wicg.github.io/urlpattern/#is-a-valid-name-code-point
319#[inline]
320pub(crate) fn is_valid_name_codepoint(code_point: char, first: bool) -> bool {
321  if first {
322    unic_ucd_ident::is_id_start(code_point) || matches!(code_point, '$' | '_')
323  } else {
324    unic_ucd_ident::is_id_continue(code_point)
325      || matches!(code_point, '$' | '\u{200C}' | '\u{200D}')
326  }
327}