1use crate::error::TokenizerError;
4use crate::Error;
5
6#[derive(Debug, Clone, Eq, PartialEq)]
11pub enum TokenType {
12 Open,
13 Close,
14 Regexp,
15 Name,
16 Char,
17 EscapedChar,
18 OtherModifier,
19 Asterisk,
20 End,
21 InvalidChar,
22}
23
24#[derive(Debug, Clone)]
26pub struct Token {
27 pub kind: TokenType,
28 pub index: usize,
29 pub value: String,
30}
31
32#[derive(Debug, Eq, PartialEq)]
34pub enum TokenizePolicy {
35 Strict,
36 Lenient,
37}
38
39struct Tokenizer {
41 input: Vec<char>,
42 policy: TokenizePolicy,
43 token_list: Vec<Token>,
44 index: usize,
45 next_index: usize,
46 code_point: Option<char>, }
48
49impl Tokenizer {
50 #[inline]
52 fn get_next_codepoint(&mut self) {
53 self.code_point = Some(self.input[self.next_index]);
54 self.next_index += 1;
55 }
56
57 #[inline]
59 fn add_token_with_default_pos_and_len(&mut self, kind: TokenType) {
60 self.add_token_with_default_len(kind, self.next_index, self.index);
61 }
62
63 #[inline]
65 fn add_token_with_default_len(
66 &mut self,
67 kind: TokenType,
68 next_pos: usize,
69 value_pos: usize,
70 ) {
71 self.add_token(kind, next_pos, value_pos, next_pos - value_pos);
72 }
73
74 #[inline]
76 fn add_token(
77 &mut self,
78 kind: TokenType,
79 next_pos: usize,
80 value_pos: usize,
81 value_len: usize,
82 ) {
83 let range = value_pos..(value_pos + value_len);
84 let value = self.input[range].iter().collect::<String>();
85 self.token_list.push(Token {
86 kind,
87 index: self.index,
88 value,
89 });
90 self.index = next_pos;
91 }
92
93 fn process_tokenizing_error(
95 &mut self,
96 next_pos: usize,
97 value_pos: usize,
98 error: TokenizerError,
99 ) -> Result<(), Error> {
100 if self.policy == TokenizePolicy::Strict {
101 Err(Error::Tokenizer(error, value_pos))
102 } else {
103 self.add_token_with_default_len(
104 TokenType::InvalidChar,
105 next_pos,
106 value_pos,
107 );
108 Ok(())
109 }
110 }
111
112 #[inline]
114 fn seek_and_get_next_codepoint(&mut self, index: usize) {
115 self.next_index = index;
116 self.get_next_codepoint();
117 }
118}
119
120pub fn tokenize(
122 input: &str,
123 policy: TokenizePolicy,
124) -> Result<Vec<Token>, Error> {
125 let mut tokenizer = Tokenizer {
126 input: input.chars().collect::<Vec<char>>(),
127 policy,
128 token_list: vec![],
129 index: 0,
130 next_index: 0,
131 code_point: None,
132 };
133
134 while tokenizer.index < tokenizer.input.len() {
135 tokenizer.seek_and_get_next_codepoint(tokenizer.index);
136
137 if tokenizer.code_point == Some('*') {
138 tokenizer.add_token_with_default_pos_and_len(TokenType::Asterisk);
139 continue;
140 }
141 if matches!(tokenizer.code_point, Some('+') | Some('?')) {
142 tokenizer.add_token_with_default_pos_and_len(TokenType::OtherModifier);
143 continue;
144 }
145 if tokenizer.code_point == Some('\\') {
146 if tokenizer.index == (tokenizer.input.len() - 1) {
147 tokenizer.process_tokenizing_error(
148 tokenizer.next_index,
149 tokenizer.index,
150 TokenizerError::IncompleteEscapeCode,
151 )?;
152 continue;
153 }
154 let escaped_index = tokenizer.next_index;
155 tokenizer.get_next_codepoint();
156 tokenizer.add_token_with_default_len(
157 TokenType::EscapedChar,
158 tokenizer.next_index,
159 escaped_index,
160 );
161 continue;
162 }
163 if tokenizer.code_point == Some('{') {
164 tokenizer.add_token_with_default_pos_and_len(TokenType::Open);
165 continue;
166 }
167 if tokenizer.code_point == Some('}') {
168 tokenizer.add_token_with_default_pos_and_len(TokenType::Close);
169 continue;
170 }
171 if tokenizer.code_point == Some(':') {
172 let mut name_pos = tokenizer.next_index;
173 let name_start = name_pos;
174 while name_pos < tokenizer.input.len() {
175 tokenizer.seek_and_get_next_codepoint(name_pos);
176 let first_code_point = name_pos == name_start;
177 let valid_codepoint = is_valid_name_codepoint(
178 tokenizer.code_point.unwrap(),
179 first_code_point,
180 );
181 if !valid_codepoint {
182 break;
183 }
184 name_pos = tokenizer.next_index;
185 }
186 if name_pos <= name_start {
187 tokenizer.process_tokenizing_error(
188 name_start,
189 tokenizer.index,
190 TokenizerError::InvalidName,
191 )?;
192 continue;
193 }
194 tokenizer.add_token_with_default_len(
195 TokenType::Name,
196 name_pos,
197 name_start,
198 );
199 continue;
200 }
201
202 if tokenizer.code_point == Some('(') {
203 let mut depth = 1;
204 let mut regexp_pos = tokenizer.next_index;
205 let regexp_start = regexp_pos;
206 let mut error = false;
207 while regexp_pos < tokenizer.input.len() {
209 tokenizer.seek_and_get_next_codepoint(regexp_pos);
210 if !tokenizer.code_point.unwrap().is_ascii()
211 || (regexp_pos == regexp_start && tokenizer.code_point == Some('?'))
212 {
213 tokenizer.process_tokenizing_error(
214 regexp_start,
215 tokenizer.index,
216 TokenizerError::InvalidRegex(
217 "must not start with ?, and may only contain ascii",
218 ),
219 )?;
220 error = true;
221 break;
222 }
223 if tokenizer.code_point == Some('\\') {
224 if regexp_pos == (tokenizer.input.len() - 1) {
225 tokenizer.process_tokenizing_error(
226 regexp_start,
227 tokenizer.index,
228 TokenizerError::IncompleteEscapeCode,
229 )?;
230 error = true;
231 break;
232 }
233 tokenizer.get_next_codepoint();
234 if !tokenizer.code_point.unwrap().is_ascii() {
235 tokenizer.process_tokenizing_error(
236 regexp_start,
237 tokenizer.index,
238 TokenizerError::InvalidRegex("non ascii character was escaped"),
239 )?;
240 error = true;
241 break;
242 }
243 regexp_pos = tokenizer.next_index;
244 continue;
245 }
246 if tokenizer.code_point == Some(')') {
247 depth -= 1;
248 if depth == 0 {
249 regexp_pos = tokenizer.next_index;
250 break;
251 }
252 } else if tokenizer.code_point == Some('(') {
253 depth += 1;
254 if regexp_pos == (tokenizer.input.len() - 1) {
255 tokenizer.process_tokenizing_error(
256 regexp_start,
257 tokenizer.index,
258 TokenizerError::InvalidRegex("nested groups not closed"),
259 )?;
260 error = true;
261 break;
262 }
263 let temp_pos = tokenizer.next_index;
264 tokenizer.get_next_codepoint();
265 if tokenizer.code_point != Some('?') {
266 tokenizer.process_tokenizing_error(
267 regexp_start,
268 tokenizer.index,
269 TokenizerError::InvalidRegex("nested groups must start with ?"),
270 )?;
271 error = true;
272 break;
273 }
274 tokenizer.next_index = temp_pos;
275 }
276 regexp_pos = tokenizer.next_index;
277 }
278 if error {
279 continue;
280 }
281 if depth != 0 {
282 tokenizer.process_tokenizing_error(
283 regexp_start,
284 tokenizer.index,
285 TokenizerError::InvalidRegex("missing closing )"),
286 )?;
287 continue;
288 }
289 let regexp_len = regexp_pos - regexp_start - 1;
290 if regexp_len == 0 {
291 tokenizer.process_tokenizing_error(
292 regexp_start,
293 tokenizer.index,
294 TokenizerError::InvalidRegex("length must be > 0"),
295 )?;
296 continue;
297 }
298 tokenizer.add_token(
299 TokenType::Regexp,
300 regexp_pos,
301 regexp_start,
302 regexp_len,
303 );
304 continue;
305 }
306
307 tokenizer.add_token_with_default_pos_and_len(TokenType::Char);
308 }
309
310 tokenizer.add_token_with_default_len(
311 TokenType::End,
312 tokenizer.index,
313 tokenizer.index,
314 );
315 Ok(tokenizer.token_list)
316}
317
318#[inline]
320pub(crate) fn is_valid_name_codepoint(code_point: char, first: bool) -> bool {
321 if first {
322 unic_ucd_ident::is_id_start(code_point) || matches!(code_point, '$' | '_')
323 } else {
324 unic_ucd_ident::is_id_continue(code_point)
325 || matches!(code_point, '$' | '\u{200C}' | '\u{200D}')
326 }
327}