1use crate::{is_valid_continuation, is_valid_start};
6
7#[derive(Clone, Copy, Debug, PartialEq, Eq)]
8pub enum Error {
9 InvalidVariableReference,
11 InvalidNCName,
12 ExpectedOperator,
13 UnterminatedStringLiteral,
14 IllegalCharacter,
15}
16
17#[derive(Clone, Copy, Debug, PartialEq, Eq)]
18pub(crate) struct CNameToken<'a> {
19 pub(crate) prefix: Option<&'a str>,
20 pub(crate) local_name: &'a str,
21}
22
23#[derive(Clone, Copy, Debug, PartialEq, Eq)]
24pub(crate) enum OperatorToken {
25 And,
26 Or,
27 Multiply,
28 Modulo,
29 Divide,
30 Add,
31 Subtract,
32 LessThan,
33 LessThanOrEqual,
34 GreaterThan,
35 GreaterThanOrEqual,
36 Equal,
37 NotEqual,
38}
39
40#[derive(Clone, Copy, Debug, PartialEq)]
41pub(crate) enum LiteralToken<'a> {
42 Integer(i64),
43 Decimal(f64),
44 String(&'a str),
45}
46
47#[derive(Clone, Copy, Debug, PartialEq)]
48pub(crate) enum Token<'a> {
49 VariableReference(&'a str),
50 CName(CNameToken<'a>),
51 Operator(OperatorToken),
52 Literal(LiteralToken<'a>),
53 AxisIdentifier(&'a str),
55 ParentNode,
57 SelfNode,
59 Parent,
61 Ancestor,
63 FunctionCall(&'a str),
65 OpeningParenthesis,
67 ClosingParenthesis,
69 OpeningBracket,
71 ClosingBracket,
73 Comma,
75 AtSign,
77 ProcessingInstructionTest,
79 CommentTest,
81 NodeTest,
83 TextTest,
85 Union,
87}
88
89struct Tokenizer<'a> {
90 remaining: &'a str,
91}
92
93impl<'a> Tokenizer<'a> {
94 fn consume_ncname(&mut self, allow_wildcard: bool) -> Result<&'a str, Error> {
96 if allow_wildcard && self.remaining.starts_with('*') {
97 self.remaining = &self.remaining[1..];
98 return Ok("*");
99 }
100
101 let mut chars = self.remaining.char_indices();
102
103 if !chars
104 .next()
105 .is_some_and(|(_, character)| is_valid_start(character) && character != ':')
106 {
107 return Err(Error::InvalidNCName);
108 }
109
110 let name_end = chars
111 .find(|(_, character)| !is_valid_continuation(*character) || *character == ':')
112 .map(|(index, _)| index)
113 .unwrap_or(self.remaining.len());
114
115 let (ncname, remaining) = self.remaining.split_at(name_end);
116 self.remaining = remaining;
117 Ok(ncname)
118 }
119
120 fn consume_single_token(&mut self, expect_operator_token: bool) -> Result<Token<'a>, Error> {
125 if self.remaining.starts_with('$') {
126 self.remaining = &self.remaining[1..];
127 let variable_name = self
128 .consume_ncname(false)
129 .map_err(|_| Error::InvalidVariableReference)?;
130 return Ok(Token::VariableReference(variable_name));
131 }
132
133 if let Ok(ncname) = self.consume_ncname(true) {
134 if expect_operator_token {
135 return match_operator_name(ncname).map(Token::Operator);
136 }
137
138 if self.remaining.starts_with(':') {
139 self.remaining = &self.remaining[1..];
140 if self.remaining.starts_with(':') {
141 self.remaining = &self.remaining[1..];
143 return Ok(Token::AxisIdentifier(ncname));
144 }
145
146 return Ok(Token::CName(CNameToken {
148 prefix: Some(ncname),
149 local_name: self.consume_ncname(true)?,
150 }));
151 } else if self.remaining.starts_with('(') {
152 self.remaining = &self.remaining[1..];
153 let token = match ncname {
154 "processing-instruction" => Token::ProcessingInstructionTest,
155 "node" => Token::NodeTest,
156 "text" => Token::TextTest,
157 "comment" => Token::CommentTest,
158 _ => Token::FunctionCall(ncname),
159 };
160 return Ok(token);
161 } else {
162 return Ok(Token::CName(CNameToken {
163 prefix: None,
164 local_name: ncname,
165 }));
166 }
167 }
168
169 match self
170 .remaining
171 .chars()
172 .next()
173 .expect("consume_single_token called with empty input")
174 {
175 '0'..='9' => {
176 let number = self.consume_numeric_literal();
177 Ok(Token::Literal(number))
178 },
179 '\'' | '"' => {
180 let string = self.consume_string_literal()?;
181 Ok(Token::Literal(LiteralToken::String(string)))
182 },
183 '.' => {
184 match self.remaining.chars().nth(1) {
188 Some('0'..='9') => Ok(Token::Literal(self.consume_numeric_literal())),
189 Some('.') => {
190 self.remaining = &self.remaining[2..];
191 Ok(Token::ParentNode)
192 },
193 _ => {
194 self.remaining = &self.remaining[1..];
195 Ok(Token::SelfNode)
196 },
197 }
198 },
199 '/' => {
200 if self.remaining.chars().nth(1).is_some_and(|c| c == '/') {
201 self.remaining = &self.remaining[2..];
202 Ok(Token::Ancestor)
203 } else {
204 self.remaining = &self.remaining[1..];
205 Ok(Token::Parent)
206 }
207 },
208 '-' => {
209 self.remaining = &self.remaining[1..];
210 Ok(Token::Operator(OperatorToken::Subtract))
211 },
212 '(' => {
213 self.remaining = &self.remaining[1..];
214 Ok(Token::OpeningParenthesis)
215 },
216 ')' => {
217 self.remaining = &self.remaining[1..];
218 Ok(Token::ClosingParenthesis)
219 },
220 '[' => {
221 self.remaining = &self.remaining[1..];
222 Ok(Token::OpeningBracket)
223 },
224 ']' => {
225 self.remaining = &self.remaining[1..];
226 Ok(Token::ClosingBracket)
227 },
228 ',' => {
229 self.remaining = &self.remaining[1..];
230 Ok(Token::Comma)
231 },
232 '@' => {
233 self.remaining = &self.remaining[1..];
234 Ok(Token::AtSign)
235 },
236 '<' => {
237 self.remaining = &self.remaining[1..];
238 if self.remaining.starts_with('=') {
239 self.remaining = &self.remaining[1..];
240 Ok(Token::Operator(OperatorToken::LessThanOrEqual))
241 } else {
242 Ok(Token::Operator(OperatorToken::LessThan))
243 }
244 },
245 '>' => {
246 self.remaining = &self.remaining[1..];
247 if self.remaining.starts_with('=') {
248 self.remaining = &self.remaining[1..];
249 Ok(Token::Operator(OperatorToken::GreaterThanOrEqual))
250 } else {
251 Ok(Token::Operator(OperatorToken::GreaterThan))
252 }
253 },
254 '!' => {
255 if self.remaining.starts_with("!=") {
256 self.remaining = &self.remaining[2..];
257 Ok(Token::Operator(OperatorToken::NotEqual))
258 } else {
259 Err(Error::IllegalCharacter)
260 }
261 },
262 '=' => {
263 self.remaining = &self.remaining[1..];
264 Ok(Token::Operator(OperatorToken::Equal))
265 },
266 '|' => {
267 self.remaining = &self.remaining[1..];
268 Ok(Token::Union)
269 },
270 '+' => {
271 self.remaining = &self.remaining[1..];
272 Ok(Token::Operator(OperatorToken::Add))
273 },
274 other => {
275 log::debug!("Illegal character: {other:?}");
276 Err(Error::IllegalCharacter)
277 },
278 }
279 }
280
281 fn consume_string_literal(&mut self) -> Result<&'a str, Error> {
282 let quote_character = self.remaining.chars().next().unwrap();
283 debug_assert!(quote_character == '\'' || quote_character == '"');
284 let Some((literal, remaining)) = self.remaining[1..].split_once(quote_character) else {
285 return Err(Error::UnterminatedStringLiteral);
286 };
287 self.remaining = remaining;
288 Ok(literal)
289 }
290
291 fn consume_numeric_literal(&mut self) -> LiteralToken<'a> {
293 let mut has_period = false;
294 let mut end = self.remaining.len();
295 for (index, c) in self.remaining.char_indices() {
296 let is_first_period = !has_period && c == '.';
297 if !c.is_ascii_digit() && !is_first_period {
298 end = index;
299 break;
300 }
301
302 has_period |= c == '.';
303 }
304
305 let (mut number, remaining) = self.remaining.split_at(end);
306 debug_assert!(
307 !(number.is_empty() || number == "."),
308 "Why did we even try to parse this as a literal",
309 );
310 self.remaining = remaining;
311
312 let mut is_integer_literal = !has_period;
315 if let Some(integer_literal) = number.strip_suffix('.') {
316 number = integer_literal;
317 is_integer_literal = true;
318 };
319
320 if is_integer_literal {
323 let value = number
324 .parse()
325 .inspect_err(|error| {
326 log::warn!(
327 "Failed to parse numeric literal ({number:?}) that looked valid: {error:?}"
328 )
329 })
330 .unwrap_or(i64::MAX);
331 LiteralToken::Integer(value)
332 } else {
333 let value = number
334 .parse()
335 .inspect_err(|error| {
336 log::warn!(
337 "Failed to parse numeric literal ({number:?}) that looked valid: {error:?}"
338 )
339 })
340 .unwrap_or(f64::NAN);
341 LiteralToken::Decimal(value)
342 }
343 }
344
345 fn skip_whitespace(&mut self) {
346 self.remaining = self
347 .remaining
348 .trim_start_matches(|c: char| c.is_ascii_whitespace());
349 }
350}
351
352fn match_operator_name(operator_name: &str) -> Result<OperatorToken, Error> {
353 let operator = match operator_name {
354 "and" => OperatorToken::And,
355 "or" => OperatorToken::Or,
356 "mod" => OperatorToken::Modulo,
357 "div" => OperatorToken::Divide,
358 "*" => OperatorToken::Multiply,
359 _ => {
360 log::debug!("Expected Operator, found {operator_name:?}");
361 return Err(Error::ExpectedOperator);
362 },
363 };
364
365 Ok(operator)
366}
367
368impl OperatorToken {
369 pub(crate) fn precedence(&self) -> impl Ord {
371 match self {
372 Self::Or => 0,
373 Self::And => 1,
374 Self::Equal | Self::NotEqual => 2,
375 Self::LessThan |
376 Self::LessThanOrEqual |
377 Self::GreaterThan |
378 Self::GreaterThanOrEqual => 3,
379 Self::Add | Self::Subtract => 4,
380 Self::Multiply | Self::Divide | Self::Modulo => 5,
381 }
382 }
383}
384
385impl<'a> Token<'a> {
386 pub(crate) fn is_start_of_location_step(&self) -> bool {
387 matches!(
388 self,
389 Self::AxisIdentifier(_) |
390 Self::AtSign |
391 Self::ParentNode |
392 Self::SelfNode |
393 Self::CName(_) |
394 Self::CommentTest |
395 Self::NodeTest |
396 Self::ProcessingInstructionTest |
397 Self::TextTest
398 )
399 }
400
401 fn followed_by_operator(&self) -> bool {
403 matches!(
404 self,
405 Self::Literal(_) |
406 Self::CName(_) |
407 Self::VariableReference(_) |
408 Self::ParentNode |
409 Self::SelfNode |
410 Self::ClosingBracket |
411 Self::ClosingParenthesis
412 )
413 }
414}
415
416pub(crate) fn tokenize(input: &str) -> Result<Vec<Token<'_>>, Error> {
417 let mut tokenizer = Tokenizer { remaining: input };
418 let mut tokens: Vec<Token> = vec![];
419
420 let mut expect_operator_token = false;
425
426 tokenizer.skip_whitespace();
427 while !tokenizer.remaining.is_empty() {
428 let token = tokenizer.consume_single_token(expect_operator_token)?;
429 tokens.push(token);
430 expect_operator_token = token.followed_by_operator();
431 tokenizer.skip_whitespace();
432 }
433
434 Ok(tokens)
435}
436
437#[cfg(test)]
438mod tests {
439 use super::*;
440
441 #[test]
442 fn parse_name_without_prefix() {
443 let mut tokenizer = Tokenizer { remaining: "foo" };
444 assert_eq!(
445 tokenizer.consume_single_token(false),
446 Ok(Token::CName(CNameToken {
447 prefix: None,
448 local_name: "foo"
449 }))
450 );
451 assert!(tokenizer.remaining.is_empty());
452 }
453
454 #[test]
455 fn parse_name_with_prefix() {
456 let mut tokenizer = Tokenizer {
457 remaining: "foo:bar",
458 };
459 assert_eq!(
460 tokenizer.consume_single_token(false),
461 Ok(Token::CName(CNameToken {
462 prefix: Some("foo"),
463 local_name: "bar"
464 }))
465 );
466 assert!(tokenizer.remaining.is_empty());
467 }
468
469 #[test]
470 fn parse_name_with_wildcard_prefix() {
471 let mut tokenizer = Tokenizer { remaining: "*:bar" };
472 assert_eq!(
473 tokenizer.consume_single_token(false),
474 Ok(Token::CName(CNameToken {
475 prefix: Some("*"),
476 local_name: "bar"
477 }))
478 );
479 assert!(tokenizer.remaining.is_empty());
480 }
481
482 #[test]
483 fn parse_name_with_wildcard_local_name() {
484 let mut tokenizer = Tokenizer { remaining: "*" };
485 assert_eq!(
486 tokenizer.consume_single_token(false),
487 Ok(Token::CName(CNameToken {
488 prefix: None,
489 local_name: "*"
490 }))
491 );
492 assert!(tokenizer.remaining.is_empty());
493 }
494
495 #[test]
496 fn parse_variable_reference() {
497 let mut tokenizer = Tokenizer {
498 remaining: "$servo",
499 };
500 assert_eq!(
501 tokenizer.consume_single_token(false),
502 Ok(Token::VariableReference("servo"))
503 );
504 assert!(tokenizer.remaining.is_empty());
505 }
506
507 #[test]
508 fn parse_floating_point_literal() {
509 let mut tokenizer = Tokenizer { remaining: "13.5" };
510 assert_eq!(
511 tokenizer.consume_numeric_literal(),
512 LiteralToken::Decimal(13.5)
513 );
514 assert!(tokenizer.remaining.is_empty());
515 }
516
517 #[test]
518 fn parse_floating_point_literal_without_leading_digit() {
519 let mut tokenizer = Tokenizer { remaining: ".42" };
520 assert_eq!(
521 tokenizer.consume_numeric_literal(),
522 LiteralToken::Decimal(0.42)
523 );
524 assert!(tokenizer.remaining.is_empty());
525 }
526
527 #[test]
528 fn parse_floating_point_literal_that_can_be_optimized_to_integer_literal() {
529 let mut tokenizer = Tokenizer { remaining: "42." };
530 assert_eq!(
531 tokenizer.consume_numeric_literal(),
532 LiteralToken::Integer(42)
533 );
534 assert!(tokenizer.remaining.is_empty());
535 }
536
537 #[test]
538 fn parse_integer_literal() {
539 let mut tokenizer = Tokenizer { remaining: "12" };
540 assert_eq!(
541 tokenizer.consume_numeric_literal(),
542 LiteralToken::Integer(12)
543 );
544 assert!(tokenizer.remaining.is_empty());
545 }
546
547 #[test]
548 fn parse_function_name() {
549 let mut tokenizer = Tokenizer { remaining: "foo(" };
550 assert_eq!(
551 tokenizer.consume_single_token(false),
552 Ok(Token::FunctionCall("foo"))
553 );
554 assert!(tokenizer.remaining.is_empty());
555 }
556
557 #[test]
558 fn parse_axis_identifier() {
559 let mut tokenizer = Tokenizer { remaining: "foo::" };
560 assert_eq!(
561 tokenizer.consume_single_token(false),
562 Ok(Token::AxisIdentifier("foo"))
563 );
564 assert!(tokenizer.remaining.is_empty());
565 }
566}