1use crate::{is_valid_continuation, is_valid_start};
6
7#[derive(Clone, Copy, Debug, PartialEq, Eq)]
8pub enum Error {
9    InvalidVariableReference,
11    InvalidNCName,
12    ExpectedOperator,
13    UnterminatedStringLiteral,
14    IllegalCharacter,
15}
16
17#[derive(Clone, Copy, Debug, PartialEq, Eq)]
18pub(crate) struct CNameToken<'a> {
19    pub(crate) prefix: Option<&'a str>,
20    pub(crate) local_name: &'a str,
21}
22
23#[derive(Clone, Copy, Debug, PartialEq, Eq)]
24pub(crate) enum OperatorToken {
25    And,
26    Or,
27    Multiply,
28    Modulo,
29    Divide,
30    Add,
31    Subtract,
32    LessThan,
33    LessThanOrEqual,
34    GreaterThan,
35    GreaterThanOrEqual,
36    Equal,
37    NotEqual,
38}
39
40#[derive(Clone, Copy, Debug, PartialEq)]
41pub(crate) enum LiteralToken<'a> {
42    Integer(i64),
43    Decimal(f64),
44    String(&'a str),
45}
46
47#[derive(Clone, Copy, Debug, PartialEq)]
48pub(crate) enum Token<'a> {
49    VariableReference(&'a str),
50    CName(CNameToken<'a>),
51    Operator(OperatorToken),
52    Literal(LiteralToken<'a>),
53    AxisIdentifier(&'a str),
55    ParentNode,
57    SelfNode,
59    Parent,
61    Ancestor,
63    FunctionCall(&'a str),
65    OpeningParenthesis,
67    ClosingParenthesis,
69    OpeningBracket,
71    ClosingBracket,
73    Comma,
75    AtSign,
77    ProcessingInstructionTest,
79    CommentTest,
81    NodeTest,
83    TextTest,
85    Union,
87}
88
89struct Tokenizer<'a> {
90    remaining: &'a str,
91}
92
93impl<'a> Tokenizer<'a> {
94    fn consume_ncname(&mut self, allow_wildcard: bool) -> Result<&'a str, Error> {
96        if allow_wildcard && self.remaining.starts_with('*') {
97            self.remaining = &self.remaining[1..];
98            return Ok("*");
99        }
100
101        let mut chars = self.remaining.char_indices();
102
103        if !chars
104            .next()
105            .is_some_and(|(_, character)| is_valid_start(character) && character != ':')
106        {
107            return Err(Error::InvalidNCName);
108        }
109
110        let name_end = chars
111            .find(|(_, character)| !is_valid_continuation(*character) || *character == ':')
112            .map(|(index, _)| index)
113            .unwrap_or(self.remaining.len());
114
115        let (ncname, remaining) = self.remaining.split_at(name_end);
116        self.remaining = remaining;
117        Ok(ncname)
118    }
119
120    fn consume_single_token(&mut self, expect_operator_token: bool) -> Result<Token<'a>, Error> {
125        if self.remaining.starts_with('$') {
126            self.remaining = &self.remaining[1..];
127            let variable_name = self
128                .consume_ncname(false)
129                .map_err(|_| Error::InvalidVariableReference)?;
130            return Ok(Token::VariableReference(variable_name));
131        }
132
133        if let Ok(ncname) = self.consume_ncname(true) {
134            if expect_operator_token {
135                return match_operator_name(ncname).map(Token::Operator);
136            }
137
138            if self.remaining.starts_with(':') {
139                self.remaining = &self.remaining[1..];
140                if self.remaining.starts_with(':') {
141                    self.remaining = &self.remaining[1..];
143                    return Ok(Token::AxisIdentifier(ncname));
144                }
145
146                return Ok(Token::CName(CNameToken {
148                    prefix: Some(ncname),
149                    local_name: self.consume_ncname(true)?,
150                }));
151            } else if self.remaining.starts_with('(') {
152                self.remaining = &self.remaining[1..];
153                let token = match ncname {
154                    "processing-instruction" => Token::ProcessingInstructionTest,
155                    "node" => Token::NodeTest,
156                    "text" => Token::TextTest,
157                    "comment" => Token::CommentTest,
158                    _ => Token::FunctionCall(ncname),
159                };
160                return Ok(token);
161            } else {
162                return Ok(Token::CName(CNameToken {
163                    prefix: None,
164                    local_name: ncname,
165                }));
166            }
167        }
168
169        match self
170            .remaining
171            .chars()
172            .next()
173            .expect("consume_single_token called with empty input")
174        {
175            '0'..='9' => {
176                let number = self.consume_numeric_literal();
177                Ok(Token::Literal(number))
178            },
179            '\'' | '"' => {
180                let string = self.consume_string_literal()?;
181                Ok(Token::Literal(LiteralToken::String(string)))
182            },
183            '.' => {
184                match self.remaining.chars().nth(1) {
188                    Some('0'..='9') => Ok(Token::Literal(self.consume_numeric_literal())),
189                    Some('.') => {
190                        self.remaining = &self.remaining[2..];
191                        Ok(Token::ParentNode)
192                    },
193                    _ => {
194                        self.remaining = &self.remaining[1..];
195                        Ok(Token::SelfNode)
196                    },
197                }
198            },
199            '/' => {
200                if self.remaining.chars().nth(1).is_some_and(|c| c == '/') {
201                    self.remaining = &self.remaining[2..];
202                    Ok(Token::Ancestor)
203                } else {
204                    self.remaining = &self.remaining[1..];
205                    Ok(Token::Parent)
206                }
207            },
208            '-' => {
209                self.remaining = &self.remaining[1..];
210                Ok(Token::Operator(OperatorToken::Subtract))
211            },
212            '(' => {
213                self.remaining = &self.remaining[1..];
214                Ok(Token::OpeningParenthesis)
215            },
216            ')' => {
217                self.remaining = &self.remaining[1..];
218                Ok(Token::ClosingParenthesis)
219            },
220            '[' => {
221                self.remaining = &self.remaining[1..];
222                Ok(Token::OpeningBracket)
223            },
224            ']' => {
225                self.remaining = &self.remaining[1..];
226                Ok(Token::ClosingBracket)
227            },
228            ',' => {
229                self.remaining = &self.remaining[1..];
230                Ok(Token::Comma)
231            },
232            '@' => {
233                self.remaining = &self.remaining[1..];
234                Ok(Token::AtSign)
235            },
236            '<' => {
237                self.remaining = &self.remaining[1..];
238                if self.remaining.starts_with('=') {
239                    self.remaining = &self.remaining[1..];
240                    Ok(Token::Operator(OperatorToken::LessThanOrEqual))
241                } else {
242                    Ok(Token::Operator(OperatorToken::LessThan))
243                }
244            },
245            '>' => {
246                self.remaining = &self.remaining[1..];
247                if self.remaining.starts_with('=') {
248                    self.remaining = &self.remaining[1..];
249                    Ok(Token::Operator(OperatorToken::GreaterThanOrEqual))
250                } else {
251                    Ok(Token::Operator(OperatorToken::GreaterThan))
252                }
253            },
254            '!' => {
255                if self.remaining.starts_with("!=") {
256                    self.remaining = &self.remaining[2..];
257                    Ok(Token::Operator(OperatorToken::NotEqual))
258                } else {
259                    Err(Error::IllegalCharacter)
260                }
261            },
262            '=' => {
263                self.remaining = &self.remaining[1..];
264                Ok(Token::Operator(OperatorToken::Equal))
265            },
266            '|' => {
267                self.remaining = &self.remaining[1..];
268                Ok(Token::Union)
269            },
270            '+' => {
271                self.remaining = &self.remaining[1..];
272                Ok(Token::Operator(OperatorToken::Add))
273            },
274            other => {
275                log::debug!("Illegal character: {other:?}");
276                Err(Error::IllegalCharacter)
277            },
278        }
279    }
280
281    fn consume_string_literal(&mut self) -> Result<&'a str, Error> {
282        let quote_character = self.remaining.chars().next().unwrap();
283        debug_assert!(quote_character == '\'' || quote_character == '"');
284        let Some((literal, remaining)) = self.remaining[1..].split_once(quote_character) else {
285            return Err(Error::UnterminatedStringLiteral);
286        };
287        self.remaining = remaining;
288        Ok(literal)
289    }
290
291    fn consume_numeric_literal(&mut self) -> LiteralToken<'a> {
293        let mut has_period = false;
294        let mut end = self.remaining.len();
295        for (index, c) in self.remaining.char_indices() {
296            let is_first_period = !has_period && c == '.';
297            if !c.is_ascii_digit() && !is_first_period {
298                end = index;
299                break;
300            }
301
302            has_period |= c == '.';
303        }
304
305        let (mut number, remaining) = self.remaining.split_at(end);
306        debug_assert!(
307            !(number.is_empty() || number == "."),
308            "Why did we even try to parse this as a literal",
309        );
310        self.remaining = remaining;
311
312        let mut is_integer_literal = !has_period;
315        if let Some(integer_literal) = number.strip_suffix('.') {
316            number = integer_literal;
317            is_integer_literal = true;
318        };
319
320        if is_integer_literal {
323            let value = number
324                .parse()
325                .inspect_err(|error| {
326                    log::warn!(
327                        "Failed to parse numeric literal ({number:?}) that looked valid: {error:?}"
328                    )
329                })
330                .unwrap_or(i64::MAX);
331            LiteralToken::Integer(value)
332        } else {
333            let value = number
334                .parse()
335                .inspect_err(|error| {
336                    log::warn!(
337                        "Failed to parse numeric literal ({number:?}) that looked valid: {error:?}"
338                    )
339                })
340                .unwrap_or(f64::NAN);
341            LiteralToken::Decimal(value)
342        }
343    }
344
345    fn skip_whitespace(&mut self) {
346        self.remaining = self
347            .remaining
348            .trim_start_matches(|c: char| c.is_ascii_whitespace());
349    }
350}
351
352fn match_operator_name(operator_name: &str) -> Result<OperatorToken, Error> {
353    let operator = match operator_name {
354        "and" => OperatorToken::And,
355        "or" => OperatorToken::Or,
356        "mod" => OperatorToken::Modulo,
357        "div" => OperatorToken::Divide,
358        "*" => OperatorToken::Multiply,
359        _ => {
360            log::debug!("Expected Operator, found {operator_name:?}");
361            return Err(Error::ExpectedOperator);
362        },
363    };
364
365    Ok(operator)
366}
367
368impl OperatorToken {
369    pub(crate) fn precedence(&self) -> impl Ord {
371        match self {
372            Self::Or => 0,
373            Self::And => 1,
374            Self::Equal | Self::NotEqual => 2,
375            Self::LessThan |
376            Self::LessThanOrEqual |
377            Self::GreaterThan |
378            Self::GreaterThanOrEqual => 3,
379            Self::Add | Self::Subtract => 4,
380            Self::Multiply | Self::Divide | Self::Modulo => 5,
381        }
382    }
383}
384
385impl<'a> Token<'a> {
386    pub(crate) fn is_start_of_location_step(&self) -> bool {
387        matches!(
388            self,
389            Self::AxisIdentifier(_) |
390                Self::AtSign |
391                Self::ParentNode |
392                Self::SelfNode |
393                Self::CName(_) |
394                Self::CommentTest |
395                Self::NodeTest |
396                Self::ProcessingInstructionTest |
397                Self::TextTest
398        )
399    }
400
401    fn followed_by_operator(&self) -> bool {
403        matches!(
404            self,
405            Self::Literal(_) |
406                Self::CName(_) |
407                Self::VariableReference(_) |
408                Self::ParentNode |
409                Self::SelfNode |
410                Self::ClosingBracket |
411                Self::ClosingParenthesis
412        )
413    }
414}
415
416pub(crate) fn tokenize(input: &str) -> Result<Vec<Token<'_>>, Error> {
417    let mut tokenizer = Tokenizer { remaining: input };
418    let mut tokens: Vec<Token> = vec![];
419
420    let mut expect_operator_token = false;
425
426    tokenizer.skip_whitespace();
427    while !tokenizer.remaining.is_empty() {
428        let token = tokenizer.consume_single_token(expect_operator_token)?;
429        tokens.push(token);
430        expect_operator_token = token.followed_by_operator();
431        tokenizer.skip_whitespace();
432    }
433
434    Ok(tokens)
435}
436
437#[cfg(test)]
438mod tests {
439    use super::*;
440
441    #[test]
442    fn parse_name_without_prefix() {
443        let mut tokenizer = Tokenizer { remaining: "foo" };
444        assert_eq!(
445            tokenizer.consume_single_token(false),
446            Ok(Token::CName(CNameToken {
447                prefix: None,
448                local_name: "foo"
449            }))
450        );
451        assert!(tokenizer.remaining.is_empty());
452    }
453
454    #[test]
455    fn parse_name_with_prefix() {
456        let mut tokenizer = Tokenizer {
457            remaining: "foo:bar",
458        };
459        assert_eq!(
460            tokenizer.consume_single_token(false),
461            Ok(Token::CName(CNameToken {
462                prefix: Some("foo"),
463                local_name: "bar"
464            }))
465        );
466        assert!(tokenizer.remaining.is_empty());
467    }
468
469    #[test]
470    fn parse_name_with_wildcard_prefix() {
471        let mut tokenizer = Tokenizer { remaining: "*:bar" };
472        assert_eq!(
473            tokenizer.consume_single_token(false),
474            Ok(Token::CName(CNameToken {
475                prefix: Some("*"),
476                local_name: "bar"
477            }))
478        );
479        assert!(tokenizer.remaining.is_empty());
480    }
481
482    #[test]
483    fn parse_name_with_wildcard_local_name() {
484        let mut tokenizer = Tokenizer { remaining: "*" };
485        assert_eq!(
486            tokenizer.consume_single_token(false),
487            Ok(Token::CName(CNameToken {
488                prefix: None,
489                local_name: "*"
490            }))
491        );
492        assert!(tokenizer.remaining.is_empty());
493    }
494
495    #[test]
496    fn parse_variable_reference() {
497        let mut tokenizer = Tokenizer {
498            remaining: "$servo",
499        };
500        assert_eq!(
501            tokenizer.consume_single_token(false),
502            Ok(Token::VariableReference("servo"))
503        );
504        assert!(tokenizer.remaining.is_empty());
505    }
506
507    #[test]
508    fn parse_floating_point_literal() {
509        let mut tokenizer = Tokenizer { remaining: "13.5" };
510        assert_eq!(
511            tokenizer.consume_numeric_literal(),
512            LiteralToken::Decimal(13.5)
513        );
514        assert!(tokenizer.remaining.is_empty());
515    }
516
517    #[test]
518    fn parse_floating_point_literal_without_leading_digit() {
519        let mut tokenizer = Tokenizer { remaining: ".42" };
520        assert_eq!(
521            tokenizer.consume_numeric_literal(),
522            LiteralToken::Decimal(0.42)
523        );
524        assert!(tokenizer.remaining.is_empty());
525    }
526
527    #[test]
528    fn parse_floating_point_literal_that_can_be_optimized_to_integer_literal() {
529        let mut tokenizer = Tokenizer { remaining: "42." };
530        assert_eq!(
531            tokenizer.consume_numeric_literal(),
532            LiteralToken::Integer(42)
533        );
534        assert!(tokenizer.remaining.is_empty());
535    }
536
537    #[test]
538    fn parse_integer_literal() {
539        let mut tokenizer = Tokenizer { remaining: "12" };
540        assert_eq!(
541            tokenizer.consume_numeric_literal(),
542            LiteralToken::Integer(12)
543        );
544        assert!(tokenizer.remaining.is_empty());
545    }
546
547    #[test]
548    fn parse_function_name() {
549        let mut tokenizer = Tokenizer { remaining: "foo(" };
550        assert_eq!(
551            tokenizer.consume_single_token(false),
552            Ok(Token::FunctionCall("foo"))
553        );
554        assert!(tokenizer.remaining.is_empty());
555    }
556
557    #[test]
558    fn parse_axis_identifier() {
559        let mut tokenizer = Tokenizer { remaining: "foo::" };
560        assert_eq!(
561            tokenizer.consume_single_token(false),
562            Ok(Token::AxisIdentifier("foo"))
563        );
564        assert!(tokenizer.remaining.is_empty());
565    }
566}