sea_query/
token.rs

1#![allow(dead_code)]
2use std::fmt::Write;
3use std::iter::Iterator;
4
5/// Tokenizer for processing SQL.
6#[derive(Debug)]
7pub struct Tokenizer<'a> {
8    input: &'a str,
9    chars: std::str::Chars<'a>,
10    c: Option<char>,
11    p: usize,
12}
13
14#[derive(Debug, PartialEq, Eq)]
15#[non_exhaustive]
16pub enum Token<'a> {
17    Quoted(&'a str),
18    Unquoted(&'a str),
19    Space(&'a str),
20    Punctuation(&'a str),
21}
22
23impl<'a> Tokenizer<'a> {
24    pub fn new(string: &'a str) -> Self {
25        let mut chars = string.chars();
26        let c = chars.next();
27        Self {
28            input: string,
29            chars,
30            c,
31            p: 0,
32        }
33    }
34
35    pub fn iter(self) -> impl Iterator<Item = Token<'a>> {
36        self
37    }
38
39    fn get(&self) -> char {
40        self.c.unwrap()
41    }
42
43    fn inc(&mut self) {
44        let c = self.get();
45        self.c = self.chars.next();
46        self.p += c.len_utf8();
47    }
48
49    fn end(&self) -> bool {
50        self.c.is_none()
51    }
52
53    fn p_c(&self, c: char) -> usize {
54        self.p + c.len_utf8()
55    }
56
57    fn space(&mut self) -> Option<Token<'a>> {
58        let a = self.p;
59        let mut b = a;
60
61        while !self.end() {
62            let c = self.get();
63            if Self::is_space(c) {
64                b = self.p_c(c);
65            } else {
66                break;
67            }
68            self.inc();
69        }
70
71        if a != b {
72            Some(Token::Space(&self.input[a..b]))
73        } else {
74            None
75        }
76    }
77
78    fn unquoted(&mut self) -> Option<Token<'a>> {
79        let a = self.p;
80        let mut b = a;
81
82        let mut first = true;
83        while !self.end() {
84            let c = self.get();
85            if Self::is_alphanumeric(c) {
86                b = self.p_c(c);
87                first = false;
88                self.inc();
89            } else if !first && Self::is_identifier(c) {
90                b = self.p_c(c);
91                self.inc();
92            } else {
93                break;
94            }
95        }
96
97        if a != b {
98            Some(Token::Unquoted(&self.input[a..b]))
99        } else {
100            None
101        }
102    }
103
104    fn quoted(&mut self) -> Option<Token<'a>> {
105        let a = self.p;
106        let mut b = a;
107
108        let mut first = true;
109        let mut escape = false;
110        let mut start = ' ';
111        while !self.end() {
112            let c = self.get();
113            if first && Self::is_string_delimiter_start(c) {
114                b = self.p_c(c);
115                first = false;
116                start = c;
117                self.inc();
118            } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
119                b = self.p_c(c);
120                self.inc();
121                if self.end() {
122                    break;
123                }
124                if !Self::is_string_escape_for(start, self.get()) {
125                    break;
126                } else {
127                    b = self.p_c(c);
128                    self.inc();
129                }
130            } else if !first {
131                escape = !escape && Self::is_escape_char(c);
132                b = self.p_c(c);
133                self.inc();
134            } else {
135                break;
136            }
137        }
138        if a != b {
139            Some(Token::Quoted(&self.input[a..b]))
140        } else {
141            None
142        }
143    }
144
145    /// unquote a quoted string
146    fn unquote(mut self) -> String {
147        let mut string = String::new();
148        let mut first = true;
149        let mut escape = false;
150        let mut start = ' ';
151        while !self.end() {
152            let c = self.get();
153            if first && Self::is_string_delimiter_start(c) {
154                first = false;
155                start = c;
156                self.inc();
157            } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
158                self.inc();
159                if self.end() {
160                    break;
161                }
162                if !Self::is_string_escape_for(start, self.get()) {
163                    break;
164                } else {
165                    string.write_char(c).unwrap();
166                    self.inc();
167                }
168            } else if !first {
169                escape = !escape && Self::is_escape_char(c);
170                string.write_char(c).unwrap();
171                self.inc();
172            } else {
173                break;
174            }
175        }
176        string
177    }
178
179    fn punctuation(&mut self) -> Option<Token<'a>> {
180        let a = self.p;
181        let mut b = a;
182
183        if !self.end() {
184            let c = self.get();
185            if !Self::is_space(c) && !Self::is_alphanumeric(c) {
186                b = self.p_c(c);
187                self.inc();
188            }
189        }
190
191        if a != b {
192            Some(Token::Punctuation(&self.input[a..b]))
193        } else {
194            None
195        }
196    }
197
198    fn is_space(c: char) -> bool {
199        matches!(c, ' ' | '\t' | '\r' | '\n')
200    }
201
202    fn is_identifier(c: char) -> bool {
203        matches!(c, '_' | '$')
204    }
205
206    fn is_alphanumeric(c: char) -> bool {
207        c.is_alphabetic() || c.is_ascii_digit()
208    }
209
210    fn is_string_delimiter_start(c: char) -> bool {
211        matches!(c, '`' | '[' | '\'' | '"')
212    }
213
214    fn is_string_escape_for(start: char, c: char) -> bool {
215        match start {
216            '`' => c == '`',
217            '\'' => c == '\'',
218            '"' => c == '"',
219            _ => false,
220        }
221    }
222
223    fn is_string_delimiter_end_for(start: char, c: char) -> bool {
224        match start {
225            '`' => c == '`',
226            '[' => c == ']',
227            '\'' => c == '\'',
228            '"' => c == '"',
229            _ => false,
230        }
231    }
232
233    fn is_escape_char(c: char) -> bool {
234        c == '\\'
235    }
236}
237
238impl<'a> Iterator for Tokenizer<'a> {
239    type Item = Token<'a>;
240
241    fn next(&mut self) -> Option<Self::Item> {
242        if let Some(space) = self.space() {
243            return Some(space);
244        }
245        if let Some(unquoted) = self.unquoted() {
246            return Some(unquoted);
247        }
248        if let Some(quoted) = self.quoted() {
249            return Some(quoted);
250        }
251        if let Some(punctuation) = self.punctuation() {
252            return Some(punctuation);
253        }
254        None
255    }
256}
257
258impl Token<'_> {
259    pub fn is_quoted(&self) -> bool {
260        matches!(self, Self::Quoted(_))
261    }
262
263    pub fn is_unquoted(&self) -> bool {
264        matches!(self, Self::Unquoted(_))
265    }
266
267    pub fn is_space(&self) -> bool {
268        matches!(self, Self::Space(_))
269    }
270
271    pub fn is_punctuation(&self) -> bool {
272        matches!(self, Self::Punctuation(_))
273    }
274
275    pub fn as_str(&self) -> &str {
276        match self {
277            Self::Quoted(string) => string,
278            Self::Unquoted(string) => string,
279            Self::Space(string) => string,
280            Self::Punctuation(string) => string,
281        }
282    }
283
284    pub fn unquote(&self) -> Option<String> {
285        if self.is_quoted() {
286            let tokenizer = Tokenizer::new(self.as_str());
287            Some(tokenizer.unquote())
288        } else {
289            None
290        }
291    }
292}
293
294impl std::fmt::Display for Token<'_> {
295    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
296        f.write_str(self.as_str())
297    }
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303
304    #[test]
305    fn test_0() {
306        let tokenizer = Tokenizer::new("");
307        let tokens: Vec<Token> = tokenizer.iter().collect();
308        assert_eq!(tokens, vec![]);
309    }
310
311    #[test]
312    fn test_1() {
313        let string = "SELECT * FROM `character`";
314        let tokenizer = Tokenizer::new(string);
315        let tokens: Vec<Token> = tokenizer.iter().collect();
316        assert_eq!(
317            tokens,
318            vec![
319                Token::Unquoted("SELECT"),
320                Token::Space(" "),
321                Token::Punctuation("*"),
322                Token::Space(" "),
323                Token::Unquoted("FROM"),
324                Token::Space(" "),
325                Token::Quoted("`character`"),
326            ]
327        );
328        assert_eq!(
329            string,
330            tokens.iter().map(|x| x.as_str()).collect::<String>()
331        );
332    }
333
334    #[test]
335    fn test_2() {
336        let string = "SELECT * FROM `character` WHERE id = ?";
337        let tokenizer = Tokenizer::new(string);
338        let tokens: Vec<Token> = tokenizer.iter().collect();
339        assert_eq!(
340            tokens,
341            vec![
342                Token::Unquoted("SELECT"),
343                Token::Space(" "),
344                Token::Punctuation("*"),
345                Token::Space(" "),
346                Token::Unquoted("FROM"),
347                Token::Space(" "),
348                Token::Quoted("`character`"),
349                Token::Space(" "),
350                Token::Unquoted("WHERE"),
351                Token::Space(" "),
352                Token::Unquoted("id"),
353                Token::Space(" "),
354                Token::Punctuation("="),
355                Token::Space(" "),
356                Token::Punctuation("?"),
357            ]
358        );
359        assert_eq!(
360            string,
361            tokens.iter().map(|x| x.as_str()).collect::<String>()
362        );
363    }
364
365    #[test]
366    fn test_3() {
367        let string = r#"? = "?" "#;
368        let tokenizer = Tokenizer::new(string);
369        let tokens: Vec<Token> = tokenizer.iter().collect();
370        assert_eq!(
371            tokens,
372            vec![
373                Token::Punctuation("?"),
374                Token::Space(" "),
375                Token::Punctuation("="),
376                Token::Space(" "),
377                Token::Quoted(r#""?""#),
378                Token::Space(" "),
379            ]
380        );
381        assert_eq!(
382            string,
383            tokens.iter().map(|x| x.as_str()).collect::<String>()
384        );
385    }
386
387    #[test]
388    fn test_4() {
389        let string = r#""a\"bc""#;
390        let tokenizer = Tokenizer::new(string);
391        let tokens: Vec<Token> = tokenizer.iter().collect();
392        assert_eq!(tokens, vec![Token::Quoted("\"a\\\"bc\"")]);
393        assert_eq!(
394            string,
395            tokens.iter().map(|x| x.as_str()).collect::<String>()
396        );
397    }
398
399    #[test]
400    fn test_5() {
401        let string = "abc123";
402        let tokenizer = Tokenizer::new(string);
403        let tokens: Vec<Token> = tokenizer.iter().collect();
404        assert_eq!(tokens, vec![Token::Unquoted(string)]);
405        assert_eq!(
406            string,
407            tokens.iter().map(|x| x.as_str()).collect::<String>()
408        );
409    }
410
411    #[test]
412    fn test_6() {
413        let string = "2.3*4";
414        let tokenizer = Tokenizer::new(string);
415        let tokens: Vec<Token> = tokenizer.iter().collect();
416        assert_eq!(
417            tokens,
418            vec![
419                Token::Unquoted("2"),
420                Token::Punctuation("."),
421                Token::Unquoted("3"),
422                Token::Punctuation("*"),
423                Token::Unquoted("4"),
424            ]
425        );
426        assert_eq!(
427            string,
428            tokens.iter().map(|x| x.as_str()).collect::<String>()
429        );
430    }
431
432    #[test]
433    fn test_7() {
434        let string = r#""a\\" B"#;
435        let tokenizer = Tokenizer::new(string);
436        let tokens: Vec<Token> = tokenizer.iter().collect();
437        assert_eq!(
438            tokens,
439            vec![
440                Token::Quoted("\"a\\\\\""),
441                Token::Space(" "),
442                Token::Unquoted("B"),
443            ]
444        );
445        assert_eq!(
446            string,
447            tokens.iter().map(|x| x.as_str()).collect::<String>()
448        );
449    }
450
451    #[test]
452    fn test_8() {
453        let string = r#"`a"b` "#;
454        let tokenizer = Tokenizer::new(string);
455        let tokens: Vec<Token> = tokenizer.iter().collect();
456        assert_eq!(tokens, vec![Token::Quoted("`a\"b`"), Token::Space(" ")]);
457        assert_eq!(
458            string,
459            tokens.iter().map(|x| x.as_str()).collect::<String>()
460        );
461    }
462
463    #[test]
464    fn test_9() {
465        let string = r"[ab] ";
466        let tokenizer = Tokenizer::new(string);
467        let tokens: Vec<Token> = tokenizer.iter().collect();
468        assert_eq!(tokens, vec![Token::Quoted("[ab]"), Token::Space(" ")]);
469        assert_eq!(
470            string,
471            tokens.iter().map(|x| x.as_str()).collect::<String>()
472        );
473    }
474
475    #[test]
476    fn test_10() {
477        let string = r#" 'a"b' "#;
478        let tokenizer = Tokenizer::new(string);
479        let tokens: Vec<Token> = tokenizer.iter().collect();
480        assert_eq!(
481            tokens,
482            vec![
483                Token::Space(" "),
484                Token::Quoted("'a\"b'"),
485                Token::Space(" "),
486            ]
487        );
488        assert_eq!(
489            string,
490            tokens.iter().map(|x| x.as_str()).collect::<String>()
491        );
492    }
493
494    #[test]
495    fn test_11() {
496        let string = r" `a``b` ";
497        let tokenizer = Tokenizer::new(string);
498        let tokens: Vec<Token> = tokenizer.iter().collect();
499        assert_eq!(
500            tokens,
501            vec![
502                Token::Space(" "),
503                Token::Quoted("`a``b`"),
504                Token::Space(" "),
505            ]
506        );
507        assert_eq!(
508            string,
509            tokens.iter().map(|x| x.as_str()).collect::<String>()
510        );
511    }
512
513    #[test]
514    fn test_12() {
515        let string = r" 'a''b' ";
516        let tokenizer = Tokenizer::new(string);
517        let tokens: Vec<Token> = tokenizer.iter().collect();
518        assert_eq!(
519            tokens,
520            vec![
521                Token::Space(" "),
522                Token::Quoted("'a''b'"),
523                Token::Space(" "),
524            ]
525        );
526        assert_eq!(
527            string,
528            tokens.iter().map(|x| x.as_str()).collect::<String>()
529        );
530    }
531
532    #[test]
533    fn test_13() {
534        let string = r"(?)";
535        let tokenizer = Tokenizer::new(string);
536        let tokens: Vec<Token> = tokenizer.iter().collect();
537        assert_eq!(
538            tokens,
539            vec![
540                Token::Punctuation("("),
541                Token::Punctuation("?"),
542                Token::Punctuation(")"),
543            ]
544        );
545        assert_eq!(
546            string,
547            tokens.iter().map(|x| x.as_str()).collect::<String>()
548        );
549    }
550
551    #[test]
552    fn test_14() {
553        let string = r"($1 = $2)";
554        let tokenizer = Tokenizer::new(string);
555        let tokens: Vec<Token> = tokenizer.iter().collect();
556        assert_eq!(
557            tokens,
558            vec![
559                Token::Punctuation("("),
560                Token::Punctuation("$"),
561                Token::Unquoted("1"),
562                Token::Space(" "),
563                Token::Punctuation("="),
564                Token::Space(" "),
565                Token::Punctuation("$"),
566                Token::Unquoted("2"),
567                Token::Punctuation(")"),
568            ]
569        );
570        assert_eq!(
571            string,
572            tokens.iter().map(|x| x.as_str()).collect::<String>()
573        );
574    }
575
576    #[test]
577    fn test_15() {
578        let string = r#" "Hello World" "#;
579        let tokenizer = Tokenizer::new(string);
580        let tokens: Vec<Token> = tokenizer.iter().collect();
581        assert_eq!(
582            tokens,
583            vec![
584                Token::Space(" "),
585                Token::Quoted("\"Hello World\""),
586                Token::Space(" "),
587            ]
588        );
589        assert_eq!(
590            string,
591            tokens.iter().map(|x| x.as_str()).collect::<String>()
592        );
593    }
594
595    #[test]
596    fn test_16() {
597        let string = "abc_$123";
598        let tokenizer = Tokenizer::new(string);
599        let tokens: Vec<Token> = tokenizer.iter().collect();
600        assert_eq!(tokens, vec![Token::Unquoted(string)]);
601        assert_eq!(
602            string,
603            tokens.iter().map(|x| x.as_str()).collect::<String>()
604        );
605    }
606
607    #[test]
608    fn test_17() {
609        let string = "$abc$123";
610        let tokenizer = Tokenizer::new(string);
611        let tokens: Vec<Token> = tokenizer.iter().collect();
612        assert_eq!(
613            tokens,
614            vec![Token::Punctuation("$"), Token::Unquoted("abc$123"),]
615        );
616        assert_eq!(
617            string,
618            tokens.iter().map(|x| x.as_str()).collect::<String>()
619        );
620    }
621
622    #[test]
623    fn test_18() {
624        let string = "_$abc_123$";
625        let tokenizer = Tokenizer::new(string);
626        let tokens: Vec<Token> = tokenizer.iter().collect();
627        assert_eq!(
628            tokens,
629            vec![
630                Token::Punctuation("_"),
631                Token::Punctuation("$"),
632                Token::Unquoted("abc_123$"),
633            ]
634        );
635        assert_eq!(
636            string,
637            tokens.iter().map(|x| x.as_str()).collect::<String>()
638        );
639    }
640
641    #[test]
642    fn test_19() {
643        let string = r#""a\"bc""#;
644        let tokenizer = Tokenizer::new(string);
645        assert_eq!(tokenizer.unquote(), "a\\\"bc".to_owned());
646    }
647
648    #[test]
649    fn test_20() {
650        let string = r#""a""bc""#;
651        let tokenizer = Tokenizer::new(string);
652        assert_eq!(tokenizer.unquote(), "a\"bc".to_owned());
653    }
654
655    #[test]
656    fn test_21() {
657        assert_eq!(
658            Token::Quoted("'a\\nb'").unquote().unwrap(),
659            "a\\nb".to_owned()
660        );
661    }
662
663    #[test]
664    fn test_22() {
665        let string = r#" "Hello\nWorld" "#;
666        let tokenizer = Tokenizer::new(string);
667        let tokens: Vec<Token> = tokenizer.iter().collect();
668        assert_eq!(
669            tokens,
670            vec![
671                Token::Space(" "),
672                Token::Quoted("\"Hello\\nWorld\""),
673                Token::Space(" "),
674            ]
675        );
676        assert_eq!(
677            string,
678            tokens.iter().map(|x| x.as_str()).collect::<String>()
679        );
680    }
681
682    #[test]
683    fn test_23() {
684        let string = "{ab} '{cd}'";
685        let tokenizer = Tokenizer::new(string);
686        let tokens: Vec<Token> = tokenizer.iter().collect();
687        assert_eq!(
688            tokens,
689            vec![
690                Token::Punctuation("{"),
691                Token::Unquoted("ab"),
692                Token::Punctuation("}"),
693                Token::Space(" "),
694                Token::Quoted("'{cd}'"),
695            ]
696        );
697        assert_eq!(
698            string,
699            tokens.iter().map(|x| x.as_str()).collect::<String>()
700        );
701    }
702
703    #[test]
704    fn test_24() {
705        let string = r#"新"老虎","#;
706        let tokenizer = Tokenizer::new(string);
707        let tokens: Vec<Token> = tokenizer.iter().collect();
708        assert_eq!(
709            tokens,
710            vec![
711                Token::Unquoted("新"),
712                Token::Quoted("\"老虎\""),
713                Token::Punctuation(","),
714            ]
715        );
716        assert_eq!(
717            string,
718            tokens.iter().map(|x| x.as_str()).collect::<String>()
719        );
720    }
721
722    #[test]
723    fn test_25() {
724        let string = r#"{a.1:2}"#;
725        let tokenizer = Tokenizer::new(string);
726        let tokens: Vec<Token> = tokenizer.iter().collect();
727        assert_eq!(
728            tokens,
729            vec![
730                Token::Punctuation("{"),
731                Token::Unquoted("a"),
732                Token::Punctuation("."),
733                Token::Unquoted("1"),
734                Token::Punctuation(":"),
735                Token::Unquoted("2"),
736                Token::Punctuation("}"),
737            ]
738        );
739        assert_eq!(
740            string,
741            tokens.iter().map(|x| x.as_str()).collect::<String>()
742        );
743    }
744
745    #[test]
746    fn test_26() {
747        let string = r#"{..(a.1:2)}"#;
748        let tokenizer = Tokenizer::new(string);
749        let tokens: Vec<Token> = tokenizer.iter().collect();
750        assert_eq!(
751            tokens,
752            vec![
753                Token::Punctuation("{"),
754                Token::Punctuation("."),
755                Token::Punctuation("."),
756                Token::Punctuation("("),
757                Token::Unquoted("a"),
758                Token::Punctuation("."),
759                Token::Unquoted("1"),
760                Token::Punctuation(":"),
761                Token::Unquoted("2"),
762                Token::Punctuation(")"),
763                Token::Punctuation("}"),
764            ]
765        );
766        assert_eq!(
767            string,
768            tokens.iter().map(|x| x.as_str()).collect::<String>()
769        );
770    }
771}