sea_query_derive/raw_sql/
token.rs

1#![allow(dead_code)]
2use std::fmt::Write;
3use std::iter::Iterator;
4
5/// Tokenizer for processing SQL.
6#[derive(Debug)]
7pub struct Tokenizer<'a> {
8    input: &'a str,
9    chars: std::str::Chars<'a>,
10    c: Option<char>,
11    p: usize,
12}
13
14#[derive(Debug, PartialEq, Eq)]
15#[non_exhaustive]
16pub enum Token<'a> {
17    Quoted(&'a str),
18    Unquoted(&'a str),
19    Space(&'a str),
20    Punctuation(&'a str),
21}
22
23impl<'a> Tokenizer<'a> {
24    pub fn new(string: &'a str) -> Self {
25        let mut chars = string.chars();
26        let c = chars.next();
27        Self {
28            input: string,
29            chars,
30            c,
31            p: 0,
32        }
33    }
34
35    pub fn iter(self) -> impl Iterator<Item = Token<'a>> {
36        self
37    }
38
39    fn get(&self) -> char {
40        self.c.unwrap()
41    }
42
43    fn inc(&mut self) {
44        let c = self.get();
45        self.c = self.chars.next();
46        self.p += c.len_utf8();
47    }
48
49    fn end(&self) -> bool {
50        self.c.is_none()
51    }
52
53    fn p_c(&self, c: char) -> usize {
54        self.p + c.len_utf8()
55    }
56
57    fn space(&mut self) -> Option<Token<'a>> {
58        let a = self.p;
59        let mut b = a;
60
61        while !self.end() {
62            let c = self.get();
63            if Self::is_space(c) {
64                b = self.p_c(c);
65            } else {
66                break;
67            }
68            self.inc();
69        }
70
71        if a != b {
72            Some(Token::Space(&self.input[a..b]))
73        } else {
74            None
75        }
76    }
77
78    fn unquoted(&mut self) -> Option<Token<'a>> {
79        let a = self.p;
80        let mut b = a;
81
82        let mut first = true;
83        while !self.end() {
84            let c = self.get();
85            if Self::is_alphanumeric(c) {
86                b = self.p_c(c);
87                first = false;
88                self.inc();
89            } else if !first && Self::is_identifier(c) {
90                b = self.p_c(c);
91                self.inc();
92            } else {
93                break;
94            }
95        }
96
97        if a != b {
98            Some(Token::Unquoted(&self.input[a..b]))
99        } else {
100            None
101        }
102    }
103
104    fn quoted(&mut self) -> Option<Token<'a>> {
105        let a = self.p;
106        let mut b = a;
107
108        let mut first = true;
109        let mut escape = false;
110        let mut start = ' ';
111        while !self.end() {
112            let c = self.get();
113            if first && Self::is_string_delimiter_start(c) {
114                b = self.p_c(c);
115                first = false;
116                start = c;
117                self.inc();
118            } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
119                b = self.p_c(c);
120                self.inc();
121                if self.end() {
122                    break;
123                }
124                if !Self::is_string_escape_for(start, self.get()) {
125                    break;
126                } else {
127                    b = self.p_c(c);
128                    self.inc();
129                }
130            } else if !first {
131                escape = !escape && Self::is_escape_char(c);
132                b = self.p_c(c);
133                self.inc();
134            } else {
135                break;
136            }
137        }
138        if a != b {
139            Some(Token::Quoted(&self.input[a..b]))
140        } else {
141            None
142        }
143    }
144
145    /// unquote a quoted string
146    fn unquote(mut self) -> String {
147        let mut string = String::new();
148        let mut first = true;
149        let mut escape = false;
150        let mut start = ' ';
151        while !self.end() {
152            let c = self.get();
153            if first && Self::is_string_delimiter_start(c) {
154                first = false;
155                start = c;
156                self.inc();
157            } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
158                self.inc();
159                if self.end() {
160                    break;
161                }
162                if !Self::is_string_escape_for(start, self.get()) {
163                    break;
164                } else {
165                    write!(string, "{c}").unwrap();
166                    self.inc();
167                }
168            } else if !first {
169                escape = !escape && Self::is_escape_char(c);
170                write!(string, "{c}").unwrap();
171                self.inc();
172            } else {
173                break;
174            }
175        }
176        string
177    }
178
179    fn punctuation(&mut self) -> Option<Token<'a>> {
180        let a = self.p;
181        let mut b = a;
182
183        if !self.end() {
184            let c = self.get();
185            if !Self::is_space(c) && !Self::is_alphanumeric(c) {
186                b = self.p_c(c);
187                self.inc();
188            }
189        }
190
191        if a != b {
192            Some(Token::Punctuation(&self.input[a..b]))
193        } else {
194            None
195        }
196    }
197
198    fn is_space(c: char) -> bool {
199        matches!(c, ' ' | '\t' | '\r' | '\n')
200    }
201
202    fn is_identifier(c: char) -> bool {
203        matches!(c, '_' | '$')
204    }
205
206    fn is_alphanumeric(c: char) -> bool {
207        c.is_alphabetic() || c.is_ascii_digit()
208    }
209
210    fn is_string_delimiter_start(c: char) -> bool {
211        matches!(c, '`' | '[' | '\'' | '"')
212    }
213
214    fn is_string_escape_for(start: char, c: char) -> bool {
215        match start {
216            '`' => c == '`',
217            '\'' => c == '\'',
218            '"' => c == '"',
219            _ => false,
220        }
221    }
222
223    fn is_string_delimiter_end_for(start: char, c: char) -> bool {
224        match start {
225            '`' => c == '`',
226            '[' => c == ']',
227            '\'' => c == '\'',
228            '"' => c == '"',
229            _ => false,
230        }
231    }
232
233    fn is_escape_char(c: char) -> bool {
234        c == '\\'
235    }
236}
237
238impl<'a> Iterator for Tokenizer<'a> {
239    type Item = Token<'a>;
240
241    fn next(&mut self) -> Option<Self::Item> {
242        if let Some(space) = self.space() {
243            return Some(space);
244        }
245        if let Some(unquoted) = self.unquoted() {
246            return Some(unquoted);
247        }
248        if let Some(quoted) = self.quoted() {
249            return Some(quoted);
250        }
251        if let Some(punctuation) = self.punctuation() {
252            return Some(punctuation);
253        }
254        None
255    }
256}
257
258impl Token<'_> {
259    pub fn is_quoted(&self) -> bool {
260        matches!(self, Self::Quoted(_))
261    }
262
263    pub fn is_unquoted(&self) -> bool {
264        matches!(self, Self::Unquoted(_))
265    }
266
267    pub fn is_space(&self) -> bool {
268        matches!(self, Self::Space(_))
269    }
270
271    pub fn is_punctuation(&self) -> bool {
272        matches!(self, Self::Punctuation(_))
273    }
274
275    pub fn as_str(&self) -> &str {
276        match self {
277            Self::Quoted(string) => string,
278            Self::Unquoted(string) => string,
279            Self::Space(string) => string,
280            Self::Punctuation(string) => string,
281        }
282    }
283
284    pub fn unquote(&self) -> Option<String> {
285        if self.is_quoted() {
286            let tokenizer = Tokenizer::new(self.as_str());
287            Some(tokenizer.unquote())
288        } else {
289            None
290        }
291    }
292}
293
294impl std::fmt::Display for Token<'_> {
295    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
296        write!(
297            f,
298            "{}",
299            match self {
300                Token::Unquoted(string) => string,
301                Token::Space(string) => string,
302                Token::Quoted(string) => string,
303                Token::Punctuation(string) => string,
304            }
305        )
306    }
307}
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312
313    #[test]
314    fn test_0() {
315        let tokenizer = Tokenizer::new("");
316        let tokens: Vec<Token> = tokenizer.iter().collect();
317        assert_eq!(tokens, vec![]);
318    }
319
320    #[test]
321    fn test_1() {
322        let string = "SELECT * FROM `character`";
323        let tokenizer = Tokenizer::new(string);
324        let tokens: Vec<Token> = tokenizer.iter().collect();
325        assert_eq!(
326            tokens,
327            vec![
328                Token::Unquoted("SELECT"),
329                Token::Space(" "),
330                Token::Punctuation("*"),
331                Token::Space(" "),
332                Token::Unquoted("FROM"),
333                Token::Space(" "),
334                Token::Quoted("`character`"),
335            ]
336        );
337        assert_eq!(
338            string,
339            tokens.iter().map(|x| x.as_str()).collect::<String>()
340        );
341    }
342
343    #[test]
344    fn test_2() {
345        let string = "SELECT * FROM `character` WHERE id = ?";
346        let tokenizer = Tokenizer::new(string);
347        let tokens: Vec<Token> = tokenizer.iter().collect();
348        assert_eq!(
349            tokens,
350            vec![
351                Token::Unquoted("SELECT"),
352                Token::Space(" "),
353                Token::Punctuation("*"),
354                Token::Space(" "),
355                Token::Unquoted("FROM"),
356                Token::Space(" "),
357                Token::Quoted("`character`"),
358                Token::Space(" "),
359                Token::Unquoted("WHERE"),
360                Token::Space(" "),
361                Token::Unquoted("id"),
362                Token::Space(" "),
363                Token::Punctuation("="),
364                Token::Space(" "),
365                Token::Punctuation("?"),
366            ]
367        );
368        assert_eq!(
369            string,
370            tokens.iter().map(|x| x.as_str()).collect::<String>()
371        );
372    }
373
374    #[test]
375    fn test_3() {
376        let string = r#"? = "?" "#;
377        let tokenizer = Tokenizer::new(string);
378        let tokens: Vec<Token> = tokenizer.iter().collect();
379        assert_eq!(
380            tokens,
381            vec![
382                Token::Punctuation("?"),
383                Token::Space(" "),
384                Token::Punctuation("="),
385                Token::Space(" "),
386                Token::Quoted(r#""?""#),
387                Token::Space(" "),
388            ]
389        );
390        assert_eq!(
391            string,
392            tokens.iter().map(|x| x.as_str()).collect::<String>()
393        );
394    }
395
396    #[test]
397    fn test_4() {
398        let string = r#""a\"bc""#;
399        let tokenizer = Tokenizer::new(string);
400        let tokens: Vec<Token> = tokenizer.iter().collect();
401        assert_eq!(tokens, vec![Token::Quoted("\"a\\\"bc\"")]);
402        assert_eq!(
403            string,
404            tokens.iter().map(|x| x.as_str()).collect::<String>()
405        );
406    }
407
408    #[test]
409    fn test_5() {
410        let string = "abc123";
411        let tokenizer = Tokenizer::new(string);
412        let tokens: Vec<Token> = tokenizer.iter().collect();
413        assert_eq!(tokens, vec![Token::Unquoted(string)]);
414        assert_eq!(
415            string,
416            tokens.iter().map(|x| x.as_str()).collect::<String>()
417        );
418    }
419
420    #[test]
421    fn test_6() {
422        let string = "2.3*4";
423        let tokenizer = Tokenizer::new(string);
424        let tokens: Vec<Token> = tokenizer.iter().collect();
425        assert_eq!(
426            tokens,
427            vec![
428                Token::Unquoted("2"),
429                Token::Punctuation("."),
430                Token::Unquoted("3"),
431                Token::Punctuation("*"),
432                Token::Unquoted("4"),
433            ]
434        );
435        assert_eq!(
436            string,
437            tokens.iter().map(|x| x.as_str()).collect::<String>()
438        );
439    }
440
441    #[test]
442    fn test_7() {
443        let string = r#""a\\" B"#;
444        let tokenizer = Tokenizer::new(string);
445        let tokens: Vec<Token> = tokenizer.iter().collect();
446        assert_eq!(
447            tokens,
448            vec![
449                Token::Quoted("\"a\\\\\""),
450                Token::Space(" "),
451                Token::Unquoted("B"),
452            ]
453        );
454        assert_eq!(
455            string,
456            tokens.iter().map(|x| x.as_str()).collect::<String>()
457        );
458    }
459
460    #[test]
461    fn test_8() {
462        let string = r#"`a"b` "#;
463        let tokenizer = Tokenizer::new(string);
464        let tokens: Vec<Token> = tokenizer.iter().collect();
465        assert_eq!(tokens, vec![Token::Quoted("`a\"b`"), Token::Space(" ")]);
466        assert_eq!(
467            string,
468            tokens.iter().map(|x| x.as_str()).collect::<String>()
469        );
470    }
471
472    #[test]
473    fn test_9() {
474        let string = r"[ab] ";
475        let tokenizer = Tokenizer::new(string);
476        let tokens: Vec<Token> = tokenizer.iter().collect();
477        assert_eq!(tokens, vec![Token::Quoted("[ab]"), Token::Space(" ")]);
478        assert_eq!(
479            string,
480            tokens.iter().map(|x| x.as_str()).collect::<String>()
481        );
482    }
483
484    #[test]
485    fn test_10() {
486        let string = r#" 'a"b' "#;
487        let tokenizer = Tokenizer::new(string);
488        let tokens: Vec<Token> = tokenizer.iter().collect();
489        assert_eq!(
490            tokens,
491            vec![
492                Token::Space(" "),
493                Token::Quoted("'a\"b'"),
494                Token::Space(" "),
495            ]
496        );
497        assert_eq!(
498            string,
499            tokens.iter().map(|x| x.as_str()).collect::<String>()
500        );
501    }
502
503    #[test]
504    fn test_11() {
505        let string = r" `a``b` ";
506        let tokenizer = Tokenizer::new(string);
507        let tokens: Vec<Token> = tokenizer.iter().collect();
508        assert_eq!(
509            tokens,
510            vec![
511                Token::Space(" "),
512                Token::Quoted("`a``b`"),
513                Token::Space(" "),
514            ]
515        );
516        assert_eq!(
517            string,
518            tokens.iter().map(|x| x.as_str()).collect::<String>()
519        );
520    }
521
522    #[test]
523    fn test_12() {
524        let string = r" 'a''b' ";
525        let tokenizer = Tokenizer::new(string);
526        let tokens: Vec<Token> = tokenizer.iter().collect();
527        assert_eq!(
528            tokens,
529            vec![
530                Token::Space(" "),
531                Token::Quoted("'a''b'"),
532                Token::Space(" "),
533            ]
534        );
535        assert_eq!(
536            string,
537            tokens.iter().map(|x| x.as_str()).collect::<String>()
538        );
539    }
540
541    #[test]
542    fn test_13() {
543        let string = r"(?)";
544        let tokenizer = Tokenizer::new(string);
545        let tokens: Vec<Token> = tokenizer.iter().collect();
546        assert_eq!(
547            tokens,
548            vec![
549                Token::Punctuation("("),
550                Token::Punctuation("?"),
551                Token::Punctuation(")"),
552            ]
553        );
554        assert_eq!(
555            string,
556            tokens.iter().map(|x| x.as_str()).collect::<String>()
557        );
558    }
559
560    #[test]
561    fn test_14() {
562        let string = r"($1 = $2)";
563        let tokenizer = Tokenizer::new(string);
564        let tokens: Vec<Token> = tokenizer.iter().collect();
565        assert_eq!(
566            tokens,
567            vec![
568                Token::Punctuation("("),
569                Token::Punctuation("$"),
570                Token::Unquoted("1"),
571                Token::Space(" "),
572                Token::Punctuation("="),
573                Token::Space(" "),
574                Token::Punctuation("$"),
575                Token::Unquoted("2"),
576                Token::Punctuation(")"),
577            ]
578        );
579        assert_eq!(
580            string,
581            tokens.iter().map(|x| x.as_str()).collect::<String>()
582        );
583    }
584
585    #[test]
586    fn test_15() {
587        let string = r#" "Hello World" "#;
588        let tokenizer = Tokenizer::new(string);
589        let tokens: Vec<Token> = tokenizer.iter().collect();
590        assert_eq!(
591            tokens,
592            vec![
593                Token::Space(" "),
594                Token::Quoted("\"Hello World\""),
595                Token::Space(" "),
596            ]
597        );
598        assert_eq!(
599            string,
600            tokens.iter().map(|x| x.as_str()).collect::<String>()
601        );
602    }
603
604    #[test]
605    fn test_16() {
606        let string = "abc_$123";
607        let tokenizer = Tokenizer::new(string);
608        let tokens: Vec<Token> = tokenizer.iter().collect();
609        assert_eq!(tokens, vec![Token::Unquoted(string)]);
610        assert_eq!(
611            string,
612            tokens.iter().map(|x| x.as_str()).collect::<String>()
613        );
614    }
615
616    #[test]
617    fn test_17() {
618        let string = "$abc$123";
619        let tokenizer = Tokenizer::new(string);
620        let tokens: Vec<Token> = tokenizer.iter().collect();
621        assert_eq!(
622            tokens,
623            vec![Token::Punctuation("$"), Token::Unquoted("abc$123"),]
624        );
625        assert_eq!(
626            string,
627            tokens.iter().map(|x| x.as_str()).collect::<String>()
628        );
629    }
630
631    #[test]
632    fn test_18() {
633        let string = "_$abc_123$";
634        let tokenizer = Tokenizer::new(string);
635        let tokens: Vec<Token> = tokenizer.iter().collect();
636        assert_eq!(
637            tokens,
638            vec![
639                Token::Punctuation("_"),
640                Token::Punctuation("$"),
641                Token::Unquoted("abc_123$"),
642            ]
643        );
644        assert_eq!(
645            string,
646            tokens.iter().map(|x| x.as_str()).collect::<String>()
647        );
648    }
649
650    #[test]
651    fn test_19() {
652        let string = r#""a\"bc""#;
653        let tokenizer = Tokenizer::new(string);
654        assert_eq!(tokenizer.unquote(), "a\\\"bc".to_owned());
655    }
656
657    #[test]
658    fn test_20() {
659        let string = r#""a""bc""#;
660        let tokenizer = Tokenizer::new(string);
661        assert_eq!(tokenizer.unquote(), "a\"bc".to_owned());
662    }
663
664    #[test]
665    fn test_21() {
666        assert_eq!(
667            Token::Quoted("'a\\nb'").unquote().unwrap(),
668            "a\\nb".to_owned()
669        );
670    }
671
672    #[test]
673    fn test_22() {
674        let string = r#" "Hello\nWorld" "#;
675        let tokenizer = Tokenizer::new(string);
676        let tokens: Vec<Token> = tokenizer.iter().collect();
677        assert_eq!(
678            tokens,
679            vec![
680                Token::Space(" "),
681                Token::Quoted("\"Hello\\nWorld\""),
682                Token::Space(" "),
683            ]
684        );
685        assert_eq!(
686            string,
687            tokens.iter().map(|x| x.as_str()).collect::<String>()
688        );
689    }
690
691    #[test]
692    fn test_23() {
693        let string = "{ab} '{cd}'";
694        let tokenizer = Tokenizer::new(string);
695        let tokens: Vec<Token> = tokenizer.iter().collect();
696        assert_eq!(
697            tokens,
698            vec![
699                Token::Punctuation("{"),
700                Token::Unquoted("ab"),
701                Token::Punctuation("}"),
702                Token::Space(" "),
703                Token::Quoted("'{cd}'"),
704            ]
705        );
706        assert_eq!(
707            string,
708            tokens.iter().map(|x| x.as_str()).collect::<String>()
709        );
710    }
711
712    #[test]
713    fn test_24() {
714        let string = r#"新"老虎","#;
715        let tokenizer = Tokenizer::new(string);
716        let tokens: Vec<Token> = tokenizer.iter().collect();
717        assert_eq!(
718            tokens,
719            vec![
720                Token::Unquoted("新"),
721                Token::Quoted("\"老虎\""),
722                Token::Punctuation(","),
723            ]
724        );
725        assert_eq!(
726            string,
727            tokens.iter().map(|x| x.as_str()).collect::<String>()
728        );
729    }
730
731    #[test]
732    fn test_25() {
733        let string = r#"{a.1:2}"#;
734        let tokenizer = Tokenizer::new(string);
735        let tokens: Vec<Token> = tokenizer.iter().collect();
736        assert_eq!(
737            tokens,
738            vec![
739                Token::Punctuation("{"),
740                Token::Unquoted("a"),
741                Token::Punctuation("."),
742                Token::Unquoted("1"),
743                Token::Punctuation(":"),
744                Token::Unquoted("2"),
745                Token::Punctuation("}"),
746            ]
747        );
748        assert_eq!(
749            string,
750            tokens.iter().map(|x| x.as_str()).collect::<String>()
751        );
752    }
753
754    #[test]
755    fn test_26() {
756        let string = r#"{..(a.1:2)}"#;
757        let tokenizer = Tokenizer::new(string);
758        let tokens: Vec<Token> = tokenizer.iter().collect();
759        assert_eq!(
760            tokens,
761            vec![
762                Token::Punctuation("{"),
763                Token::Punctuation("."),
764                Token::Punctuation("."),
765                Token::Punctuation("("),
766                Token::Unquoted("a"),
767                Token::Punctuation("."),
768                Token::Unquoted("1"),
769                Token::Punctuation(":"),
770                Token::Unquoted("2"),
771                Token::Punctuation(")"),
772                Token::Punctuation("}"),
773            ]
774        );
775        assert_eq!(
776            string,
777            tokens.iter().map(|x| x.as_str()).collect::<String>()
778        );
779    }
780}