1#![allow(dead_code)]
2use std::fmt::Write;
3use std::iter::Iterator;
4
5#[derive(Debug)]
7pub struct Tokenizer<'a> {
8 input: &'a str,
9 chars: std::str::Chars<'a>,
10 c: Option<char>,
11 p: usize,
12}
13
14#[derive(Debug, PartialEq, Eq)]
15#[non_exhaustive]
16pub enum Token<'a> {
17 Quoted(&'a str),
18 Unquoted(&'a str),
19 Space(&'a str),
20 Punctuation(&'a str),
21}
22
23impl<'a> Tokenizer<'a> {
24 pub fn new(string: &'a str) -> Self {
25 let mut chars = string.chars();
26 let c = chars.next();
27 Self {
28 input: string,
29 chars,
30 c,
31 p: 0,
32 }
33 }
34
35 pub fn iter(self) -> impl Iterator<Item = Token<'a>> {
36 self
37 }
38
39 fn get(&self) -> char {
40 self.c.unwrap()
41 }
42
43 fn inc(&mut self) {
44 let c = self.get();
45 self.c = self.chars.next();
46 self.p += c.len_utf8();
47 }
48
49 fn end(&self) -> bool {
50 self.c.is_none()
51 }
52
53 fn p_c(&self, c: char) -> usize {
54 self.p + c.len_utf8()
55 }
56
57 fn space(&mut self) -> Option<Token<'a>> {
58 let a = self.p;
59 let mut b = a;
60
61 while !self.end() {
62 let c = self.get();
63 if Self::is_space(c) {
64 b = self.p_c(c);
65 } else {
66 break;
67 }
68 self.inc();
69 }
70
71 if a != b {
72 Some(Token::Space(&self.input[a..b]))
73 } else {
74 None
75 }
76 }
77
78 fn unquoted(&mut self) -> Option<Token<'a>> {
79 let a = self.p;
80 let mut b = a;
81
82 let mut first = true;
83 while !self.end() {
84 let c = self.get();
85 if Self::is_alphanumeric(c) {
86 b = self.p_c(c);
87 first = false;
88 self.inc();
89 } else if !first && Self::is_identifier(c) {
90 b = self.p_c(c);
91 self.inc();
92 } else {
93 break;
94 }
95 }
96
97 if a != b {
98 Some(Token::Unquoted(&self.input[a..b]))
99 } else {
100 None
101 }
102 }
103
104 fn quoted(&mut self) -> Option<Token<'a>> {
105 let a = self.p;
106 let mut b = a;
107
108 let mut first = true;
109 let mut escape = false;
110 let mut start = ' ';
111 while !self.end() {
112 let c = self.get();
113 if first && Self::is_string_delimiter_start(c) {
114 b = self.p_c(c);
115 first = false;
116 start = c;
117 self.inc();
118 } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
119 b = self.p_c(c);
120 self.inc();
121 if self.end() {
122 break;
123 }
124 if !Self::is_string_escape_for(start, self.get()) {
125 break;
126 } else {
127 b = self.p_c(c);
128 self.inc();
129 }
130 } else if !first {
131 escape = !escape && Self::is_escape_char(c);
132 b = self.p_c(c);
133 self.inc();
134 } else {
135 break;
136 }
137 }
138 if a != b {
139 Some(Token::Quoted(&self.input[a..b]))
140 } else {
141 None
142 }
143 }
144
145 fn unquote(mut self) -> String {
147 let mut string = String::new();
148 let mut first = true;
149 let mut escape = false;
150 let mut start = ' ';
151 while !self.end() {
152 let c = self.get();
153 if first && Self::is_string_delimiter_start(c) {
154 first = false;
155 start = c;
156 self.inc();
157 } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
158 self.inc();
159 if self.end() {
160 break;
161 }
162 if !Self::is_string_escape_for(start, self.get()) {
163 break;
164 } else {
165 string.write_char(c).unwrap();
166 self.inc();
167 }
168 } else if !first {
169 escape = !escape && Self::is_escape_char(c);
170 string.write_char(c).unwrap();
171 self.inc();
172 } else {
173 break;
174 }
175 }
176 string
177 }
178
179 fn punctuation(&mut self) -> Option<Token<'a>> {
180 let a = self.p;
181 let mut b = a;
182
183 if !self.end() {
184 let c = self.get();
185 if !Self::is_space(c) && !Self::is_alphanumeric(c) {
186 b = self.p_c(c);
187 self.inc();
188 }
189 }
190
191 if a != b {
192 Some(Token::Punctuation(&self.input[a..b]))
193 } else {
194 None
195 }
196 }
197
198 fn is_space(c: char) -> bool {
199 matches!(c, ' ' | '\t' | '\r' | '\n')
200 }
201
202 fn is_identifier(c: char) -> bool {
203 matches!(c, '_' | '$')
204 }
205
206 fn is_alphanumeric(c: char) -> bool {
207 c.is_alphabetic() || c.is_ascii_digit()
208 }
209
210 fn is_string_delimiter_start(c: char) -> bool {
211 matches!(c, '`' | '[' | '\'' | '"')
212 }
213
214 fn is_string_escape_for(start: char, c: char) -> bool {
215 match start {
216 '`' => c == '`',
217 '\'' => c == '\'',
218 '"' => c == '"',
219 _ => false,
220 }
221 }
222
223 fn is_string_delimiter_end_for(start: char, c: char) -> bool {
224 match start {
225 '`' => c == '`',
226 '[' => c == ']',
227 '\'' => c == '\'',
228 '"' => c == '"',
229 _ => false,
230 }
231 }
232
233 fn is_escape_char(c: char) -> bool {
234 c == '\\'
235 }
236}
237
238impl<'a> Iterator for Tokenizer<'a> {
239 type Item = Token<'a>;
240
241 fn next(&mut self) -> Option<Self::Item> {
242 if let Some(space) = self.space() {
243 return Some(space);
244 }
245 if let Some(unquoted) = self.unquoted() {
246 return Some(unquoted);
247 }
248 if let Some(quoted) = self.quoted() {
249 return Some(quoted);
250 }
251 if let Some(punctuation) = self.punctuation() {
252 return Some(punctuation);
253 }
254 None
255 }
256}
257
258impl Token<'_> {
259 pub fn is_quoted(&self) -> bool {
260 matches!(self, Self::Quoted(_))
261 }
262
263 pub fn is_unquoted(&self) -> bool {
264 matches!(self, Self::Unquoted(_))
265 }
266
267 pub fn is_space(&self) -> bool {
268 matches!(self, Self::Space(_))
269 }
270
271 pub fn is_punctuation(&self) -> bool {
272 matches!(self, Self::Punctuation(_))
273 }
274
275 pub fn as_str(&self) -> &str {
276 match self {
277 Self::Quoted(string) => string,
278 Self::Unquoted(string) => string,
279 Self::Space(string) => string,
280 Self::Punctuation(string) => string,
281 }
282 }
283
284 pub fn unquote(&self) -> Option<String> {
285 if self.is_quoted() {
286 let tokenizer = Tokenizer::new(self.as_str());
287 Some(tokenizer.unquote())
288 } else {
289 None
290 }
291 }
292}
293
294impl std::fmt::Display for Token<'_> {
295 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
296 f.write_str(self.as_str())
297 }
298}
299
300#[cfg(test)]
301mod tests {
302 use super::*;
303
304 #[test]
305 fn test_0() {
306 let tokenizer = Tokenizer::new("");
307 let tokens: Vec<Token> = tokenizer.iter().collect();
308 assert_eq!(tokens, vec![]);
309 }
310
311 #[test]
312 fn test_1() {
313 let string = "SELECT * FROM `character`";
314 let tokenizer = Tokenizer::new(string);
315 let tokens: Vec<Token> = tokenizer.iter().collect();
316 assert_eq!(
317 tokens,
318 vec![
319 Token::Unquoted("SELECT"),
320 Token::Space(" "),
321 Token::Punctuation("*"),
322 Token::Space(" "),
323 Token::Unquoted("FROM"),
324 Token::Space(" "),
325 Token::Quoted("`character`"),
326 ]
327 );
328 assert_eq!(
329 string,
330 tokens.iter().map(|x| x.as_str()).collect::<String>()
331 );
332 }
333
334 #[test]
335 fn test_2() {
336 let string = "SELECT * FROM `character` WHERE id = ?";
337 let tokenizer = Tokenizer::new(string);
338 let tokens: Vec<Token> = tokenizer.iter().collect();
339 assert_eq!(
340 tokens,
341 vec![
342 Token::Unquoted("SELECT"),
343 Token::Space(" "),
344 Token::Punctuation("*"),
345 Token::Space(" "),
346 Token::Unquoted("FROM"),
347 Token::Space(" "),
348 Token::Quoted("`character`"),
349 Token::Space(" "),
350 Token::Unquoted("WHERE"),
351 Token::Space(" "),
352 Token::Unquoted("id"),
353 Token::Space(" "),
354 Token::Punctuation("="),
355 Token::Space(" "),
356 Token::Punctuation("?"),
357 ]
358 );
359 assert_eq!(
360 string,
361 tokens.iter().map(|x| x.as_str()).collect::<String>()
362 );
363 }
364
365 #[test]
366 fn test_3() {
367 let string = r#"? = "?" "#;
368 let tokenizer = Tokenizer::new(string);
369 let tokens: Vec<Token> = tokenizer.iter().collect();
370 assert_eq!(
371 tokens,
372 vec![
373 Token::Punctuation("?"),
374 Token::Space(" "),
375 Token::Punctuation("="),
376 Token::Space(" "),
377 Token::Quoted(r#""?""#),
378 Token::Space(" "),
379 ]
380 );
381 assert_eq!(
382 string,
383 tokens.iter().map(|x| x.as_str()).collect::<String>()
384 );
385 }
386
387 #[test]
388 fn test_4() {
389 let string = r#""a\"bc""#;
390 let tokenizer = Tokenizer::new(string);
391 let tokens: Vec<Token> = tokenizer.iter().collect();
392 assert_eq!(tokens, vec![Token::Quoted("\"a\\\"bc\"")]);
393 assert_eq!(
394 string,
395 tokens.iter().map(|x| x.as_str()).collect::<String>()
396 );
397 }
398
399 #[test]
400 fn test_5() {
401 let string = "abc123";
402 let tokenizer = Tokenizer::new(string);
403 let tokens: Vec<Token> = tokenizer.iter().collect();
404 assert_eq!(tokens, vec![Token::Unquoted(string)]);
405 assert_eq!(
406 string,
407 tokens.iter().map(|x| x.as_str()).collect::<String>()
408 );
409 }
410
411 #[test]
412 fn test_6() {
413 let string = "2.3*4";
414 let tokenizer = Tokenizer::new(string);
415 let tokens: Vec<Token> = tokenizer.iter().collect();
416 assert_eq!(
417 tokens,
418 vec![
419 Token::Unquoted("2"),
420 Token::Punctuation("."),
421 Token::Unquoted("3"),
422 Token::Punctuation("*"),
423 Token::Unquoted("4"),
424 ]
425 );
426 assert_eq!(
427 string,
428 tokens.iter().map(|x| x.as_str()).collect::<String>()
429 );
430 }
431
432 #[test]
433 fn test_7() {
434 let string = r#""a\\" B"#;
435 let tokenizer = Tokenizer::new(string);
436 let tokens: Vec<Token> = tokenizer.iter().collect();
437 assert_eq!(
438 tokens,
439 vec![
440 Token::Quoted("\"a\\\\\""),
441 Token::Space(" "),
442 Token::Unquoted("B"),
443 ]
444 );
445 assert_eq!(
446 string,
447 tokens.iter().map(|x| x.as_str()).collect::<String>()
448 );
449 }
450
451 #[test]
452 fn test_8() {
453 let string = r#"`a"b` "#;
454 let tokenizer = Tokenizer::new(string);
455 let tokens: Vec<Token> = tokenizer.iter().collect();
456 assert_eq!(tokens, vec![Token::Quoted("`a\"b`"), Token::Space(" ")]);
457 assert_eq!(
458 string,
459 tokens.iter().map(|x| x.as_str()).collect::<String>()
460 );
461 }
462
463 #[test]
464 fn test_9() {
465 let string = r"[ab] ";
466 let tokenizer = Tokenizer::new(string);
467 let tokens: Vec<Token> = tokenizer.iter().collect();
468 assert_eq!(tokens, vec![Token::Quoted("[ab]"), Token::Space(" ")]);
469 assert_eq!(
470 string,
471 tokens.iter().map(|x| x.as_str()).collect::<String>()
472 );
473 }
474
475 #[test]
476 fn test_10() {
477 let string = r#" 'a"b' "#;
478 let tokenizer = Tokenizer::new(string);
479 let tokens: Vec<Token> = tokenizer.iter().collect();
480 assert_eq!(
481 tokens,
482 vec![
483 Token::Space(" "),
484 Token::Quoted("'a\"b'"),
485 Token::Space(" "),
486 ]
487 );
488 assert_eq!(
489 string,
490 tokens.iter().map(|x| x.as_str()).collect::<String>()
491 );
492 }
493
494 #[test]
495 fn test_11() {
496 let string = r" `a``b` ";
497 let tokenizer = Tokenizer::new(string);
498 let tokens: Vec<Token> = tokenizer.iter().collect();
499 assert_eq!(
500 tokens,
501 vec![
502 Token::Space(" "),
503 Token::Quoted("`a``b`"),
504 Token::Space(" "),
505 ]
506 );
507 assert_eq!(
508 string,
509 tokens.iter().map(|x| x.as_str()).collect::<String>()
510 );
511 }
512
513 #[test]
514 fn test_12() {
515 let string = r" 'a''b' ";
516 let tokenizer = Tokenizer::new(string);
517 let tokens: Vec<Token> = tokenizer.iter().collect();
518 assert_eq!(
519 tokens,
520 vec![
521 Token::Space(" "),
522 Token::Quoted("'a''b'"),
523 Token::Space(" "),
524 ]
525 );
526 assert_eq!(
527 string,
528 tokens.iter().map(|x| x.as_str()).collect::<String>()
529 );
530 }
531
532 #[test]
533 fn test_13() {
534 let string = r"(?)";
535 let tokenizer = Tokenizer::new(string);
536 let tokens: Vec<Token> = tokenizer.iter().collect();
537 assert_eq!(
538 tokens,
539 vec![
540 Token::Punctuation("("),
541 Token::Punctuation("?"),
542 Token::Punctuation(")"),
543 ]
544 );
545 assert_eq!(
546 string,
547 tokens.iter().map(|x| x.as_str()).collect::<String>()
548 );
549 }
550
551 #[test]
552 fn test_14() {
553 let string = r"($1 = $2)";
554 let tokenizer = Tokenizer::new(string);
555 let tokens: Vec<Token> = tokenizer.iter().collect();
556 assert_eq!(
557 tokens,
558 vec![
559 Token::Punctuation("("),
560 Token::Punctuation("$"),
561 Token::Unquoted("1"),
562 Token::Space(" "),
563 Token::Punctuation("="),
564 Token::Space(" "),
565 Token::Punctuation("$"),
566 Token::Unquoted("2"),
567 Token::Punctuation(")"),
568 ]
569 );
570 assert_eq!(
571 string,
572 tokens.iter().map(|x| x.as_str()).collect::<String>()
573 );
574 }
575
576 #[test]
577 fn test_15() {
578 let string = r#" "Hello World" "#;
579 let tokenizer = Tokenizer::new(string);
580 let tokens: Vec<Token> = tokenizer.iter().collect();
581 assert_eq!(
582 tokens,
583 vec![
584 Token::Space(" "),
585 Token::Quoted("\"Hello World\""),
586 Token::Space(" "),
587 ]
588 );
589 assert_eq!(
590 string,
591 tokens.iter().map(|x| x.as_str()).collect::<String>()
592 );
593 }
594
595 #[test]
596 fn test_16() {
597 let string = "abc_$123";
598 let tokenizer = Tokenizer::new(string);
599 let tokens: Vec<Token> = tokenizer.iter().collect();
600 assert_eq!(tokens, vec![Token::Unquoted(string)]);
601 assert_eq!(
602 string,
603 tokens.iter().map(|x| x.as_str()).collect::<String>()
604 );
605 }
606
607 #[test]
608 fn test_17() {
609 let string = "$abc$123";
610 let tokenizer = Tokenizer::new(string);
611 let tokens: Vec<Token> = tokenizer.iter().collect();
612 assert_eq!(
613 tokens,
614 vec![Token::Punctuation("$"), Token::Unquoted("abc$123"),]
615 );
616 assert_eq!(
617 string,
618 tokens.iter().map(|x| x.as_str()).collect::<String>()
619 );
620 }
621
622 #[test]
623 fn test_18() {
624 let string = "_$abc_123$";
625 let tokenizer = Tokenizer::new(string);
626 let tokens: Vec<Token> = tokenizer.iter().collect();
627 assert_eq!(
628 tokens,
629 vec![
630 Token::Punctuation("_"),
631 Token::Punctuation("$"),
632 Token::Unquoted("abc_123$"),
633 ]
634 );
635 assert_eq!(
636 string,
637 tokens.iter().map(|x| x.as_str()).collect::<String>()
638 );
639 }
640
641 #[test]
642 fn test_19() {
643 let string = r#""a\"bc""#;
644 let tokenizer = Tokenizer::new(string);
645 assert_eq!(tokenizer.unquote(), "a\\\"bc".to_owned());
646 }
647
648 #[test]
649 fn test_20() {
650 let string = r#""a""bc""#;
651 let tokenizer = Tokenizer::new(string);
652 assert_eq!(tokenizer.unquote(), "a\"bc".to_owned());
653 }
654
655 #[test]
656 fn test_21() {
657 assert_eq!(
658 Token::Quoted("'a\\nb'").unquote().unwrap(),
659 "a\\nb".to_owned()
660 );
661 }
662
663 #[test]
664 fn test_22() {
665 let string = r#" "Hello\nWorld" "#;
666 let tokenizer = Tokenizer::new(string);
667 let tokens: Vec<Token> = tokenizer.iter().collect();
668 assert_eq!(
669 tokens,
670 vec![
671 Token::Space(" "),
672 Token::Quoted("\"Hello\\nWorld\""),
673 Token::Space(" "),
674 ]
675 );
676 assert_eq!(
677 string,
678 tokens.iter().map(|x| x.as_str()).collect::<String>()
679 );
680 }
681
682 #[test]
683 fn test_23() {
684 let string = "{ab} '{cd}'";
685 let tokenizer = Tokenizer::new(string);
686 let tokens: Vec<Token> = tokenizer.iter().collect();
687 assert_eq!(
688 tokens,
689 vec![
690 Token::Punctuation("{"),
691 Token::Unquoted("ab"),
692 Token::Punctuation("}"),
693 Token::Space(" "),
694 Token::Quoted("'{cd}'"),
695 ]
696 );
697 assert_eq!(
698 string,
699 tokens.iter().map(|x| x.as_str()).collect::<String>()
700 );
701 }
702
703 #[test]
704 fn test_24() {
705 let string = r#"新"老虎","#;
706 let tokenizer = Tokenizer::new(string);
707 let tokens: Vec<Token> = tokenizer.iter().collect();
708 assert_eq!(
709 tokens,
710 vec![
711 Token::Unquoted("新"),
712 Token::Quoted("\"老虎\""),
713 Token::Punctuation(","),
714 ]
715 );
716 assert_eq!(
717 string,
718 tokens.iter().map(|x| x.as_str()).collect::<String>()
719 );
720 }
721
722 #[test]
723 fn test_25() {
724 let string = r#"{a.1:2}"#;
725 let tokenizer = Tokenizer::new(string);
726 let tokens: Vec<Token> = tokenizer.iter().collect();
727 assert_eq!(
728 tokens,
729 vec![
730 Token::Punctuation("{"),
731 Token::Unquoted("a"),
732 Token::Punctuation("."),
733 Token::Unquoted("1"),
734 Token::Punctuation(":"),
735 Token::Unquoted("2"),
736 Token::Punctuation("}"),
737 ]
738 );
739 assert_eq!(
740 string,
741 tokens.iter().map(|x| x.as_str()).collect::<String>()
742 );
743 }
744
745 #[test]
746 fn test_26() {
747 let string = r#"{..(a.1:2)}"#;
748 let tokenizer = Tokenizer::new(string);
749 let tokens: Vec<Token> = tokenizer.iter().collect();
750 assert_eq!(
751 tokens,
752 vec![
753 Token::Punctuation("{"),
754 Token::Punctuation("."),
755 Token::Punctuation("."),
756 Token::Punctuation("("),
757 Token::Unquoted("a"),
758 Token::Punctuation("."),
759 Token::Unquoted("1"),
760 Token::Punctuation(":"),
761 Token::Unquoted("2"),
762 Token::Punctuation(")"),
763 Token::Punctuation("}"),
764 ]
765 );
766 assert_eq!(
767 string,
768 tokens.iter().map(|x| x.as_str()).collect::<String>()
769 );
770 }
771}