1#![allow(dead_code)]
2use std::fmt::Write;
3use std::iter::Iterator;
4
5#[derive(Debug)]
7pub struct Tokenizer<'a> {
8 input: &'a str,
9 chars: std::str::Chars<'a>,
10 c: Option<char>,
11 p: usize,
12}
13
14#[derive(Debug, PartialEq, Eq)]
15#[non_exhaustive]
16pub enum Token<'a> {
17 Quoted(&'a str),
18 Unquoted(&'a str),
19 Space(&'a str),
20 Punctuation(&'a str),
21}
22
23impl<'a> Tokenizer<'a> {
24 pub fn new(string: &'a str) -> Self {
25 let mut chars = string.chars();
26 let c = chars.next();
27 Self {
28 input: string,
29 chars,
30 c,
31 p: 0,
32 }
33 }
34
35 pub fn iter(self) -> impl Iterator<Item = Token<'a>> {
36 self
37 }
38
39 fn get(&self) -> char {
40 self.c.unwrap()
41 }
42
43 fn inc(&mut self) {
44 let c = self.get();
45 self.c = self.chars.next();
46 self.p += c.len_utf8();
47 }
48
49 fn end(&self) -> bool {
50 self.c.is_none()
51 }
52
53 fn p_c(&self, c: char) -> usize {
54 self.p + c.len_utf8()
55 }
56
57 fn space(&mut self) -> Option<Token<'a>> {
58 let a = self.p;
59 let mut b = a;
60
61 while !self.end() {
62 let c = self.get();
63 if Self::is_space(c) {
64 b = self.p_c(c);
65 } else {
66 break;
67 }
68 self.inc();
69 }
70
71 if a != b {
72 Some(Token::Space(&self.input[a..b]))
73 } else {
74 None
75 }
76 }
77
78 fn unquoted(&mut self) -> Option<Token<'a>> {
79 let a = self.p;
80 let mut b = a;
81
82 let mut first = true;
83 while !self.end() {
84 let c = self.get();
85 if Self::is_alphanumeric(c) {
86 b = self.p_c(c);
87 first = false;
88 self.inc();
89 } else if !first && Self::is_identifier(c) {
90 b = self.p_c(c);
91 self.inc();
92 } else {
93 break;
94 }
95 }
96
97 if a != b {
98 Some(Token::Unquoted(&self.input[a..b]))
99 } else {
100 None
101 }
102 }
103
104 fn quoted(&mut self) -> Option<Token<'a>> {
105 let a = self.p;
106 let mut b = a;
107
108 let mut first = true;
109 let mut escape = false;
110 let mut start = ' ';
111 while !self.end() {
112 let c = self.get();
113 if first && Self::is_string_delimiter_start(c) {
114 b = self.p_c(c);
115 first = false;
116 start = c;
117 self.inc();
118 } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
119 b = self.p_c(c);
120 self.inc();
121 if self.end() {
122 break;
123 }
124 if !Self::is_string_escape_for(start, self.get()) {
125 break;
126 } else {
127 b = self.p_c(c);
128 self.inc();
129 }
130 } else if !first {
131 escape = !escape && Self::is_escape_char(c);
132 b = self.p_c(c);
133 self.inc();
134 } else {
135 break;
136 }
137 }
138 if a != b {
139 Some(Token::Quoted(&self.input[a..b]))
140 } else {
141 None
142 }
143 }
144
145 fn unquote(mut self) -> String {
147 let mut string = String::new();
148 let mut first = true;
149 let mut escape = false;
150 let mut start = ' ';
151 while !self.end() {
152 let c = self.get();
153 if first && Self::is_string_delimiter_start(c) {
154 first = false;
155 start = c;
156 self.inc();
157 } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
158 self.inc();
159 if self.end() {
160 break;
161 }
162 if !Self::is_string_escape_for(start, self.get()) {
163 break;
164 } else {
165 write!(string, "{c}").unwrap();
166 self.inc();
167 }
168 } else if !first {
169 escape = !escape && Self::is_escape_char(c);
170 write!(string, "{c}").unwrap();
171 self.inc();
172 } else {
173 break;
174 }
175 }
176 string
177 }
178
179 fn punctuation(&mut self) -> Option<Token<'a>> {
180 let a = self.p;
181 let mut b = a;
182
183 if !self.end() {
184 let c = self.get();
185 if !Self::is_space(c) && !Self::is_alphanumeric(c) {
186 b = self.p_c(c);
187 self.inc();
188 }
189 }
190
191 if a != b {
192 Some(Token::Punctuation(&self.input[a..b]))
193 } else {
194 None
195 }
196 }
197
198 fn is_space(c: char) -> bool {
199 matches!(c, ' ' | '\t' | '\r' | '\n')
200 }
201
202 fn is_identifier(c: char) -> bool {
203 matches!(c, '_' | '$')
204 }
205
206 fn is_alphanumeric(c: char) -> bool {
207 c.is_alphabetic() || c.is_ascii_digit()
208 }
209
210 fn is_string_delimiter_start(c: char) -> bool {
211 matches!(c, '`' | '[' | '\'' | '"')
212 }
213
214 fn is_string_escape_for(start: char, c: char) -> bool {
215 match start {
216 '`' => c == '`',
217 '\'' => c == '\'',
218 '"' => c == '"',
219 _ => false,
220 }
221 }
222
223 fn is_string_delimiter_end_for(start: char, c: char) -> bool {
224 match start {
225 '`' => c == '`',
226 '[' => c == ']',
227 '\'' => c == '\'',
228 '"' => c == '"',
229 _ => false,
230 }
231 }
232
233 fn is_escape_char(c: char) -> bool {
234 c == '\\'
235 }
236}
237
238impl<'a> Iterator for Tokenizer<'a> {
239 type Item = Token<'a>;
240
241 fn next(&mut self) -> Option<Self::Item> {
242 if let Some(space) = self.space() {
243 return Some(space);
244 }
245 if let Some(unquoted) = self.unquoted() {
246 return Some(unquoted);
247 }
248 if let Some(quoted) = self.quoted() {
249 return Some(quoted);
250 }
251 if let Some(punctuation) = self.punctuation() {
252 return Some(punctuation);
253 }
254 None
255 }
256}
257
258impl Token<'_> {
259 pub fn is_quoted(&self) -> bool {
260 matches!(self, Self::Quoted(_))
261 }
262
263 pub fn is_unquoted(&self) -> bool {
264 matches!(self, Self::Unquoted(_))
265 }
266
267 pub fn is_space(&self) -> bool {
268 matches!(self, Self::Space(_))
269 }
270
271 pub fn is_punctuation(&self) -> bool {
272 matches!(self, Self::Punctuation(_))
273 }
274
275 pub fn as_str(&self) -> &str {
276 match self {
277 Self::Quoted(string) => string,
278 Self::Unquoted(string) => string,
279 Self::Space(string) => string,
280 Self::Punctuation(string) => string,
281 }
282 }
283
284 pub fn unquote(&self) -> Option<String> {
285 if self.is_quoted() {
286 let tokenizer = Tokenizer::new(self.as_str());
287 Some(tokenizer.unquote())
288 } else {
289 None
290 }
291 }
292}
293
294impl std::fmt::Display for Token<'_> {
295 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
296 write!(
297 f,
298 "{}",
299 match self {
300 Token::Unquoted(string) => string,
301 Token::Space(string) => string,
302 Token::Quoted(string) => string,
303 Token::Punctuation(string) => string,
304 }
305 )
306 }
307}
308
309#[cfg(test)]
310mod tests {
311 use super::*;
312
313 #[test]
314 fn test_0() {
315 let tokenizer = Tokenizer::new("");
316 let tokens: Vec<Token> = tokenizer.iter().collect();
317 assert_eq!(tokens, vec![]);
318 }
319
320 #[test]
321 fn test_1() {
322 let string = "SELECT * FROM `character`";
323 let tokenizer = Tokenizer::new(string);
324 let tokens: Vec<Token> = tokenizer.iter().collect();
325 assert_eq!(
326 tokens,
327 vec![
328 Token::Unquoted("SELECT"),
329 Token::Space(" "),
330 Token::Punctuation("*"),
331 Token::Space(" "),
332 Token::Unquoted("FROM"),
333 Token::Space(" "),
334 Token::Quoted("`character`"),
335 ]
336 );
337 assert_eq!(
338 string,
339 tokens.iter().map(|x| x.as_str()).collect::<String>()
340 );
341 }
342
343 #[test]
344 fn test_2() {
345 let string = "SELECT * FROM `character` WHERE id = ?";
346 let tokenizer = Tokenizer::new(string);
347 let tokens: Vec<Token> = tokenizer.iter().collect();
348 assert_eq!(
349 tokens,
350 vec![
351 Token::Unquoted("SELECT"),
352 Token::Space(" "),
353 Token::Punctuation("*"),
354 Token::Space(" "),
355 Token::Unquoted("FROM"),
356 Token::Space(" "),
357 Token::Quoted("`character`"),
358 Token::Space(" "),
359 Token::Unquoted("WHERE"),
360 Token::Space(" "),
361 Token::Unquoted("id"),
362 Token::Space(" "),
363 Token::Punctuation("="),
364 Token::Space(" "),
365 Token::Punctuation("?"),
366 ]
367 );
368 assert_eq!(
369 string,
370 tokens.iter().map(|x| x.as_str()).collect::<String>()
371 );
372 }
373
374 #[test]
375 fn test_3() {
376 let string = r#"? = "?" "#;
377 let tokenizer = Tokenizer::new(string);
378 let tokens: Vec<Token> = tokenizer.iter().collect();
379 assert_eq!(
380 tokens,
381 vec![
382 Token::Punctuation("?"),
383 Token::Space(" "),
384 Token::Punctuation("="),
385 Token::Space(" "),
386 Token::Quoted(r#""?""#),
387 Token::Space(" "),
388 ]
389 );
390 assert_eq!(
391 string,
392 tokens.iter().map(|x| x.as_str()).collect::<String>()
393 );
394 }
395
396 #[test]
397 fn test_4() {
398 let string = r#""a\"bc""#;
399 let tokenizer = Tokenizer::new(string);
400 let tokens: Vec<Token> = tokenizer.iter().collect();
401 assert_eq!(tokens, vec![Token::Quoted("\"a\\\"bc\"")]);
402 assert_eq!(
403 string,
404 tokens.iter().map(|x| x.as_str()).collect::<String>()
405 );
406 }
407
408 #[test]
409 fn test_5() {
410 let string = "abc123";
411 let tokenizer = Tokenizer::new(string);
412 let tokens: Vec<Token> = tokenizer.iter().collect();
413 assert_eq!(tokens, vec![Token::Unquoted(string)]);
414 assert_eq!(
415 string,
416 tokens.iter().map(|x| x.as_str()).collect::<String>()
417 );
418 }
419
420 #[test]
421 fn test_6() {
422 let string = "2.3*4";
423 let tokenizer = Tokenizer::new(string);
424 let tokens: Vec<Token> = tokenizer.iter().collect();
425 assert_eq!(
426 tokens,
427 vec![
428 Token::Unquoted("2"),
429 Token::Punctuation("."),
430 Token::Unquoted("3"),
431 Token::Punctuation("*"),
432 Token::Unquoted("4"),
433 ]
434 );
435 assert_eq!(
436 string,
437 tokens.iter().map(|x| x.as_str()).collect::<String>()
438 );
439 }
440
441 #[test]
442 fn test_7() {
443 let string = r#""a\\" B"#;
444 let tokenizer = Tokenizer::new(string);
445 let tokens: Vec<Token> = tokenizer.iter().collect();
446 assert_eq!(
447 tokens,
448 vec![
449 Token::Quoted("\"a\\\\\""),
450 Token::Space(" "),
451 Token::Unquoted("B"),
452 ]
453 );
454 assert_eq!(
455 string,
456 tokens.iter().map(|x| x.as_str()).collect::<String>()
457 );
458 }
459
460 #[test]
461 fn test_8() {
462 let string = r#"`a"b` "#;
463 let tokenizer = Tokenizer::new(string);
464 let tokens: Vec<Token> = tokenizer.iter().collect();
465 assert_eq!(tokens, vec![Token::Quoted("`a\"b`"), Token::Space(" ")]);
466 assert_eq!(
467 string,
468 tokens.iter().map(|x| x.as_str()).collect::<String>()
469 );
470 }
471
472 #[test]
473 fn test_9() {
474 let string = r"[ab] ";
475 let tokenizer = Tokenizer::new(string);
476 let tokens: Vec<Token> = tokenizer.iter().collect();
477 assert_eq!(tokens, vec![Token::Quoted("[ab]"), Token::Space(" ")]);
478 assert_eq!(
479 string,
480 tokens.iter().map(|x| x.as_str()).collect::<String>()
481 );
482 }
483
484 #[test]
485 fn test_10() {
486 let string = r#" 'a"b' "#;
487 let tokenizer = Tokenizer::new(string);
488 let tokens: Vec<Token> = tokenizer.iter().collect();
489 assert_eq!(
490 tokens,
491 vec![
492 Token::Space(" "),
493 Token::Quoted("'a\"b'"),
494 Token::Space(" "),
495 ]
496 );
497 assert_eq!(
498 string,
499 tokens.iter().map(|x| x.as_str()).collect::<String>()
500 );
501 }
502
503 #[test]
504 fn test_11() {
505 let string = r" `a``b` ";
506 let tokenizer = Tokenizer::new(string);
507 let tokens: Vec<Token> = tokenizer.iter().collect();
508 assert_eq!(
509 tokens,
510 vec![
511 Token::Space(" "),
512 Token::Quoted("`a``b`"),
513 Token::Space(" "),
514 ]
515 );
516 assert_eq!(
517 string,
518 tokens.iter().map(|x| x.as_str()).collect::<String>()
519 );
520 }
521
522 #[test]
523 fn test_12() {
524 let string = r" 'a''b' ";
525 let tokenizer = Tokenizer::new(string);
526 let tokens: Vec<Token> = tokenizer.iter().collect();
527 assert_eq!(
528 tokens,
529 vec![
530 Token::Space(" "),
531 Token::Quoted("'a''b'"),
532 Token::Space(" "),
533 ]
534 );
535 assert_eq!(
536 string,
537 tokens.iter().map(|x| x.as_str()).collect::<String>()
538 );
539 }
540
541 #[test]
542 fn test_13() {
543 let string = r"(?)";
544 let tokenizer = Tokenizer::new(string);
545 let tokens: Vec<Token> = tokenizer.iter().collect();
546 assert_eq!(
547 tokens,
548 vec![
549 Token::Punctuation("("),
550 Token::Punctuation("?"),
551 Token::Punctuation(")"),
552 ]
553 );
554 assert_eq!(
555 string,
556 tokens.iter().map(|x| x.as_str()).collect::<String>()
557 );
558 }
559
560 #[test]
561 fn test_14() {
562 let string = r"($1 = $2)";
563 let tokenizer = Tokenizer::new(string);
564 let tokens: Vec<Token> = tokenizer.iter().collect();
565 assert_eq!(
566 tokens,
567 vec![
568 Token::Punctuation("("),
569 Token::Punctuation("$"),
570 Token::Unquoted("1"),
571 Token::Space(" "),
572 Token::Punctuation("="),
573 Token::Space(" "),
574 Token::Punctuation("$"),
575 Token::Unquoted("2"),
576 Token::Punctuation(")"),
577 ]
578 );
579 assert_eq!(
580 string,
581 tokens.iter().map(|x| x.as_str()).collect::<String>()
582 );
583 }
584
585 #[test]
586 fn test_15() {
587 let string = r#" "Hello World" "#;
588 let tokenizer = Tokenizer::new(string);
589 let tokens: Vec<Token> = tokenizer.iter().collect();
590 assert_eq!(
591 tokens,
592 vec![
593 Token::Space(" "),
594 Token::Quoted("\"Hello World\""),
595 Token::Space(" "),
596 ]
597 );
598 assert_eq!(
599 string,
600 tokens.iter().map(|x| x.as_str()).collect::<String>()
601 );
602 }
603
604 #[test]
605 fn test_16() {
606 let string = "abc_$123";
607 let tokenizer = Tokenizer::new(string);
608 let tokens: Vec<Token> = tokenizer.iter().collect();
609 assert_eq!(tokens, vec![Token::Unquoted(string)]);
610 assert_eq!(
611 string,
612 tokens.iter().map(|x| x.as_str()).collect::<String>()
613 );
614 }
615
616 #[test]
617 fn test_17() {
618 let string = "$abc$123";
619 let tokenizer = Tokenizer::new(string);
620 let tokens: Vec<Token> = tokenizer.iter().collect();
621 assert_eq!(
622 tokens,
623 vec![Token::Punctuation("$"), Token::Unquoted("abc$123"),]
624 );
625 assert_eq!(
626 string,
627 tokens.iter().map(|x| x.as_str()).collect::<String>()
628 );
629 }
630
631 #[test]
632 fn test_18() {
633 let string = "_$abc_123$";
634 let tokenizer = Tokenizer::new(string);
635 let tokens: Vec<Token> = tokenizer.iter().collect();
636 assert_eq!(
637 tokens,
638 vec![
639 Token::Punctuation("_"),
640 Token::Punctuation("$"),
641 Token::Unquoted("abc_123$"),
642 ]
643 );
644 assert_eq!(
645 string,
646 tokens.iter().map(|x| x.as_str()).collect::<String>()
647 );
648 }
649
650 #[test]
651 fn test_19() {
652 let string = r#""a\"bc""#;
653 let tokenizer = Tokenizer::new(string);
654 assert_eq!(tokenizer.unquote(), "a\\\"bc".to_owned());
655 }
656
657 #[test]
658 fn test_20() {
659 let string = r#""a""bc""#;
660 let tokenizer = Tokenizer::new(string);
661 assert_eq!(tokenizer.unquote(), "a\"bc".to_owned());
662 }
663
664 #[test]
665 fn test_21() {
666 assert_eq!(
667 Token::Quoted("'a\\nb'").unquote().unwrap(),
668 "a\\nb".to_owned()
669 );
670 }
671
672 #[test]
673 fn test_22() {
674 let string = r#" "Hello\nWorld" "#;
675 let tokenizer = Tokenizer::new(string);
676 let tokens: Vec<Token> = tokenizer.iter().collect();
677 assert_eq!(
678 tokens,
679 vec![
680 Token::Space(" "),
681 Token::Quoted("\"Hello\\nWorld\""),
682 Token::Space(" "),
683 ]
684 );
685 assert_eq!(
686 string,
687 tokens.iter().map(|x| x.as_str()).collect::<String>()
688 );
689 }
690
691 #[test]
692 fn test_23() {
693 let string = "{ab} '{cd}'";
694 let tokenizer = Tokenizer::new(string);
695 let tokens: Vec<Token> = tokenizer.iter().collect();
696 assert_eq!(
697 tokens,
698 vec![
699 Token::Punctuation("{"),
700 Token::Unquoted("ab"),
701 Token::Punctuation("}"),
702 Token::Space(" "),
703 Token::Quoted("'{cd}'"),
704 ]
705 );
706 assert_eq!(
707 string,
708 tokens.iter().map(|x| x.as_str()).collect::<String>()
709 );
710 }
711
712 #[test]
713 fn test_24() {
714 let string = r#"新"老虎","#;
715 let tokenizer = Tokenizer::new(string);
716 let tokens: Vec<Token> = tokenizer.iter().collect();
717 assert_eq!(
718 tokens,
719 vec![
720 Token::Unquoted("新"),
721 Token::Quoted("\"老虎\""),
722 Token::Punctuation(","),
723 ]
724 );
725 assert_eq!(
726 string,
727 tokens.iter().map(|x| x.as_str()).collect::<String>()
728 );
729 }
730
731 #[test]
732 fn test_25() {
733 let string = r#"{a.1:2}"#;
734 let tokenizer = Tokenizer::new(string);
735 let tokens: Vec<Token> = tokenizer.iter().collect();
736 assert_eq!(
737 tokens,
738 vec![
739 Token::Punctuation("{"),
740 Token::Unquoted("a"),
741 Token::Punctuation("."),
742 Token::Unquoted("1"),
743 Token::Punctuation(":"),
744 Token::Unquoted("2"),
745 Token::Punctuation("}"),
746 ]
747 );
748 assert_eq!(
749 string,
750 tokens.iter().map(|x| x.as_str()).collect::<String>()
751 );
752 }
753
754 #[test]
755 fn test_26() {
756 let string = r#"{..(a.1:2)}"#;
757 let tokenizer = Tokenizer::new(string);
758 let tokens: Vec<Token> = tokenizer.iter().collect();
759 assert_eq!(
760 tokens,
761 vec![
762 Token::Punctuation("{"),
763 Token::Punctuation("."),
764 Token::Punctuation("."),
765 Token::Punctuation("("),
766 Token::Unquoted("a"),
767 Token::Punctuation("."),
768 Token::Unquoted("1"),
769 Token::Punctuation(":"),
770 Token::Unquoted("2"),
771 Token::Punctuation(")"),
772 Token::Punctuation("}"),
773 ]
774 );
775 assert_eq!(
776 string,
777 tokens.iter().map(|x| x.as_str()).collect::<String>()
778 );
779 }
780}