icu_segmenter/complex/
language.rs1#[derive(PartialEq, Debug, Copy, Clone)]
6pub(super) enum Language {
7 Burmese,
8 ChineseOrJapanese,
9 Khmer,
10 Lao,
11 Thai,
12 Unknown,
13}
14
15fn get_language(codepoint: u32) -> Language {
17 match codepoint {
18 0xe01..=0xe7f => Language::Thai,
19 0x0E80..=0x0EFF => Language::Lao,
20 0x1000..=0x109f => Language::Burmese,
21 0x1780..=0x17FF => Language::Khmer,
22 0x19E0..=0x19FF => Language::Khmer,
23 0x2E80..=0x2EFF => Language::ChineseOrJapanese,
24 0x2F00..=0x2FDF => Language::ChineseOrJapanese,
25 0x3040..=0x30FF => Language::ChineseOrJapanese,
26 0x31F0..=0x31FF => Language::ChineseOrJapanese,
27 0x32D0..=0x32FE => Language::ChineseOrJapanese,
28 0x3400..=0x4DBF => Language::ChineseOrJapanese,
29 0x4E00..=0x9FFF => Language::ChineseOrJapanese,
30 0xa9e0..=0xa9ff => Language::Burmese,
31 0xaa60..=0xaa7f => Language::Burmese,
32 0xF900..=0xFAFF => Language::ChineseOrJapanese,
33 0xFF66..=0xFF9D => Language::ChineseOrJapanese,
34 0x16FE2..=0x16FE3 => Language::ChineseOrJapanese,
35 0x16FF0..=0x16FF1 => Language::ChineseOrJapanese,
36 0x1AFF0..=0x1B16F => Language::ChineseOrJapanese,
37 0x1F200 => Language::ChineseOrJapanese,
38 0x20000..=0x2FA1F => Language::ChineseOrJapanese,
39 0x30000..=0x3134F => Language::ChineseOrJapanese,
40 _ => Language::Unknown,
41 }
42}
43
44pub(super) struct LanguageIterator<'s> {
47 rest: &'s str,
48}
49
50impl<'s> LanguageIterator<'s> {
51 pub(super) fn new(input: &'s str) -> Self {
52 Self { rest: input }
53 }
54}
55
56impl<'s> Iterator for LanguageIterator<'s> {
57 type Item = (&'s str, Language);
58
59 fn next(&mut self) -> Option<Self::Item> {
60 let mut indices = self.rest.char_indices();
61 let lang = get_language(indices.next()?.1 as u32);
62 match indices.find(|&(_, ch)| get_language(ch as u32) != lang) {
63 Some((i, _)) => {
64 let (result, rest) = self.rest.split_at(i);
65 self.rest = rest;
66 Some((result, lang))
67 }
68 None => Some((core::mem::take(&mut self.rest), lang)),
69 }
70 }
71}
72
73pub(super) struct LanguageIteratorUtf16<'s> {
74 rest: &'s [u16],
75}
76
77impl<'s> LanguageIteratorUtf16<'s> {
78 pub(super) fn new(input: &'s [u16]) -> Self {
79 Self { rest: input }
80 }
81}
82
83impl<'s> Iterator for LanguageIteratorUtf16<'s> {
84 type Item = (&'s [u16], Language);
85
86 fn next(&mut self) -> Option<Self::Item> {
87 let lang = get_language(*self.rest.first()? as u32);
88 match self
89 .rest
90 .iter()
91 .position(|&ch| get_language(ch as u32) != lang)
92 {
93 Some(i) => {
94 let (result, rest) = self.rest.split_at(i);
95 self.rest = rest;
96 Some((result, lang))
97 }
98 None => Some((core::mem::take(&mut self.rest), lang)),
99 }
100 }
101}
102
103#[cfg(test)]
104mod tests {
105 use super::*;
106
107 #[test]
108 fn test_thai_only() {
109 let s = "ภาษาไทยภาษาไทย";
110 let utf16: Vec<u16> = s.encode_utf16().collect();
111 let mut iter = LanguageIteratorUtf16::new(&utf16);
112 assert_eq!(
113 iter.next(),
114 Some((utf16.as_slice(), Language::Thai)),
115 "Thai language only with UTF-16"
116 );
117 let mut iter = LanguageIterator::new(s);
118 assert_eq!(
119 iter.next(),
120 Some((s, Language::Thai)),
121 "Thai language only with UTF-8"
122 );
123 assert_eq!(iter.next(), None, "Iterator for UTF-8 is finished");
124 }
125
126 #[test]
127 fn test_combine() {
128 const TEST_STR_THAI: &str = "ภาษาไทยภาษาไทย";
129 const TEST_STR_BURMESE: &str = "ဗမာနွယ်ဘာသာစကားမျာ";
130 let s = format!("{TEST_STR_THAI}{TEST_STR_BURMESE}");
131 let utf16: Vec<u16> = s.encode_utf16().collect();
132 let thai_utf16: Vec<u16> = TEST_STR_THAI.encode_utf16().collect();
133 let burmese_utf16: Vec<u16> = TEST_STR_BURMESE.encode_utf16().collect();
134
135 let mut iter = LanguageIteratorUtf16::new(&utf16);
136 assert_eq!(
137 iter.next(),
138 Some((thai_utf16.as_slice(), Language::Thai)),
139 "Thai language with UTF-16 at first"
140 );
141 assert_eq!(
142 iter.next(),
143 Some((burmese_utf16.as_slice(), Language::Burmese)),
144 "Burmese language with UTF-16 at second"
145 );
146 assert_eq!(iter.next(), None, "Iterator for UTF-16 is finished");
147
148 let mut iter = LanguageIterator::new(&s);
149 assert_eq!(
150 iter.next(),
151 Some((TEST_STR_THAI, Language::Thai)),
152 "Thai language with UTF-8 at first"
153 );
154 assert_eq!(
155 iter.next(),
156 Some((TEST_STR_BURMESE, Language::Burmese)),
157 "Burmese language with UTF-8 at second"
158 );
159 assert_eq!(iter.next(), None, "Iterator for UTF-8 is finished");
160 }
161}