icu_segmenter/
rule_segmenter.rs1use crate::complex::ComplexPayloads;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::provider::*;
8use crate::WordType;
9use core::str::CharIndices;
10use utf8_iter::Utf8CharIndices;
11
12pub trait RuleBreakType<'l, 's> {
15 type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
17
18 type CharType: Copy + Into<u32> + core::fmt::Debug;
20
21 fn get_current_position_character_len(iter: &RuleBreakIterator<'l, 's, Self>) -> usize;
22
23 fn handle_complex_language(
24 iter: &mut RuleBreakIterator<'l, 's, Self>,
25 left_codepoint: Self::CharType,
26 ) -> Option<usize>;
27}
28
29#[derive(Debug)]
40pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
41 pub(crate) iter: Y::IterAttr,
42 pub(crate) len: usize,
43 pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
44 pub(crate) result_cache: alloc::vec::Vec<usize>,
45 pub(crate) data: &'l RuleBreakDataV1<'l>,
46 pub(crate) complex: Option<&'l ComplexPayloads>,
47 pub(crate) boundary_property: u8,
48}
49
50impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
51 type Item = usize;
52
53 fn next(&mut self) -> Option<Self::Item> {
54 if let Some(&first_result) = self.result_cache.first() {
56 let mut i = 0;
57 loop {
58 if i == first_result {
59 self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
60 return self.get_current_position();
61 }
62 i += Y::get_current_position_character_len(self);
63 self.advance_iter();
64 if self.is_eof() {
65 self.result_cache.clear();
66 self.boundary_property = self.data.complex_property;
67 return Some(self.len);
68 }
69 }
70 }
71
72 if self.is_eof() {
73 self.advance_iter();
74 if self.is_eof() && self.len == 0 {
75 self.len = 1;
79 return Some(0);
80 }
81 let Some(right_prop) = self.get_current_break_property() else {
82 self.boundary_property = 0;
84 return None;
85 };
86 if matches!(
88 self.get_break_state_from_table(self.data.sot_property, right_prop),
89 BreakState::Break | BreakState::NoMatch
90 ) {
91 self.boundary_property = 0; return self.get_current_position();
93 }
94 }
95
96 'a: loop {
97 debug_assert!(!self.is_eof());
98 let left_codepoint = self.get_current_codepoint()?;
99 let left_prop = self.get_break_property(left_codepoint);
100 self.advance_iter();
101
102 let Some(right_prop) = self.get_current_break_property() else {
103 self.boundary_property = left_prop;
104 return Some(self.len);
105 };
106
107 if right_prop == self.data.complex_property {
110 if left_prop != self.data.complex_property {
111 self.boundary_property = left_prop;
113 return self.get_current_position();
114 }
115 let break_offset = Y::handle_complex_language(self, left_codepoint);
116 if break_offset.is_some() {
117 return break_offset;
118 }
119 }
120
121 match self.get_break_state_from_table(left_prop, right_prop) {
122 BreakState::Keep => continue,
123 BreakState::Break | BreakState::NoMatch => {
124 self.boundary_property = left_prop;
125 return self.get_current_position();
126 }
127 BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
128 let mut previous_iter = self.iter.clone();
130 let mut previous_pos_data = self.current_pos_data;
131 let mut previous_left_prop = left_prop;
132
133 loop {
134 self.advance_iter();
135
136 let Some(prop) = self.get_current_break_property() else {
137 self.boundary_property = index;
139 if self.get_break_state_from_table(index, self.data.eot_property)
140 == BreakState::NoMatch
141 {
142 self.boundary_property = previous_left_prop;
143 self.iter = previous_iter;
144 self.current_pos_data = previous_pos_data;
145 return self.get_current_position();
146 }
147 return Some(self.len);
149 };
150
151 let previous_break_state_is_cp_prop =
152 index <= self.data.last_codepoint_property;
153
154 match self.get_break_state_from_table(index, prop) {
155 BreakState::Keep => continue 'a,
156 BreakState::NoMatch => {
157 self.boundary_property = previous_left_prop;
158 self.iter = previous_iter;
159 self.current_pos_data = previous_pos_data;
160 return self.get_current_position();
161 }
162 BreakState::Break => return self.get_current_position(),
163 BreakState::Intermediate(i) => {
164 index = i;
165 if previous_break_state_is_cp_prop {
166 previous_left_prop = index;
168 }
169 previous_iter = self.iter.clone();
170 previous_pos_data = self.current_pos_data;
171 }
172 BreakState::Index(i) => {
173 index = i;
174 if previous_break_state_is_cp_prop {
175 previous_iter = self.iter.clone();
177 previous_pos_data = self.current_pos_data;
178 previous_left_prop = index;
179 }
180 }
181 }
182 }
183 }
184 }
185 }
186 }
187}
188
189impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {
190 pub(crate) fn advance_iter(&mut self) {
191 self.current_pos_data = self.iter.next();
192 }
193
194 pub(crate) fn is_eof(&self) -> bool {
195 self.current_pos_data.is_none()
196 }
197
198 pub(crate) fn get_current_break_property(&self) -> Option<u8> {
199 self.get_current_codepoint()
200 .map(|c| self.get_break_property(c))
201 }
202
203 pub(crate) fn get_current_position(&self) -> Option<usize> {
204 self.current_pos_data.map(|(pos, _)| pos)
205 }
206
207 pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
208 self.current_pos_data.map(|(_, codepoint)| codepoint)
209 }
210
211 fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
212 self.data.property_table.get32(codepoint.into())
214 }
215
216 fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
217 let idx = left as usize * self.data.property_count as usize + right as usize;
218 self.data
220 .break_state_table
221 .get(idx)
222 .unwrap_or(BreakState::Keep)
223 }
224
225 pub fn word_type(&self) -> WordType {
228 if self.result_cache.first().is_some() {
229 return WordType::Letter;
231 }
232 if self.boundary_property == 0 {
233 return WordType::None;
235 }
236 self.data
237 .word_type_table
238 .get((self.boundary_property - 1) as usize)
239 .unwrap_or(WordType::None)
240 }
241
242 pub fn is_word_like(&self) -> bool {
245 self.word_type().is_word_like()
246 }
247}
248
249#[derive(Debug)]
250pub struct RuleBreakTypeUtf8;
251
252impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf8 {
253 type IterAttr = CharIndices<'s>;
254 type CharType = char;
255
256 fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
257 iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
258 }
259
260 fn handle_complex_language(
261 _: &mut RuleBreakIterator<Self>,
262 _: Self::CharType,
263 ) -> Option<usize> {
264 unreachable!()
265 }
266}
267
268#[derive(Debug)]
269pub struct RuleBreakTypePotentiallyIllFormedUtf8;
270
271impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypePotentiallyIllFormedUtf8 {
272 type IterAttr = Utf8CharIndices<'s>;
273 type CharType = char;
274
275 fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
276 iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
277 }
278
279 fn handle_complex_language(
280 _: &mut RuleBreakIterator<Self>,
281 _: Self::CharType,
282 ) -> Option<usize> {
283 unreachable!()
284 }
285}
286
287#[derive(Debug)]
288pub struct RuleBreakTypeLatin1;
289
290impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeLatin1 {
291 type IterAttr = Latin1Indices<'s>;
292 type CharType = u8;
293
294 fn get_current_position_character_len(_: &RuleBreakIterator<Self>) -> usize {
295 unreachable!()
296 }
297
298 fn handle_complex_language(
299 _: &mut RuleBreakIterator<Self>,
300 _: Self::CharType,
301 ) -> Option<usize> {
302 unreachable!()
303 }
304}
305
306#[derive(Debug)]
307pub struct RuleBreakTypeUtf16;
308
309impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf16 {
310 type IterAttr = Utf16Indices<'s>;
311 type CharType = u32;
312
313 fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
314 match iter.get_current_codepoint() {
315 None => 0,
316 Some(ch) if ch >= 0x10000 => 2,
317 _ => 1,
318 }
319 }
320
321 fn handle_complex_language(
322 _: &mut RuleBreakIterator<Self>,
323 _: Self::CharType,
324 ) -> Option<usize> {
325 unreachable!()
326 }
327}