icu_segmenter/
rule_segmenter.rs1use crate::complex::ComplexPayloads;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::provider::*;
8use crate::WordType;
9use core::str::CharIndices;
10use utf8_iter::Utf8CharIndices;
11
12pub trait RuleBreakType<'l, 's> {
15    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
17
18    type CharType: Copy + Into<u32> + core::fmt::Debug;
20
21    fn get_current_position_character_len(iter: &RuleBreakIterator<'l, 's, Self>) -> usize;
22
23    fn handle_complex_language(
24        iter: &mut RuleBreakIterator<'l, 's, Self>,
25        left_codepoint: Self::CharType,
26    ) -> Option<usize>;
27}
28
29#[derive(Debug)]
40pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
41    pub(crate) iter: Y::IterAttr,
42    pub(crate) len: usize,
43    pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
44    pub(crate) result_cache: alloc::vec::Vec<usize>,
45    pub(crate) data: &'l RuleBreakDataV1<'l>,
46    pub(crate) complex: Option<&'l ComplexPayloads>,
47    pub(crate) boundary_property: u8,
48}
49
50impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
51    type Item = usize;
52
53    fn next(&mut self) -> Option<Self::Item> {
54        if let Some(&first_result) = self.result_cache.first() {
56            let mut i = 0;
57            loop {
58                if i == first_result {
59                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
60                    return self.get_current_position();
61                }
62                i += Y::get_current_position_character_len(self);
63                self.advance_iter();
64                if self.is_eof() {
65                    self.result_cache.clear();
66                    self.boundary_property = self.data.complex_property;
67                    return Some(self.len);
68                }
69            }
70        }
71
72        if self.is_eof() {
73            self.advance_iter();
74            if self.is_eof() && self.len == 0 {
75                self.len = 1;
79                return Some(0);
80            }
81            let Some(right_prop) = self.get_current_break_property() else {
82                self.boundary_property = 0;
84                return None;
85            };
86            if matches!(
88                self.get_break_state_from_table(self.data.sot_property, right_prop),
89                BreakState::Break | BreakState::NoMatch
90            ) {
91                self.boundary_property = 0; return self.get_current_position();
93            }
94        }
95
96        'a: loop {
97            debug_assert!(!self.is_eof());
98            let left_codepoint = self.get_current_codepoint()?;
99            let left_prop = self.get_break_property(left_codepoint);
100            self.advance_iter();
101
102            let Some(right_prop) = self.get_current_break_property() else {
103                self.boundary_property = left_prop;
104                return Some(self.len);
105            };
106
107            if right_prop == self.data.complex_property {
110                if left_prop != self.data.complex_property {
111                    self.boundary_property = left_prop;
113                    return self.get_current_position();
114                }
115                let break_offset = Y::handle_complex_language(self, left_codepoint);
116                if break_offset.is_some() {
117                    return break_offset;
118                }
119            }
120
121            match self.get_break_state_from_table(left_prop, right_prop) {
122                BreakState::Keep => continue,
123                BreakState::Break | BreakState::NoMatch => {
124                    self.boundary_property = left_prop;
125                    return self.get_current_position();
126                }
127                BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
128                    let mut previous_iter = self.iter.clone();
130                    let mut previous_pos_data = self.current_pos_data;
131                    let mut previous_left_prop = left_prop;
132
133                    loop {
134                        self.advance_iter();
135
136                        let Some(prop) = self.get_current_break_property() else {
137                            self.boundary_property = index;
139                            if self.get_break_state_from_table(index, self.data.eot_property)
140                                == BreakState::NoMatch
141                            {
142                                self.boundary_property = previous_left_prop;
143                                self.iter = previous_iter;
144                                self.current_pos_data = previous_pos_data;
145                                return self.get_current_position();
146                            }
147                            return Some(self.len);
149                        };
150
151                        let previous_break_state_is_cp_prop =
152                            index <= self.data.last_codepoint_property;
153
154                        match self.get_break_state_from_table(index, prop) {
155                            BreakState::Keep => continue 'a,
156                            BreakState::NoMatch => {
157                                self.boundary_property = previous_left_prop;
158                                self.iter = previous_iter;
159                                self.current_pos_data = previous_pos_data;
160                                return self.get_current_position();
161                            }
162                            BreakState::Break => return self.get_current_position(),
163                            BreakState::Intermediate(i) => {
164                                index = i;
165                                if previous_break_state_is_cp_prop {
166                                    previous_left_prop = index;
168                                }
169                                previous_iter = self.iter.clone();
170                                previous_pos_data = self.current_pos_data;
171                            }
172                            BreakState::Index(i) => {
173                                index = i;
174                                if previous_break_state_is_cp_prop {
175                                    previous_iter = self.iter.clone();
177                                    previous_pos_data = self.current_pos_data;
178                                    previous_left_prop = index;
179                                }
180                            }
181                        }
182                    }
183                }
184            }
185        }
186    }
187}
188
189impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {
190    pub(crate) fn advance_iter(&mut self) {
191        self.current_pos_data = self.iter.next();
192    }
193
194    pub(crate) fn is_eof(&self) -> bool {
195        self.current_pos_data.is_none()
196    }
197
198    pub(crate) fn get_current_break_property(&self) -> Option<u8> {
199        self.get_current_codepoint()
200            .map(|c| self.get_break_property(c))
201    }
202
203    pub(crate) fn get_current_position(&self) -> Option<usize> {
204        self.current_pos_data.map(|(pos, _)| pos)
205    }
206
207    pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
208        self.current_pos_data.map(|(_, codepoint)| codepoint)
209    }
210
211    fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
212        self.data.property_table.get32(codepoint.into())
214    }
215
216    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
217        let idx = left as usize * self.data.property_count as usize + right as usize;
218        self.data
220            .break_state_table
221            .get(idx)
222            .unwrap_or(BreakState::Keep)
223    }
224
225    pub fn word_type(&self) -> WordType {
228        if self.result_cache.first().is_some() {
229            return WordType::Letter;
231        }
232        if self.boundary_property == 0 {
233            return WordType::None;
235        }
236        self.data
237            .word_type_table
238            .get((self.boundary_property - 1) as usize)
239            .unwrap_or(WordType::None)
240    }
241
242    pub fn is_word_like(&self) -> bool {
245        self.word_type().is_word_like()
246    }
247}
248
249#[derive(Debug)]
250pub struct RuleBreakTypeUtf8;
251
252impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf8 {
253    type IterAttr = CharIndices<'s>;
254    type CharType = char;
255
256    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
257        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
258    }
259
260    fn handle_complex_language(
261        _: &mut RuleBreakIterator<Self>,
262        _: Self::CharType,
263    ) -> Option<usize> {
264        unreachable!()
265    }
266}
267
268#[derive(Debug)]
269pub struct RuleBreakTypePotentiallyIllFormedUtf8;
270
271impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypePotentiallyIllFormedUtf8 {
272    type IterAttr = Utf8CharIndices<'s>;
273    type CharType = char;
274
275    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
276        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
277    }
278
279    fn handle_complex_language(
280        _: &mut RuleBreakIterator<Self>,
281        _: Self::CharType,
282    ) -> Option<usize> {
283        unreachable!()
284    }
285}
286
287#[derive(Debug)]
288pub struct RuleBreakTypeLatin1;
289
290impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeLatin1 {
291    type IterAttr = Latin1Indices<'s>;
292    type CharType = u8;
293
294    fn get_current_position_character_len(_: &RuleBreakIterator<Self>) -> usize {
295        unreachable!()
296    }
297
298    fn handle_complex_language(
299        _: &mut RuleBreakIterator<Self>,
300        _: Self::CharType,
301    ) -> Option<usize> {
302        unreachable!()
303    }
304}
305
306#[derive(Debug)]
307pub struct RuleBreakTypeUtf16;
308
309impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf16 {
310    type IterAttr = Utf16Indices<'s>;
311    type CharType = u32;
312
313    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
314        match iter.get_current_codepoint() {
315            None => 0,
316            Some(ch) if ch >= 0x10000 => 2,
317            _ => 1,
318        }
319    }
320
321    fn handle_complex_language(
322        _: &mut RuleBreakIterator<Self>,
323        _: Self::CharType,
324    ) -> Option<usize> {
325        unreachable!()
326    }
327}