icu_segmenter/
rule_segmenter.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::ComplexPayloads;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::provider::*;
8use crate::WordType;
9use core::str::CharIndices;
10use utf8_iter::Utf8CharIndices;
11
12/// A trait allowing for RuleBreakIterator to be generalized to multiple string
13/// encoding methods and granularity such as grapheme cluster, word, etc.
14pub trait RuleBreakType<'l, 's> {
15    /// The iterator over characters.
16    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
17
18    /// The character type.
19    type CharType: Copy + Into<u32> + core::fmt::Debug;
20
21    fn get_current_position_character_len(iter: &RuleBreakIterator<'l, 's, Self>) -> usize;
22
23    fn handle_complex_language(
24        iter: &mut RuleBreakIterator<'l, 's, Self>,
25        left_codepoint: Self::CharType,
26    ) -> Option<usize>;
27}
28
29/// Implements the [`Iterator`] trait over the segmenter boundaries of the given string.
30///
31/// Lifetimes:
32///
33/// - `'l` = lifetime of the segmenter object from which this iterator was created
34/// - `'s` = lifetime of the string being segmented
35///
36/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
37/// _after_ the boundary (for a boundary at the end of text, this index is the length
38/// of the [`str`] or array of code units).
39#[derive(Debug)]
40pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
41    pub(crate) iter: Y::IterAttr,
42    pub(crate) len: usize,
43    pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
44    pub(crate) result_cache: alloc::vec::Vec<usize>,
45    pub(crate) data: &'l RuleBreakDataV1<'l>,
46    pub(crate) complex: Option<&'l ComplexPayloads>,
47    pub(crate) boundary_property: u8,
48}
49
50impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
51    type Item = usize;
52
53    fn next(&mut self) -> Option<Self::Item> {
54        // If we have break point cache by previous run, return this result
55        if let Some(&first_result) = self.result_cache.first() {
56            let mut i = 0;
57            loop {
58                if i == first_result {
59                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
60                    return self.get_current_position();
61                }
62                i += Y::get_current_position_character_len(self);
63                self.advance_iter();
64                if self.is_eof() {
65                    self.result_cache.clear();
66                    self.boundary_property = self.data.complex_property;
67                    return Some(self.len);
68                }
69            }
70        }
71
72        if self.is_eof() {
73            self.advance_iter();
74            if self.is_eof() && self.len == 0 {
75                // Empty string. Since `self.current_pos_data` is always going to be empty,
76                // we never read `self.len` except for here, so we can use it to mark that
77                // we have already returned the single empty-string breakpoint.
78                self.len = 1;
79                return Some(0);
80            }
81            let Some(right_prop) = self.get_current_break_property() else {
82                // iterator already reaches to EOT. Reset boundary property for word-like.
83                self.boundary_property = 0;
84                return None;
85            };
86            // SOT x anything
87            if matches!(
88                self.get_break_state_from_table(self.data.sot_property, right_prop),
89                BreakState::Break | BreakState::NoMatch
90            ) {
91                self.boundary_property = 0; // SOT is special type
92                return self.get_current_position();
93            }
94        }
95
96        'a: loop {
97            debug_assert!(!self.is_eof());
98            let left_codepoint = self.get_current_codepoint()?;
99            let left_prop = self.get_break_property(left_codepoint);
100            self.advance_iter();
101
102            let Some(right_prop) = self.get_current_break_property() else {
103                self.boundary_property = left_prop;
104                return Some(self.len);
105            };
106
107            // Some segmenter rules doesn't have language-specific rules, we have to use LSTM (or dictionary) segmenter.
108            // If property is marked as SA, use it
109            if right_prop == self.data.complex_property {
110                if left_prop != self.data.complex_property {
111                    // break before SA
112                    self.boundary_property = left_prop;
113                    return self.get_current_position();
114                }
115                let break_offset = Y::handle_complex_language(self, left_codepoint);
116                if break_offset.is_some() {
117                    return break_offset;
118                }
119            }
120
121            match self.get_break_state_from_table(left_prop, right_prop) {
122                BreakState::Keep => continue,
123                BreakState::Break | BreakState::NoMatch => {
124                    self.boundary_property = left_prop;
125                    return self.get_current_position();
126                }
127                BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
128                    // This isn't simple rule set. We need marker to restore iterator to previous position.
129                    let mut previous_iter = self.iter.clone();
130                    let mut previous_pos_data = self.current_pos_data;
131                    let mut previous_left_prop = left_prop;
132
133                    loop {
134                        self.advance_iter();
135
136                        let Some(prop) = self.get_current_break_property() else {
137                            // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
138                            self.boundary_property = index;
139                            if self.get_break_state_from_table(index, self.data.eot_property)
140                                == BreakState::NoMatch
141                            {
142                                self.boundary_property = previous_left_prop;
143                                self.iter = previous_iter;
144                                self.current_pos_data = previous_pos_data;
145                                return self.get_current_position();
146                            }
147                            // EOF
148                            return Some(self.len);
149                        };
150
151                        let previous_break_state_is_cp_prop =
152                            index <= self.data.last_codepoint_property;
153
154                        match self.get_break_state_from_table(index, prop) {
155                            BreakState::Keep => continue 'a,
156                            BreakState::NoMatch => {
157                                self.boundary_property = previous_left_prop;
158                                self.iter = previous_iter;
159                                self.current_pos_data = previous_pos_data;
160                                return self.get_current_position();
161                            }
162                            BreakState::Break => return self.get_current_position(),
163                            BreakState::Intermediate(i) => {
164                                index = i;
165                                if previous_break_state_is_cp_prop {
166                                    // Move marker
167                                    previous_left_prop = index;
168                                }
169                                previous_iter = self.iter.clone();
170                                previous_pos_data = self.current_pos_data;
171                            }
172                            BreakState::Index(i) => {
173                                index = i;
174                                if previous_break_state_is_cp_prop {
175                                    // Move marker
176                                    previous_iter = self.iter.clone();
177                                    previous_pos_data = self.current_pos_data;
178                                    previous_left_prop = index;
179                                }
180                            }
181                        }
182                    }
183                }
184            }
185        }
186    }
187}
188
189impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {
190    pub(crate) fn advance_iter(&mut self) {
191        self.current_pos_data = self.iter.next();
192    }
193
194    pub(crate) fn is_eof(&self) -> bool {
195        self.current_pos_data.is_none()
196    }
197
198    pub(crate) fn get_current_break_property(&self) -> Option<u8> {
199        self.get_current_codepoint()
200            .map(|c| self.get_break_property(c))
201    }
202
203    pub(crate) fn get_current_position(&self) -> Option<usize> {
204        self.current_pos_data.map(|(pos, _)| pos)
205    }
206
207    pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
208        self.current_pos_data.map(|(_, codepoint)| codepoint)
209    }
210
211    fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
212        // Note: Default value is 0 == UNKNOWN
213        self.data.property_table.get32(codepoint.into())
214    }
215
216    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
217        let idx = left as usize * self.data.property_count as usize + right as usize;
218        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
219        self.data
220            .break_state_table
221            .get(idx)
222            .unwrap_or(BreakState::Keep)
223    }
224
225    /// Return the status value of break boundary.
226    /// If segmenter isn't word, always return WordType::None
227    pub fn word_type(&self) -> WordType {
228        if self.result_cache.first().is_some() {
229            // Dictionary type (CJ and East Asian) is letter.
230            return WordType::Letter;
231        }
232        if self.boundary_property == 0 {
233            // break position is SOT / Any
234            return WordType::None;
235        }
236        self.data
237            .word_type_table
238            .get((self.boundary_property - 1) as usize)
239            .unwrap_or(WordType::None)
240    }
241
242    /// Return true when break boundary is word-like such as letter/number/CJK
243    /// If segmenter isn't word, return false
244    pub fn is_word_like(&self) -> bool {
245        self.word_type().is_word_like()
246    }
247}
248
249#[derive(Debug)]
250pub struct RuleBreakTypeUtf8;
251
252impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf8 {
253    type IterAttr = CharIndices<'s>;
254    type CharType = char;
255
256    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
257        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
258    }
259
260    fn handle_complex_language(
261        _: &mut RuleBreakIterator<Self>,
262        _: Self::CharType,
263    ) -> Option<usize> {
264        unreachable!()
265    }
266}
267
268#[derive(Debug)]
269pub struct RuleBreakTypePotentiallyIllFormedUtf8;
270
271impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypePotentiallyIllFormedUtf8 {
272    type IterAttr = Utf8CharIndices<'s>;
273    type CharType = char;
274
275    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
276        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
277    }
278
279    fn handle_complex_language(
280        _: &mut RuleBreakIterator<Self>,
281        _: Self::CharType,
282    ) -> Option<usize> {
283        unreachable!()
284    }
285}
286
287#[derive(Debug)]
288pub struct RuleBreakTypeLatin1;
289
290impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeLatin1 {
291    type IterAttr = Latin1Indices<'s>;
292    type CharType = u8;
293
294    fn get_current_position_character_len(_: &RuleBreakIterator<Self>) -> usize {
295        unreachable!()
296    }
297
298    fn handle_complex_language(
299        _: &mut RuleBreakIterator<Self>,
300        _: Self::CharType,
301    ) -> Option<usize> {
302        unreachable!()
303    }
304}
305
306#[derive(Debug)]
307pub struct RuleBreakTypeUtf16;
308
309impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf16 {
310    type IterAttr = Utf16Indices<'s>;
311    type CharType = u32;
312
313    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
314        match iter.get_current_codepoint() {
315            None => 0,
316            Some(ch) if ch >= 0x10000 => 2,
317            _ => 1,
318        }
319    }
320
321    fn handle_complex_language(
322        _: &mut RuleBreakIterator<Self>,
323        _: Self::CharType,
324    ) -> Option<usize> {
325        unreachable!()
326    }
327}