icu_capi/
segmenter_word.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[diplomat::bridge]
6pub mod ffi {
7    use crate::errors::ffi::ICU4XError;
8    use crate::provider::ffi::ICU4XDataProvider;
9    use alloc::boxed::Box;
10    use core::convert::TryFrom;
11    use icu_segmenter::{
12        WordBreakIteratorLatin1, WordBreakIteratorPotentiallyIllFormedUtf8, WordBreakIteratorUtf16,
13        WordSegmenter, WordType,
14    };
15
16    #[diplomat::enum_convert(WordType, needs_wildcard)]
17    #[diplomat::rust_link(icu::segmenter::WordType, Enum)]
18    pub enum ICU4XSegmenterWordType {
19        None = 0,
20        Number = 1,
21        Letter = 2,
22    }
23
24    #[diplomat::opaque]
25    /// An ICU4X word-break segmenter, capable of finding word breakpoints in strings.
26    #[diplomat::rust_link(icu::segmenter::WordSegmenter, Struct)]
27    pub struct ICU4XWordSegmenter(WordSegmenter);
28
29    #[diplomat::opaque]
30    #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)]
31    #[diplomat::rust_link(
32        icu::segmenter::WordBreakIteratorPotentiallyIllFormedUtf8,
33        Typedef,
34        hidden
35    )]
36    #[diplomat::rust_link(icu::segmenter::WordBreakIteratorUtf8, Typedef, hidden)]
37    pub struct ICU4XWordBreakIteratorUtf8<'a>(WordBreakIteratorPotentiallyIllFormedUtf8<'a, 'a>);
38
39    #[diplomat::opaque]
40    #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)]
41    #[diplomat::rust_link(icu::segmenter::WordBreakIteratorUtf16, Typedef, hidden)]
42    pub struct ICU4XWordBreakIteratorUtf16<'a>(WordBreakIteratorUtf16<'a, 'a>);
43
44    #[diplomat::opaque]
45    #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)]
46    #[diplomat::rust_link(icu::segmenter::WordBreakIteratorLatin1, Typedef, hidden)]
47    pub struct ICU4XWordBreakIteratorLatin1<'a>(WordBreakIteratorLatin1<'a, 'a>);
48
49    impl ICU4XSegmenterWordType {
50        #[diplomat::rust_link(icu::segmenter::WordType::is_word_like, FnInEnum)]
51        #[diplomat::attr(supports = accessors, getter)]
52        pub fn is_word_like(self) -> bool {
53            WordType::from(self).is_word_like()
54        }
55    }
56
57    impl ICU4XWordSegmenter {
58        /// Construct an [`ICU4XWordSegmenter`] with automatically selecting the best available LSTM
59        /// or dictionary payload data.
60        ///
61        /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
62        /// Khmer, Lao, and Thai.
63        #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_auto, FnInStruct)]
64        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "auto")]
65        pub fn create_auto(
66            provider: &ICU4XDataProvider,
67        ) -> Result<Box<ICU4XWordSegmenter>, ICU4XError> {
68            Ok(Box::new(ICU4XWordSegmenter(call_constructor!(
69                WordSegmenter::new_auto [r => Ok(r)],
70                WordSegmenter::try_new_auto_with_any_provider,
71                WordSegmenter::try_new_auto_with_buffer_provider,
72                provider
73            )?)))
74        }
75
76        /// Construct an [`ICU4XWordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and
77        /// Thai.
78        ///
79        /// Warning: [`ICU4XWordSegmenter`] created by this function doesn't handle Chinese or
80        /// Japanese.
81        #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_lstm, FnInStruct)]
82        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm")]
83        pub fn create_lstm(
84            provider: &ICU4XDataProvider,
85        ) -> Result<Box<ICU4XWordSegmenter>, ICU4XError> {
86            Ok(Box::new(ICU4XWordSegmenter(call_constructor!(
87                WordSegmenter::new_lstm [r => Ok(r)],
88                WordSegmenter::try_new_lstm_with_any_provider,
89                WordSegmenter::try_new_lstm_with_buffer_provider,
90                provider,
91            )?)))
92        }
93
94        /// Construct an [`ICU4XWordSegmenter`] with dictionary payload data for Chinese, Japanese,
95        /// Burmese, Khmer, Lao, and Thai.
96        #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_dictionary, FnInStruct)]
97        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary")]
98        pub fn create_dictionary(
99            provider: &ICU4XDataProvider,
100        ) -> Result<Box<ICU4XWordSegmenter>, ICU4XError> {
101            Ok(Box::new(ICU4XWordSegmenter(call_constructor!(
102                WordSegmenter::new_dictionary [r => Ok(r)],
103                WordSegmenter::try_new_dictionary_with_any_provider,
104                WordSegmenter::try_new_dictionary_with_buffer_provider,
105                provider,
106            )?)))
107        }
108
109        /// Segments a string.
110        ///
111        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
112        /// to the WHATWG Encoding Standard.
113        #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf8, FnInStruct)]
114        #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_str, FnInStruct, hidden)]
115        #[diplomat::attr(dart, disable)]
116        pub fn segment_utf8<'a>(
117            &'a self,
118            input: &'a DiplomatStr,
119        ) -> Box<ICU4XWordBreakIteratorUtf8<'a>> {
120            Box::new(ICU4XWordBreakIteratorUtf8(self.0.segment_utf8(input)))
121        }
122
123        /// Segments a string.
124        ///
125        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
126        /// to the WHATWG Encoding Standard.
127        #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf16, FnInStruct)]
128        #[diplomat::attr(dart, rename = "segment")]
129        pub fn segment_utf16<'a>(
130            &'a self,
131            input: &'a DiplomatStr16,
132        ) -> Box<ICU4XWordBreakIteratorUtf16<'a>> {
133            Box::new(ICU4XWordBreakIteratorUtf16(self.0.segment_utf16(input)))
134        }
135
136        /// Segments a Latin-1 string.
137        #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_latin1, FnInStruct)]
138        #[diplomat::attr(dart, disable)]
139        pub fn segment_latin1<'a>(
140            &'a self,
141            input: &'a [u8],
142        ) -> Box<ICU4XWordBreakIteratorLatin1<'a>> {
143            Box::new(ICU4XWordBreakIteratorLatin1(self.0.segment_latin1(input)))
144        }
145    }
146
147    impl<'a> ICU4XWordBreakIteratorUtf8<'a> {
148        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
149        /// out of range of a 32-bit signed integer.
150        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)]
151        #[diplomat::rust_link(
152            icu::segmenter::WordBreakIterator::Item,
153            AssociatedTypeInStruct,
154            hidden
155        )]
156        pub fn next(&mut self) -> i32 {
157            self.0
158                .next()
159                .and_then(|u| i32::try_from(u).ok())
160                .unwrap_or(-1)
161        }
162
163        /// Return the status value of break boundary.
164        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)]
165        #[diplomat::attr(supports = accessors, getter)]
166        pub fn word_type(&self) -> ICU4XSegmenterWordType {
167            self.0.word_type().into()
168        }
169
170        /// Return true when break boundary is word-like such as letter/number/CJK
171        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)]
172        #[diplomat::attr(supports = accessors, getter)]
173        pub fn is_word_like(&self) -> bool {
174            self.0.is_word_like()
175        }
176    }
177
178    impl<'a> ICU4XWordBreakIteratorUtf16<'a> {
179        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
180        /// out of range of a 32-bit signed integer.
181        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)]
182        #[diplomat::rust_link(
183            icu::segmenter::WordBreakIterator::Item,
184            AssociatedTypeInStruct,
185            hidden
186        )]
187        pub fn next(&mut self) -> i32 {
188            self.0
189                .next()
190                .and_then(|u| i32::try_from(u).ok())
191                .unwrap_or(-1)
192        }
193
194        /// Return the status value of break boundary.
195        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)]
196        #[diplomat::rust_link(
197            icu::segmenter::WordBreakIterator::iter_with_word_type,
198            FnInStruct,
199            hidden
200        )]
201        #[diplomat::attr(supports = accessors, getter)]
202        pub fn word_type(&self) -> ICU4XSegmenterWordType {
203            self.0.word_type().into()
204        }
205
206        /// Return true when break boundary is word-like such as letter/number/CJK
207        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)]
208        #[diplomat::attr(supports = accessors, getter)]
209        pub fn is_word_like(&self) -> bool {
210            self.0.is_word_like()
211        }
212    }
213
214    impl<'a> ICU4XWordBreakIteratorLatin1<'a> {
215        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
216        /// out of range of a 32-bit signed integer.
217        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)]
218        #[diplomat::rust_link(
219            icu::segmenter::WordBreakIterator::Item,
220            AssociatedTypeInStruct,
221            hidden
222        )]
223        pub fn next(&mut self) -> i32 {
224            self.0
225                .next()
226                .and_then(|u| i32::try_from(u).ok())
227                .unwrap_or(-1)
228        }
229
230        /// Return the status value of break boundary.
231        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)]
232        #[diplomat::attr(supports = accessors, getter)]
233        pub fn word_type(&self) -> ICU4XSegmenterWordType {
234            self.0.word_type().into()
235        }
236
237        /// Return true when break boundary is word-like such as letter/number/CJK
238        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)]
239        #[diplomat::attr(supports = accessors, getter)]
240        pub fn is_word_like(&self) -> bool {
241            self.0.is_word_like()
242        }
243    }
244}