icu_capi/
segmenter_line.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use icu_segmenter::LineBreakOptions;
6use icu_segmenter::LineBreakStrictness;
7use icu_segmenter::LineBreakWordOption;
8
9#[diplomat::bridge]
10pub mod ffi {
11    use crate::errors::ffi::ICU4XError;
12    use crate::provider::ffi::ICU4XDataProvider;
13    use alloc::boxed::Box;
14    use core::convert::TryFrom;
15    use icu_segmenter::{
16        LineBreakIteratorLatin1, LineBreakIteratorPotentiallyIllFormedUtf8, LineBreakIteratorUtf16,
17        LineSegmenter,
18    };
19
20    #[diplomat::opaque]
21    /// An ICU4X line-break segmenter, capable of finding breakpoints in strings.
22    #[diplomat::rust_link(icu::segmenter::LineSegmenter, Struct)]
23    pub struct ICU4XLineSegmenter(LineSegmenter);
24
25    #[diplomat::rust_link(icu::segmenter::LineBreakStrictness, Enum)]
26    pub enum ICU4XLineBreakStrictness {
27        Loose,
28        Normal,
29        Strict,
30        Anywhere,
31    }
32
33    #[diplomat::rust_link(icu::segmenter::LineBreakWordOption, Enum)]
34    pub enum ICU4XLineBreakWordOption {
35        Normal,
36        BreakAll,
37        KeepAll,
38    }
39
40    #[diplomat::rust_link(icu::segmenter::LineBreakOptions, Struct)]
41    #[diplomat::attr(dart, rename = "LineBreakOptions")]
42    pub struct ICU4XLineBreakOptionsV1 {
43        pub strictness: ICU4XLineBreakStrictness,
44        pub word_option: ICU4XLineBreakWordOption,
45        pub ja_zh: bool,
46    }
47
48    #[diplomat::opaque]
49    #[diplomat::rust_link(icu::segmenter::LineBreakIterator, Struct)]
50    #[diplomat::rust_link(
51        icu::segmenter::LineBreakIteratorPotentiallyIllFormedUtf8,
52        Typedef,
53        compact
54    )]
55    #[diplomat::rust_link(icu::segmenter::LineBreakIteratorUtf8, Typedef, hidden)]
56    pub struct ICU4XLineBreakIteratorUtf8<'a>(LineBreakIteratorPotentiallyIllFormedUtf8<'a, 'a>);
57
58    #[diplomat::opaque]
59    #[diplomat::rust_link(icu::segmenter::LineBreakIterator, Struct)]
60    #[diplomat::rust_link(icu::segmenter::LineBreakIteratorUtf16, Typedef, compact)]
61    pub struct ICU4XLineBreakIteratorUtf16<'a>(LineBreakIteratorUtf16<'a, 'a>);
62
63    #[diplomat::opaque]
64    #[diplomat::rust_link(icu::segmenter::LineBreakIterator, Struct)]
65    #[diplomat::rust_link(icu::segmenter::LineBreakIteratorLatin1, Typedef, compact)]
66    pub struct ICU4XLineBreakIteratorLatin1<'a>(LineBreakIteratorLatin1<'a, 'a>);
67
68    impl ICU4XLineSegmenter {
69        /// Construct a [`ICU4XLineSegmenter`] with default options. It automatically loads the best
70        /// available payload data for Burmese, Khmer, Lao, and Thai.
71        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)]
72        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "auto")]
73        pub fn create_auto(
74            provider: &ICU4XDataProvider,
75        ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
76            Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
77                LineSegmenter::new_auto [r => Ok(r)],
78                LineSegmenter::try_new_auto_with_any_provider,
79                LineSegmenter::try_new_auto_with_buffer_provider,
80                provider
81            )?)))
82        }
83
84        /// Construct a [`ICU4XLineSegmenter`] with default options and LSTM payload data for
85        /// Burmese, Khmer, Lao, and Thai.
86        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)]
87        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm")]
88        pub fn create_lstm(
89            provider: &ICU4XDataProvider,
90        ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
91            Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
92                LineSegmenter::new_lstm [r => Ok(r)],
93                LineSegmenter::try_new_lstm_with_any_provider,
94                LineSegmenter::try_new_lstm_with_buffer_provider,
95                provider,
96            )?)))
97        }
98
99        /// Construct a [`ICU4XLineSegmenter`] with default options and dictionary payload data for
100        /// Burmese, Khmer, Lao, and Thai..
101        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)]
102        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary")]
103        pub fn create_dictionary(
104            provider: &ICU4XDataProvider,
105        ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
106            Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
107                LineSegmenter::new_dictionary [r => Ok(r)],
108                LineSegmenter::try_new_dictionary_with_any_provider,
109                LineSegmenter::try_new_dictionary_with_buffer_provider,
110                provider,
111            )?)))
112        }
113
114        /// Construct a [`ICU4XLineSegmenter`] with custom options. It automatically loads the best
115        /// available payload data for Burmese, Khmer, Lao, and Thai.
116        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto_with_options, FnInStruct)]
117        #[diplomat::attr(dart, rename = "auto_with_options")]
118        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_options_v1")]
119        pub fn create_auto_with_options_v1(
120            provider: &ICU4XDataProvider,
121            options: ICU4XLineBreakOptionsV1,
122        ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
123            Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
124                LineSegmenter::new_auto_with_options [r => Ok(r)],
125                LineSegmenter::try_new_auto_with_options_with_any_provider,
126                LineSegmenter::try_new_auto_with_options_with_buffer_provider,
127                provider,
128                options.into(),
129            )?)))
130        }
131
132        /// Construct a [`ICU4XLineSegmenter`] with custom options and LSTM payload data for
133        /// Burmese, Khmer, Lao, and Thai.
134        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm_with_options, FnInStruct)]
135        #[diplomat::attr(dart, rename = "lstm_with_options")]
136        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_options_v1")]
137        pub fn create_lstm_with_options_v1(
138            provider: &ICU4XDataProvider,
139            options: ICU4XLineBreakOptionsV1,
140        ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
141            Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
142                LineSegmenter::new_lstm_with_options [r => Ok(r)],
143                LineSegmenter::try_new_lstm_with_options_with_any_provider,
144                LineSegmenter::try_new_lstm_with_options_with_buffer_provider,
145                provider,
146                options.into(),
147            )?)))
148        }
149
150        /// Construct a [`ICU4XLineSegmenter`] with custom options and dictionary payload data for
151        /// Burmese, Khmer, Lao, and Thai.
152        #[diplomat::rust_link(
153            icu::segmenter::LineSegmenter::new_dictionary_with_options,
154            FnInStruct
155        )]
156        #[diplomat::attr(dart, rename = "dictionary_with_options")]
157        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_options_v1")]
158        pub fn create_dictionary_with_options_v1(
159            provider: &ICU4XDataProvider,
160            options: ICU4XLineBreakOptionsV1,
161        ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
162            Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
163                LineSegmenter::new_dictionary_with_options [r => Ok(r)],
164                LineSegmenter::try_new_dictionary_with_options_with_any_provider,
165                LineSegmenter::try_new_dictionary_with_options_with_buffer_provider,
166                provider,
167                options.into(),
168            )?)))
169        }
170
171        /// Segments a string.
172        ///
173        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
174        /// to the WHATWG Encoding Standard.
175        #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_utf8, FnInStruct)]
176        #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_str, FnInStruct, hidden)]
177        #[diplomat::attr(dart, disable)]
178        pub fn segment_utf8<'a>(
179            &'a self,
180            input: &'a DiplomatStr,
181        ) -> Box<ICU4XLineBreakIteratorUtf8<'a>> {
182            Box::new(ICU4XLineBreakIteratorUtf8(self.0.segment_utf8(input)))
183        }
184
185        /// Segments a string.
186        ///
187        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
188        /// to the WHATWG Encoding Standard.
189        #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_utf16, FnInStruct)]
190        #[diplomat::attr(dart, rename = "segment")]
191        pub fn segment_utf16<'a>(
192            &'a self,
193            input: &'a DiplomatStr16,
194        ) -> Box<ICU4XLineBreakIteratorUtf16<'a>> {
195            Box::new(ICU4XLineBreakIteratorUtf16(self.0.segment_utf16(input)))
196        }
197
198        /// Segments a Latin-1 string.
199        #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_latin1, FnInStruct)]
200        #[diplomat::attr(dart, disable)]
201        pub fn segment_latin1<'a>(
202            &'a self,
203            input: &'a [u8],
204        ) -> Box<ICU4XLineBreakIteratorLatin1<'a>> {
205            Box::new(ICU4XLineBreakIteratorLatin1(self.0.segment_latin1(input)))
206        }
207    }
208
209    impl<'a> ICU4XLineBreakIteratorUtf8<'a> {
210        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
211        /// out of range of a 32-bit signed integer.
212        #[diplomat::rust_link(icu::segmenter::LineBreakIterator::next, FnInStruct)]
213        #[diplomat::rust_link(
214            icu::segmenter::LineBreakIterator::Item,
215            AssociatedTypeInStruct,
216            hidden
217        )]
218        pub fn next(&mut self) -> i32 {
219            self.0
220                .next()
221                .and_then(|u| i32::try_from(u).ok())
222                .unwrap_or(-1)
223        }
224    }
225
226    impl<'a> ICU4XLineBreakIteratorUtf16<'a> {
227        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
228        /// out of range of a 32-bit signed integer.
229        #[diplomat::rust_link(icu::segmenter::LineBreakIterator::next, FnInStruct)]
230        #[diplomat::rust_link(
231            icu::segmenter::LineBreakIterator::Item,
232            AssociatedTypeInStruct,
233            hidden
234        )]
235        pub fn next(&mut self) -> i32 {
236            self.0
237                .next()
238                .and_then(|u| i32::try_from(u).ok())
239                .unwrap_or(-1)
240        }
241    }
242
243    impl<'a> ICU4XLineBreakIteratorLatin1<'a> {
244        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
245        /// out of range of a 32-bit signed integer.
246        #[diplomat::rust_link(icu::segmenter::LineBreakIterator::next, FnInStruct)]
247        #[diplomat::rust_link(
248            icu::segmenter::LineBreakIterator::Item,
249            AssociatedTypeInStruct,
250            hidden
251        )]
252        pub fn next(&mut self) -> i32 {
253            self.0
254                .next()
255                .and_then(|u| i32::try_from(u).ok())
256                .unwrap_or(-1)
257        }
258    }
259}
260
261impl From<ffi::ICU4XLineBreakStrictness> for LineBreakStrictness {
262    fn from(other: ffi::ICU4XLineBreakStrictness) -> Self {
263        match other {
264            ffi::ICU4XLineBreakStrictness::Loose => Self::Loose,
265            ffi::ICU4XLineBreakStrictness::Normal => Self::Normal,
266            ffi::ICU4XLineBreakStrictness::Strict => Self::Strict,
267            ffi::ICU4XLineBreakStrictness::Anywhere => Self::Anywhere,
268        }
269    }
270}
271
272impl From<ffi::ICU4XLineBreakWordOption> for LineBreakWordOption {
273    fn from(other: ffi::ICU4XLineBreakWordOption) -> Self {
274        match other {
275            ffi::ICU4XLineBreakWordOption::Normal => Self::Normal,
276            ffi::ICU4XLineBreakWordOption::BreakAll => Self::BreakAll,
277            ffi::ICU4XLineBreakWordOption::KeepAll => Self::KeepAll,
278        }
279    }
280}
281
282impl From<ffi::ICU4XLineBreakOptionsV1> for LineBreakOptions {
283    fn from(other: ffi::ICU4XLineBreakOptionsV1) -> Self {
284        let mut options = LineBreakOptions::default();
285        options.strictness = other.strictness.into();
286        options.word_option = other.word_option.into();
287        options.ja_zh = other.ja_zh;
288        options
289    }
290}