icu_segmenter/sentence.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::vec::Vec;
6use icu_provider::prelude::*;
7
8use crate::indices::{Latin1Indices, Utf16Indices};
9use crate::iterator_helpers::derive_usize_iterator_with_type;
10use crate::rule_segmenter::*;
11use crate::{provider::*, SegmenterError};
12use utf8_iter::Utf8CharIndices;
13
14/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
15///
16/// Lifetimes:
17///
18/// - `'l` = lifetime of the segmenter object from which this iterator was created
19/// - `'s` = lifetime of the string being segmented
20///
21/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
22/// _after_ the boundary (for a boundary at the end of text, this index is the length
23/// of the [`str`] or array of code units).
24///
25/// For examples of use, see [`SentenceSegmenter`].
26#[derive(Debug)]
27pub struct SentenceBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
28 RuleBreakIterator<'l, 's, Y>,
29);
30
31derive_usize_iterator_with_type!(SentenceBreakIterator);
32
33/// Sentence break iterator for an `str` (a UTF-8 string).
34///
35/// For examples of use, see [`SentenceSegmenter`].
36pub type SentenceBreakIteratorUtf8<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf8>;
37
38/// Sentence break iterator for a potentially invalid UTF-8 string.
39///
40/// For examples of use, see [`SentenceSegmenter`].
41pub type SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
42 SentenceBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
43
44/// Sentence break iterator for a Latin-1 (8-bit) string.
45///
46/// For examples of use, see [`SentenceSegmenter`].
47pub type SentenceBreakIteratorLatin1<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeLatin1>;
48
49/// Sentence break iterator for a UTF-16 string.
50///
51/// For examples of use, see [`SentenceSegmenter`].
52pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf16>;
53
54/// Supports loading sentence break data, and creating sentence break iterators for different string
55/// encodings.
56///
57/// # Examples
58///
59/// Segment a string:
60///
61/// ```rust
62/// use icu::segmenter::SentenceSegmenter;
63/// let segmenter = SentenceSegmenter::new();
64///
65/// let breakpoints: Vec<usize> =
66/// segmenter.segment_str("Hello World").collect();
67/// assert_eq!(&breakpoints, &[0, 11]);
68/// ```
69///
70/// Segment a Latin1 byte string:
71///
72/// ```rust
73/// use icu::segmenter::SentenceSegmenter;
74/// let segmenter = SentenceSegmenter::new();
75///
76/// let breakpoints: Vec<usize> =
77/// segmenter.segment_latin1(b"Hello World").collect();
78/// assert_eq!(&breakpoints, &[0, 11]);
79/// ```
80///
81/// Successive boundaries can be used to retrieve the sentences.
82/// In particular, the first boundary is always 0, and the last one is the
83/// length of the segmented text in code units.
84///
85/// ```rust
86/// # use icu::segmenter::SentenceSegmenter;
87/// # let segmenter = SentenceSegmenter::new();
88/// use itertools::Itertools;
89/// let text = "Ceci tuera cela. Le livre tuera l’édifice.";
90/// let sentences: Vec<&str> = segmenter
91/// .segment_str(text)
92/// .tuple_windows()
93/// .map(|(i, j)| &text[i..j])
94/// .collect();
95/// assert_eq!(
96/// &sentences,
97/// &["Ceci tuera cela. ", "Le livre tuera l’édifice."]
98/// );
99/// ```
100#[derive(Debug)]
101pub struct SentenceSegmenter {
102 payload: DataPayload<SentenceBreakDataV1Marker>,
103}
104
105#[cfg(feature = "compiled_data")]
106impl Default for SentenceSegmenter {
107 fn default() -> Self {
108 Self::new()
109 }
110}
111
112impl SentenceSegmenter {
113 /// Constructs a [`SentenceSegmenter`] with an invariant locale and compiled data.
114 ///
115 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
116 ///
117 /// [📚 Help choosing a constructor](icu_provider::constructors)
118 #[cfg(feature = "compiled_data")]
119 pub fn new() -> Self {
120 Self {
121 payload: DataPayload::from_static_ref(
122 crate::provider::Baked::SINGLETON_SEGMENTER_SENTENCE_V1,
123 ),
124 }
125 }
126
127 icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
128 #[cfg(skip)]
129 functions: [
130 new,
131 try_new_with_any_provider,
132 try_new_with_buffer_provider,
133 try_new_unstable,
134 Self,
135 ]
136 );
137
138 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
139 pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
140 where
141 D: DataProvider<SentenceBreakDataV1Marker> + ?Sized,
142 {
143 let payload = provider.load(Default::default())?.take_payload()?;
144 Ok(Self { payload })
145 }
146
147 /// Creates a sentence break iterator for an `str` (a UTF-8 string).
148 ///
149 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
150 pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> SentenceBreakIteratorUtf8<'l, 's> {
151 SentenceBreakIterator(RuleBreakIterator {
152 iter: input.char_indices(),
153 len: input.len(),
154 current_pos_data: None,
155 result_cache: Vec::new(),
156 data: self.payload.get(),
157 complex: None,
158 boundary_property: 0,
159 })
160 }
161 /// Creates a sentence break iterator for a potentially ill-formed UTF8 string
162 ///
163 /// Invalid characters are treated as REPLACEMENT CHARACTER
164 ///
165 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
166 pub fn segment_utf8<'l, 's>(
167 &'l self,
168 input: &'s [u8],
169 ) -> SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
170 SentenceBreakIterator(RuleBreakIterator {
171 iter: Utf8CharIndices::new(input),
172 len: input.len(),
173 current_pos_data: None,
174 result_cache: Vec::new(),
175 data: self.payload.get(),
176 complex: None,
177 boundary_property: 0,
178 })
179 }
180 /// Creates a sentence break iterator for a Latin-1 (8-bit) string.
181 ///
182 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
183 pub fn segment_latin1<'l, 's>(
184 &'l self,
185 input: &'s [u8],
186 ) -> SentenceBreakIteratorLatin1<'l, 's> {
187 SentenceBreakIterator(RuleBreakIterator {
188 iter: Latin1Indices::new(input),
189 len: input.len(),
190 current_pos_data: None,
191 result_cache: Vec::new(),
192 data: self.payload.get(),
193 complex: None,
194 boundary_property: 0,
195 })
196 }
197
198 /// Creates a sentence break iterator for a UTF-16 string.
199 ///
200 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
201 pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> SentenceBreakIteratorUtf16<'l, 's> {
202 SentenceBreakIterator(RuleBreakIterator {
203 iter: Utf16Indices::new(input),
204 len: input.len(),
205 current_pos_data: None,
206 result_cache: Vec::new(),
207 data: self.payload.get(),
208 complex: None,
209 boundary_property: 0,
210 })
211 }
212}
213
214#[cfg(all(test, feature = "serde"))]
215#[test]
216fn empty_string() {
217 let segmenter = SentenceSegmenter::new();
218 let breaks: Vec<usize> = segmenter.segment_str("").collect();
219 assert_eq!(breaks, [0]);
220}