icu_segmenter/
sentence.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::vec::Vec;
6use icu_provider::prelude::*;
7
8use crate::indices::{Latin1Indices, Utf16Indices};
9use crate::iterator_helpers::derive_usize_iterator_with_type;
10use crate::rule_segmenter::*;
11use crate::{provider::*, SegmenterError};
12use utf8_iter::Utf8CharIndices;
13
14/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
15///
16/// Lifetimes:
17///
18/// - `'l` = lifetime of the segmenter object from which this iterator was created
19/// - `'s` = lifetime of the string being segmented
20///
21/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
22/// _after_ the boundary (for a boundary at the end of text, this index is the length
23/// of the [`str`] or array of code units).
24///
25/// For examples of use, see [`SentenceSegmenter`].
26#[derive(Debug)]
27pub struct SentenceBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
28    RuleBreakIterator<'l, 's, Y>,
29);
30
31derive_usize_iterator_with_type!(SentenceBreakIterator);
32
33/// Sentence break iterator for an `str` (a UTF-8 string).
34///
35/// For examples of use, see [`SentenceSegmenter`].
36pub type SentenceBreakIteratorUtf8<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf8>;
37
38/// Sentence break iterator for a potentially invalid UTF-8 string.
39///
40/// For examples of use, see [`SentenceSegmenter`].
41pub type SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
42    SentenceBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
43
44/// Sentence break iterator for a Latin-1 (8-bit) string.
45///
46/// For examples of use, see [`SentenceSegmenter`].
47pub type SentenceBreakIteratorLatin1<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeLatin1>;
48
49/// Sentence break iterator for a UTF-16 string.
50///
51/// For examples of use, see [`SentenceSegmenter`].
52pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf16>;
53
54/// Supports loading sentence break data, and creating sentence break iterators for different string
55/// encodings.
56///
57/// # Examples
58///
59/// Segment a string:
60///
61/// ```rust
62/// use icu::segmenter::SentenceSegmenter;
63/// let segmenter = SentenceSegmenter::new();
64///
65/// let breakpoints: Vec<usize> =
66///     segmenter.segment_str("Hello World").collect();
67/// assert_eq!(&breakpoints, &[0, 11]);
68/// ```
69///
70/// Segment a Latin1 byte string:
71///
72/// ```rust
73/// use icu::segmenter::SentenceSegmenter;
74/// let segmenter = SentenceSegmenter::new();
75///
76/// let breakpoints: Vec<usize> =
77///     segmenter.segment_latin1(b"Hello World").collect();
78/// assert_eq!(&breakpoints, &[0, 11]);
79/// ```
80///
81/// Successive boundaries can be used to retrieve the sentences.
82/// In particular, the first boundary is always 0, and the last one is the
83/// length of the segmented text in code units.
84///
85/// ```rust
86/// # use icu::segmenter::SentenceSegmenter;
87/// # let segmenter = SentenceSegmenter::new();
88/// use itertools::Itertools;
89/// let text = "Ceci tuera cela. Le livre tuera l’édifice.";
90/// let sentences: Vec<&str> = segmenter
91///     .segment_str(text)
92///     .tuple_windows()
93///     .map(|(i, j)| &text[i..j])
94///     .collect();
95/// assert_eq!(
96///     &sentences,
97///     &["Ceci tuera cela. ", "Le livre tuera l’édifice."]
98/// );
99/// ```
100#[derive(Debug)]
101pub struct SentenceSegmenter {
102    payload: DataPayload<SentenceBreakDataV1Marker>,
103}
104
105#[cfg(feature = "compiled_data")]
106impl Default for SentenceSegmenter {
107    fn default() -> Self {
108        Self::new()
109    }
110}
111
112impl SentenceSegmenter {
113    /// Constructs a [`SentenceSegmenter`] with an invariant locale and compiled data.
114    ///
115    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
116    ///
117    /// [📚 Help choosing a constructor](icu_provider::constructors)
118    #[cfg(feature = "compiled_data")]
119    pub fn new() -> Self {
120        Self {
121            payload: DataPayload::from_static_ref(
122                crate::provider::Baked::SINGLETON_SEGMENTER_SENTENCE_V1,
123            ),
124        }
125    }
126
127    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
128        #[cfg(skip)]
129        functions: [
130            new,
131            try_new_with_any_provider,
132            try_new_with_buffer_provider,
133            try_new_unstable,
134            Self,
135        ]
136    );
137
138    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
139    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
140    where
141        D: DataProvider<SentenceBreakDataV1Marker> + ?Sized,
142    {
143        let payload = provider.load(Default::default())?.take_payload()?;
144        Ok(Self { payload })
145    }
146
147    /// Creates a sentence break iterator for an `str` (a UTF-8 string).
148    ///
149    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
150    pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> SentenceBreakIteratorUtf8<'l, 's> {
151        SentenceBreakIterator(RuleBreakIterator {
152            iter: input.char_indices(),
153            len: input.len(),
154            current_pos_data: None,
155            result_cache: Vec::new(),
156            data: self.payload.get(),
157            complex: None,
158            boundary_property: 0,
159        })
160    }
161    /// Creates a sentence break iterator for a potentially ill-formed UTF8 string
162    ///
163    /// Invalid characters are treated as REPLACEMENT CHARACTER
164    ///
165    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
166    pub fn segment_utf8<'l, 's>(
167        &'l self,
168        input: &'s [u8],
169    ) -> SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
170        SentenceBreakIterator(RuleBreakIterator {
171            iter: Utf8CharIndices::new(input),
172            len: input.len(),
173            current_pos_data: None,
174            result_cache: Vec::new(),
175            data: self.payload.get(),
176            complex: None,
177            boundary_property: 0,
178        })
179    }
180    /// Creates a sentence break iterator for a Latin-1 (8-bit) string.
181    ///
182    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
183    pub fn segment_latin1<'l, 's>(
184        &'l self,
185        input: &'s [u8],
186    ) -> SentenceBreakIteratorLatin1<'l, 's> {
187        SentenceBreakIterator(RuleBreakIterator {
188            iter: Latin1Indices::new(input),
189            len: input.len(),
190            current_pos_data: None,
191            result_cache: Vec::new(),
192            data: self.payload.get(),
193            complex: None,
194            boundary_property: 0,
195        })
196    }
197
198    /// Creates a sentence break iterator for a UTF-16 string.
199    ///
200    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
201    pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> SentenceBreakIteratorUtf16<'l, 's> {
202        SentenceBreakIterator(RuleBreakIterator {
203            iter: Utf16Indices::new(input),
204            len: input.len(),
205            current_pos_data: None,
206            result_cache: Vec::new(),
207            data: self.payload.get(),
208            complex: None,
209            boundary_property: 0,
210        })
211    }
212}
213
214#[cfg(all(test, feature = "serde"))]
215#[test]
216fn empty_string() {
217    let segmenter = SentenceSegmenter::new();
218    let breaks: Vec<usize> = segmenter.segment_str("").collect();
219    assert_eq!(breaks, [0]);
220}