icu_segmenter/grapheme.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::vec::Vec;
6use icu_provider::prelude::*;
7
8use crate::indices::{Latin1Indices, Utf16Indices};
9use crate::iterator_helpers::derive_usize_iterator_with_type;
10use crate::rule_segmenter::*;
11use crate::{provider::*, SegmenterError};
12use utf8_iter::Utf8CharIndices;
13
14/// Implements the [`Iterator`] trait over the grapheme cluster boundaries of the given string.
15///
16/// Lifetimes:
17///
18/// - `'l` = lifetime of the segmenter object from which this iterator was created
19/// - `'s` = lifetime of the string being segmented
20///
21/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
22/// _after_ the boundary (for a boundary at the end of text, this index is the length
23/// of the [`str`] or array of code units).
24///
25/// For examples of use, see [`GraphemeClusterSegmenter`].
26#[derive(Debug)]
27pub struct GraphemeClusterBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
28 RuleBreakIterator<'l, 's, Y>,
29);
30
31derive_usize_iterator_with_type!(GraphemeClusterBreakIterator);
32
33/// Grapheme cluster break iterator for an `str` (a UTF-8 string).
34///
35/// For examples of use, see [`GraphemeClusterSegmenter`].
36pub type GraphemeClusterBreakIteratorUtf8<'l, 's> =
37 GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf8>;
38
39/// Grapheme cluster break iterator for a potentially invalid UTF-8 string.
40///
41/// For examples of use, see [`GraphemeClusterSegmenter`].
42pub type GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
43 GraphemeClusterBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
44
45/// Grapheme cluster break iterator for a Latin-1 (8-bit) string.
46///
47/// For examples of use, see [`GraphemeClusterSegmenter`].
48pub type GraphemeClusterBreakIteratorLatin1<'l, 's> =
49 GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeLatin1>;
50
51/// Grapheme cluster break iterator for a UTF-16 string.
52///
53/// For examples of use, see [`GraphemeClusterSegmenter`].
54pub type GraphemeClusterBreakIteratorUtf16<'l, 's> =
55 GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf16>;
56
57/// Segments a string into grapheme clusters.
58///
59/// Supports loading grapheme cluster break data, and creating grapheme cluster break iterators for
60/// different string encodings.
61///
62/// # Examples
63///
64/// Segment a string:
65///
66/// ```rust
67/// use icu::segmenter::GraphemeClusterSegmenter;
68/// let segmenter = GraphemeClusterSegmenter::new();
69///
70/// let breakpoints: Vec<usize> = segmenter.segment_str("Hello 🗺").collect();
71/// // World Map (U+1F5FA) is encoded in four bytes in UTF-8.
72/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 10]);
73/// ```
74///
75/// Segment a Latin1 byte string:
76///
77/// ```rust
78/// use icu::segmenter::GraphemeClusterSegmenter;
79/// let segmenter = GraphemeClusterSegmenter::new();
80///
81/// let breakpoints: Vec<usize> =
82/// segmenter.segment_latin1(b"Hello World").collect();
83/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
84/// ```
85///
86/// Successive boundaries can be used to retrieve the grapheme clusters.
87/// In particular, the first boundary is always 0, and the last one is the
88/// length of the segmented text in code units.
89///
90/// ```rust
91/// # use icu::segmenter::GraphemeClusterSegmenter;
92/// # let segmenter =
93/// # GraphemeClusterSegmenter::new();
94/// use itertools::Itertools;
95/// let text = "मांजर";
96/// let grapheme_clusters: Vec<&str> = segmenter
97/// .segment_str(text)
98/// .tuple_windows()
99/// .map(|(i, j)| &text[i..j])
100/// .collect();
101/// assert_eq!(&grapheme_clusters, &["मां", "ज", "र"]);
102/// ```
103///
104/// This segmenter applies all rules provided to the constructor.
105/// Thus, if the data supplied by the provider comprises all
106/// [grapheme cluster boundary rules][Rules] from Unicode Standard Annex #29,
107/// _Unicode Text Segmentation_, which is the case of default data
108/// (both test data and data produced by `icu_datagen`), the `segment_*`
109/// functions return extended grapheme cluster boundaries, as opposed to
110/// legacy grapheme cluster boundaries. See [_Section 3, Grapheme Cluster
111/// Boundaries_][GC], and [_Table 1a, Sample Grapheme Clusters_][Sample_GC],
112/// in Unicode Standard Annex #29, _Unicode Text Segmentation_.
113///
114/// [Rules]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
115/// [GC]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
116/// [Sample_GC]: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
117///
118/// ```rust
119/// use icu::segmenter::GraphemeClusterSegmenter;
120/// let segmenter =
121/// GraphemeClusterSegmenter::new();
122///
123/// // நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster,
124/// // but not a legacy grapheme cluster.
125/// let ni = "நி";
126/// let egc_boundaries: Vec<usize> = segmenter.segment_str(ni).collect();
127/// assert_eq!(&egc_boundaries, &[0, ni.len()]);
128/// ```
129#[derive(Debug)]
130pub struct GraphemeClusterSegmenter {
131 payload: DataPayload<GraphemeClusterBreakDataV1Marker>,
132}
133
134#[cfg(feature = "compiled_data")]
135impl Default for GraphemeClusterSegmenter {
136 fn default() -> Self {
137 Self::new()
138 }
139}
140
141impl GraphemeClusterSegmenter {
142 /// Constructs a [`GraphemeClusterSegmenter`] with an invariant locale from compiled data.
143 ///
144 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
145 ///
146 /// [📚 Help choosing a constructor](icu_provider::constructors)
147 #[cfg(feature = "compiled_data")]
148 pub fn new() -> Self {
149 Self {
150 payload: DataPayload::from_static_ref(
151 crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
152 ),
153 }
154 }
155
156 icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
157 #[cfg(skip)]
158 functions: [
159 new,
160 try_new_with_any_provider,
161 try_new_with_buffer_provider,
162 try_new_unstable,
163 Self,
164 ]);
165
166 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
167 pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
168 where
169 D: DataProvider<GraphemeClusterBreakDataV1Marker> + ?Sized,
170 {
171 let payload = provider.load(Default::default())?.take_payload()?;
172 Ok(Self { payload })
173 }
174
175 /// Creates a grapheme cluster break iterator for an `str` (a UTF-8 string).
176 pub fn segment_str<'l, 's>(
177 &'l self,
178 input: &'s str,
179 ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
180 GraphemeClusterSegmenter::new_and_segment_str(input, self.payload.get())
181 }
182
183 /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
184 ///
185 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
186 pub(crate) fn new_and_segment_str<'l, 's>(
187 input: &'s str,
188 payload: &'l RuleBreakDataV1<'l>,
189 ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
190 GraphemeClusterBreakIterator(RuleBreakIterator {
191 iter: input.char_indices(),
192 len: input.len(),
193 current_pos_data: None,
194 result_cache: Vec::new(),
195 data: payload,
196 complex: None,
197 boundary_property: 0,
198 })
199 }
200
201 /// Creates a grapheme cluster break iterator for a potentially ill-formed UTF8 string
202 ///
203 /// Invalid characters are treated as REPLACEMENT CHARACTER
204 ///
205 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
206 pub fn segment_utf8<'l, 's>(
207 &'l self,
208 input: &'s [u8],
209 ) -> GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
210 GraphemeClusterBreakIterator(RuleBreakIterator {
211 iter: Utf8CharIndices::new(input),
212 len: input.len(),
213 current_pos_data: None,
214 result_cache: Vec::new(),
215 data: self.payload.get(),
216 complex: None,
217 boundary_property: 0,
218 })
219 }
220 /// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
221 ///
222 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
223 pub fn segment_latin1<'l, 's>(
224 &'l self,
225 input: &'s [u8],
226 ) -> GraphemeClusterBreakIteratorLatin1<'l, 's> {
227 GraphemeClusterBreakIterator(RuleBreakIterator {
228 iter: Latin1Indices::new(input),
229 len: input.len(),
230 current_pos_data: None,
231 result_cache: Vec::new(),
232 data: self.payload.get(),
233 complex: None,
234 boundary_property: 0,
235 })
236 }
237
238 /// Creates a grapheme cluster break iterator for a UTF-16 string.
239 ///
240 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
241 pub fn segment_utf16<'l, 's>(
242 &'l self,
243 input: &'s [u16],
244 ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
245 GraphemeClusterSegmenter::new_and_segment_utf16(input, self.payload.get())
246 }
247
248 /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
249 pub(crate) fn new_and_segment_utf16<'l, 's>(
250 input: &'s [u16],
251 payload: &'l RuleBreakDataV1<'l>,
252 ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
253 GraphemeClusterBreakIterator(RuleBreakIterator {
254 iter: Utf16Indices::new(input),
255 len: input.len(),
256 current_pos_data: None,
257 result_cache: Vec::new(),
258 data: payload,
259 complex: None,
260 boundary_property: 0,
261 })
262 }
263}
264
265#[test]
266fn empty_string() {
267 let segmenter = GraphemeClusterSegmenter::new();
268 let breaks: Vec<usize> = segmenter.segment_str("").collect();
269 assert_eq!(breaks, [0]);
270}
271
272#[test]
273fn emoji_flags() {
274 // https://github.com/unicode-org/icu4x/issues/4780
275 let segmenter = GraphemeClusterSegmenter::new();
276 let breaks: Vec<usize> = segmenter.segment_str("🇺🇸🏴").collect();
277 assert_eq!(breaks, [0, 8, 36]);
278}