icu_segmenter/
grapheme.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::vec::Vec;
6use icu_provider::prelude::*;
7
8use crate::indices::{Latin1Indices, Utf16Indices};
9use crate::iterator_helpers::derive_usize_iterator_with_type;
10use crate::rule_segmenter::*;
11use crate::{provider::*, SegmenterError};
12use utf8_iter::Utf8CharIndices;
13
14/// Implements the [`Iterator`] trait over the grapheme cluster boundaries of the given string.
15///
16/// Lifetimes:
17///
18/// - `'l` = lifetime of the segmenter object from which this iterator was created
19/// - `'s` = lifetime of the string being segmented
20///
21/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
22/// _after_ the boundary (for a boundary at the end of text, this index is the length
23/// of the [`str`] or array of code units).
24///
25/// For examples of use, see [`GraphemeClusterSegmenter`].
26#[derive(Debug)]
27pub struct GraphemeClusterBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
28    RuleBreakIterator<'l, 's, Y>,
29);
30
31derive_usize_iterator_with_type!(GraphemeClusterBreakIterator);
32
33/// Grapheme cluster break iterator for an `str` (a UTF-8 string).
34///
35/// For examples of use, see [`GraphemeClusterSegmenter`].
36pub type GraphemeClusterBreakIteratorUtf8<'l, 's> =
37    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf8>;
38
39/// Grapheme cluster break iterator for a potentially invalid UTF-8 string.
40///
41/// For examples of use, see [`GraphemeClusterSegmenter`].
42pub type GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
43    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
44
45/// Grapheme cluster break iterator for a Latin-1 (8-bit) string.
46///
47/// For examples of use, see [`GraphemeClusterSegmenter`].
48pub type GraphemeClusterBreakIteratorLatin1<'l, 's> =
49    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeLatin1>;
50
51/// Grapheme cluster break iterator for a UTF-16 string.
52///
53/// For examples of use, see [`GraphemeClusterSegmenter`].
54pub type GraphemeClusterBreakIteratorUtf16<'l, 's> =
55    GraphemeClusterBreakIterator<'l, 's, RuleBreakTypeUtf16>;
56
57/// Segments a string into grapheme clusters.
58///
59/// Supports loading grapheme cluster break data, and creating grapheme cluster break iterators for
60/// different string encodings.
61///
62/// # Examples
63///
64/// Segment a string:
65///
66/// ```rust
67/// use icu::segmenter::GraphemeClusterSegmenter;
68/// let segmenter = GraphemeClusterSegmenter::new();
69///
70/// let breakpoints: Vec<usize> = segmenter.segment_str("Hello 🗺").collect();
71/// // World Map (U+1F5FA) is encoded in four bytes in UTF-8.
72/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 10]);
73/// ```
74///
75/// Segment a Latin1 byte string:
76///
77/// ```rust
78/// use icu::segmenter::GraphemeClusterSegmenter;
79/// let segmenter = GraphemeClusterSegmenter::new();
80///
81/// let breakpoints: Vec<usize> =
82///     segmenter.segment_latin1(b"Hello World").collect();
83/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
84/// ```
85///
86/// Successive boundaries can be used to retrieve the grapheme clusters.
87/// In particular, the first boundary is always 0, and the last one is the
88/// length of the segmented text in code units.
89///
90/// ```rust
91/// # use icu::segmenter::GraphemeClusterSegmenter;
92/// # let segmenter =
93/// #     GraphemeClusterSegmenter::new();
94/// use itertools::Itertools;
95/// let text = "मांजर";
96/// let grapheme_clusters: Vec<&str> = segmenter
97///     .segment_str(text)
98///     .tuple_windows()
99///     .map(|(i, j)| &text[i..j])
100///     .collect();
101/// assert_eq!(&grapheme_clusters, &["मां", "ज", "र"]);
102/// ```
103///
104/// This segmenter applies all rules provided to the constructor.
105/// Thus, if the data supplied by the provider comprises all
106/// [grapheme cluster boundary rules][Rules] from Unicode Standard Annex #29,
107/// _Unicode Text Segmentation_, which is the case of default data
108/// (both test data and data produced by `icu_datagen`), the `segment_*`
109/// functions return extended grapheme cluster boundaries, as opposed to
110/// legacy grapheme cluster boundaries.  See [_Section 3, Grapheme Cluster
111/// Boundaries_][GC], and [_Table 1a, Sample Grapheme Clusters_][Sample_GC],
112/// in Unicode Standard Annex #29, _Unicode Text Segmentation_.
113///
114/// [Rules]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
115/// [GC]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
116/// [Sample_GC]: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
117///
118/// ```rust
119/// use icu::segmenter::GraphemeClusterSegmenter;
120/// let segmenter =
121///     GraphemeClusterSegmenter::new();
122///
123/// // நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster,
124/// // but not a legacy grapheme cluster.
125/// let ni = "நி";
126/// let egc_boundaries: Vec<usize> = segmenter.segment_str(ni).collect();
127/// assert_eq!(&egc_boundaries, &[0, ni.len()]);
128/// ```
129#[derive(Debug)]
130pub struct GraphemeClusterSegmenter {
131    payload: DataPayload<GraphemeClusterBreakDataV1Marker>,
132}
133
134#[cfg(feature = "compiled_data")]
135impl Default for GraphemeClusterSegmenter {
136    fn default() -> Self {
137        Self::new()
138    }
139}
140
141impl GraphemeClusterSegmenter {
142    /// Constructs a [`GraphemeClusterSegmenter`] with an invariant locale from compiled data.
143    ///
144    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
145    ///
146    /// [📚 Help choosing a constructor](icu_provider::constructors)
147    #[cfg(feature = "compiled_data")]
148    pub fn new() -> Self {
149        Self {
150            payload: DataPayload::from_static_ref(
151                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
152            ),
153        }
154    }
155
156    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
157        #[cfg(skip)]
158        functions: [
159            new,
160            try_new_with_any_provider,
161            try_new_with_buffer_provider,
162            try_new_unstable,
163            Self,
164    ]);
165
166    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
167    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
168    where
169        D: DataProvider<GraphemeClusterBreakDataV1Marker> + ?Sized,
170    {
171        let payload = provider.load(Default::default())?.take_payload()?;
172        Ok(Self { payload })
173    }
174
175    /// Creates a grapheme cluster break iterator for an `str` (a UTF-8 string).
176    pub fn segment_str<'l, 's>(
177        &'l self,
178        input: &'s str,
179    ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
180        GraphemeClusterSegmenter::new_and_segment_str(input, self.payload.get())
181    }
182
183    /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
184    ///
185    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
186    pub(crate) fn new_and_segment_str<'l, 's>(
187        input: &'s str,
188        payload: &'l RuleBreakDataV1<'l>,
189    ) -> GraphemeClusterBreakIteratorUtf8<'l, 's> {
190        GraphemeClusterBreakIterator(RuleBreakIterator {
191            iter: input.char_indices(),
192            len: input.len(),
193            current_pos_data: None,
194            result_cache: Vec::new(),
195            data: payload,
196            complex: None,
197            boundary_property: 0,
198        })
199    }
200
201    /// Creates a grapheme cluster break iterator for a potentially ill-formed UTF8 string
202    ///
203    /// Invalid characters are treated as REPLACEMENT CHARACTER
204    ///
205    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
206    pub fn segment_utf8<'l, 's>(
207        &'l self,
208        input: &'s [u8],
209    ) -> GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
210        GraphemeClusterBreakIterator(RuleBreakIterator {
211            iter: Utf8CharIndices::new(input),
212            len: input.len(),
213            current_pos_data: None,
214            result_cache: Vec::new(),
215            data: self.payload.get(),
216            complex: None,
217            boundary_property: 0,
218        })
219    }
220    /// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
221    ///
222    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
223    pub fn segment_latin1<'l, 's>(
224        &'l self,
225        input: &'s [u8],
226    ) -> GraphemeClusterBreakIteratorLatin1<'l, 's> {
227        GraphemeClusterBreakIterator(RuleBreakIterator {
228            iter: Latin1Indices::new(input),
229            len: input.len(),
230            current_pos_data: None,
231            result_cache: Vec::new(),
232            data: self.payload.get(),
233            complex: None,
234            boundary_property: 0,
235        })
236    }
237
238    /// Creates a grapheme cluster break iterator for a UTF-16 string.
239    ///
240    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
241    pub fn segment_utf16<'l, 's>(
242        &'l self,
243        input: &'s [u16],
244    ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
245        GraphemeClusterSegmenter::new_and_segment_utf16(input, self.payload.get())
246    }
247
248    /// Creates a grapheme cluster break iterator from grapheme cluster rule payload.
249    pub(crate) fn new_and_segment_utf16<'l, 's>(
250        input: &'s [u16],
251        payload: &'l RuleBreakDataV1<'l>,
252    ) -> GraphemeClusterBreakIteratorUtf16<'l, 's> {
253        GraphemeClusterBreakIterator(RuleBreakIterator {
254            iter: Utf16Indices::new(input),
255            len: input.len(),
256            current_pos_data: None,
257            result_cache: Vec::new(),
258            data: payload,
259            complex: None,
260            boundary_property: 0,
261        })
262    }
263}
264
265#[test]
266fn empty_string() {
267    let segmenter = GraphemeClusterSegmenter::new();
268    let breaks: Vec<usize> = segmenter.segment_str("").collect();
269    assert_eq!(breaks, [0]);
270}
271
272#[test]
273fn emoji_flags() {
274    // https://github.com/unicode-org/icu4x/issues/4780
275    let segmenter = GraphemeClusterSegmenter::new();
276    let breaks: Vec<usize> = segmenter.segment_str("🇺🇸🏴󠁧󠁢󠁥󠁮󠁧󠁿").collect();
277    assert_eq!(breaks, [0, 8, 36]);
278}