Struct GraphemeClusterSegmenter

Source

pub struct GraphemeClusterSegmenter {
    payload: DataPayload<GraphemeClusterBreakDataV1Marker>,
}

Expand description

Segments a string into grapheme clusters.

Supports loading grapheme cluster break data, and creating grapheme cluster break iterators for different string encodings.

§Examples

Segment a string:

use icu::segmenter::GraphemeClusterSegmenter;
let segmenter = GraphemeClusterSegmenter::new();

let breakpoints: Vec<usize> = segmenter.segment_str("Hello 🗺").collect();
// World Map (U+1F5FA) is encoded in four bytes in UTF-8.
assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 10]);

Segment a Latin1 byte string:

use icu::segmenter::GraphemeClusterSegmenter;
let segmenter = GraphemeClusterSegmenter::new();

let breakpoints: Vec<usize> =
    segmenter.segment_latin1(b"Hello World").collect();
assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);

Successive boundaries can be used to retrieve the grapheme clusters. In particular, the first boundary is always 0, and the last one is the length of the segmented text in code units.

use itertools::Itertools;
let text = "मांजर";
let grapheme_clusters: Vec<&str> = segmenter
    .segment_str(text)
    .tuple_windows()
    .map(|(i, j)| &text[i..j])
    .collect();
assert_eq!(&grapheme_clusters, &["मां", "ज", "र"]);

This segmenter applies all rules provided to the constructor. Thus, if the data supplied by the provider comprises all grapheme cluster boundary rules from Unicode Standard Annex #29, Unicode Text Segmentation, which is the case of default data (both test data and data produced by icu_datagen), the segment_* functions return extended grapheme cluster boundaries, as opposed to legacy grapheme cluster boundaries. See Section 3, Grapheme Cluster Boundaries, and Table 1a, Sample Grapheme Clusters, in Unicode Standard Annex #29, Unicode Text Segmentation.

use icu::segmenter::GraphemeClusterSegmenter;
let segmenter =
    GraphemeClusterSegmenter::new();

// நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster,
// but not a legacy grapheme cluster.
let ni = "நி";
let egc_boundaries: Vec<usize> = segmenter.segment_str(ni).collect();
assert_eq!(&egc_boundaries, &[0, ni.len()]);

Fields§

§payload: DataPayload<GraphemeClusterBreakDataV1Marker>

Struct GraphemeClusterSegmenter Copy item path

§Examples

Fields§

Implementations§

impl GraphemeClusterSegmenter

pub fn new() -> Self

pub fn try_new_with_any_provider( provider: &(impl AnyProvider + ?Sized), ) -> Result<Self, SegmenterError>

pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>where D: DataProvider<GraphemeClusterBreakDataV1Marker> + ?Sized,

pub fn segment_str<'l, 's>( &'l self, input: &'s str, ) -> GraphemeClusterBreakIteratorUtf8<'l, 's>

pub(crate) fn new_and_segment_str<'l, 's>( input: &'s str, payload: &'l RuleBreakDataV1<'l>, ) -> GraphemeClusterBreakIteratorUtf8<'l, 's>

pub fn segment_utf8<'l, 's>( &'l self, input: &'s [u8], ) -> GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'l, 's>

pub fn segment_latin1<'l, 's>( &'l self, input: &'s [u8], ) -> GraphemeClusterBreakIteratorLatin1<'l, 's>

pub fn segment_utf16<'l, 's>( &'l self, input: &'s [u16], ) -> GraphemeClusterBreakIteratorUtf16<'l, 's>

pub(crate) fn new_and_segment_utf16<'l, 's>( input: &'s [u16], payload: &'l RuleBreakDataV1<'l>, ) -> GraphemeClusterBreakIteratorUtf16<'l, 's>

Trait Implementations§

impl Debug for GraphemeClusterSegmenter

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for GraphemeClusterSegmenter

fn default() -> Self

Auto Trait Implementations§

impl Freeze for GraphemeClusterSegmenter

impl RefUnwindSafe for GraphemeClusterSegmenter

impl !Send for GraphemeClusterSegmenter

impl !Sync for GraphemeClusterSegmenter

impl Unpin for GraphemeClusterSegmenter

impl UnwindSafe for GraphemeClusterSegmenter

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<T> ErasedDestructor for Twhere T: 'static,

impl<T> MaybeSendSync for T

Struct GraphemeClusterSegmenter

pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
where D: DataProvider<GraphemeClusterBreakDataV1Marker> + ?Sized,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<T> ErasedDestructor for T
where T: 'static,