icu_segmenter/
lib.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Segment strings by lines, graphemes, words, and sentences.
6//!
7//! This module is published as its own crate ([`icu_segmenter`](https://docs.rs/icu_segmenter/latest/icu_segmenter/))
8//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
9//!
10//! This module contains segmenter implementation for the following rules.
11//!
12//! - Line segmenter that is compatible with [Unicode Standard Annex #14][UAX14], _Unicode Line
13//!   Breaking Algorithm_, with options to tailor line-breaking behavior for CSS [`line-break`] and
14//!   [`word-break`] properties.
15//! - Grapheme cluster segmenter, word segmenter, and sentence segmenter that are compatible with
16//!   [Unicode Standard Annex #29][UAX29], _Unicode Text Segmentation_.
17//!
18//! [UAX14]: https://www.unicode.org/reports/tr14/
19//! [UAX29]: https://www.unicode.org/reports/tr29/
20//! [`line-break`]: https://drafts.csswg.org/css-text-3/#line-break-property
21//! [`word-break`]: https://drafts.csswg.org/css-text-3/#word-break-property
22//!
23//! # Examples
24//!
25//! ## Line Break
26//!
27//! Find line break opportunities:
28//!
29//!```rust
30//! use icu::segmenter::LineSegmenter;
31//!
32//! let segmenter = LineSegmenter::new_auto();
33//!
34//! let breakpoints: Vec<usize> = segmenter
35//!     .segment_str("Hello World. Xin chào thế giới!")
36//!     .collect();
37//! assert_eq!(&breakpoints, &[0, 6, 13, 17, 23, 29, 36]);
38//! ```
39//!
40//! See [`LineSegmenter`] for more examples.
41//!
42//! ## Grapheme Cluster Break
43//!
44//! Find all grapheme cluster boundaries:
45//!
46//!```rust
47//! use icu::segmenter::GraphemeClusterSegmenter;
48//!
49//! let segmenter = GraphemeClusterSegmenter::new();
50//!
51//! let breakpoints: Vec<usize> = segmenter
52//!     .segment_str("Hello World. Xin chào thế giới!")
53//!     .collect();
54//! assert_eq!(
55//!     &breakpoints,
56//!     &[
57//!         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
58//!         19, 21, 22, 23, 24, 25, 28, 29, 30, 31, 34, 35, 36
59//!     ]
60//! );
61//! ```
62//!
63//! See [`GraphemeClusterSegmenter`] for more examples.
64//!
65//! ## Word Break
66//!
67//! Find all word boundaries:
68//!
69//!```rust
70//! use icu::segmenter::WordSegmenter;
71//!
72//! let segmenter = WordSegmenter::new_auto();
73//!
74//! let breakpoints: Vec<usize> = segmenter
75//!     .segment_str("Hello World. Xin chào thế giới!")
76//!     .collect();
77//! assert_eq!(
78//!     &breakpoints,
79//!     &[0, 5, 6, 11, 12, 13, 16, 17, 22, 23, 28, 29, 35, 36]
80//! );
81//! ```
82//!
83//! See [`WordSegmenter`] for more examples.
84//!
85//! ## Sentence Break
86//!
87//! Segment the string into sentences:
88//!
89//!```rust
90//! use icu::segmenter::SentenceSegmenter;
91//!
92//! let segmenter = SentenceSegmenter::new();
93//!
94//! let breakpoints: Vec<usize> = segmenter
95//!     .segment_str("Hello World. Xin chào thế giới!")
96//!     .collect();
97//! assert_eq!(&breakpoints, &[0, 13, 36]);
98//! ```
99//!
100//! See [`SentenceSegmenter`] for more examples.
101
102// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
103#![cfg_attr(not(any(test, feature = "std")), no_std)]
104#![cfg_attr(
105    not(test),
106    deny(
107        clippy::indexing_slicing,
108        clippy::unwrap_used,
109        clippy::expect_used,
110        clippy::panic,
111        clippy::exhaustive_structs,
112        clippy::exhaustive_enums,
113        missing_debug_implementations,
114    )
115)]
116#![warn(missing_docs)]
117
118extern crate alloc;
119
120mod complex;
121mod error;
122mod indices;
123mod iterator_helpers;
124mod rule_segmenter;
125
126mod grapheme;
127mod line;
128mod sentence;
129mod word;
130
131pub mod provider;
132
133// Main Segmenter and BreakIterator public types
134pub use crate::grapheme::GraphemeClusterBreakIterator;
135pub use crate::grapheme::GraphemeClusterSegmenter;
136pub use crate::line::LineBreakIterator;
137pub use crate::line::LineSegmenter;
138pub use crate::sentence::SentenceBreakIterator;
139pub use crate::sentence::SentenceSegmenter;
140pub use crate::word::WordBreakIterator;
141pub use crate::word::WordSegmenter;
142
143// Options structs and enums
144pub use crate::line::LineBreakOptions;
145pub use crate::line::LineBreakStrictness;
146pub use crate::line::LineBreakWordOption;
147pub use crate::word::WordType;
148
149// Typedefs
150pub use crate::grapheme::GraphemeClusterBreakIteratorLatin1;
151pub use crate::grapheme::GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8;
152pub use crate::grapheme::GraphemeClusterBreakIteratorUtf16;
153pub use crate::grapheme::GraphemeClusterBreakIteratorUtf8;
154pub use crate::line::LineBreakIteratorLatin1;
155pub use crate::line::LineBreakIteratorPotentiallyIllFormedUtf8;
156pub use crate::line::LineBreakIteratorUtf16;
157pub use crate::line::LineBreakIteratorUtf8;
158pub use crate::sentence::SentenceBreakIteratorLatin1;
159pub use crate::sentence::SentenceBreakIteratorPotentiallyIllFormedUtf8;
160pub use crate::sentence::SentenceBreakIteratorUtf16;
161pub use crate::sentence::SentenceBreakIteratorUtf8;
162pub use crate::word::WordBreakIteratorLatin1;
163pub use crate::word::WordBreakIteratorPotentiallyIllFormedUtf8;
164pub use crate::word::WordBreakIteratorUtf16;
165pub use crate::word::WordBreakIteratorUtf8;
166
167pub use error::SegmenterError;
168
169#[doc(no_inline)]
170pub use SegmenterError as Error;