icu_segmenter/
line.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::*;
6use crate::indices::*;
7use crate::provider::*;
8use crate::SegmenterError;
9use alloc::string::String;
10use alloc::vec;
11use alloc::vec::Vec;
12use core::char;
13use core::str::CharIndices;
14use icu_provider::prelude::*;
15use utf8_iter::Utf8CharIndices;
16
17// TODO(#1637): These constants should be data driven.
18#[allow(dead_code)]
19const UNKNOWN: u8 = 0;
20#[allow(dead_code)]
21const AI: u8 = 1;
22#[allow(dead_code)]
23const AL: u8 = 2;
24#[allow(dead_code)]
25const B2: u8 = 3;
26#[allow(dead_code)]
27const BA: u8 = 4;
28#[allow(dead_code)]
29const BB: u8 = 5;
30#[allow(dead_code)]
31const BK: u8 = 6;
32#[allow(dead_code)]
33const CB: u8 = 7;
34#[allow(dead_code)]
35const CJ: u8 = 8;
36#[allow(dead_code)]
37const CL: u8 = 9;
38#[allow(dead_code)]
39const CM: u8 = 10;
40#[allow(dead_code)]
41const CP: u8 = 11;
42#[allow(dead_code)]
43const CR: u8 = 12;
44#[allow(dead_code)]
45const EB: u8 = 13;
46#[allow(dead_code)]
47const EM: u8 = 14;
48#[allow(dead_code)]
49const EX: u8 = 15;
50#[allow(dead_code)]
51const GL: u8 = 16;
52#[allow(dead_code)]
53const H2: u8 = 17;
54#[allow(dead_code)]
55const H3: u8 = 18;
56#[allow(dead_code)]
57const HL: u8 = 19;
58#[allow(dead_code)]
59const HY: u8 = 20;
60#[allow(dead_code)]
61const ID: u8 = 21;
62#[allow(dead_code)]
63const ID_CN: u8 = 22;
64#[allow(dead_code)]
65const IN: u8 = 23;
66#[allow(dead_code)]
67const IS: u8 = 24;
68#[allow(dead_code)]
69const JL: u8 = 25;
70#[allow(dead_code)]
71const JT: u8 = 26;
72#[allow(dead_code)]
73const JV: u8 = 27;
74#[allow(dead_code)]
75const LF: u8 = 28;
76#[allow(dead_code)]
77const NL: u8 = 29;
78#[allow(dead_code)]
79const NS: u8 = 30;
80#[allow(dead_code)]
81const NU: u8 = 31;
82#[allow(dead_code)]
83const OP_EA: u8 = 32;
84#[allow(dead_code)]
85const OP_OP30: u8 = 33;
86#[allow(dead_code)]
87const PO: u8 = 34;
88#[allow(dead_code)]
89const PO_EAW: u8 = 35;
90#[allow(dead_code)]
91const PR: u8 = 36;
92#[allow(dead_code)]
93const PR_EAW: u8 = 37;
94#[allow(dead_code)]
95const QU: u8 = 38;
96#[allow(dead_code)]
97const RI: u8 = 39;
98#[allow(dead_code)]
99const SA: u8 = 40;
100#[allow(dead_code)]
101const SG: u8 = 41;
102#[allow(dead_code)]
103const SP: u8 = 42;
104#[allow(dead_code)]
105const SY: u8 = 43;
106#[allow(dead_code)]
107const WJ: u8 = 44;
108#[allow(dead_code)]
109const XX: u8 = 45;
110#[allow(dead_code)]
111const ZW: u8 = 46;
112#[allow(dead_code)]
113const ZWJ: u8 = 47;
114
115/// An enum specifies the strictness of line-breaking rules. It can be passed as
116/// an argument when creating a line segmenter.
117///
118/// Each enum value has the same meaning with respect to the `line-break`
119/// property values in the CSS Text spec. See the details in
120/// <https://drafts.csswg.org/css-text-3/#line-break-property>.
121#[non_exhaustive]
122#[derive(Copy, Clone, PartialEq, Eq, Debug)]
123pub enum LineBreakStrictness {
124    /// Breaks text using the least restrictive set of line-breaking rules.
125    /// Typically used for short lines, such as in newspapers.
126    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-loose>
127    Loose,
128
129    /// Breaks text using the most common set of line-breaking rules.
130    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-normal>
131    Normal,
132
133    /// Breaks text using the most stringent set of line-breaking rules.
134    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-strict>
135    ///
136    /// This is the default behaviour of the Unicode Line Breaking Algorithm,
137    /// resolving class [CJ](https://www.unicode.org/reports/tr14/#CJ) to
138    /// [NS](https://www.unicode.org/reports/tr14/#NS);
139    /// see rule [LB1](https://www.unicode.org/reports/tr14/#LB1).
140    Strict,
141
142    /// Breaks text assuming there is a soft wrap opportunity around every
143    /// typographic character unit, disregarding any prohibition against line
144    /// breaks. See more details in
145    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-anywhere>.
146    Anywhere,
147}
148
149/// An enum specifies the line break opportunities between letters. It can be
150/// passed as an argument when creating a line segmenter.
151///
152/// Each enum value has the same meaning with respect to the `word-break`
153/// property values in the CSS Text spec. See the details in
154/// <https://drafts.csswg.org/css-text-3/#word-break-property>
155#[non_exhaustive]
156#[derive(Copy, Clone, PartialEq, Eq, Debug)]
157pub enum LineBreakWordOption {
158    /// Words break according to their customary rules. See the details in
159    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-normal>.
160    Normal,
161
162    /// Breaking is allowed within "words".
163    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-break-all>
164    BreakAll,
165
166    /// Breaking is forbidden within "word".
167    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all>
168    KeepAll,
169}
170
171/// Options to tailor line-breaking behavior.
172#[non_exhaustive]
173#[derive(Copy, Clone, PartialEq, Eq, Debug)]
174pub struct LineBreakOptions {
175    /// Strictness of line-breaking rules. See [`LineBreakStrictness`].
176    pub strictness: LineBreakStrictness,
177
178    /// Line break opportunities between letters. See [`LineBreakWordOption`].
179    pub word_option: LineBreakWordOption,
180
181    /// Use `true` as a hint to the line segmenter that the writing
182    /// system is Chinese or Japanese. This allows more break opportunities when
183    /// `LineBreakStrictness` is `Normal` or `Loose`. See
184    /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
185    ///
186    /// This option has no effect in Latin-1 mode.
187    pub ja_zh: bool,
188}
189
190impl Default for LineBreakOptions {
191    fn default() -> Self {
192        Self {
193            strictness: LineBreakStrictness::Strict,
194            word_option: LineBreakWordOption::Normal,
195            ja_zh: false,
196        }
197    }
198}
199
200/// Line break iterator for an `str` (a UTF-8 string).
201///
202/// For examples of use, see [`LineSegmenter`].
203pub type LineBreakIteratorUtf8<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf8>;
204
205/// Line break iterator for a potentially invalid UTF-8 string.
206///
207/// For examples of use, see [`LineSegmenter`].
208pub type LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
209    LineBreakIterator<'l, 's, LineBreakTypePotentiallyIllFormedUtf8>;
210
211/// Line break iterator for a Latin-1 (8-bit) string.
212///
213/// For examples of use, see [`LineSegmenter`].
214pub type LineBreakIteratorLatin1<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeLatin1>;
215
216/// Line break iterator for a UTF-16 string.
217///
218/// For examples of use, see [`LineSegmenter`].
219pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf16>;
220
221/// Supports loading line break data, and creating line break iterators for different string
222/// encodings.
223///
224/// The segmenter returns mandatory breaks (as defined by [definition LD7][LD7] of
225/// Unicode Standard Annex #14, _Unicode Line Breaking Algorithm_) as well as
226/// line break opportunities ([definition LD3][LD3]).
227/// It does not distinguish them.  Callers requiring that distinction can check
228/// the Line_Break property of the code point preceding the break against those
229/// listed in rules [LB4][LB4] and [LB5][LB5], special-casing the end of text
230/// according to [LB3][LB3].
231///
232/// For consistency with the grapheme, word, and sentence segmenters, there is
233/// always a breakpoint returned at index 0, but this breakpoint is not a
234/// meaningful line break opportunity.
235///
236/// [LD3]: https://www.unicode.org/reports/tr14/#LD3
237/// [LD7]: https://www.unicode.org/reports/tr14/#LD7
238/// [LB3]: https://www.unicode.org/reports/tr14/#LB3
239/// [LB4]: https://www.unicode.org/reports/tr14/#LB4
240/// [LB5]: https://www.unicode.org/reports/tr14/#LB5
241///
242/// ```rust
243/// # use icu::segmenter::LineSegmenter;
244/// #
245/// # let segmenter = LineSegmenter::new_auto();
246/// #
247/// let text = "Summary\r\nThis annex…";
248/// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
249/// // 9 and 22 are mandatory breaks, 14 is a line break opportunity.
250/// assert_eq!(&breakpoints, &[0, 9, 14, 22]);
251///
252/// // There is a break opportunity between emoji, but not within the ZWJ sequence πŸ³οΈβ€πŸŒˆ.
253/// let flag_equation = "πŸ³οΈβž•πŸŒˆπŸŸ°πŸ³οΈ\u{200D}🌈";
254/// let possible_first_lines: Vec<&str> =
255///     segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect();
256/// assert_eq!(
257///     &possible_first_lines,
258///     &[
259///         "🏳️",
260///         "πŸ³οΈβž•",
261///         "πŸ³οΈβž•πŸŒˆ",
262///         "πŸ³οΈβž•πŸŒˆπŸŸ°",
263///         "πŸ³οΈβž•πŸŒˆπŸŸ°πŸ³οΈβ€πŸŒˆ"
264///     ]
265/// );
266/// ```
267///
268/// # Examples
269///
270/// Segment a string with default options:
271///
272/// ```rust
273/// use icu::segmenter::LineSegmenter;
274///
275/// let segmenter = LineSegmenter::new_auto();
276///
277/// let breakpoints: Vec<usize> =
278///     segmenter.segment_str("Hello World").collect();
279/// assert_eq!(&breakpoints, &[0, 6, 11]);
280/// ```
281///
282/// Segment a string with CSS option overrides:
283///
284/// ```rust
285/// use icu::segmenter::{
286///     LineBreakOptions, LineBreakStrictness, LineBreakWordOption,
287///     LineSegmenter,
288/// };
289///
290/// let mut options = LineBreakOptions::default();
291/// options.strictness = LineBreakStrictness::Strict;
292/// options.word_option = LineBreakWordOption::BreakAll;
293/// options.ja_zh = false;
294/// let segmenter = LineSegmenter::new_auto_with_options(options);
295///
296/// let breakpoints: Vec<usize> =
297///     segmenter.segment_str("Hello World").collect();
298/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]);
299/// ```
300///
301/// Segment a Latin1 byte string:
302///
303/// ```rust
304/// use icu::segmenter::LineSegmenter;
305///
306/// let segmenter = LineSegmenter::new_auto();
307///
308/// let breakpoints: Vec<usize> =
309///     segmenter.segment_latin1(b"Hello World").collect();
310/// assert_eq!(&breakpoints, &[0, 6, 11]);
311/// ```
312///
313/// Separate mandatory breaks from the break opportunities:
314///
315/// ```rust
316/// use icu::properties::{maps, LineBreak};
317/// use icu::segmenter::LineSegmenter;
318///
319/// # let segmenter = LineSegmenter::new_auto();
320/// #
321/// let text = "Summary\r\nThis annex…";
322///
323/// let mandatory_breaks: Vec<usize> = segmenter
324///     .segment_str(text)
325///     .into_iter()
326///     .filter(|&i| {
327///         text[..i].chars().next_back().map_or(false, |c| {
328///             matches!(
329///                 maps::line_break().get(c),
330///                 LineBreak::MandatoryBreak
331///                     | LineBreak::CarriageReturn
332///                     | LineBreak::LineFeed
333///                     | LineBreak::NextLine
334///             ) || i == text.len()
335///         })
336///     })
337///     .collect();
338/// assert_eq!(&mandatory_breaks, &[9, 22]);
339/// ```
340#[derive(Debug)]
341pub struct LineSegmenter {
342    options: LineBreakOptions,
343    payload: DataPayload<LineBreakDataV1Marker>,
344    complex: ComplexPayloads,
345}
346
347impl LineSegmenter {
348    /// Constructs a [`LineSegmenter`] with an invariant locale and the best available compiled data for
349    /// complex scripts (Khmer, Lao, Myanmar, and Thai).
350    ///
351    /// The current behavior, which is subject to change, is to use the LSTM model when available.
352    ///
353    /// See also [`Self::new_auto_with_options`].
354    ///
355    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
356    ///
357    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
358    #[cfg(feature = "compiled_data")]
359    #[cfg(feature = "auto")]
360    pub fn new_auto() -> Self {
361        Self::new_auto_with_options(Default::default())
362    }
363
364    #[cfg(feature = "auto")]
365    icu_provider::gen_any_buffer_data_constructors!(
366        locale: skip,
367        options: skip,
368        error: SegmenterError,
369        #[cfg(skip)]
370        functions: [
371            new_auto,
372            try_new_auto_with_any_provider,
373            try_new_auto_with_buffer_provider,
374            try_new_auto_unstable,
375            Self,
376        ]
377    );
378
379    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
380    #[cfg(feature = "auto")]
381    pub fn try_new_auto_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
382    where
383        D: DataProvider<LineBreakDataV1Marker>
384            + DataProvider<LstmForWordLineAutoV1Marker>
385            + DataProvider<GraphemeClusterBreakDataV1Marker>
386            + ?Sized,
387    {
388        Self::try_new_auto_with_options_unstable(provider, Default::default())
389    }
390
391    /// Constructs a [`LineSegmenter`] with an invariant locale and compiled LSTM data for
392    /// complex scripts (Khmer, Lao, Myanmar, and Thai).
393    ///
394    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
395    /// the full dictionary but more expensive during segmentation (inference).
396    ///
397    /// See also [`Self::new_lstm_with_options`].
398    ///
399    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
400    ///
401    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
402    #[cfg(feature = "compiled_data")]
403    #[cfg(feature = "lstm")]
404    pub fn new_lstm() -> Self {
405        Self::new_lstm_with_options(Default::default())
406    }
407
408    #[cfg(feature = "lstm")]
409    icu_provider::gen_any_buffer_data_constructors!(
410        locale: skip,
411        options: skip,
412        error: SegmenterError,
413        #[cfg(skip)]
414        functions: [
415            new_lstm,
416            try_new_lstm_with_any_provider,
417            try_new_lstm_with_buffer_provider,
418            try_new_lstm_unstable,
419            Self,
420        ]
421    );
422
423    #[cfg(feature = "lstm")]
424    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
425    pub fn try_new_lstm_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
426    where
427        D: DataProvider<LineBreakDataV1Marker>
428            + DataProvider<LstmForWordLineAutoV1Marker>
429            + DataProvider<GraphemeClusterBreakDataV1Marker>
430            + ?Sized,
431    {
432        Self::try_new_lstm_with_options_unstable(provider, Default::default())
433    }
434
435    /// Constructs a [`LineSegmenter`] with an invariant locale and compiled dictionary data for
436    /// complex scripts (Khmer, Lao, Myanmar, and Thai).
437    ///
438    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
439    /// faster than the LSTM model but requires more data.
440    ///
441    /// See also [`Self::new_dictionary_with_options`].
442    ///
443    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
444    ///
445    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
446    #[cfg(feature = "compiled_data")]
447    pub fn new_dictionary() -> Self {
448        Self::new_dictionary_with_options(Default::default())
449    }
450
451    icu_provider::gen_any_buffer_data_constructors!(
452        locale: skip,
453        options: skip,
454        error: SegmenterError,
455        #[cfg(skip)]
456        functions: [
457            new_dictionary,
458            try_new_dictionary_with_any_provider,
459            try_new_dictionary_with_buffer_provider,
460            try_new_dictionary_unstable,
461            Self,
462        ]
463    );
464
465    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
466    pub fn try_new_dictionary_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
467    where
468        D: DataProvider<LineBreakDataV1Marker>
469            + DataProvider<DictionaryForWordLineExtendedV1Marker>
470            + DataProvider<GraphemeClusterBreakDataV1Marker>
471            + ?Sized,
472    {
473        Self::try_new_dictionary_with_options_unstable(provider, Default::default())
474    }
475
476    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
477    /// the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai).
478    ///
479    /// The current behavior, which is subject to change, is to use the LSTM model when available.
480    ///
481    /// See also [`Self::new_auto`].
482    ///
483    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
484    ///
485    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
486    #[cfg(feature = "auto")]
487    #[cfg(feature = "compiled_data")]
488    pub fn new_auto_with_options(options: LineBreakOptions) -> Self {
489        Self::new_lstm_with_options(options)
490    }
491
492    #[cfg(feature = "auto")]
493    icu_provider::gen_any_buffer_data_constructors!(
494        locale: skip,
495        options: LineBreakOptions,
496        error: SegmenterError,
497        #[cfg(skip)]
498        functions: [
499            new_auto_with_options,
500            try_new_auto_with_options_with_any_provider,
501            try_new_auto_with_options_with_buffer_provider,
502            try_new_auto_with_options_unstable,
503            Self,
504        ]
505    );
506
507    #[cfg(feature = "auto")]
508    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto_with_options)]
509    pub fn try_new_auto_with_options_unstable<D>(
510        provider: &D,
511        options: LineBreakOptions,
512    ) -> Result<Self, SegmenterError>
513    where
514        D: DataProvider<LineBreakDataV1Marker>
515            + DataProvider<LstmForWordLineAutoV1Marker>
516            + DataProvider<GraphemeClusterBreakDataV1Marker>
517            + ?Sized,
518    {
519        Self::try_new_lstm_with_options_unstable(provider, options)
520    }
521
522    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
523    /// compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai).
524    ///
525    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
526    /// the full dictionary but more expensive during segmentation (inference).
527    ///
528    /// See also [`Self::new_dictionary`].
529    ///
530    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
531    ///
532    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
533    #[cfg(feature = "lstm")]
534    #[cfg(feature = "compiled_data")]
535    pub fn new_lstm_with_options(options: LineBreakOptions) -> Self {
536        Self {
537            options,
538            payload: DataPayload::from_static_ref(
539                crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
540            ),
541            complex: ComplexPayloads::new_lstm(),
542        }
543    }
544
545    #[cfg(feature = "lstm")]
546    icu_provider::gen_any_buffer_data_constructors!(
547        locale: skip,
548        options: LineBreakOptions,
549        error: SegmenterError,
550        #[cfg(skip)]
551        functions: [
552            try_new_lstm_with_options,
553            try_new_lstm_with_options_with_any_provider,
554            try_new_lstm_with_options_with_buffer_provider,
555            try_new_lstm_with_options_unstable,
556            Self,
557        ]
558    );
559
560    #[cfg(feature = "lstm")]
561    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm_with_options)]
562    pub fn try_new_lstm_with_options_unstable<D>(
563        provider: &D,
564        options: LineBreakOptions,
565    ) -> Result<Self, SegmenterError>
566    where
567        D: DataProvider<LineBreakDataV1Marker>
568            + DataProvider<LstmForWordLineAutoV1Marker>
569            + DataProvider<GraphemeClusterBreakDataV1Marker>
570            + ?Sized,
571    {
572        Ok(Self {
573            options,
574            payload: provider.load(Default::default())?.take_payload()?,
575            complex: ComplexPayloads::try_new_lstm(provider)?,
576        })
577    }
578
579    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
580    /// compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai).
581    ///
582    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
583    /// faster than the LSTM model but requires more data.
584    ///
585    /// See also [`Self::new_dictionary`].
586    ///
587    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
588    ///
589    /// [πŸ“š Help choosing a constructor](icu_provider::constructors)
590    #[cfg(feature = "compiled_data")]
591    pub fn new_dictionary_with_options(options: LineBreakOptions) -> Self {
592        Self {
593            options,
594            payload: DataPayload::from_static_ref(
595                crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
596            ),
597            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
598            // characters [1]. Southeast Asian languages however require complex context analysis
599            // [2].
600            //
601            // [1]: https://www.unicode.org/reports/tr14/#ID
602            // [2]: https://www.unicode.org/reports/tr14/#SA
603            complex: ComplexPayloads::new_southeast_asian(),
604        }
605    }
606
607    icu_provider::gen_any_buffer_data_constructors!(
608        locale: skip,
609        options: LineBreakOptions,
610        error: SegmenterError,
611        #[cfg(skip)]
612        functions: [
613            new_dictionary_with_options,
614            try_new_dictionary_with_options_with_any_provider,
615            try_new_dictionary_with_options_with_buffer_provider,
616            try_new_dictionary_with_options_unstable,
617            Self,
618        ]
619    );
620
621    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary_with_options)]
622    pub fn try_new_dictionary_with_options_unstable<D>(
623        provider: &D,
624        options: LineBreakOptions,
625    ) -> Result<Self, SegmenterError>
626    where
627        D: DataProvider<LineBreakDataV1Marker>
628            + DataProvider<DictionaryForWordLineExtendedV1Marker>
629            + DataProvider<GraphemeClusterBreakDataV1Marker>
630            + ?Sized,
631    {
632        Ok(Self {
633            options,
634            payload: provider.load(Default::default())?.take_payload()?,
635            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
636            // characters [1]. Southeast Asian languages however require complex context analysis
637            // [2].
638            //
639            // [1]: https://www.unicode.org/reports/tr14/#ID
640            // [2]: https://www.unicode.org/reports/tr14/#SA
641            complex: ComplexPayloads::try_new_southeast_asian(provider)?,
642        })
643    }
644
645    /// Creates a line break iterator for an `str` (a UTF-8 string).
646    ///
647    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
648    pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
649        LineBreakIterator {
650            iter: input.char_indices(),
651            len: input.len(),
652            current_pos_data: None,
653            result_cache: Vec::new(),
654            data: self.payload.get(),
655            options: &self.options,
656            complex: &self.complex,
657        }
658    }
659    /// Creates a line break iterator for a potentially ill-formed UTF8 string
660    ///
661    /// Invalid characters are treated as REPLACEMENT CHARACTER
662    ///
663    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
664    pub fn segment_utf8<'l, 's>(
665        &'l self,
666        input: &'s [u8],
667    ) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
668        LineBreakIterator {
669            iter: Utf8CharIndices::new(input),
670            len: input.len(),
671            current_pos_data: None,
672            result_cache: Vec::new(),
673            data: self.payload.get(),
674            options: &self.options,
675            complex: &self.complex,
676        }
677    }
678    /// Creates a line break iterator for a Latin-1 (8-bit) string.
679    ///
680    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
681    pub fn segment_latin1<'l, 's>(&'l self, input: &'s [u8]) -> LineBreakIteratorLatin1<'l, 's> {
682        LineBreakIterator {
683            iter: Latin1Indices::new(input),
684            len: input.len(),
685            current_pos_data: None,
686            result_cache: Vec::new(),
687            data: self.payload.get(),
688            options: &self.options,
689            complex: &self.complex,
690        }
691    }
692
693    /// Creates a line break iterator for a UTF-16 string.
694    ///
695    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
696    pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
697        LineBreakIterator {
698            iter: Utf16Indices::new(input),
699            len: input.len(),
700            current_pos_data: None,
701            result_cache: Vec::new(),
702            data: self.payload.get(),
703            options: &self.options,
704            complex: &self.complex,
705        }
706    }
707}
708
709impl RuleBreakDataV1<'_> {
710    fn get_linebreak_property_utf32_with_rule(
711        &self,
712        codepoint: u32,
713        strictness: LineBreakStrictness,
714        word_option: LineBreakWordOption,
715    ) -> u8 {
716        // Note: Default value is 0 == UNKNOWN
717        let prop = self.property_table.get32(codepoint);
718
719        if word_option == LineBreakWordOption::BreakAll
720            || strictness == LineBreakStrictness::Loose
721            || strictness == LineBreakStrictness::Normal
722        {
723            return match prop {
724                CJ => ID, // All CJ's General_Category is Other_Letter (Lo).
725                _ => prop,
726            };
727        }
728
729        // CJ is treated as NS by default, yielding strict line breaking.
730        // https://www.unicode.org/reports/tr14/#CJ
731        prop
732    }
733
734    #[inline]
735    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
736        let idx = (left as usize) * (self.property_count as usize) + (right as usize);
737        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
738        self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
739    }
740
741    #[inline]
742    fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
743        let line_break_property = self.get_linebreak_property_utf32_with_rule(
744            codepoint,
745            LineBreakStrictness::Strict,
746            LineBreakWordOption::Normal,
747        );
748
749        line_break_property == SA
750    }
751}
752
753#[inline]
754fn is_break_utf32_by_loose(
755    right_codepoint: u32,
756    left_prop: u8,
757    right_prop: u8,
758    ja_zh: bool,
759) -> Option<bool> {
760    // breaks before hyphens
761    if right_prop == BA {
762        if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
763            return Some(true);
764        }
765    } else if right_prop == NS {
766        // breaks before certain CJK hyphen-like characters
767        if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
768            return Some(ja_zh);
769        }
770
771        // breaks before iteration marks
772        if right_codepoint == 0x3005
773            || right_codepoint == 0x303B
774            || right_codepoint == 0x309D
775            || right_codepoint == 0x309E
776            || right_codepoint == 0x30FD
777            || right_codepoint == 0x30FE
778        {
779            return Some(true);
780        }
781
782        // breaks before certain centered punctuation marks:
783        if right_codepoint == 0x30FB
784            || right_codepoint == 0xFF1A
785            || right_codepoint == 0xFF1B
786            || right_codepoint == 0xFF65
787            || right_codepoint == 0x203C
788            || (0x2047..=0x2049).contains(&right_codepoint)
789        {
790            return Some(ja_zh);
791        }
792    } else if right_prop == IN {
793        // breaks between inseparable characters such as U+2025, U+2026 i.e. characters with the Unicode Line Break property IN
794        return Some(true);
795    } else if right_prop == EX {
796        // breaks before certain centered punctuation marks:
797        if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
798            return Some(ja_zh);
799        }
800    }
801
802    // breaks before suffixes:
803    // Characters with the Unicode Line Break property PO and the East Asian Width property
804    if right_prop == PO_EAW {
805        return Some(ja_zh);
806    }
807    // breaks after prefixes:
808    // Characters with the Unicode Line Break property PR and the East Asian Width property
809    if left_prop == PR_EAW {
810        return Some(ja_zh);
811    }
812    None
813}
814
815/// A trait allowing for LineBreakIterator to be generalized to multiple string iteration methods.
816///
817/// This is implemented by ICU4X for several common string types.
818pub trait LineBreakType<'l, 's> {
819    /// The iterator over characters.
820    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone;
821
822    /// The character type.
823    type CharType: Copy + Into<u32>;
824
825    fn use_complex_breaking(iterator: &LineBreakIterator<'l, 's, Self>, c: Self::CharType) -> bool;
826
827    fn get_linebreak_property_with_rule(
828        iterator: &LineBreakIterator<'l, 's, Self>,
829        c: Self::CharType,
830    ) -> u8;
831
832    fn get_current_position_character_len(iterator: &LineBreakIterator<'l, 's, Self>) -> usize;
833
834    fn handle_complex_language(
835        iterator: &mut LineBreakIterator<'l, 's, Self>,
836        left_codepoint: Self::CharType,
837    ) -> Option<usize>;
838}
839
840/// Implements the [`Iterator`] trait over the line break opportunities of the given string.
841///
842/// Lifetimes:
843///
844/// - `'l` = lifetime of the [`LineSegmenter`] object from which this iterator was created
845/// - `'s` = lifetime of the string being segmented
846///
847/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
848/// _after_ the break (for a break at the end of text, this index is the length
849/// of the [`str`] or array of code units).
850///
851/// For examples of use, see [`LineSegmenter`].
852#[derive(Debug)]
853pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
854    iter: Y::IterAttr,
855    len: usize,
856    current_pos_data: Option<(usize, Y::CharType)>,
857    result_cache: Vec<usize>,
858    data: &'l RuleBreakDataV1<'l>,
859    options: &'l LineBreakOptions,
860    complex: &'l ComplexPayloads,
861}
862
863impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
864    type Item = usize;
865
866    fn next(&mut self) -> Option<Self::Item> {
867        match self.check_eof() {
868            StringBoundaryPosType::Start => return Some(0),
869            StringBoundaryPosType::End => return None,
870            _ => (),
871        }
872
873        // If we have break point cache by previous run, return this result
874        if let Some(&first_pos) = self.result_cache.first() {
875            let mut i = 0;
876            loop {
877                if i == first_pos {
878                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
879                    return self.get_current_position();
880                }
881                i += Y::get_current_position_character_len(self);
882                self.advance_iter();
883                if self.is_eof() {
884                    self.result_cache.clear();
885                    return Some(self.len);
886                }
887            }
888        }
889
890        'a: loop {
891            debug_assert!(!self.is_eof());
892            let left_codepoint = self.get_current_codepoint()?;
893            let mut left_prop = self.get_linebreak_property(left_codepoint);
894            self.advance_iter();
895
896            let Some(right_codepoint) = self.get_current_codepoint() else {
897                return Some(self.len);
898            };
899            let right_prop = self.get_linebreak_property(right_codepoint);
900
901            // CSS word-break property handling
902            match (self.options.word_option, left_prop, right_prop) {
903                (LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
904                    left_prop = ID;
905                }
906                //  typographic letter units shouldn't be break
907                (
908                    LineBreakWordOption::KeepAll,
909                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
910                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
911                ) => {
912                    continue;
913                }
914                _ => (),
915            }
916
917            // CSS line-break property handling
918            match self.options.strictness {
919                LineBreakStrictness::Normal => {
920                    if self.is_break_by_normal(right_codepoint) {
921                        return self.get_current_position();
922                    }
923                }
924                LineBreakStrictness::Loose => {
925                    if let Some(breakable) = is_break_utf32_by_loose(
926                        right_codepoint.into(),
927                        left_prop,
928                        right_prop,
929                        self.options.ja_zh,
930                    ) {
931                        if breakable {
932                            return self.get_current_position();
933                        }
934                        continue;
935                    }
936                }
937                LineBreakStrictness::Anywhere => {
938                    return self.get_current_position();
939                }
940                _ => (),
941            };
942
943            // UAX14 doesn't have Thai etc, so use another way.
944            if self.options.word_option != LineBreakWordOption::BreakAll
945                && Y::use_complex_breaking(self, left_codepoint)
946                && Y::use_complex_breaking(self, right_codepoint)
947            {
948                let result = Y::handle_complex_language(self, left_codepoint);
949                if result.is_some() {
950                    return result;
951                }
952                // I may have to fetch text until non-SA character?.
953            }
954
955            // If break_state is equals or grater than 0, it is alias of property.
956            let mut index = match self.data.get_break_state_from_table(left_prop, right_prop) {
957                BreakState::Index(index) => index,
958                // Line break uses more that 64 states, so they spill over into the intermediate range,
959                // and we cannot change that at the moment
960                BreakState::Intermediate(index) => index + 64,
961                BreakState::Break | BreakState::NoMatch => return self.get_current_position(),
962                BreakState::Keep => continue,
963            };
964
965            let mut previous_iter = self.iter.clone();
966            let mut previous_pos_data = self.current_pos_data;
967
968            loop {
969                self.advance_iter();
970
971                let Some(prop) = self.get_current_linebreak_property() else {
972                    // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
973                    let break_state = self
974                        .data
975                        .get_break_state_from_table(index, self.data.eot_property);
976                    if break_state == BreakState::NoMatch {
977                        self.iter = previous_iter;
978                        self.current_pos_data = previous_pos_data;
979                        return self.get_current_position();
980                    }
981                    // EOF
982                    return Some(self.len);
983                };
984
985                match self.data.get_break_state_from_table(index, prop) {
986                    BreakState::Keep => continue 'a,
987                    BreakState::NoMatch => {
988                        self.iter = previous_iter;
989                        self.current_pos_data = previous_pos_data;
990                        return self.get_current_position();
991                    }
992                    BreakState::Break => return self.get_current_position(),
993                    BreakState::Index(i) => {
994                        index = i;
995                        previous_iter = self.iter.clone();
996                        previous_pos_data = self.current_pos_data;
997                    }
998                    BreakState::Intermediate(i) => {
999                        index = i + 64;
1000                        previous_iter = self.iter.clone();
1001                        previous_pos_data = self.current_pos_data;
1002                    }
1003                }
1004            }
1005        }
1006    }
1007}
1008
1009enum StringBoundaryPosType {
1010    Start,
1011    Middle,
1012    End,
1013}
1014
1015impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {
1016    fn advance_iter(&mut self) {
1017        self.current_pos_data = self.iter.next();
1018    }
1019
1020    fn is_eof(&self) -> bool {
1021        self.current_pos_data.is_none()
1022    }
1023
1024    #[inline]
1025    fn check_eof(&mut self) -> StringBoundaryPosType {
1026        if self.is_eof() {
1027            self.advance_iter();
1028            if self.is_eof() {
1029                if self.len == 0 {
1030                    // Empty string. Since `self.current_pos_data` is always going to be empty,
1031                    // we never read `self.len` except for here, so we can use it to mark that
1032                    // we have already returned the single empty-string breakpoint.
1033                    self.len = 1;
1034                    StringBoundaryPosType::Start
1035                } else {
1036                    StringBoundaryPosType::End
1037                }
1038            } else {
1039                StringBoundaryPosType::Start
1040            }
1041        } else {
1042            StringBoundaryPosType::Middle
1043        }
1044    }
1045
1046    fn get_current_position(&self) -> Option<usize> {
1047        self.current_pos_data.map(|(pos, _)| pos)
1048    }
1049
1050    fn get_current_codepoint(&self) -> Option<Y::CharType> {
1051        self.current_pos_data.map(|(_, codepoint)| codepoint)
1052    }
1053
1054    fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
1055        Y::get_linebreak_property_with_rule(self, codepoint)
1056    }
1057
1058    fn get_current_linebreak_property(&self) -> Option<u8> {
1059        self.get_current_codepoint()
1060            .map(|c| self.get_linebreak_property(c))
1061    }
1062
1063    fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
1064        match codepoint.into() {
1065            0x301C | 0x30A0 => self.options.ja_zh,
1066            _ => false,
1067        }
1068    }
1069}
1070
1071#[derive(Debug)]
1072pub struct LineBreakTypeUtf8;
1073
1074impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf8 {
1075    type IterAttr = CharIndices<'s>;
1076    type CharType = char;
1077
1078    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1079        iterator.data.get_linebreak_property_utf32_with_rule(
1080            c as u32,
1081            iterator.options.strictness,
1082            iterator.options.word_option,
1083        )
1084    }
1085
1086    #[inline]
1087    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1088        iterator.data.use_complex_breaking_utf32(c as u32)
1089    }
1090
1091    fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
1092        iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
1093    }
1094
1095    fn handle_complex_language(
1096        iter: &mut LineBreakIterator<'l, 's, Self>,
1097        left_codepoint: char,
1098    ) -> Option<usize> {
1099        handle_complex_language_utf8(iter, left_codepoint)
1100    }
1101}
1102
1103#[derive(Debug)]
1104pub struct LineBreakTypePotentiallyIllFormedUtf8;
1105
1106impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypePotentiallyIllFormedUtf8 {
1107    type IterAttr = Utf8CharIndices<'s>;
1108    type CharType = char;
1109
1110    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1111        iterator.data.get_linebreak_property_utf32_with_rule(
1112            c as u32,
1113            iterator.options.strictness,
1114            iterator.options.word_option,
1115        )
1116    }
1117
1118    #[inline]
1119    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1120        iterator.data.use_complex_breaking_utf32(c as u32)
1121    }
1122
1123    fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
1124        iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
1125    }
1126
1127    fn handle_complex_language(
1128        iter: &mut LineBreakIterator<'l, 's, Self>,
1129        left_codepoint: char,
1130    ) -> Option<usize> {
1131        handle_complex_language_utf8(iter, left_codepoint)
1132    }
1133}
1134/// handle_complex_language impl for UTF8 iterators
1135fn handle_complex_language_utf8<'l, 's, T>(
1136    iter: &mut LineBreakIterator<'l, 's, T>,
1137    left_codepoint: char,
1138) -> Option<usize>
1139where
1140    T: LineBreakType<'l, 's, CharType = char>,
1141{
1142    // word segmenter doesn't define break rules for some languages such as Thai.
1143    let start_iter = iter.iter.clone();
1144    let start_point = iter.current_pos_data;
1145    let mut s = String::new();
1146    s.push(left_codepoint);
1147    loop {
1148        debug_assert!(!iter.is_eof());
1149        s.push(iter.get_current_codepoint()?);
1150        iter.advance_iter();
1151        if let Some(current_codepoint) = iter.get_current_codepoint() {
1152            if !T::use_complex_breaking(iter, current_codepoint) {
1153                break;
1154            }
1155        } else {
1156            // EOF
1157            break;
1158        }
1159    }
1160
1161    // Restore iterator to move to head of complex string
1162    iter.iter = start_iter;
1163    iter.current_pos_data = start_point;
1164    let breaks = complex_language_segment_str(iter.complex, &s);
1165    iter.result_cache = breaks;
1166    let first_pos = *iter.result_cache.first()?;
1167    let mut i = left_codepoint.len_utf8();
1168    loop {
1169        if i == first_pos {
1170            // Re-calculate breaking offset
1171            iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
1172            return iter.get_current_position();
1173        }
1174        debug_assert!(
1175            i < first_pos,
1176            "we should always arrive at first_pos: near index {:?}",
1177            iter.get_current_position()
1178        );
1179        i += T::get_current_position_character_len(iter);
1180        iter.advance_iter();
1181        if iter.is_eof() {
1182            iter.result_cache.clear();
1183            return Some(iter.len);
1184        }
1185    }
1186}
1187
1188#[derive(Debug)]
1189pub struct LineBreakTypeLatin1;
1190
1191impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeLatin1 {
1192    type IterAttr = Latin1Indices<'s>;
1193    type CharType = u8;
1194
1195    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
1196        // No CJ on Latin1
1197        // Note: Default value is 0 == UNKNOWN
1198        iterator.data.property_table.get32(c as u32)
1199    }
1200
1201    #[inline]
1202    fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
1203        false
1204    }
1205
1206    fn get_current_position_character_len(_: &LineBreakIterator<Self>) -> usize {
1207        unreachable!()
1208    }
1209
1210    fn handle_complex_language(
1211        _: &mut LineBreakIterator<Self>,
1212        _: Self::CharType,
1213    ) -> Option<usize> {
1214        unreachable!()
1215    }
1216}
1217
1218#[derive(Debug)]
1219pub struct LineBreakTypeUtf16;
1220
1221impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 {
1222    type IterAttr = Utf16Indices<'s>;
1223    type CharType = u32;
1224
1225    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
1226        iterator.data.get_linebreak_property_utf32_with_rule(
1227            c,
1228            iterator.options.strictness,
1229            iterator.options.word_option,
1230        )
1231    }
1232
1233    #[inline]
1234    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
1235        iterator.data.use_complex_breaking_utf32(c)
1236    }
1237
1238    fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
1239        match iterator.get_current_codepoint() {
1240            None => 0,
1241            Some(ch) if ch >= 0x10000 => 2,
1242            _ => 1,
1243        }
1244    }
1245
1246    fn handle_complex_language(
1247        iterator: &mut LineBreakIterator<Self>,
1248        left_codepoint: Self::CharType,
1249    ) -> Option<usize> {
1250        // word segmenter doesn't define break rules for some languages such as Thai.
1251        let start_iter = iterator.iter.clone();
1252        let start_point = iterator.current_pos_data;
1253        let mut s = vec![left_codepoint as u16];
1254        loop {
1255            debug_assert!(!iterator.is_eof());
1256            s.push(iterator.get_current_codepoint()? as u16);
1257            iterator.advance_iter();
1258            if let Some(current_codepoint) = iterator.get_current_codepoint() {
1259                if !Self::use_complex_breaking(iterator, current_codepoint) {
1260                    break;
1261                }
1262            } else {
1263                // EOF
1264                break;
1265            }
1266        }
1267
1268        // Restore iterator to move to head of complex string
1269        iterator.iter = start_iter;
1270        iterator.current_pos_data = start_point;
1271        let breaks = complex_language_segment_utf16(iterator.complex, &s);
1272        iterator.result_cache = breaks;
1273        // result_cache vector is utf-16 index that is in BMP.
1274        let first_pos = *iterator.result_cache.first()?;
1275        let mut i = 1;
1276        loop {
1277            if i == first_pos {
1278                // Re-calculate breaking offset
1279                iterator.result_cache = iterator
1280                    .result_cache
1281                    .iter()
1282                    .skip(1)
1283                    .map(|r| r - i)
1284                    .collect();
1285                return iterator.get_current_position();
1286            }
1287            debug_assert!(
1288                i < first_pos,
1289                "we should always arrive at first_pos: near index {:?}",
1290                iterator.get_current_position()
1291            );
1292            i += 1;
1293            iterator.advance_iter();
1294            if iterator.is_eof() {
1295                iterator.result_cache.clear();
1296                return Some(iterator.len);
1297            }
1298        }
1299    }
1300}
1301
1302#[cfg(test)]
1303#[cfg(feature = "serde")]
1304mod tests {
1305    use super::*;
1306    use crate::LineSegmenter;
1307
1308    #[test]
1309    fn linebreak_property() {
1310        let payload = DataProvider::<LineBreakDataV1Marker>::load(
1311            &crate::provider::Baked,
1312            Default::default(),
1313        )
1314        .expect("Loading should succeed!")
1315        .take_payload()
1316        .expect("Data should be present!");
1317
1318        let get_linebreak_property = |codepoint| {
1319            payload.get().get_linebreak_property_utf32_with_rule(
1320                codepoint as u32,
1321                LineBreakStrictness::Strict,
1322                LineBreakWordOption::Normal,
1323            )
1324        };
1325
1326        assert_eq!(get_linebreak_property('\u{0020}'), SP);
1327        assert_eq!(get_linebreak_property('\u{0022}'), QU);
1328        assert_eq!(get_linebreak_property('('), OP_OP30);
1329        assert_eq!(get_linebreak_property('\u{0030}'), NU);
1330        assert_eq!(get_linebreak_property('['), OP_OP30);
1331        assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
1332        assert_eq!(get_linebreak_property('\u{20000}'), ID);
1333        assert_eq!(get_linebreak_property('\u{e0020}'), CM);
1334        assert_eq!(get_linebreak_property('\u{3041}'), CJ);
1335        assert_eq!(get_linebreak_property('\u{0025}'), PO);
1336        assert_eq!(get_linebreak_property('\u{00A7}'), AI);
1337        assert_eq!(get_linebreak_property('\u{50005}'), XX);
1338        assert_eq!(get_linebreak_property('\u{17D6}'), NS);
1339        assert_eq!(get_linebreak_property('\u{2014}'), B2);
1340    }
1341
1342    #[test]
1343    #[allow(clippy::bool_assert_comparison)] // clearer when we're testing bools directly
1344    fn break_rule() {
1345        let payload = DataProvider::<LineBreakDataV1Marker>::load(
1346            &crate::provider::Baked,
1347            Default::default(),
1348        )
1349        .expect("Loading should succeed!")
1350        .take_payload()
1351        .expect("Data should be present!");
1352        let lb_data: &RuleBreakDataV1 = payload.get();
1353
1354        let is_break = |left, right| {
1355            matches!(
1356                lb_data.get_break_state_from_table(left, right),
1357                BreakState::Break | BreakState::NoMatch
1358            )
1359        };
1360
1361        // LB4
1362        assert_eq!(is_break(BK, AL), true);
1363        // LB5
1364        assert_eq!(is_break(CR, LF), false);
1365        assert_eq!(is_break(CR, AL), true);
1366        assert_eq!(is_break(LF, AL), true);
1367        assert_eq!(is_break(NL, AL), true);
1368        // LB6
1369        assert_eq!(is_break(AL, BK), false);
1370        assert_eq!(is_break(AL, CR), false);
1371        assert_eq!(is_break(AL, LF), false);
1372        assert_eq!(is_break(AL, NL), false);
1373        // LB7
1374        assert_eq!(is_break(AL, SP), false);
1375        assert_eq!(is_break(AL, ZW), false);
1376        // LB8
1377        // LB8a
1378        assert_eq!(is_break(ZWJ, AL), false);
1379        // LB9
1380        assert_eq!(is_break(AL, ZWJ), false);
1381        assert_eq!(is_break(AL, CM), false);
1382        assert_eq!(is_break(ID, ZWJ), false);
1383        // LB10
1384        assert_eq!(is_break(ZWJ, SP), false);
1385        assert_eq!(is_break(SP, CM), true);
1386        // LB11
1387        assert_eq!(is_break(AL, WJ), false);
1388        assert_eq!(is_break(WJ, AL), false);
1389        // LB12
1390        assert_eq!(is_break(GL, AL), false);
1391        // LB12a
1392        assert_eq!(is_break(AL, GL), false);
1393        assert_eq!(is_break(SP, GL), true);
1394        // LB13
1395        assert_eq!(is_break(AL, CL), false);
1396        assert_eq!(is_break(AL, CP), false);
1397        assert_eq!(is_break(AL, EX), false);
1398        assert_eq!(is_break(AL, IS), false);
1399        assert_eq!(is_break(AL, SY), false);
1400        // LB18
1401        assert_eq!(is_break(SP, AL), true);
1402        // LB19
1403        assert_eq!(is_break(AL, QU), false);
1404        assert_eq!(is_break(QU, AL), false);
1405        // LB20
1406        assert_eq!(is_break(AL, CB), true);
1407        assert_eq!(is_break(CB, AL), true);
1408        // LB20
1409        assert_eq!(is_break(AL, BA), false);
1410        assert_eq!(is_break(AL, HY), false);
1411        assert_eq!(is_break(AL, NS), false);
1412        // LB21
1413        assert_eq!(is_break(AL, BA), false);
1414        assert_eq!(is_break(BB, AL), false);
1415        assert_eq!(is_break(ID, BA), false);
1416        assert_eq!(is_break(ID, NS), false);
1417        // LB21a
1418        // LB21b
1419        assert_eq!(is_break(SY, HL), false);
1420        // LB22
1421        assert_eq!(is_break(AL, IN), false);
1422        // LB 23
1423        assert_eq!(is_break(AL, NU), false);
1424        assert_eq!(is_break(HL, NU), false);
1425        // LB 23a
1426        assert_eq!(is_break(PR, ID), false);
1427        assert_eq!(is_break(PR, EB), false);
1428        assert_eq!(is_break(PR, EM), false);
1429        assert_eq!(is_break(ID, PO), false);
1430        assert_eq!(is_break(EB, PO), false);
1431        assert_eq!(is_break(EM, PO), false);
1432        // LB26
1433        assert_eq!(is_break(JL, JL), false);
1434        assert_eq!(is_break(JL, JV), false);
1435        assert_eq!(is_break(JL, H2), false);
1436        // LB27
1437        assert_eq!(is_break(JL, IN), false);
1438        assert_eq!(is_break(JL, PO), false);
1439        assert_eq!(is_break(PR, JL), false);
1440        // LB28
1441        assert_eq!(is_break(AL, AL), false);
1442        assert_eq!(is_break(HL, AL), false);
1443        // LB29
1444        assert_eq!(is_break(IS, AL), false);
1445        assert_eq!(is_break(IS, HL), false);
1446        // LB30b
1447        assert_eq!(is_break(EB, EM), false);
1448        // LB31
1449        assert_eq!(is_break(ID, ID), true);
1450    }
1451
1452    #[test]
1453    fn linebreak() {
1454        let segmenter = LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked)
1455            .expect("Data exists");
1456
1457        let mut iter = segmenter.segment_str("hello world");
1458        assert_eq!(Some(0), iter.next());
1459        assert_eq!(Some(6), iter.next());
1460        assert_eq!(Some(11), iter.next());
1461        assert_eq!(None, iter.next());
1462
1463        iter = segmenter.segment_str("$10 $10");
1464        assert_eq!(Some(0), iter.next());
1465        assert_eq!(Some(4), iter.next());
1466        assert_eq!(Some(7), iter.next());
1467        assert_eq!(None, iter.next());
1468
1469        // LB10
1470
1471        // LB14
1472        iter = segmenter.segment_str("[  abc def");
1473        assert_eq!(Some(0), iter.next());
1474        assert_eq!(Some(7), iter.next());
1475        assert_eq!(Some(10), iter.next());
1476        assert_eq!(None, iter.next());
1477
1478        let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1479        let mut iter_u8 = segmenter.segment_latin1(&input);
1480        assert_eq!(Some(0), iter_u8.next());
1481        assert_eq!(Some(7), iter_u8.next());
1482        assert_eq!(Some(10), iter_u8.next());
1483        assert_eq!(None, iter_u8.next());
1484
1485        let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1486        let mut iter_u16 = segmenter.segment_utf16(&input);
1487        assert_eq!(Some(0), iter_u16.next());
1488        assert_eq!(Some(7), iter_u16.next());
1489        assert_eq!(Some(10), iter_u16.next());
1490        assert_eq!(None, iter_u16.next());
1491
1492        // LB15
1493        iter = segmenter.segment_str("abc\u{0022}  (def");
1494        assert_eq!(Some(0), iter.next());
1495        assert_eq!(Some(10), iter.next());
1496        assert_eq!(None, iter.next());
1497
1498        let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1499        let mut iter_u8 = segmenter.segment_latin1(&input);
1500        assert_eq!(Some(0), iter_u8.next());
1501        assert_eq!(Some(10), iter_u8.next());
1502        assert_eq!(None, iter_u8.next());
1503
1504        let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1505        let mut iter_u16 = segmenter.segment_utf16(&input);
1506        assert_eq!(Some(0), iter_u16.next());
1507        assert_eq!(Some(10), iter_u16.next());
1508        assert_eq!(None, iter_u16.next());
1509
1510        // LB16
1511        iter = segmenter.segment_str("\u{0029}\u{203C}");
1512        assert_eq!(Some(0), iter.next());
1513        assert_eq!(Some(4), iter.next());
1514        assert_eq!(None, iter.next());
1515        iter = segmenter.segment_str("\u{0029}  \u{203C}");
1516        assert_eq!(Some(0), iter.next());
1517        assert_eq!(Some(6), iter.next());
1518        assert_eq!(None, iter.next());
1519
1520        let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
1521        let mut iter_u16 = segmenter.segment_utf16(&input);
1522        assert_eq!(Some(0), iter_u16.next());
1523        assert_eq!(Some(4), iter_u16.next());
1524        assert_eq!(None, iter_u16.next());
1525
1526        // LB17
1527        iter = segmenter.segment_str("\u{2014}\u{2014}aa");
1528        assert_eq!(Some(0), iter.next());
1529        assert_eq!(Some(6), iter.next());
1530        assert_eq!(Some(8), iter.next());
1531        assert_eq!(None, iter.next());
1532        iter = segmenter.segment_str("\u{2014}  \u{2014}aa");
1533        assert_eq!(Some(0), iter.next());
1534        assert_eq!(Some(8), iter.next());
1535        assert_eq!(Some(10), iter.next());
1536        assert_eq!(None, iter.next());
1537
1538        iter = segmenter.segment_str("\u{2014}\u{2014}  \u{2014}\u{2014}123 abc");
1539        assert_eq!(Some(0), iter.next());
1540        assert_eq!(Some(14), iter.next());
1541        assert_eq!(Some(18), iter.next());
1542        assert_eq!(Some(21), iter.next());
1543        assert_eq!(None, iter.next());
1544
1545        // LB25
1546        let mut iter = segmenter.segment_str("(0,1)+(2,3)");
1547        assert_eq!(Some(0), iter.next());
1548        assert_eq!(Some(11), iter.next());
1549        assert_eq!(None, iter.next());
1550        let input: [u16; 11] = [
1551            0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
1552        ];
1553        let mut iter_u16 = segmenter.segment_utf16(&input);
1554        assert_eq!(Some(0), iter_u16.next());
1555        assert_eq!(Some(11), iter_u16.next());
1556        assert_eq!(None, iter_u16.next());
1557
1558        let input: [u16; 13] = [
1559            0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
1560        ];
1561        let mut iter_u16 = segmenter.segment_utf16(&input);
1562        assert_eq!(Some(0), iter_u16.next());
1563        assert_eq!(Some(6), iter_u16.next());
1564        assert_eq!(Some(10), iter_u16.next());
1565        assert_eq!(Some(13), iter_u16.next());
1566        assert_eq!(None, iter_u16.next());
1567
1568        iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
1569        assert_eq!(Some(0), iter.next());
1570        assert_eq!(Some(5), iter.next());
1571        assert_eq!(Some(9), iter.next());
1572        assert_eq!(None, iter.next());
1573    }
1574
1575    #[test]
1576    #[cfg(feature = "lstm")]
1577    fn thai_line_break() {
1578        const TEST_STR: &str = "ΰΈ ΰΈ²ΰΈ©ΰΈ²ΰΉ„ΰΈ—ΰΈ’ΰΈ ΰΈ²ΰΈ©ΰΈ²ΰΉ„ΰΈ—ΰΈ’";
1579
1580        let segmenter = LineSegmenter::new_lstm();
1581        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1582        assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
1583
1584        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1585        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1586        assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
1587
1588        let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
1589        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1590        assert_eq!(breaks, [0, 4], "Thai test");
1591    }
1592
1593    #[test]
1594    #[cfg(feature = "lstm")]
1595    fn burmese_line_break() {
1596        // "Burmese Language" in Burmese
1597        const TEST_STR: &str = "α€™α€Όα€”α€Ία€™α€¬α€˜α€¬α€žα€¬α€…α€€α€¬α€Έ";
1598
1599        let segmenter = LineSegmenter::new_lstm();
1600        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1601        // LSTM model breaks more characters, but it is better to return [30].
1602        assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
1603
1604        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1605        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1606        // LSTM model breaks more characters, but it is better to return [10].
1607        assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
1608    }
1609
1610    #[test]
1611    #[cfg(feature = "lstm")]
1612    fn khmer_line_break() {
1613        const TEST_STR: &str = "αžŸαŸαž…αž€αŸ’αžŠαžΈαž”αŸ’αžšαž€αžΆαžŸαž‡αžΆαžŸαž€αž›αžŸαŸ’αžŠαžΈαž–αžΈαžŸαž·αž‘αŸ’αž’αž·αž˜αž“αž»αžŸαŸ’αžŸ";
1614
1615        let segmenter = LineSegmenter::new_lstm();
1616        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1617        // Note: This small sample matches the ICU dictionary segmenter
1618        assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
1619
1620        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1621        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1622        assert_eq!(
1623            breaks,
1624            [0, 13, 16, 18, 24, utf16.len()],
1625            "Khmer utf-16 test"
1626        );
1627    }
1628
1629    #[test]
1630    #[cfg(feature = "lstm")]
1631    fn lao_line_break() {
1632        const TEST_STR: &str = "ກ່ຽວກັບΰΊͺΰΊ΄ΰΊ”ΰΊ‚ΰΊ­ΰΊ‡ΰΊ‘ΰΊ°ΰΊ™ΰΊΈΰΊ”";
1633
1634        let segmenter = LineSegmenter::new_lstm();
1635        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1636        // Note: LSTM finds a break at '12' that the dictionary does not find
1637        assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
1638
1639        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1640        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1641        assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
1642    }
1643
1644    #[test]
1645    fn empty_string() {
1646        let segmenter = LineSegmenter::new_auto();
1647        let breaks: Vec<usize> = segmenter.segment_str("").collect();
1648        assert_eq!(breaks, [0]);
1649    }
1650}