1use crate::complex::*;
6use crate::indices::*;
7use crate::provider::*;
8use crate::SegmenterError;
9use alloc::string::String;
10use alloc::vec;
11use alloc::vec::Vec;
12use core::char;
13use core::str::CharIndices;
14use icu_provider::prelude::*;
15use utf8_iter::Utf8CharIndices;
16
17#[allow(dead_code)]
19const UNKNOWN: u8 = 0;
20#[allow(dead_code)]
21const AI: u8 = 1;
22#[allow(dead_code)]
23const AL: u8 = 2;
24#[allow(dead_code)]
25const B2: u8 = 3;
26#[allow(dead_code)]
27const BA: u8 = 4;
28#[allow(dead_code)]
29const BB: u8 = 5;
30#[allow(dead_code)]
31const BK: u8 = 6;
32#[allow(dead_code)]
33const CB: u8 = 7;
34#[allow(dead_code)]
35const CJ: u8 = 8;
36#[allow(dead_code)]
37const CL: u8 = 9;
38#[allow(dead_code)]
39const CM: u8 = 10;
40#[allow(dead_code)]
41const CP: u8 = 11;
42#[allow(dead_code)]
43const CR: u8 = 12;
44#[allow(dead_code)]
45const EB: u8 = 13;
46#[allow(dead_code)]
47const EM: u8 = 14;
48#[allow(dead_code)]
49const EX: u8 = 15;
50#[allow(dead_code)]
51const GL: u8 = 16;
52#[allow(dead_code)]
53const H2: u8 = 17;
54#[allow(dead_code)]
55const H3: u8 = 18;
56#[allow(dead_code)]
57const HL: u8 = 19;
58#[allow(dead_code)]
59const HY: u8 = 20;
60#[allow(dead_code)]
61const ID: u8 = 21;
62#[allow(dead_code)]
63const ID_CN: u8 = 22;
64#[allow(dead_code)]
65const IN: u8 = 23;
66#[allow(dead_code)]
67const IS: u8 = 24;
68#[allow(dead_code)]
69const JL: u8 = 25;
70#[allow(dead_code)]
71const JT: u8 = 26;
72#[allow(dead_code)]
73const JV: u8 = 27;
74#[allow(dead_code)]
75const LF: u8 = 28;
76#[allow(dead_code)]
77const NL: u8 = 29;
78#[allow(dead_code)]
79const NS: u8 = 30;
80#[allow(dead_code)]
81const NU: u8 = 31;
82#[allow(dead_code)]
83const OP_EA: u8 = 32;
84#[allow(dead_code)]
85const OP_OP30: u8 = 33;
86#[allow(dead_code)]
87const PO: u8 = 34;
88#[allow(dead_code)]
89const PO_EAW: u8 = 35;
90#[allow(dead_code)]
91const PR: u8 = 36;
92#[allow(dead_code)]
93const PR_EAW: u8 = 37;
94#[allow(dead_code)]
95const QU: u8 = 38;
96#[allow(dead_code)]
97const RI: u8 = 39;
98#[allow(dead_code)]
99const SA: u8 = 40;
100#[allow(dead_code)]
101const SG: u8 = 41;
102#[allow(dead_code)]
103const SP: u8 = 42;
104#[allow(dead_code)]
105const SY: u8 = 43;
106#[allow(dead_code)]
107const WJ: u8 = 44;
108#[allow(dead_code)]
109const XX: u8 = 45;
110#[allow(dead_code)]
111const ZW: u8 = 46;
112#[allow(dead_code)]
113const ZWJ: u8 = 47;
114
115#[non_exhaustive]
122#[derive(Copy, Clone, PartialEq, Eq, Debug)]
123pub enum LineBreakStrictness {
124 Loose,
128
129 Normal,
132
133 Strict,
141
142 Anywhere,
147}
148
149#[non_exhaustive]
156#[derive(Copy, Clone, PartialEq, Eq, Debug)]
157pub enum LineBreakWordOption {
158 Normal,
161
162 BreakAll,
165
166 KeepAll,
169}
170
171#[non_exhaustive]
173#[derive(Copy, Clone, PartialEq, Eq, Debug)]
174pub struct LineBreakOptions {
175 pub strictness: LineBreakStrictness,
177
178 pub word_option: LineBreakWordOption,
180
181 pub ja_zh: bool,
188}
189
190impl Default for LineBreakOptions {
191 fn default() -> Self {
192 Self {
193 strictness: LineBreakStrictness::Strict,
194 word_option: LineBreakWordOption::Normal,
195 ja_zh: false,
196 }
197 }
198}
199
200pub type LineBreakIteratorUtf8<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf8>;
204
205pub type LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
209 LineBreakIterator<'l, 's, LineBreakTypePotentiallyIllFormedUtf8>;
210
211pub type LineBreakIteratorLatin1<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeLatin1>;
215
216pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf16>;
220
221#[derive(Debug)]
341pub struct LineSegmenter {
342 options: LineBreakOptions,
343 payload: DataPayload<LineBreakDataV1Marker>,
344 complex: ComplexPayloads,
345}
346
347impl LineSegmenter {
348 #[cfg(feature = "compiled_data")]
359 #[cfg(feature = "auto")]
360 pub fn new_auto() -> Self {
361 Self::new_auto_with_options(Default::default())
362 }
363
364 #[cfg(feature = "auto")]
365 icu_provider::gen_any_buffer_data_constructors!(
366 locale: skip,
367 options: skip,
368 error: SegmenterError,
369 #[cfg(skip)]
370 functions: [
371 new_auto,
372 try_new_auto_with_any_provider,
373 try_new_auto_with_buffer_provider,
374 try_new_auto_unstable,
375 Self,
376 ]
377 );
378
379 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
380 #[cfg(feature = "auto")]
381 pub fn try_new_auto_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
382 where
383 D: DataProvider<LineBreakDataV1Marker>
384 + DataProvider<LstmForWordLineAutoV1Marker>
385 + DataProvider<GraphemeClusterBreakDataV1Marker>
386 + ?Sized,
387 {
388 Self::try_new_auto_with_options_unstable(provider, Default::default())
389 }
390
391 #[cfg(feature = "compiled_data")]
403 #[cfg(feature = "lstm")]
404 pub fn new_lstm() -> Self {
405 Self::new_lstm_with_options(Default::default())
406 }
407
408 #[cfg(feature = "lstm")]
409 icu_provider::gen_any_buffer_data_constructors!(
410 locale: skip,
411 options: skip,
412 error: SegmenterError,
413 #[cfg(skip)]
414 functions: [
415 new_lstm,
416 try_new_lstm_with_any_provider,
417 try_new_lstm_with_buffer_provider,
418 try_new_lstm_unstable,
419 Self,
420 ]
421 );
422
423 #[cfg(feature = "lstm")]
424 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
425 pub fn try_new_lstm_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
426 where
427 D: DataProvider<LineBreakDataV1Marker>
428 + DataProvider<LstmForWordLineAutoV1Marker>
429 + DataProvider<GraphemeClusterBreakDataV1Marker>
430 + ?Sized,
431 {
432 Self::try_new_lstm_with_options_unstable(provider, Default::default())
433 }
434
435 #[cfg(feature = "compiled_data")]
447 pub fn new_dictionary() -> Self {
448 Self::new_dictionary_with_options(Default::default())
449 }
450
451 icu_provider::gen_any_buffer_data_constructors!(
452 locale: skip,
453 options: skip,
454 error: SegmenterError,
455 #[cfg(skip)]
456 functions: [
457 new_dictionary,
458 try_new_dictionary_with_any_provider,
459 try_new_dictionary_with_buffer_provider,
460 try_new_dictionary_unstable,
461 Self,
462 ]
463 );
464
465 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
466 pub fn try_new_dictionary_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
467 where
468 D: DataProvider<LineBreakDataV1Marker>
469 + DataProvider<DictionaryForWordLineExtendedV1Marker>
470 + DataProvider<GraphemeClusterBreakDataV1Marker>
471 + ?Sized,
472 {
473 Self::try_new_dictionary_with_options_unstable(provider, Default::default())
474 }
475
476 #[cfg(feature = "auto")]
487 #[cfg(feature = "compiled_data")]
488 pub fn new_auto_with_options(options: LineBreakOptions) -> Self {
489 Self::new_lstm_with_options(options)
490 }
491
492 #[cfg(feature = "auto")]
493 icu_provider::gen_any_buffer_data_constructors!(
494 locale: skip,
495 options: LineBreakOptions,
496 error: SegmenterError,
497 #[cfg(skip)]
498 functions: [
499 new_auto_with_options,
500 try_new_auto_with_options_with_any_provider,
501 try_new_auto_with_options_with_buffer_provider,
502 try_new_auto_with_options_unstable,
503 Self,
504 ]
505 );
506
507 #[cfg(feature = "auto")]
508 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto_with_options)]
509 pub fn try_new_auto_with_options_unstable<D>(
510 provider: &D,
511 options: LineBreakOptions,
512 ) -> Result<Self, SegmenterError>
513 where
514 D: DataProvider<LineBreakDataV1Marker>
515 + DataProvider<LstmForWordLineAutoV1Marker>
516 + DataProvider<GraphemeClusterBreakDataV1Marker>
517 + ?Sized,
518 {
519 Self::try_new_lstm_with_options_unstable(provider, options)
520 }
521
522 #[cfg(feature = "lstm")]
534 #[cfg(feature = "compiled_data")]
535 pub fn new_lstm_with_options(options: LineBreakOptions) -> Self {
536 Self {
537 options,
538 payload: DataPayload::from_static_ref(
539 crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
540 ),
541 complex: ComplexPayloads::new_lstm(),
542 }
543 }
544
545 #[cfg(feature = "lstm")]
546 icu_provider::gen_any_buffer_data_constructors!(
547 locale: skip,
548 options: LineBreakOptions,
549 error: SegmenterError,
550 #[cfg(skip)]
551 functions: [
552 try_new_lstm_with_options,
553 try_new_lstm_with_options_with_any_provider,
554 try_new_lstm_with_options_with_buffer_provider,
555 try_new_lstm_with_options_unstable,
556 Self,
557 ]
558 );
559
560 #[cfg(feature = "lstm")]
561 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm_with_options)]
562 pub fn try_new_lstm_with_options_unstable<D>(
563 provider: &D,
564 options: LineBreakOptions,
565 ) -> Result<Self, SegmenterError>
566 where
567 D: DataProvider<LineBreakDataV1Marker>
568 + DataProvider<LstmForWordLineAutoV1Marker>
569 + DataProvider<GraphemeClusterBreakDataV1Marker>
570 + ?Sized,
571 {
572 Ok(Self {
573 options,
574 payload: provider.load(Default::default())?.take_payload()?,
575 complex: ComplexPayloads::try_new_lstm(provider)?,
576 })
577 }
578
579 #[cfg(feature = "compiled_data")]
591 pub fn new_dictionary_with_options(options: LineBreakOptions) -> Self {
592 Self {
593 options,
594 payload: DataPayload::from_static_ref(
595 crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
596 ),
597 complex: ComplexPayloads::new_southeast_asian(),
604 }
605 }
606
607 icu_provider::gen_any_buffer_data_constructors!(
608 locale: skip,
609 options: LineBreakOptions,
610 error: SegmenterError,
611 #[cfg(skip)]
612 functions: [
613 new_dictionary_with_options,
614 try_new_dictionary_with_options_with_any_provider,
615 try_new_dictionary_with_options_with_buffer_provider,
616 try_new_dictionary_with_options_unstable,
617 Self,
618 ]
619 );
620
621 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary_with_options)]
622 pub fn try_new_dictionary_with_options_unstable<D>(
623 provider: &D,
624 options: LineBreakOptions,
625 ) -> Result<Self, SegmenterError>
626 where
627 D: DataProvider<LineBreakDataV1Marker>
628 + DataProvider<DictionaryForWordLineExtendedV1Marker>
629 + DataProvider<GraphemeClusterBreakDataV1Marker>
630 + ?Sized,
631 {
632 Ok(Self {
633 options,
634 payload: provider.load(Default::default())?.take_payload()?,
635 complex: ComplexPayloads::try_new_southeast_asian(provider)?,
642 })
643 }
644
645 pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
649 LineBreakIterator {
650 iter: input.char_indices(),
651 len: input.len(),
652 current_pos_data: None,
653 result_cache: Vec::new(),
654 data: self.payload.get(),
655 options: &self.options,
656 complex: &self.complex,
657 }
658 }
659 pub fn segment_utf8<'l, 's>(
665 &'l self,
666 input: &'s [u8],
667 ) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
668 LineBreakIterator {
669 iter: Utf8CharIndices::new(input),
670 len: input.len(),
671 current_pos_data: None,
672 result_cache: Vec::new(),
673 data: self.payload.get(),
674 options: &self.options,
675 complex: &self.complex,
676 }
677 }
678 pub fn segment_latin1<'l, 's>(&'l self, input: &'s [u8]) -> LineBreakIteratorLatin1<'l, 's> {
682 LineBreakIterator {
683 iter: Latin1Indices::new(input),
684 len: input.len(),
685 current_pos_data: None,
686 result_cache: Vec::new(),
687 data: self.payload.get(),
688 options: &self.options,
689 complex: &self.complex,
690 }
691 }
692
693 pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
697 LineBreakIterator {
698 iter: Utf16Indices::new(input),
699 len: input.len(),
700 current_pos_data: None,
701 result_cache: Vec::new(),
702 data: self.payload.get(),
703 options: &self.options,
704 complex: &self.complex,
705 }
706 }
707}
708
709impl RuleBreakDataV1<'_> {
710 fn get_linebreak_property_utf32_with_rule(
711 &self,
712 codepoint: u32,
713 strictness: LineBreakStrictness,
714 word_option: LineBreakWordOption,
715 ) -> u8 {
716 let prop = self.property_table.get32(codepoint);
718
719 if word_option == LineBreakWordOption::BreakAll
720 || strictness == LineBreakStrictness::Loose
721 || strictness == LineBreakStrictness::Normal
722 {
723 return match prop {
724 CJ => ID, _ => prop,
726 };
727 }
728
729 prop
732 }
733
734 #[inline]
735 fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
736 let idx = (left as usize) * (self.property_count as usize) + (right as usize);
737 self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
739 }
740
741 #[inline]
742 fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
743 let line_break_property = self.get_linebreak_property_utf32_with_rule(
744 codepoint,
745 LineBreakStrictness::Strict,
746 LineBreakWordOption::Normal,
747 );
748
749 line_break_property == SA
750 }
751}
752
753#[inline]
754fn is_break_utf32_by_loose(
755 right_codepoint: u32,
756 left_prop: u8,
757 right_prop: u8,
758 ja_zh: bool,
759) -> Option<bool> {
760 if right_prop == BA {
762 if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
763 return Some(true);
764 }
765 } else if right_prop == NS {
766 if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
768 return Some(ja_zh);
769 }
770
771 if right_codepoint == 0x3005
773 || right_codepoint == 0x303B
774 || right_codepoint == 0x309D
775 || right_codepoint == 0x309E
776 || right_codepoint == 0x30FD
777 || right_codepoint == 0x30FE
778 {
779 return Some(true);
780 }
781
782 if right_codepoint == 0x30FB
784 || right_codepoint == 0xFF1A
785 || right_codepoint == 0xFF1B
786 || right_codepoint == 0xFF65
787 || right_codepoint == 0x203C
788 || (0x2047..=0x2049).contains(&right_codepoint)
789 {
790 return Some(ja_zh);
791 }
792 } else if right_prop == IN {
793 return Some(true);
795 } else if right_prop == EX {
796 if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
798 return Some(ja_zh);
799 }
800 }
801
802 if right_prop == PO_EAW {
805 return Some(ja_zh);
806 }
807 if left_prop == PR_EAW {
810 return Some(ja_zh);
811 }
812 None
813}
814
815pub trait LineBreakType<'l, 's> {
819 type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone;
821
822 type CharType: Copy + Into<u32>;
824
825 fn use_complex_breaking(iterator: &LineBreakIterator<'l, 's, Self>, c: Self::CharType) -> bool;
826
827 fn get_linebreak_property_with_rule(
828 iterator: &LineBreakIterator<'l, 's, Self>,
829 c: Self::CharType,
830 ) -> u8;
831
832 fn get_current_position_character_len(iterator: &LineBreakIterator<'l, 's, Self>) -> usize;
833
834 fn handle_complex_language(
835 iterator: &mut LineBreakIterator<'l, 's, Self>,
836 left_codepoint: Self::CharType,
837 ) -> Option<usize>;
838}
839
840#[derive(Debug)]
853pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
854 iter: Y::IterAttr,
855 len: usize,
856 current_pos_data: Option<(usize, Y::CharType)>,
857 result_cache: Vec<usize>,
858 data: &'l RuleBreakDataV1<'l>,
859 options: &'l LineBreakOptions,
860 complex: &'l ComplexPayloads,
861}
862
863impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
864 type Item = usize;
865
866 fn next(&mut self) -> Option<Self::Item> {
867 match self.check_eof() {
868 StringBoundaryPosType::Start => return Some(0),
869 StringBoundaryPosType::End => return None,
870 _ => (),
871 }
872
873 if let Some(&first_pos) = self.result_cache.first() {
875 let mut i = 0;
876 loop {
877 if i == first_pos {
878 self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
879 return self.get_current_position();
880 }
881 i += Y::get_current_position_character_len(self);
882 self.advance_iter();
883 if self.is_eof() {
884 self.result_cache.clear();
885 return Some(self.len);
886 }
887 }
888 }
889
890 'a: loop {
891 debug_assert!(!self.is_eof());
892 let left_codepoint = self.get_current_codepoint()?;
893 let mut left_prop = self.get_linebreak_property(left_codepoint);
894 self.advance_iter();
895
896 let Some(right_codepoint) = self.get_current_codepoint() else {
897 return Some(self.len);
898 };
899 let right_prop = self.get_linebreak_property(right_codepoint);
900
901 match (self.options.word_option, left_prop, right_prop) {
903 (LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
904 left_prop = ID;
905 }
906 (
908 LineBreakWordOption::KeepAll,
909 AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
910 AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
911 ) => {
912 continue;
913 }
914 _ => (),
915 }
916
917 match self.options.strictness {
919 LineBreakStrictness::Normal => {
920 if self.is_break_by_normal(right_codepoint) {
921 return self.get_current_position();
922 }
923 }
924 LineBreakStrictness::Loose => {
925 if let Some(breakable) = is_break_utf32_by_loose(
926 right_codepoint.into(),
927 left_prop,
928 right_prop,
929 self.options.ja_zh,
930 ) {
931 if breakable {
932 return self.get_current_position();
933 }
934 continue;
935 }
936 }
937 LineBreakStrictness::Anywhere => {
938 return self.get_current_position();
939 }
940 _ => (),
941 };
942
943 if self.options.word_option != LineBreakWordOption::BreakAll
945 && Y::use_complex_breaking(self, left_codepoint)
946 && Y::use_complex_breaking(self, right_codepoint)
947 {
948 let result = Y::handle_complex_language(self, left_codepoint);
949 if result.is_some() {
950 return result;
951 }
952 }
954
955 let mut index = match self.data.get_break_state_from_table(left_prop, right_prop) {
957 BreakState::Index(index) => index,
958 BreakState::Intermediate(index) => index + 64,
961 BreakState::Break | BreakState::NoMatch => return self.get_current_position(),
962 BreakState::Keep => continue,
963 };
964
965 let mut previous_iter = self.iter.clone();
966 let mut previous_pos_data = self.current_pos_data;
967
968 loop {
969 self.advance_iter();
970
971 let Some(prop) = self.get_current_linebreak_property() else {
972 let break_state = self
974 .data
975 .get_break_state_from_table(index, self.data.eot_property);
976 if break_state == BreakState::NoMatch {
977 self.iter = previous_iter;
978 self.current_pos_data = previous_pos_data;
979 return self.get_current_position();
980 }
981 return Some(self.len);
983 };
984
985 match self.data.get_break_state_from_table(index, prop) {
986 BreakState::Keep => continue 'a,
987 BreakState::NoMatch => {
988 self.iter = previous_iter;
989 self.current_pos_data = previous_pos_data;
990 return self.get_current_position();
991 }
992 BreakState::Break => return self.get_current_position(),
993 BreakState::Index(i) => {
994 index = i;
995 previous_iter = self.iter.clone();
996 previous_pos_data = self.current_pos_data;
997 }
998 BreakState::Intermediate(i) => {
999 index = i + 64;
1000 previous_iter = self.iter.clone();
1001 previous_pos_data = self.current_pos_data;
1002 }
1003 }
1004 }
1005 }
1006 }
1007}
1008
1009enum StringBoundaryPosType {
1010 Start,
1011 Middle,
1012 End,
1013}
1014
1015impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {
1016 fn advance_iter(&mut self) {
1017 self.current_pos_data = self.iter.next();
1018 }
1019
1020 fn is_eof(&self) -> bool {
1021 self.current_pos_data.is_none()
1022 }
1023
1024 #[inline]
1025 fn check_eof(&mut self) -> StringBoundaryPosType {
1026 if self.is_eof() {
1027 self.advance_iter();
1028 if self.is_eof() {
1029 if self.len == 0 {
1030 self.len = 1;
1034 StringBoundaryPosType::Start
1035 } else {
1036 StringBoundaryPosType::End
1037 }
1038 } else {
1039 StringBoundaryPosType::Start
1040 }
1041 } else {
1042 StringBoundaryPosType::Middle
1043 }
1044 }
1045
1046 fn get_current_position(&self) -> Option<usize> {
1047 self.current_pos_data.map(|(pos, _)| pos)
1048 }
1049
1050 fn get_current_codepoint(&self) -> Option<Y::CharType> {
1051 self.current_pos_data.map(|(_, codepoint)| codepoint)
1052 }
1053
1054 fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
1055 Y::get_linebreak_property_with_rule(self, codepoint)
1056 }
1057
1058 fn get_current_linebreak_property(&self) -> Option<u8> {
1059 self.get_current_codepoint()
1060 .map(|c| self.get_linebreak_property(c))
1061 }
1062
1063 fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
1064 match codepoint.into() {
1065 0x301C | 0x30A0 => self.options.ja_zh,
1066 _ => false,
1067 }
1068 }
1069}
1070
1071#[derive(Debug)]
1072pub struct LineBreakTypeUtf8;
1073
1074impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf8 {
1075 type IterAttr = CharIndices<'s>;
1076 type CharType = char;
1077
1078 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1079 iterator.data.get_linebreak_property_utf32_with_rule(
1080 c as u32,
1081 iterator.options.strictness,
1082 iterator.options.word_option,
1083 )
1084 }
1085
1086 #[inline]
1087 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1088 iterator.data.use_complex_breaking_utf32(c as u32)
1089 }
1090
1091 fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
1092 iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
1093 }
1094
1095 fn handle_complex_language(
1096 iter: &mut LineBreakIterator<'l, 's, Self>,
1097 left_codepoint: char,
1098 ) -> Option<usize> {
1099 handle_complex_language_utf8(iter, left_codepoint)
1100 }
1101}
1102
1103#[derive(Debug)]
1104pub struct LineBreakTypePotentiallyIllFormedUtf8;
1105
1106impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypePotentiallyIllFormedUtf8 {
1107 type IterAttr = Utf8CharIndices<'s>;
1108 type CharType = char;
1109
1110 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1111 iterator.data.get_linebreak_property_utf32_with_rule(
1112 c as u32,
1113 iterator.options.strictness,
1114 iterator.options.word_option,
1115 )
1116 }
1117
1118 #[inline]
1119 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1120 iterator.data.use_complex_breaking_utf32(c as u32)
1121 }
1122
1123 fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
1124 iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
1125 }
1126
1127 fn handle_complex_language(
1128 iter: &mut LineBreakIterator<'l, 's, Self>,
1129 left_codepoint: char,
1130 ) -> Option<usize> {
1131 handle_complex_language_utf8(iter, left_codepoint)
1132 }
1133}
1134fn handle_complex_language_utf8<'l, 's, T>(
1136 iter: &mut LineBreakIterator<'l, 's, T>,
1137 left_codepoint: char,
1138) -> Option<usize>
1139where
1140 T: LineBreakType<'l, 's, CharType = char>,
1141{
1142 let start_iter = iter.iter.clone();
1144 let start_point = iter.current_pos_data;
1145 let mut s = String::new();
1146 s.push(left_codepoint);
1147 loop {
1148 debug_assert!(!iter.is_eof());
1149 s.push(iter.get_current_codepoint()?);
1150 iter.advance_iter();
1151 if let Some(current_codepoint) = iter.get_current_codepoint() {
1152 if !T::use_complex_breaking(iter, current_codepoint) {
1153 break;
1154 }
1155 } else {
1156 break;
1158 }
1159 }
1160
1161 iter.iter = start_iter;
1163 iter.current_pos_data = start_point;
1164 let breaks = complex_language_segment_str(iter.complex, &s);
1165 iter.result_cache = breaks;
1166 let first_pos = *iter.result_cache.first()?;
1167 let mut i = left_codepoint.len_utf8();
1168 loop {
1169 if i == first_pos {
1170 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
1172 return iter.get_current_position();
1173 }
1174 debug_assert!(
1175 i < first_pos,
1176 "we should always arrive at first_pos: near index {:?}",
1177 iter.get_current_position()
1178 );
1179 i += T::get_current_position_character_len(iter);
1180 iter.advance_iter();
1181 if iter.is_eof() {
1182 iter.result_cache.clear();
1183 return Some(iter.len);
1184 }
1185 }
1186}
1187
1188#[derive(Debug)]
1189pub struct LineBreakTypeLatin1;
1190
1191impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeLatin1 {
1192 type IterAttr = Latin1Indices<'s>;
1193 type CharType = u8;
1194
1195 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
1196 iterator.data.property_table.get32(c as u32)
1199 }
1200
1201 #[inline]
1202 fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
1203 false
1204 }
1205
1206 fn get_current_position_character_len(_: &LineBreakIterator<Self>) -> usize {
1207 unreachable!()
1208 }
1209
1210 fn handle_complex_language(
1211 _: &mut LineBreakIterator<Self>,
1212 _: Self::CharType,
1213 ) -> Option<usize> {
1214 unreachable!()
1215 }
1216}
1217
1218#[derive(Debug)]
1219pub struct LineBreakTypeUtf16;
1220
1221impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 {
1222 type IterAttr = Utf16Indices<'s>;
1223 type CharType = u32;
1224
1225 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
1226 iterator.data.get_linebreak_property_utf32_with_rule(
1227 c,
1228 iterator.options.strictness,
1229 iterator.options.word_option,
1230 )
1231 }
1232
1233 #[inline]
1234 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
1235 iterator.data.use_complex_breaking_utf32(c)
1236 }
1237
1238 fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
1239 match iterator.get_current_codepoint() {
1240 None => 0,
1241 Some(ch) if ch >= 0x10000 => 2,
1242 _ => 1,
1243 }
1244 }
1245
1246 fn handle_complex_language(
1247 iterator: &mut LineBreakIterator<Self>,
1248 left_codepoint: Self::CharType,
1249 ) -> Option<usize> {
1250 let start_iter = iterator.iter.clone();
1252 let start_point = iterator.current_pos_data;
1253 let mut s = vec![left_codepoint as u16];
1254 loop {
1255 debug_assert!(!iterator.is_eof());
1256 s.push(iterator.get_current_codepoint()? as u16);
1257 iterator.advance_iter();
1258 if let Some(current_codepoint) = iterator.get_current_codepoint() {
1259 if !Self::use_complex_breaking(iterator, current_codepoint) {
1260 break;
1261 }
1262 } else {
1263 break;
1265 }
1266 }
1267
1268 iterator.iter = start_iter;
1270 iterator.current_pos_data = start_point;
1271 let breaks = complex_language_segment_utf16(iterator.complex, &s);
1272 iterator.result_cache = breaks;
1273 let first_pos = *iterator.result_cache.first()?;
1275 let mut i = 1;
1276 loop {
1277 if i == first_pos {
1278 iterator.result_cache = iterator
1280 .result_cache
1281 .iter()
1282 .skip(1)
1283 .map(|r| r - i)
1284 .collect();
1285 return iterator.get_current_position();
1286 }
1287 debug_assert!(
1288 i < first_pos,
1289 "we should always arrive at first_pos: near index {:?}",
1290 iterator.get_current_position()
1291 );
1292 i += 1;
1293 iterator.advance_iter();
1294 if iterator.is_eof() {
1295 iterator.result_cache.clear();
1296 return Some(iterator.len);
1297 }
1298 }
1299 }
1300}
1301
1302#[cfg(test)]
1303#[cfg(feature = "serde")]
1304mod tests {
1305 use super::*;
1306 use crate::LineSegmenter;
1307
1308 #[test]
1309 fn linebreak_property() {
1310 let payload = DataProvider::<LineBreakDataV1Marker>::load(
1311 &crate::provider::Baked,
1312 Default::default(),
1313 )
1314 .expect("Loading should succeed!")
1315 .take_payload()
1316 .expect("Data should be present!");
1317
1318 let get_linebreak_property = |codepoint| {
1319 payload.get().get_linebreak_property_utf32_with_rule(
1320 codepoint as u32,
1321 LineBreakStrictness::Strict,
1322 LineBreakWordOption::Normal,
1323 )
1324 };
1325
1326 assert_eq!(get_linebreak_property('\u{0020}'), SP);
1327 assert_eq!(get_linebreak_property('\u{0022}'), QU);
1328 assert_eq!(get_linebreak_property('('), OP_OP30);
1329 assert_eq!(get_linebreak_property('\u{0030}'), NU);
1330 assert_eq!(get_linebreak_property('['), OP_OP30);
1331 assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
1332 assert_eq!(get_linebreak_property('\u{20000}'), ID);
1333 assert_eq!(get_linebreak_property('\u{e0020}'), CM);
1334 assert_eq!(get_linebreak_property('\u{3041}'), CJ);
1335 assert_eq!(get_linebreak_property('\u{0025}'), PO);
1336 assert_eq!(get_linebreak_property('\u{00A7}'), AI);
1337 assert_eq!(get_linebreak_property('\u{50005}'), XX);
1338 assert_eq!(get_linebreak_property('\u{17D6}'), NS);
1339 assert_eq!(get_linebreak_property('\u{2014}'), B2);
1340 }
1341
1342 #[test]
1343 #[allow(clippy::bool_assert_comparison)] fn break_rule() {
1345 let payload = DataProvider::<LineBreakDataV1Marker>::load(
1346 &crate::provider::Baked,
1347 Default::default(),
1348 )
1349 .expect("Loading should succeed!")
1350 .take_payload()
1351 .expect("Data should be present!");
1352 let lb_data: &RuleBreakDataV1 = payload.get();
1353
1354 let is_break = |left, right| {
1355 matches!(
1356 lb_data.get_break_state_from_table(left, right),
1357 BreakState::Break | BreakState::NoMatch
1358 )
1359 };
1360
1361 assert_eq!(is_break(BK, AL), true);
1363 assert_eq!(is_break(CR, LF), false);
1365 assert_eq!(is_break(CR, AL), true);
1366 assert_eq!(is_break(LF, AL), true);
1367 assert_eq!(is_break(NL, AL), true);
1368 assert_eq!(is_break(AL, BK), false);
1370 assert_eq!(is_break(AL, CR), false);
1371 assert_eq!(is_break(AL, LF), false);
1372 assert_eq!(is_break(AL, NL), false);
1373 assert_eq!(is_break(AL, SP), false);
1375 assert_eq!(is_break(AL, ZW), false);
1376 assert_eq!(is_break(ZWJ, AL), false);
1379 assert_eq!(is_break(AL, ZWJ), false);
1381 assert_eq!(is_break(AL, CM), false);
1382 assert_eq!(is_break(ID, ZWJ), false);
1383 assert_eq!(is_break(ZWJ, SP), false);
1385 assert_eq!(is_break(SP, CM), true);
1386 assert_eq!(is_break(AL, WJ), false);
1388 assert_eq!(is_break(WJ, AL), false);
1389 assert_eq!(is_break(GL, AL), false);
1391 assert_eq!(is_break(AL, GL), false);
1393 assert_eq!(is_break(SP, GL), true);
1394 assert_eq!(is_break(AL, CL), false);
1396 assert_eq!(is_break(AL, CP), false);
1397 assert_eq!(is_break(AL, EX), false);
1398 assert_eq!(is_break(AL, IS), false);
1399 assert_eq!(is_break(AL, SY), false);
1400 assert_eq!(is_break(SP, AL), true);
1402 assert_eq!(is_break(AL, QU), false);
1404 assert_eq!(is_break(QU, AL), false);
1405 assert_eq!(is_break(AL, CB), true);
1407 assert_eq!(is_break(CB, AL), true);
1408 assert_eq!(is_break(AL, BA), false);
1410 assert_eq!(is_break(AL, HY), false);
1411 assert_eq!(is_break(AL, NS), false);
1412 assert_eq!(is_break(AL, BA), false);
1414 assert_eq!(is_break(BB, AL), false);
1415 assert_eq!(is_break(ID, BA), false);
1416 assert_eq!(is_break(ID, NS), false);
1417 assert_eq!(is_break(SY, HL), false);
1420 assert_eq!(is_break(AL, IN), false);
1422 assert_eq!(is_break(AL, NU), false);
1424 assert_eq!(is_break(HL, NU), false);
1425 assert_eq!(is_break(PR, ID), false);
1427 assert_eq!(is_break(PR, EB), false);
1428 assert_eq!(is_break(PR, EM), false);
1429 assert_eq!(is_break(ID, PO), false);
1430 assert_eq!(is_break(EB, PO), false);
1431 assert_eq!(is_break(EM, PO), false);
1432 assert_eq!(is_break(JL, JL), false);
1434 assert_eq!(is_break(JL, JV), false);
1435 assert_eq!(is_break(JL, H2), false);
1436 assert_eq!(is_break(JL, IN), false);
1438 assert_eq!(is_break(JL, PO), false);
1439 assert_eq!(is_break(PR, JL), false);
1440 assert_eq!(is_break(AL, AL), false);
1442 assert_eq!(is_break(HL, AL), false);
1443 assert_eq!(is_break(IS, AL), false);
1445 assert_eq!(is_break(IS, HL), false);
1446 assert_eq!(is_break(EB, EM), false);
1448 assert_eq!(is_break(ID, ID), true);
1450 }
1451
1452 #[test]
1453 fn linebreak() {
1454 let segmenter = LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked)
1455 .expect("Data exists");
1456
1457 let mut iter = segmenter.segment_str("hello world");
1458 assert_eq!(Some(0), iter.next());
1459 assert_eq!(Some(6), iter.next());
1460 assert_eq!(Some(11), iter.next());
1461 assert_eq!(None, iter.next());
1462
1463 iter = segmenter.segment_str("$10 $10");
1464 assert_eq!(Some(0), iter.next());
1465 assert_eq!(Some(4), iter.next());
1466 assert_eq!(Some(7), iter.next());
1467 assert_eq!(None, iter.next());
1468
1469 iter = segmenter.segment_str("[ abc def");
1473 assert_eq!(Some(0), iter.next());
1474 assert_eq!(Some(7), iter.next());
1475 assert_eq!(Some(10), iter.next());
1476 assert_eq!(None, iter.next());
1477
1478 let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1479 let mut iter_u8 = segmenter.segment_latin1(&input);
1480 assert_eq!(Some(0), iter_u8.next());
1481 assert_eq!(Some(7), iter_u8.next());
1482 assert_eq!(Some(10), iter_u8.next());
1483 assert_eq!(None, iter_u8.next());
1484
1485 let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1486 let mut iter_u16 = segmenter.segment_utf16(&input);
1487 assert_eq!(Some(0), iter_u16.next());
1488 assert_eq!(Some(7), iter_u16.next());
1489 assert_eq!(Some(10), iter_u16.next());
1490 assert_eq!(None, iter_u16.next());
1491
1492 iter = segmenter.segment_str("abc\u{0022} (def");
1494 assert_eq!(Some(0), iter.next());
1495 assert_eq!(Some(10), iter.next());
1496 assert_eq!(None, iter.next());
1497
1498 let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1499 let mut iter_u8 = segmenter.segment_latin1(&input);
1500 assert_eq!(Some(0), iter_u8.next());
1501 assert_eq!(Some(10), iter_u8.next());
1502 assert_eq!(None, iter_u8.next());
1503
1504 let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1505 let mut iter_u16 = segmenter.segment_utf16(&input);
1506 assert_eq!(Some(0), iter_u16.next());
1507 assert_eq!(Some(10), iter_u16.next());
1508 assert_eq!(None, iter_u16.next());
1509
1510 iter = segmenter.segment_str("\u{0029}\u{203C}");
1512 assert_eq!(Some(0), iter.next());
1513 assert_eq!(Some(4), iter.next());
1514 assert_eq!(None, iter.next());
1515 iter = segmenter.segment_str("\u{0029} \u{203C}");
1516 assert_eq!(Some(0), iter.next());
1517 assert_eq!(Some(6), iter.next());
1518 assert_eq!(None, iter.next());
1519
1520 let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
1521 let mut iter_u16 = segmenter.segment_utf16(&input);
1522 assert_eq!(Some(0), iter_u16.next());
1523 assert_eq!(Some(4), iter_u16.next());
1524 assert_eq!(None, iter_u16.next());
1525
1526 iter = segmenter.segment_str("\u{2014}\u{2014}aa");
1528 assert_eq!(Some(0), iter.next());
1529 assert_eq!(Some(6), iter.next());
1530 assert_eq!(Some(8), iter.next());
1531 assert_eq!(None, iter.next());
1532 iter = segmenter.segment_str("\u{2014} \u{2014}aa");
1533 assert_eq!(Some(0), iter.next());
1534 assert_eq!(Some(8), iter.next());
1535 assert_eq!(Some(10), iter.next());
1536 assert_eq!(None, iter.next());
1537
1538 iter = segmenter.segment_str("\u{2014}\u{2014} \u{2014}\u{2014}123 abc");
1539 assert_eq!(Some(0), iter.next());
1540 assert_eq!(Some(14), iter.next());
1541 assert_eq!(Some(18), iter.next());
1542 assert_eq!(Some(21), iter.next());
1543 assert_eq!(None, iter.next());
1544
1545 let mut iter = segmenter.segment_str("(0,1)+(2,3)");
1547 assert_eq!(Some(0), iter.next());
1548 assert_eq!(Some(11), iter.next());
1549 assert_eq!(None, iter.next());
1550 let input: [u16; 11] = [
1551 0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
1552 ];
1553 let mut iter_u16 = segmenter.segment_utf16(&input);
1554 assert_eq!(Some(0), iter_u16.next());
1555 assert_eq!(Some(11), iter_u16.next());
1556 assert_eq!(None, iter_u16.next());
1557
1558 let input: [u16; 13] = [
1559 0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
1560 ];
1561 let mut iter_u16 = segmenter.segment_utf16(&input);
1562 assert_eq!(Some(0), iter_u16.next());
1563 assert_eq!(Some(6), iter_u16.next());
1564 assert_eq!(Some(10), iter_u16.next());
1565 assert_eq!(Some(13), iter_u16.next());
1566 assert_eq!(None, iter_u16.next());
1567
1568 iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
1569 assert_eq!(Some(0), iter.next());
1570 assert_eq!(Some(5), iter.next());
1571 assert_eq!(Some(9), iter.next());
1572 assert_eq!(None, iter.next());
1573 }
1574
1575 #[test]
1576 #[cfg(feature = "lstm")]
1577 fn thai_line_break() {
1578 const TEST_STR: &str = "ΰΈ ΰΈ²ΰΈ©ΰΈ²ΰΉΰΈΰΈ’ΰΈ ΰΈ²ΰΈ©ΰΈ²ΰΉΰΈΰΈ’";
1579
1580 let segmenter = LineSegmenter::new_lstm();
1581 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1582 assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
1583
1584 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1585 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1586 assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
1587
1588 let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
1589 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1590 assert_eq!(breaks, [0, 4], "Thai test");
1591 }
1592
1593 #[test]
1594 #[cfg(feature = "lstm")]
1595 fn burmese_line_break() {
1596 const TEST_STR: &str = "ααΌααΊαα¬αα¬αα¬α
αα¬αΈ";
1598
1599 let segmenter = LineSegmenter::new_lstm();
1600 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1601 assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
1603
1604 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1605 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1606 assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
1608 }
1609
1610 #[test]
1611 #[cfg(feature = "lstm")]
1612 fn khmer_line_break() {
1613 const TEST_STR: &str = "ααα
ααααΈαααααΆαααΆαααααααΈααΈαα·αααα·ααα»ααα";
1614
1615 let segmenter = LineSegmenter::new_lstm();
1616 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1617 assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
1619
1620 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1621 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1622 assert_eq!(
1623 breaks,
1624 [0, 13, 16, 18, 24, utf16.len()],
1625 "Khmer utf-16 test"
1626 );
1627 }
1628
1629 #[test]
1630 #[cfg(feature = "lstm")]
1631 fn lao_line_break() {
1632 const TEST_STR: &str = "ΰΊΰ»ΰΊ½ΰΊ§ΰΊΰΊ±ΰΊΰΊͺΰΊ΄ΰΊΰΊΰΊΰΊΰΊ‘ΰΊ°ΰΊΰΊΈΰΊ";
1633
1634 let segmenter = LineSegmenter::new_lstm();
1635 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1636 assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
1638
1639 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1640 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1641 assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
1642 }
1643
1644 #[test]
1645 fn empty_string() {
1646 let segmenter = LineSegmenter::new_auto();
1647 let breaks: Vec<usize> = segmenter.segment_str("").collect();
1648 assert_eq!(breaks, [0]);
1649 }
1650}