1#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
17
18mod lstm;
19pub use lstm::*;
20
21use crate::WordType;
22use icu_collections::codepointtrie::CodePointTrie;
23use icu_provider::prelude::*;
24use zerovec::ZeroVec;
25
26#[cfg(feature = "compiled_data")]
27#[derive(Debug)]
28pub struct Baked;
36
37#[cfg(feature = "compiled_data")]
38const _: () = {
39 pub mod icu {
40 pub use crate as segmenter;
41 pub use icu_collections as collections;
42 }
43 icu_segmenter_data::make_provider!(Baked);
44 icu_segmenter_data::impl_segmenter_dictionary_w_auto_v1!(Baked);
45 icu_segmenter_data::impl_segmenter_dictionary_wl_ext_v1!(Baked);
46 icu_segmenter_data::impl_segmenter_grapheme_v1!(Baked);
47 icu_segmenter_data::impl_segmenter_line_v1!(Baked);
48 #[cfg(feature = "lstm")]
49 icu_segmenter_data::impl_segmenter_lstm_wl_auto_v1!(Baked);
50 icu_segmenter_data::impl_segmenter_sentence_v1!(Baked);
51 icu_segmenter_data::impl_segmenter_word_v1!(Baked);
52};
53
54#[cfg(feature = "datagen")]
55pub const KEYS: &[DataKey] = &[
57 DictionaryForWordLineExtendedV1Marker::KEY,
58 DictionaryForWordOnlyAutoV1Marker::KEY,
59 GraphemeClusterBreakDataV1Marker::KEY,
60 LineBreakDataV1Marker::KEY,
61 LstmForWordLineAutoV1Marker::KEY,
62 SentenceBreakDataV1Marker::KEY,
63 WordBreakDataV1Marker::KEY,
64];
65
66#[icu_provider::data_struct(
74 marker(LineBreakDataV1Marker, "segmenter/line@1", singleton),
75 marker(WordBreakDataV1Marker, "segmenter/word@1", singleton),
76 marker(GraphemeClusterBreakDataV1Marker, "segmenter/grapheme@1", singleton),
77 marker(SentenceBreakDataV1Marker, "segmenter/sentence@1", singleton)
78)]
79#[derive(Debug, PartialEq, Clone)]
80#[cfg_attr(
81 feature = "datagen",
82 derive(serde::Serialize,databake::Bake),
83 databake(path = icu_segmenter::provider),
84)]
85#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
86pub struct RuleBreakDataV1<'data> {
87 #[cfg_attr(feature = "serde", serde(borrow))]
89 pub property_table: CodePointTrie<'data, u8>,
90
91 #[cfg_attr(feature = "serde", serde(borrow))]
93 pub break_state_table: ZeroVec<'data, BreakState>,
94
95 #[cfg_attr(feature = "serde", serde(borrow, rename = "rule_status_table"))]
97 pub word_type_table: ZeroVec<'data, WordType>,
98
99 pub property_count: u8,
101
102 pub last_codepoint_property: u8,
105
106 pub sot_property: u8,
108
109 pub eot_property: u8,
111
112 pub complex_property: u8,
115}
116
117#[icu_provider::data_struct(
125 DictionaryForWordOnlyAutoV1Marker = "segmenter/dictionary/w_auto@1",
126 DictionaryForWordLineExtendedV1Marker = "segmenter/dictionary/wl_ext@1"
127)]
128#[derive(Debug, PartialEq, Clone)]
129#[cfg_attr(
130 feature = "datagen",
131 derive(serde::Serialize,databake::Bake),
132 databake(path = icu_segmenter::provider),
133)]
134#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
135pub struct UCharDictionaryBreakDataV1<'data> {
136 #[cfg_attr(feature = "serde", serde(borrow))]
138 pub trie_data: ZeroVec<'data, u16>,
139}
140
141pub(crate) struct UCharDictionaryBreakDataV1Marker;
142
143impl DataMarker for UCharDictionaryBreakDataV1Marker {
144 type Yokeable = UCharDictionaryBreakDataV1<'static>;
145}
146
147#[derive(Clone, Copy, PartialEq, Debug)]
148#[cfg_attr(
149 feature = "datagen",
150 derive(databake::Bake),
151 databake(path = icu_segmenter::provider),
152)]
153pub enum BreakState {
161 Break,
163 Keep,
165 NoMatch,
167 Intermediate(u8),
169 Index(u8),
171}
172
173#[cfg(feature = "datagen")]
174impl serde::Serialize for BreakState {
175 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
176 where
177 S: serde::Serializer,
178 {
179 if serializer.is_human_readable() {
181 i8::from_le_bytes([zerovec::ule::AsULE::to_unaligned(*self)]).serialize(serializer)
182 } else {
183 zerovec::ule::AsULE::to_unaligned(*self).serialize(serializer)
184 }
185 }
186}
187
188#[cfg(feature = "serde")]
189impl<'de> serde::Deserialize<'de> for BreakState {
190 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
191 where
192 D: serde::Deserializer<'de>,
193 {
194 if deserializer.is_human_readable() {
195 Ok(zerovec::ule::AsULE::from_unaligned(
196 i8::deserialize(deserializer)?.to_le_bytes()[0],
197 ))
198 } else {
199 u8::deserialize(deserializer).map(zerovec::ule::AsULE::from_unaligned)
200 }
201 }
202}
203
204impl zerovec::ule::AsULE for BreakState {
205 type ULE = u8;
206
207 fn to_unaligned(self) -> Self::ULE {
208 match self {
209 BreakState::Break => 128,
210 BreakState::Keep => 255,
211 BreakState::NoMatch => 254,
212 BreakState::Intermediate(i) => i | 64,
213 BreakState::Index(i) => i,
214 }
215 }
216
217 fn from_unaligned(unaligned: Self::ULE) -> Self {
218 match unaligned {
219 128 => BreakState::Break,
220 255 => BreakState::Keep,
221 254 => BreakState::NoMatch,
222 i if i & 64 != 0 => BreakState::Intermediate(i & !64),
223 i => BreakState::Index(i),
224 }
225 }
226}
227
228#[cfg(feature = "datagen")]
229impl serde::Serialize for WordType {
230 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
231 where
232 S: serde::Serializer,
233 {
234 if serializer.is_human_readable() {
235 (*self as u8).serialize(serializer)
236 } else {
237 unreachable!("only used as ULE")
238 }
239 }
240}
241
242#[cfg(feature = "datagen")]
243impl databake::Bake for WordType {
244 fn bake(&self, _crate_env: &databake::CrateEnv) -> databake::TokenStream {
245 unreachable!("only used as ULE")
246 }
247}
248
249#[cfg(feature = "serde")]
250impl<'de> serde::Deserialize<'de> for WordType {
251 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
252 where
253 D: serde::Deserializer<'de>,
254 {
255 if deserializer.is_human_readable() {
256 use serde::de::Error;
257 match u8::deserialize(deserializer) {
258 Ok(0) => Ok(WordType::None),
259 Ok(1) => Ok(WordType::Number),
260 Ok(2) => Ok(WordType::Letter),
261 Ok(_) => Err(D::Error::custom("invalid value")),
262 Err(e) => Err(e),
263 }
264 } else {
265 unreachable!("only used as ULE")
266 }
267 }
268}