icu_segmenter/provider/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
6//!
7//! <div class="stab unstable">
8//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10//! to be stable, their Rust representation might not be. Use with caution.
11//! </div>
12//!
13//! Read more about data providers: [`icu_provider`]
14
15// Provider structs must be stable
16#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
17
18mod lstm;
19pub use lstm::*;
20
21use crate::WordType;
22use icu_collections::codepointtrie::CodePointTrie;
23use icu_provider::prelude::*;
24use zerovec::ZeroVec;
25
26#[cfg(feature = "compiled_data")]
27#[derive(Debug)]
28/// Baked data
29///
30/// <div class="stab unstable">
31/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
32/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
33/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
34/// </div>
35pub struct Baked;
36
37#[cfg(feature = "compiled_data")]
38const _: () = {
39    pub mod icu {
40        pub use crate as segmenter;
41        pub use icu_collections as collections;
42    }
43    icu_segmenter_data::make_provider!(Baked);
44    icu_segmenter_data::impl_segmenter_dictionary_w_auto_v1!(Baked);
45    icu_segmenter_data::impl_segmenter_dictionary_wl_ext_v1!(Baked);
46    icu_segmenter_data::impl_segmenter_grapheme_v1!(Baked);
47    icu_segmenter_data::impl_segmenter_line_v1!(Baked);
48    #[cfg(feature = "lstm")]
49    icu_segmenter_data::impl_segmenter_lstm_wl_auto_v1!(Baked);
50    icu_segmenter_data::impl_segmenter_sentence_v1!(Baked);
51    icu_segmenter_data::impl_segmenter_word_v1!(Baked);
52};
53
54#[cfg(feature = "datagen")]
55/// The latest minimum set of keys required by this component.
56pub const KEYS: &[DataKey] = &[
57    DictionaryForWordLineExtendedV1Marker::KEY,
58    DictionaryForWordOnlyAutoV1Marker::KEY,
59    GraphemeClusterBreakDataV1Marker::KEY,
60    LineBreakDataV1Marker::KEY,
61    LstmForWordLineAutoV1Marker::KEY,
62    SentenceBreakDataV1Marker::KEY,
63    WordBreakDataV1Marker::KEY,
64];
65
66/// Pre-processed Unicode data in the form of tables to be used for rule-based breaking.
67///
68/// <div class="stab unstable">
69/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
70/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
71/// to be stable, their Rust representation might not be. Use with caution.
72/// </div>
73#[icu_provider::data_struct(
74    marker(LineBreakDataV1Marker, "segmenter/line@1", singleton),
75    marker(WordBreakDataV1Marker, "segmenter/word@1", singleton),
76    marker(GraphemeClusterBreakDataV1Marker, "segmenter/grapheme@1", singleton),
77    marker(SentenceBreakDataV1Marker, "segmenter/sentence@1", singleton)
78)]
79#[derive(Debug, PartialEq, Clone)]
80#[cfg_attr(
81    feature = "datagen",
82    derive(serde::Serialize,databake::Bake),
83    databake(path = icu_segmenter::provider),
84)]
85#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
86pub struct RuleBreakDataV1<'data> {
87    /// Property table.
88    #[cfg_attr(feature = "serde", serde(borrow))]
89    pub property_table: CodePointTrie<'data, u8>,
90
91    /// Break state table.
92    #[cfg_attr(feature = "serde", serde(borrow))]
93    pub break_state_table: ZeroVec<'data, BreakState>,
94
95    /// Word type table. Only used for word segmenter.
96    #[cfg_attr(feature = "serde", serde(borrow, rename = "rule_status_table"))]
97    pub word_type_table: ZeroVec<'data, WordType>,
98
99    /// Number of properties; should be the square root of the length of [`Self::break_state_table`].
100    pub property_count: u8,
101
102    /// The index of the last simple state for [`Self::break_state_table`]. (A simple state has no
103    /// `left` nor `right` in SegmenterProperty).
104    pub last_codepoint_property: u8,
105
106    /// The index of SOT (start of text) state for [`Self::break_state_table`].
107    pub sot_property: u8,
108
109    /// The index of EOT (end of text) state [`Self::break_state_table`].
110    pub eot_property: u8,
111
112    /// The index of "SA" state (or 127 if the complex language isn't handled) for
113    /// [`Self::break_state_table`].
114    pub complex_property: u8,
115}
116
117/// char16trie data for dictionary break
118///
119/// <div class="stab unstable">
120/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
121/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
122/// to be stable, their Rust representation might not be. Use with caution.
123/// </div>
124#[icu_provider::data_struct(
125    DictionaryForWordOnlyAutoV1Marker = "segmenter/dictionary/w_auto@1",
126    DictionaryForWordLineExtendedV1Marker = "segmenter/dictionary/wl_ext@1"
127)]
128#[derive(Debug, PartialEq, Clone)]
129#[cfg_attr(
130    feature = "datagen",
131    derive(serde::Serialize,databake::Bake),
132    databake(path = icu_segmenter::provider),
133)]
134#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
135pub struct UCharDictionaryBreakDataV1<'data> {
136    /// Dictionary data of char16trie.
137    #[cfg_attr(feature = "serde", serde(borrow))]
138    pub trie_data: ZeroVec<'data, u16>,
139}
140
141pub(crate) struct UCharDictionaryBreakDataV1Marker;
142
143impl DataMarker for UCharDictionaryBreakDataV1Marker {
144    type Yokeable = UCharDictionaryBreakDataV1<'static>;
145}
146
147#[derive(Clone, Copy, PartialEq, Debug)]
148#[cfg_attr(
149    feature = "datagen",
150    derive(databake::Bake),
151    databake(path = icu_segmenter::provider),
152)]
153/// Break state
154///
155/// <div class="stab unstable">
156/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
157/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
158/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
159/// </div>
160pub enum BreakState {
161    /// Break
162    Break,
163    /// Keep rule
164    Keep,
165    /// Non-matching rule
166    NoMatch,
167    /// We have to look ahead one more character.
168    Intermediate(u8),
169    /// Index of a state.
170    Index(u8),
171}
172
173#[cfg(feature = "datagen")]
174impl serde::Serialize for BreakState {
175    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
176    where
177        S: serde::Serializer,
178    {
179        // would be nice to use the derive serde for JSON, but can't break serialization
180        if serializer.is_human_readable() {
181            i8::from_le_bytes([zerovec::ule::AsULE::to_unaligned(*self)]).serialize(serializer)
182        } else {
183            zerovec::ule::AsULE::to_unaligned(*self).serialize(serializer)
184        }
185    }
186}
187
188#[cfg(feature = "serde")]
189impl<'de> serde::Deserialize<'de> for BreakState {
190    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
191    where
192        D: serde::Deserializer<'de>,
193    {
194        if deserializer.is_human_readable() {
195            Ok(zerovec::ule::AsULE::from_unaligned(
196                i8::deserialize(deserializer)?.to_le_bytes()[0],
197            ))
198        } else {
199            u8::deserialize(deserializer).map(zerovec::ule::AsULE::from_unaligned)
200        }
201    }
202}
203
204impl zerovec::ule::AsULE for BreakState {
205    type ULE = u8;
206
207    fn to_unaligned(self) -> Self::ULE {
208        match self {
209            BreakState::Break => 128,
210            BreakState::Keep => 255,
211            BreakState::NoMatch => 254,
212            BreakState::Intermediate(i) => i | 64,
213            BreakState::Index(i) => i,
214        }
215    }
216
217    fn from_unaligned(unaligned: Self::ULE) -> Self {
218        match unaligned {
219            128 => BreakState::Break,
220            255 => BreakState::Keep,
221            254 => BreakState::NoMatch,
222            i if i & 64 != 0 => BreakState::Intermediate(i & !64),
223            i => BreakState::Index(i),
224        }
225    }
226}
227
228#[cfg(feature = "datagen")]
229impl serde::Serialize for WordType {
230    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
231    where
232        S: serde::Serializer,
233    {
234        if serializer.is_human_readable() {
235            (*self as u8).serialize(serializer)
236        } else {
237            unreachable!("only used as ULE")
238        }
239    }
240}
241
242#[cfg(feature = "datagen")]
243impl databake::Bake for WordType {
244    fn bake(&self, _crate_env: &databake::CrateEnv) -> databake::TokenStream {
245        unreachable!("only used as ULE")
246    }
247}
248
249#[cfg(feature = "serde")]
250impl<'de> serde::Deserialize<'de> for WordType {
251    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
252    where
253        D: serde::Deserializer<'de>,
254    {
255        if deserializer.is_human_readable() {
256            use serde::de::Error;
257            match u8::deserialize(deserializer) {
258                Ok(0) => Ok(WordType::None),
259                Ok(1) => Ok(WordType::Number),
260                Ok(2) => Ok(WordType::Letter),
261                Ok(_) => Err(D::Error::custom("invalid value")),
262                Err(e) => Err(e),
263            }
264        } else {
265            unreachable!("only used as ULE")
266        }
267    }
268}