rustybuzz/hb/
unicode.rs

1use core::convert::TryFrom;
2
3pub use unicode_ccc::CanonicalCombiningClass;
4// TODO: prefer unic-ucd-normal::CanonicalCombiningClass
5pub use unicode_properties::GeneralCategory as hb_unicode_general_category_t;
6
7use crate::Script;
8
9// Space estimates based on:
10// https://unicode.org/charts/PDF/U2000.pdf
11// https://docs.microsoft.com/en-us/typography/develop/character-design-standards/whitespace
12pub mod hb_unicode_funcs_t {
13    pub type space_t = u8;
14    pub const NOT_SPACE: u8 = 0;
15    pub const SPACE_EM: u8 = 1;
16    pub const SPACE_EM_2: u8 = 2;
17    pub const SPACE_EM_3: u8 = 3;
18    pub const SPACE_EM_4: u8 = 4;
19    pub const SPACE_EM_5: u8 = 5;
20    pub const SPACE_EM_6: u8 = 6;
21    pub const SPACE_EM_16: u8 = 16;
22    pub const SPACE_4_EM_18: u8 = 17; // 4/18th of an EM!
23    pub const SPACE: u8 = 18;
24    pub const SPACE_FIGURE: u8 = 19;
25    pub const SPACE_PUNCTUATION: u8 = 20;
26    pub const SPACE_NARROW: u8 = 21;
27}
28
29#[allow(dead_code)]
30pub mod modified_combining_class {
31    // Hebrew
32    //
33    // We permute the "fixed-position" classes 10-26 into the order
34    // described in the SBL Hebrew manual:
35    //
36    // https://www.sbl-site.org/Fonts/SBLHebrewUserManual1.5x.pdf
37    //
38    // (as recommended by:
39    //  https://forum.fontlab.com/archive-old-microsoft-volt-group/vista-and-diacritic-ordering/msg22823/)
40    //
41    // More details here:
42    // https://bugzilla.mozilla.org/show_bug.cgi?id=662055
43    pub const CCC10: u8 = 22; // sheva
44    pub const CCC11: u8 = 15; // hataf segol
45    pub const CCC12: u8 = 16; // hataf patah
46    pub const CCC13: u8 = 17; // hataf qamats
47    pub const CCC14: u8 = 23; // hiriq
48    pub const CCC15: u8 = 18; // tsere
49    pub const CCC16: u8 = 19; // segol
50    pub const CCC17: u8 = 20; // patah
51    pub const CCC18: u8 = 21; // qamats & qamats qatan
52    pub const CCC19: u8 = 14; // holam & holam haser for vav
53    pub const CCC20: u8 = 24; // qubuts
54    pub const CCC21: u8 = 12; // dagesh
55    pub const CCC22: u8 = 25; // meteg
56    pub const CCC23: u8 = 13; // rafe
57    pub const CCC24: u8 = 10; // shin dot
58    pub const CCC25: u8 = 11; // sin dot
59    pub const CCC26: u8 = 26; // point varika
60
61    // Arabic
62    //
63    // Modify to move Shadda (ccc=33) before other marks.  See:
64    // https://unicode.org/faq/normalization.html#8
65    // https://unicode.org/faq/normalization.html#9
66    pub const CCC27: u8 = 28; // fathatan
67    pub const CCC28: u8 = 29; // dammatan
68    pub const CCC29: u8 = 30; // kasratan
69    pub const CCC30: u8 = 31; // fatha
70    pub const CCC31: u8 = 32; // damma
71    pub const CCC32: u8 = 33; // kasra
72    pub const CCC33: u8 = 27; // shadda
73    pub const CCC34: u8 = 34; // sukun
74    pub const CCC35: u8 = 35; // superscript alef
75
76    // Syriac
77    pub const CCC36: u8 = 36; // superscript alaph
78
79    // Telugu
80    //
81    // Modify Telugu length marks (ccc=84, ccc=91).
82    // These are the only matras in the main Indic scripts range that have
83    // a non-zero ccc.  That makes them reorder with the Halant that is
84    // ccc=9.  Just zero them, we don't need them in our Indic shaper.
85    pub const CCC84: u8 = 0; // length mark
86    pub const CCC91: u8 = 0; // ai length mark
87
88    // Thai
89    //
90    // Modify U+0E38 and U+0E39 (ccc=103) to be reordered before U+0E3A (ccc=9).
91    // Assign 3, which is unassigned otherwise.
92    // Uniscribe does this reordering too.
93    pub const CCC103: u8 = 3; // sara u / sara uu
94    pub const CCC107: u8 = 107; // mai *
95
96    // Lao
97    pub const CCC118: u8 = 118; // sign u / sign uu
98    pub const CCC122: u8 = 122; // mai *
99
100    // Tibetan
101    //
102    // In case of multiple vowel-signs, use u first (but after achung)
103    // this allows Dzongkha multi-vowel shortcuts to render correctly
104    pub const CCC129: u8 = 129; // sign aa
105    pub const CCC130: u8 = 132; // sign i
106    pub const CCC132: u8 = 131; // sign u
107}
108
109#[rustfmt::skip]
110const MODIFIED_COMBINING_CLASS: &[u8; 256] = &[
111    CanonicalCombiningClass::NotReordered as u8,
112    CanonicalCombiningClass::Overlay as u8,
113    2, 3, 4, 5, 6,
114    CanonicalCombiningClass::Nukta as u8,
115    CanonicalCombiningClass::KanaVoicing as u8,
116    CanonicalCombiningClass::Virama as u8,
117
118    // Hebrew
119    modified_combining_class::CCC10,
120    modified_combining_class::CCC11,
121    modified_combining_class::CCC12,
122    modified_combining_class::CCC13,
123    modified_combining_class::CCC14,
124    modified_combining_class::CCC15,
125    modified_combining_class::CCC16,
126    modified_combining_class::CCC17,
127    modified_combining_class::CCC18,
128    modified_combining_class::CCC19,
129    modified_combining_class::CCC20,
130    modified_combining_class::CCC21,
131    modified_combining_class::CCC22,
132    modified_combining_class::CCC23,
133    modified_combining_class::CCC24,
134    modified_combining_class::CCC25,
135    modified_combining_class::CCC26,
136
137    // Arabic
138    modified_combining_class::CCC27,
139    modified_combining_class::CCC28,
140    modified_combining_class::CCC29,
141    modified_combining_class::CCC30,
142    modified_combining_class::CCC31,
143    modified_combining_class::CCC32,
144    modified_combining_class::CCC33,
145    modified_combining_class::CCC34,
146    modified_combining_class::CCC35,
147
148    // Syriac
149    modified_combining_class::CCC36,
150
151    37, 38, 39,
152    40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
153    60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
154    80, 81, 82, 83,
155
156    // Telugu
157    modified_combining_class::CCC84,
158    85, 86, 87, 88, 89, 90,
159    modified_combining_class::CCC91,
160    92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
161
162    // Thai
163    modified_combining_class::CCC103,
164    104, 105, 106,
165    modified_combining_class::CCC107,
166    108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
167
168    // Lao
169    modified_combining_class::CCC118,
170    119, 120, 121,
171    modified_combining_class::CCC122,
172    123, 124, 125, 126, 127, 128,
173
174    // Tibetan
175    modified_combining_class::CCC129,
176    modified_combining_class::CCC130,
177    131,
178    modified_combining_class::CCC132,
179    133, 134, 135, 136, 137, 138, 139,
180
181
182    140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
183    150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
184    160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
185    170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
186    180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
187    190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
188
189    CanonicalCombiningClass::AttachedBelowLeft as u8,
190    201,
191    CanonicalCombiningClass::AttachedBelow as u8,
192    203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,
193    CanonicalCombiningClass::AttachedAbove as u8,
194    215,
195    CanonicalCombiningClass::AttachedAboveRight as u8,
196    217,
197    CanonicalCombiningClass::BelowLeft as u8,
198    219,
199    CanonicalCombiningClass::Below as u8,
200    221,
201    CanonicalCombiningClass::BelowRight as u8,
202    223,
203    CanonicalCombiningClass::Left as u8,
204    225,
205    CanonicalCombiningClass::Right as u8,
206    227,
207    CanonicalCombiningClass::AboveLeft as u8,
208    229,
209    CanonicalCombiningClass::Above as u8,
210    231,
211    CanonicalCombiningClass::AboveRight as u8,
212    CanonicalCombiningClass::DoubleBelow as u8,
213    CanonicalCombiningClass::DoubleAbove as u8,
214    235, 236, 237, 238, 239,
215    CanonicalCombiningClass::IotaSubscript as u8,
216    241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
217    255, // RB_UNICODE_COMBINING_CLASS_INVALID
218];
219
220pub trait GeneralCategoryExt {
221    fn to_rb(&self) -> u32;
222    fn from_rb(gc: u32) -> Self;
223    fn is_mark(&self) -> bool;
224    fn is_letter(&self) -> bool;
225}
226
227#[rustfmt::skip]
228impl GeneralCategoryExt for hb_unicode_general_category_t {
229    fn to_rb(&self) -> u32 {
230        match *self {
231            hb_unicode_general_category_t::ClosePunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION,
232            hb_unicode_general_category_t::ConnectorPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION,
233            hb_unicode_general_category_t::Control => hb_gc::RB_UNICODE_GENERAL_CATEGORY_CONTROL,
234            hb_unicode_general_category_t::CurrencySymbol => hb_gc::RB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL,
235            hb_unicode_general_category_t::DashPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION,
236            hb_unicode_general_category_t::DecimalNumber => hb_gc::RB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER,
237            hb_unicode_general_category_t::EnclosingMark => hb_gc::RB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK,
238            hb_unicode_general_category_t::FinalPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION,
239            hb_unicode_general_category_t::Format => hb_gc::RB_UNICODE_GENERAL_CATEGORY_FORMAT,
240            hb_unicode_general_category_t::InitialPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION,
241            hb_unicode_general_category_t::LetterNumber => hb_gc::RB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER,
242            hb_unicode_general_category_t::LineSeparator => hb_gc::RB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR,
243            hb_unicode_general_category_t::LowercaseLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER,
244            hb_unicode_general_category_t::MathSymbol => hb_gc::RB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL,
245            hb_unicode_general_category_t::ModifierLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER,
246            hb_unicode_general_category_t::ModifierSymbol => hb_gc::RB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL,
247            hb_unicode_general_category_t::NonspacingMark => hb_gc::RB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK,
248            hb_unicode_general_category_t::OpenPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION,
249            hb_unicode_general_category_t::OtherLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER,
250            hb_unicode_general_category_t::OtherNumber => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER,
251            hb_unicode_general_category_t::OtherPunctuation => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION,
252            hb_unicode_general_category_t::OtherSymbol => hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL,
253            hb_unicode_general_category_t::ParagraphSeparator => hb_gc::RB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR,
254            hb_unicode_general_category_t::PrivateUse => hb_gc::RB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE,
255            hb_unicode_general_category_t::SpaceSeparator => hb_gc::RB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR,
256            hb_unicode_general_category_t::SpacingMark => hb_gc::RB_UNICODE_GENERAL_CATEGORY_SPACING_MARK,
257            hb_unicode_general_category_t::Surrogate => hb_gc::RB_UNICODE_GENERAL_CATEGORY_SURROGATE,
258            hb_unicode_general_category_t::TitlecaseLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER,
259            hb_unicode_general_category_t::Unassigned => hb_gc::RB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,
260            hb_unicode_general_category_t::UppercaseLetter => hb_gc::RB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER
261        }
262    }
263
264    fn from_rb(gc: u32) -> Self {
265        match gc {
266            hb_gc::RB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION => hb_unicode_general_category_t::ClosePunctuation,
267            hb_gc::RB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION => hb_unicode_general_category_t::ConnectorPunctuation,
268            hb_gc::RB_UNICODE_GENERAL_CATEGORY_CONTROL => hb_unicode_general_category_t::Control,
269            hb_gc::RB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL => hb_unicode_general_category_t::CurrencySymbol,
270            hb_gc::RB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION => hb_unicode_general_category_t::DashPunctuation,
271            hb_gc::RB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER => hb_unicode_general_category_t::DecimalNumber,
272            hb_gc::RB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK => hb_unicode_general_category_t::EnclosingMark,
273            hb_gc::RB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION => hb_unicode_general_category_t::FinalPunctuation,
274            hb_gc::RB_UNICODE_GENERAL_CATEGORY_FORMAT => hb_unicode_general_category_t::Format,
275            hb_gc::RB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION => hb_unicode_general_category_t::InitialPunctuation,
276            hb_gc::RB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER => hb_unicode_general_category_t::LetterNumber,
277            hb_gc::RB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR => hb_unicode_general_category_t::LineSeparator,
278            hb_gc::RB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER => hb_unicode_general_category_t::LowercaseLetter,
279            hb_gc::RB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL => hb_unicode_general_category_t::MathSymbol,
280            hb_gc::RB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER => hb_unicode_general_category_t::ModifierLetter,
281            hb_gc::RB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL => hb_unicode_general_category_t::ModifierSymbol,
282            hb_gc::RB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK => hb_unicode_general_category_t::NonspacingMark,
283            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION => hb_unicode_general_category_t::OpenPunctuation,
284            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER => hb_unicode_general_category_t::OtherLetter,
285            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER => hb_unicode_general_category_t::OtherNumber,
286            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION => hb_unicode_general_category_t::OtherPunctuation,
287            hb_gc::RB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL => hb_unicode_general_category_t::OtherSymbol,
288            hb_gc::RB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR => hb_unicode_general_category_t::ParagraphSeparator,
289            hb_gc::RB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE => hb_unicode_general_category_t::PrivateUse,
290            hb_gc::RB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR => hb_unicode_general_category_t::SpaceSeparator,
291            hb_gc::RB_UNICODE_GENERAL_CATEGORY_SPACING_MARK => hb_unicode_general_category_t::SpacingMark,
292            hb_gc::RB_UNICODE_GENERAL_CATEGORY_SURROGATE => hb_unicode_general_category_t::Surrogate,
293            hb_gc::RB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER => hb_unicode_general_category_t::TitlecaseLetter,
294            hb_gc::RB_UNICODE_GENERAL_CATEGORY_UNASSIGNED => hb_unicode_general_category_t::Unassigned,
295            hb_gc::RB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER => hb_unicode_general_category_t::UppercaseLetter,
296            _ => unreachable!()
297        }
298    }
299
300    fn is_mark(&self) -> bool {
301        matches!(*self, 
302            hb_unicode_general_category_t::SpacingMark |
303            hb_unicode_general_category_t::EnclosingMark |
304            hb_unicode_general_category_t::NonspacingMark)
305    }
306
307    fn is_letter(&self) -> bool {
308        matches!(*self, 
309            hb_unicode_general_category_t::LowercaseLetter |
310            hb_unicode_general_category_t::ModifierLetter |
311            hb_unicode_general_category_t::OtherLetter |
312            hb_unicode_general_category_t::TitlecaseLetter |
313            hb_unicode_general_category_t::UppercaseLetter)
314    }
315}
316
317pub trait CharExt {
318    fn script(self) -> Script;
319    fn general_category(self) -> hb_unicode_general_category_t;
320    fn space_fallback(self) -> hb_unicode_funcs_t::space_t;
321    fn modified_combining_class(self) -> u8;
322    fn mirrored(self) -> Option<char>;
323    fn is_emoji_extended_pictographic(self) -> bool;
324    fn is_default_ignorable(self) -> bool;
325    fn is_variation_selector(self) -> bool;
326    fn vertical(self) -> Option<char>;
327}
328
329impl CharExt for char {
330    fn script(self) -> Script {
331        use crate::script;
332        use unicode_script as us;
333
334        match unicode_script::UnicodeScript::script(&self) {
335            us::Script::Common => script::COMMON,
336            us::Script::Inherited => script::INHERITED,
337            us::Script::Adlam => script::ADLAM,
338            us::Script::Ahom => script::AHOM,
339            us::Script::Anatolian_Hieroglyphs => script::ANATOLIAN_HIEROGLYPHS,
340            us::Script::Arabic => script::ARABIC,
341            us::Script::Armenian => script::ARMENIAN,
342            us::Script::Avestan => script::AVESTAN,
343            us::Script::Balinese => script::BALINESE,
344            us::Script::Bamum => script::BAMUM,
345            us::Script::Bassa_Vah => script::BASSA_VAH,
346            us::Script::Batak => script::BATAK,
347            us::Script::Bengali => script::BENGALI,
348            us::Script::Bhaiksuki => script::BHAIKSUKI,
349            us::Script::Bopomofo => script::BOPOMOFO,
350            us::Script::Brahmi => script::BRAHMI,
351            us::Script::Braille => script::BRAILLE,
352            us::Script::Buginese => script::BUGINESE,
353            us::Script::Buhid => script::BUHID,
354            us::Script::Canadian_Aboriginal => script::CANADIAN_SYLLABICS,
355            us::Script::Carian => script::CARIAN,
356            us::Script::Caucasian_Albanian => script::CAUCASIAN_ALBANIAN,
357            us::Script::Chakma => script::CHAKMA,
358            us::Script::Cham => script::CHAM,
359            us::Script::Cherokee => script::CHEROKEE,
360            us::Script::Chorasmian => script::CHORASMIAN,
361            us::Script::Coptic => script::COPTIC,
362            us::Script::Cuneiform => script::CUNEIFORM,
363            us::Script::Cypriot => script::CYPRIOT,
364            us::Script::Cyrillic => script::CYRILLIC,
365            us::Script::Deseret => script::DESERET,
366            us::Script::Devanagari => script::DEVANAGARI,
367            us::Script::Dives_Akuru => script::DIVES_AKURU,
368            us::Script::Dogra => script::DOGRA,
369            us::Script::Duployan => script::DUPLOYAN,
370            us::Script::Egyptian_Hieroglyphs => script::EGYPTIAN_HIEROGLYPHS,
371            us::Script::Elbasan => script::ELBASAN,
372            us::Script::Elymaic => script::ELYMAIC,
373            us::Script::Ethiopic => script::ETHIOPIC,
374            us::Script::Georgian => script::GEORGIAN,
375            us::Script::Glagolitic => script::GLAGOLITIC,
376            us::Script::Gothic => script::GOTHIC,
377            us::Script::Grantha => script::GRANTHA,
378            us::Script::Greek => script::GREEK,
379            us::Script::Gujarati => script::GUJARATI,
380            us::Script::Gunjala_Gondi => script::GUNJALA_GONDI,
381            us::Script::Gurmukhi => script::GURMUKHI,
382            us::Script::Han => script::HAN,
383            us::Script::Hangul => script::HANGUL,
384            us::Script::Hanifi_Rohingya => script::HANIFI_ROHINGYA,
385            us::Script::Hanunoo => script::HANUNOO,
386            us::Script::Hatran => script::HATRAN,
387            us::Script::Hebrew => script::HEBREW,
388            us::Script::Hiragana => script::HIRAGANA,
389            us::Script::Imperial_Aramaic => script::IMPERIAL_ARAMAIC,
390            us::Script::Inscriptional_Pahlavi => script::INSCRIPTIONAL_PAHLAVI,
391            us::Script::Inscriptional_Parthian => script::INSCRIPTIONAL_PARTHIAN,
392            us::Script::Javanese => script::JAVANESE,
393            us::Script::Kaithi => script::KAITHI,
394            us::Script::Kannada => script::KANNADA,
395            us::Script::Katakana => script::KATAKANA,
396            us::Script::Kayah_Li => script::KAYAH_LI,
397            us::Script::Kharoshthi => script::KHAROSHTHI,
398            us::Script::Khitan_Small_Script => script::KHITAN_SMALL_SCRIPT,
399            us::Script::Khmer => script::KHMER,
400            us::Script::Khojki => script::KHOJKI,
401            us::Script::Khudawadi => script::KHUDAWADI,
402            us::Script::Lao => script::LAO,
403            us::Script::Latin => script::LATIN,
404            us::Script::Lepcha => script::LEPCHA,
405            us::Script::Limbu => script::LIMBU,
406            us::Script::Linear_A => script::LINEAR_A,
407            us::Script::Linear_B => script::LINEAR_B,
408            us::Script::Lisu => script::LISU,
409            us::Script::Lycian => script::LYCIAN,
410            us::Script::Lydian => script::LYDIAN,
411            us::Script::Mahajani => script::MAHAJANI,
412            us::Script::Makasar => script::MAKASAR,
413            us::Script::Malayalam => script::MALAYALAM,
414            us::Script::Mandaic => script::MANDAIC,
415            us::Script::Manichaean => script::MANICHAEAN,
416            us::Script::Marchen => script::MARCHEN,
417            us::Script::Masaram_Gondi => script::MASARAM_GONDI,
418            us::Script::Medefaidrin => script::MEDEFAIDRIN,
419            us::Script::Meetei_Mayek => script::MEETEI_MAYEK,
420            us::Script::Mende_Kikakui => script::MENDE_KIKAKUI,
421            us::Script::Meroitic_Cursive => script::MEROITIC_CURSIVE,
422            us::Script::Meroitic_Hieroglyphs => script::MEROITIC_HIEROGLYPHS,
423            us::Script::Miao => script::MIAO,
424            us::Script::Modi => script::MODI,
425            us::Script::Mongolian => script::MONGOLIAN,
426            us::Script::Mro => script::MRO,
427            us::Script::Multani => script::MULTANI,
428            us::Script::Myanmar => script::MYANMAR,
429            us::Script::Nabataean => script::NABATAEAN,
430            us::Script::Nandinagari => script::NANDINAGARI,
431            us::Script::New_Tai_Lue => script::NEW_TAI_LUE,
432            us::Script::Newa => script::NEWA,
433            us::Script::Nko => script::NKO,
434            us::Script::Nushu => script::NUSHU,
435            us::Script::Nyiakeng_Puachue_Hmong => script::NYIAKENG_PUACHUE_HMONG,
436            us::Script::Ogham => script::OGHAM,
437            us::Script::Ol_Chiki => script::OL_CHIKI,
438            us::Script::Old_Hungarian => script::OLD_HUNGARIAN,
439            us::Script::Old_Italic => script::OLD_ITALIC,
440            us::Script::Old_North_Arabian => script::OLD_NORTH_ARABIAN,
441            us::Script::Old_Permic => script::OLD_PERMIC,
442            us::Script::Old_Persian => script::OLD_PERSIAN,
443            us::Script::Old_Sogdian => script::OLD_SOGDIAN,
444            us::Script::Old_South_Arabian => script::OLD_SOUTH_ARABIAN,
445            us::Script::Old_Turkic => script::OLD_TURKIC,
446            us::Script::Oriya => script::ORIYA,
447            us::Script::Osage => script::OSAGE,
448            us::Script::Osmanya => script::OSMANYA,
449            us::Script::Pahawh_Hmong => script::PAHAWH_HMONG,
450            us::Script::Palmyrene => script::PALMYRENE,
451            us::Script::Pau_Cin_Hau => script::PAU_CIN_HAU,
452            us::Script::Phags_Pa => script::PHAGS_PA,
453            us::Script::Phoenician => script::PHOENICIAN,
454            us::Script::Psalter_Pahlavi => script::PSALTER_PAHLAVI,
455            us::Script::Rejang => script::REJANG,
456            us::Script::Runic => script::RUNIC,
457            us::Script::Samaritan => script::SAMARITAN,
458            us::Script::Saurashtra => script::SAURASHTRA,
459            us::Script::Sharada => script::SHARADA,
460            us::Script::Shavian => script::SHAVIAN,
461            us::Script::Siddham => script::SIDDHAM,
462            us::Script::SignWriting => script::SIGNWRITING,
463            us::Script::Sinhala => script::SINHALA,
464            us::Script::Sogdian => script::SOGDIAN,
465            us::Script::Sora_Sompeng => script::SORA_SOMPENG,
466            us::Script::Soyombo => script::SOYOMBO,
467            us::Script::Sundanese => script::SUNDANESE,
468            us::Script::Syloti_Nagri => script::SYLOTI_NAGRI,
469            us::Script::Syriac => script::SYRIAC,
470            us::Script::Tagalog => script::TAGALOG,
471            us::Script::Tagbanwa => script::TAGBANWA,
472            us::Script::Tai_Le => script::TAI_LE,
473            us::Script::Tai_Tham => script::TAI_THAM,
474            us::Script::Tai_Viet => script::TAI_VIET,
475            us::Script::Takri => script::TAKRI,
476            us::Script::Tamil => script::TAMIL,
477            us::Script::Tangut => script::TANGUT,
478            us::Script::Telugu => script::TELUGU,
479            us::Script::Thaana => script::THAANA,
480            us::Script::Thai => script::THAI,
481            us::Script::Tibetan => script::TIBETAN,
482            us::Script::Tifinagh => script::TIFINAGH,
483            us::Script::Tirhuta => script::TIRHUTA,
484            us::Script::Ugaritic => script::UGARITIC,
485            us::Script::Vai => script::VAI,
486            us::Script::Wancho => script::WANCHO,
487            us::Script::Warang_Citi => script::WARANG_CITI,
488            us::Script::Yezidi => script::YEZIDI,
489            us::Script::Yi => script::YI,
490            us::Script::Zanabazar_Square => script::ZANABAZAR_SQUARE,
491            _ => script::UNKNOWN,
492        }
493    }
494
495    fn general_category(self) -> hb_unicode_general_category_t {
496        unicode_properties::general_category::UnicodeGeneralCategory::general_category(self)
497    }
498
499    fn space_fallback(self) -> hb_unicode_funcs_t::space_t {
500        use hb_unicode_funcs_t::*;
501
502        // All GC=Zs chars that can use a fallback.
503        match self {
504            '\u{0020}' => SPACE,             // SPACE
505            '\u{00A0}' => SPACE,             // NO-BREAK SPACE
506            '\u{2000}' => SPACE_EM_2,        // EN QUAD
507            '\u{2001}' => SPACE_EM,          // EM QUAD
508            '\u{2002}' => SPACE_EM_2,        // EN SPACE
509            '\u{2003}' => SPACE_EM,          // EM SPACE
510            '\u{2004}' => SPACE_EM_3,        // THREE-PER-EM SPACE
511            '\u{2005}' => SPACE_EM_4,        // FOUR-PER-EM SPACE
512            '\u{2006}' => SPACE_EM_6,        // SIX-PER-EM SPACE
513            '\u{2007}' => SPACE_FIGURE,      // FIGURE SPACE
514            '\u{2008}' => SPACE_PUNCTUATION, // PUNCTUATION SPACE
515            '\u{2009}' => SPACE_EM_5,        // THIN SPACE
516            '\u{200A}' => SPACE_EM_16,       // HAIR SPACE
517            '\u{202F}' => SPACE_NARROW,      // NARROW NO-BREAK SPACE
518            '\u{205F}' => SPACE_4_EM_18,     // MEDIUM MATHEMATICAL SPACE
519            '\u{3000}' => SPACE_EM,          // IDEOGRAPHIC SPACE
520            _ => NOT_SPACE,                  // OGHAM SPACE MARK
521        }
522    }
523
524    fn modified_combining_class(self) -> u8 {
525        let u = self;
526
527        // Reorder SAKOT to ensure it comes after any tone marks.
528        if u == '\u{1A60}' {
529            return 254;
530        }
531
532        // Reorder PADMA to ensure it comes after any vowel marks.
533        if u == '\u{0FC6}' {
534            return 254;
535        }
536
537        // Reorder TSA -PHRU to reorder before U+0F74
538        if u == '\u{0F39}' {
539            return 127;
540        }
541
542        let k = unicode_ccc::get_canonical_combining_class(u);
543        MODIFIED_COMBINING_CLASS[k as usize]
544    }
545
546    fn mirrored(self) -> Option<char> {
547        unicode_bidi_mirroring::get_mirrored(self)
548    }
549
550    fn is_emoji_extended_pictographic(self) -> bool {
551        // Generated by scripts/gen-unicode-is-emoji-ext-pict.py
552        match self as u32 {
553            0x00A9 => true,
554            0x00AE => true,
555            0x203C => true,
556            0x2049 => true,
557            0x2122 => true,
558            0x2139 => true,
559            0x2194..=0x2199 => true,
560            0x21A9..=0x21AA => true,
561            0x231A..=0x231B => true,
562            0x2328 => true,
563            0x2388 => true,
564            0x23CF => true,
565            0x23E9..=0x23F3 => true,
566            0x23F8..=0x23FA => true,
567            0x24C2 => true,
568            0x25AA..=0x25AB => true,
569            0x25B6 => true,
570            0x25C0 => true,
571            0x25FB..=0x25FE => true,
572            0x2600..=0x2605 => true,
573            0x2607..=0x2612 => true,
574            0x2614..=0x2685 => true,
575            0x2690..=0x2705 => true,
576            0x2708..=0x2712 => true,
577            0x2714 => true,
578            0x2716 => true,
579            0x271D => true,
580            0x2721 => true,
581            0x2728 => true,
582            0x2733..=0x2734 => true,
583            0x2744 => true,
584            0x2747 => true,
585            0x274C => true,
586            0x274E => true,
587            0x2753..=0x2755 => true,
588            0x2757 => true,
589            0x2763..=0x2767 => true,
590            0x2795..=0x2797 => true,
591            0x27A1 => true,
592            0x27B0 => true,
593            0x27BF => true,
594            0x2934..=0x2935 => true,
595            0x2B05..=0x2B07 => true,
596            0x2B1B..=0x2B1C => true,
597            0x2B50 => true,
598            0x2B55 => true,
599            0x3030 => true,
600            0x303D => true,
601            0x3297 => true,
602            0x3299 => true,
603            0x1F000..=0x1F0FF => true,
604            0x1F10D..=0x1F10F => true,
605            0x1F12F => true,
606            0x1F16C..=0x1F171 => true,
607            0x1F17E..=0x1F17F => true,
608            0x1F18E => true,
609            0x1F191..=0x1F19A => true,
610            0x1F1AD..=0x1F1E5 => true,
611            0x1F201..=0x1F20F => true,
612            0x1F21A => true,
613            0x1F22F => true,
614            0x1F232..=0x1F23A => true,
615            0x1F23C..=0x1F23F => true,
616            0x1F249..=0x1F3FA => true,
617            0x1F400..=0x1F53D => true,
618            0x1F546..=0x1F64F => true,
619            0x1F680..=0x1F6FF => true,
620            0x1F774..=0x1F77F => true,
621            0x1F7D5..=0x1F7FF => true,
622            0x1F80C..=0x1F80F => true,
623            0x1F848..=0x1F84F => true,
624            0x1F85A..=0x1F85F => true,
625            0x1F888..=0x1F88F => true,
626            0x1F8AE..=0x1F8FF => true,
627            0x1F90C..=0x1F93A => true,
628            0x1F93C..=0x1F945 => true,
629            0x1F947..=0x1FAFF => true,
630            0x1FC00..=0x1FFFD => true,
631            _ => false,
632        }
633    }
634
635    /// Default_Ignorable codepoints:
636    ///
637    /// Note: While U+115F, U+1160, U+3164 and U+FFA0 are Default_Ignorable,
638    /// we do NOT want to hide them, as the way Uniscribe has implemented them
639    /// is with regular spacing glyphs, and that's the way fonts are made to work.
640    /// As such, we make exceptions for those four.
641    /// Also ignoring U+1BCA0..1BCA3. https://github.com/harfbuzz/harfbuzz/issues/503
642    ///
643    /// Unicode 14.0:
644    /// $ grep '; Default_Ignorable_Code_Point ' DerivedCoreProperties.txt | sed 's/;.*#/#/'
645    /// 00AD          # Cf       SOFT HYPHEN
646    /// 034F          # Mn       COMBINING GRAPHEME JOINER
647    /// 061C          # Cf       ARABIC LETTER MARK
648    /// 115F..1160    # Lo   [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER
649    /// 17B4..17B5    # Mn   [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
650    /// 180B..180D    # Mn   [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
651    /// 180E          # Cf       MONGOLIAN VOWEL SEPARATOR
652    /// 180F          # Mn       MONGOLIAN FREE VARIATION SELECTOR FOUR
653    /// 200B..200F    # Cf   [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK
654    /// 202A..202E    # Cf   [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
655    /// 2060..2064    # Cf   [5] WORD JOINER..INVISIBLE PLUS
656    /// 2065          # Cn       <reserved-2065>
657    /// 2066..206F    # Cf  [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
658    /// 3164          # Lo       HANGUL FILLER
659    /// FE00..FE0F    # Mn  [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
660    /// FEFF          # Cf       ZERO WIDTH NO-BREAK SPACE
661    /// FFA0          # Lo       HALFWIDTH HANGUL FILLER
662    /// FFF0..FFF8    # Cn   [9] <reserved-FFF0>..<reserved-FFF8>
663    /// 1BCA0..1BCA3  # Cf   [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
664    /// 1D173..1D17A  # Cf   [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
665    /// E0000         # Cn       <reserved-E0000>
666    /// E0001         # Cf       LANGUAGE TAG
667    /// E0002..E001F  # Cn  [30] <reserved-E0002>..<reserved-E001F>
668    /// E0020..E007F  # Cf  [96] TAG SPACE..CANCEL TAG
669    /// E0080..E00FF  # Cn [128] <reserved-E0080>..<reserved-E00FF>
670    /// E0100..E01EF  # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
671    /// E01F0..E0FFF  # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
672    fn is_default_ignorable(self) -> bool {
673        let ch = u32::from(self);
674        let plane = ch >> 16;
675        if plane == 0 {
676            // BMP
677            let page = ch >> 8;
678            match page {
679                0x00 => ch == 0x00AD,
680                0x03 => ch == 0x034F,
681                0x06 => ch == 0x061C,
682                0x17 => (0x17B4..=0x17B5).contains(&ch),
683                0x18 => (0x180B..=0x180E).contains(&ch),
684                0x20 => {
685                    (0x200B..=0x200F).contains(&ch)
686                        || (0x202A..=0x202E).contains(&ch)
687                        || (0x2060..=0x206F).contains(&ch)
688                }
689                0xFE => (0xFE00..=0xFE0F).contains(&ch) || ch == 0xFEFF,
690                0xFF => (0xFFF0..=0xFFF8).contains(&ch),
691                _ => false,
692            }
693        } else {
694            // Other planes
695            match plane {
696                0x01 => (0x1D173..=0x1D17A).contains(&ch),
697                0x0E => (0xE0000..=0xE0FFF).contains(&ch),
698                _ => false,
699            }
700        }
701    }
702
703    fn is_variation_selector(self) -> bool {
704        // U+180B..180D, U+180F MONGOLIAN FREE VARIATION SELECTORs are handled in the
705        //Arabic shaper. No need to match them here.
706        let ch = u32::from(self);
707        (0x0FE00..=0x0FE0F).contains(&ch) || // VARIATION SELECTOR - 1..16
708        (0xE0100..=0xE01EF).contains(&ch) // VARIATION SELECTOR - 17..256
709    }
710
711    fn vertical(self) -> Option<char> {
712        Some(match u32::from(self) >> 8 {
713            0x20 => match self {
714                '\u{2013}' => '\u{fe32}', // EN DASH
715                '\u{2014}' => '\u{fe31}', // EM DASH
716                '\u{2025}' => '\u{fe30}', // TWO DOT LEADER
717                '\u{2026}' => '\u{fe19}', // HORIZONTAL ELLIPSIS
718                _ => return None,
719            },
720            0x30 => match self {
721                '\u{3001}' => '\u{fe11}', // IDEOGRAPHIC COMMA
722                '\u{3002}' => '\u{fe12}', // IDEOGRAPHIC FULL STOP
723                '\u{3008}' => '\u{fe3f}', // LEFT ANGLE BRACKET
724                '\u{3009}' => '\u{fe40}', // RIGHT ANGLE BRACKET
725                '\u{300a}' => '\u{fe3d}', // LEFT DOUBLE ANGLE BRACKET
726                '\u{300b}' => '\u{fe3e}', // RIGHT DOUBLE ANGLE BRACKET
727                '\u{300c}' => '\u{fe41}', // LEFT CORNER BRACKET
728                '\u{300d}' => '\u{fe42}', // RIGHT CORNER BRACKET
729                '\u{300e}' => '\u{fe43}', // LEFT WHITE CORNER BRACKET
730                '\u{300f}' => '\u{fe44}', // RIGHT WHITE CORNER BRACKET
731                '\u{3010}' => '\u{fe3b}', // LEFT BLACK LENTICULAR BRACKET
732                '\u{3011}' => '\u{fe3c}', // RIGHT BLACK LENTICULAR BRACKET
733                '\u{3014}' => '\u{fe39}', // LEFT TORTOISE SHELL BRACKET
734                '\u{3015}' => '\u{fe3a}', // RIGHT TORTOISE SHELL BRACKET
735                '\u{3016}' => '\u{fe17}', // LEFT WHITE LENTICULAR BRACKET
736                '\u{3017}' => '\u{fe18}', // RIGHT WHITE LENTICULAR BRACKET
737                _ => return None,
738            },
739            0xfe => match self {
740                '\u{fe4f}' => '\u{fe34}', // WAVY LOW LINE
741                _ => return None,
742            },
743            0xff => match self {
744                '\u{ff01}' => '\u{fe15}', // FULLWIDTH EXCLAMATION MARK
745                '\u{ff08}' => '\u{fe35}', // FULLWIDTH LEFT PARENTHESIS
746                '\u{ff09}' => '\u{fe36}', // FULLWIDTH RIGHT PARENTHESIS
747                '\u{ff0c}' => '\u{fe10}', // FULLWIDTH COMMA
748                '\u{ff1a}' => '\u{fe13}', // FULLWIDTH COLON
749                '\u{ff1b}' => '\u{fe14}', // FULLWIDTH SEMICOLON
750                '\u{ff1f}' => '\u{fe16}', // FULLWIDTH QUESTION MARK
751                '\u{ff3b}' => '\u{fe47}', // FULLWIDTH LEFT SQUARE BRACKET
752                '\u{ff3d}' => '\u{fe48}', // FULLWIDTH RIGHT SQUARE BRACKET
753                '\u{ff3f}' => '\u{fe33}', // FULLWIDTH LOW LINE
754                '\u{ff5b}' => '\u{fe37}', // FULLWIDTH LEFT CURLY BRACKET
755                '\u{ff5d}' => '\u{fe38}', // FULLWIDTH RIGHT CURLY BRACKET
756                _ => return None,
757            },
758            _ => return None,
759        })
760    }
761}
762
763const S_BASE: u32 = 0xAC00;
764const L_BASE: u32 = 0x1100;
765const V_BASE: u32 = 0x1161;
766const T_BASE: u32 = 0x11A7;
767const L_COUNT: u32 = 19;
768const V_COUNT: u32 = 21;
769const T_COUNT: u32 = 28;
770const N_COUNT: u32 = V_COUNT * T_COUNT;
771const S_COUNT: u32 = L_COUNT * N_COUNT;
772
773pub fn compose(a: char, b: char) -> Option<char> {
774    if let Some(ab) = compose_hangul(a, b) {
775        return Some(ab);
776    }
777
778    let needle = (a as u64) << 32 | (b as u64);
779    super::unicode_norm::COMPOSITION_TABLE
780        .binary_search_by(|item| item.0.cmp(&needle))
781        .map(|idx| super::unicode_norm::COMPOSITION_TABLE[idx].1)
782        .ok()
783}
784
785fn compose_hangul(a: char, b: char) -> Option<char> {
786    let l = u32::from(a);
787    let v = u32::from(b);
788    if L_BASE <= l && l < (L_BASE + L_COUNT) && V_BASE <= v && v < (V_BASE + V_COUNT) {
789        let r = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT;
790        Some(char::try_from(r).unwrap())
791    } else if S_BASE <= l
792        && l <= (S_BASE + S_COUNT - T_COUNT)
793        && T_BASE <= v
794        && v < (T_BASE + T_COUNT)
795        && (l - S_BASE) % T_COUNT == 0
796    {
797        let r = l + (v - T_BASE);
798        Some(char::try_from(r).unwrap())
799    } else {
800        None
801    }
802}
803
804pub fn decompose(ab: char) -> Option<(char, char)> {
805    if let Some(ab) = decompose_hangul(ab) {
806        return Some(ab);
807    }
808
809    super::unicode_norm::DECOMPOSITION_TABLE
810        .binary_search_by(|item| item.0.cmp(&ab))
811        .map(|idx| {
812            let chars = &super::unicode_norm::DECOMPOSITION_TABLE[idx];
813            (chars.1, chars.2.unwrap_or('\0'))
814        })
815        .ok()
816}
817
818pub fn decompose_hangul(ab: char) -> Option<(char, char)> {
819    let si = u32::from(ab).wrapping_sub(S_BASE);
820    if si >= S_COUNT {
821        return None;
822    }
823
824    let (a, b) = if si % T_COUNT != 0 {
825        // LV,T
826        (S_BASE + (si / T_COUNT) * T_COUNT, T_BASE + (si % T_COUNT))
827    } else {
828        // L,V
829        (L_BASE + (si / N_COUNT), V_BASE + (si % N_COUNT) / T_COUNT)
830    };
831
832    Some((char::try_from(a).unwrap(), char::try_from(b).unwrap()))
833}
834
835#[cfg(test)]
836mod tests {
837    #[test]
838    fn check_unicode_version() {
839        assert_eq!(unicode_bidi_mirroring::UNICODE_VERSION, (16, 0, 0));
840        assert_eq!(unicode_ccc::UNICODE_VERSION, (16, 0, 0));
841        assert_eq!(unicode_properties::UNICODE_VERSION, (16, 0, 0));
842        assert_eq!(unicode_script::UNICODE_VERSION, (16, 0, 0));
843        assert_eq!(crate::hb::unicode_norm::UNICODE_VERSION, (16, 0, 0));
844    }
845}
846
847// TODO: remove
848pub mod hb_gc {
849    pub const RB_UNICODE_GENERAL_CATEGORY_CONTROL: u32 = 0;
850    pub const RB_UNICODE_GENERAL_CATEGORY_FORMAT: u32 = 1;
851    pub const RB_UNICODE_GENERAL_CATEGORY_UNASSIGNED: u32 = 2;
852    pub const RB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE: u32 = 3;
853    pub const RB_UNICODE_GENERAL_CATEGORY_SURROGATE: u32 = 4;
854    pub const RB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER: u32 = 5;
855    pub const RB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER: u32 = 6;
856    pub const RB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER: u32 = 7;
857    pub const RB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER: u32 = 8;
858    pub const RB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER: u32 = 9;
859    pub const RB_UNICODE_GENERAL_CATEGORY_SPACING_MARK: u32 = 10;
860    pub const RB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK: u32 = 11;
861    pub const RB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK: u32 = 12;
862    pub const RB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER: u32 = 13;
863    pub const RB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER: u32 = 14;
864    pub const RB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER: u32 = 15;
865    pub const RB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: u32 = 16;
866    pub const RB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION: u32 = 17;
867    pub const RB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: u32 = 18;
868    pub const RB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: u32 = 19;
869    pub const RB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: u32 = 20;
870    pub const RB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: u32 = 21;
871    pub const RB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION: u32 = 22;
872    pub const RB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL: u32 = 23;
873    pub const RB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL: u32 = 24;
874    pub const RB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL: u32 = 25;
875    pub const RB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL: u32 = 26;
876    pub const RB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR: u32 = 27;
877    pub const RB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR: u32 = 28;
878    pub const RB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR: u32 = 29;
879}