rustybuzz/hb/
ot_shaper_thai.rs

1use super::buffer::*;
2use super::ot_layout::*;
3use super::ot_shape_normalize::HB_OT_SHAPE_NORMALIZATION_MODE_AUTO;
4use super::ot_shape_plan::hb_ot_shape_plan_t;
5use super::ot_shaper::*;
6use super::unicode::hb_unicode_general_category_t;
7use super::{hb_font_t, script};
8
9pub const THAI_SHAPER: hb_ot_shaper_t = hb_ot_shaper_t {
10    collect_features: None,
11    override_features: None,
12    create_data: None,
13    preprocess_text: Some(preprocess_text),
14    postprocess_glyphs: None,
15    normalization_preference: HB_OT_SHAPE_NORMALIZATION_MODE_AUTO,
16    decompose: None,
17    compose: None,
18    setup_masks: None,
19    gpos_tag: None,
20    reorder_marks: None,
21    zero_width_marks: HB_OT_SHAPE_ZERO_WIDTH_MARKS_BY_GDEF_LATE,
22    fallback_position: false,
23};
24
25#[derive(Clone, Copy, PartialEq)]
26enum Consonant {
27    NC = 0,
28    AC,
29    RC,
30    DC,
31    NotConsonant,
32}
33
34fn get_consonant_type(u: u32) -> Consonant {
35    match u {
36        0x0E1B | 0x0E1D | 0x0E1F => Consonant::AC,
37        0x0E0D | 0x0E10 => Consonant::RC,
38        0x0E0E | 0x0E0F => Consonant::DC,
39        0x0E01..=0x0E2E => Consonant::NC,
40        _ => Consonant::NotConsonant,
41    }
42}
43
44#[derive(Clone, Copy, PartialEq)]
45enum Mark {
46    AV,
47    BV,
48    T,
49    NotMark,
50}
51
52fn get_mark_type(u: u32) -> Mark {
53    match u {
54        0x0E31 | 0x0E34..=0x0E37 | 0x0E47 | 0x0E4D..=0x0E4E => Mark::AV,
55        0x0E38..=0x0E3A => Mark::BV,
56        0x0E48..=0x0E4C => Mark::T,
57        _ => Mark::NotMark,
58    }
59}
60
61#[derive(Clone, Copy, PartialEq)]
62enum Action {
63    NOP,
64    /// Shift combining-mark down.
65    SD,
66    /// Shift combining-mark left.
67    SL,
68    /// Shift combining-mark down-left.
69    SDL,
70    /// Remove descender from base.
71    RD,
72}
73
74#[derive(Clone, Copy)]
75struct PuaMapping {
76    u: u16,
77    win_pua: u16,
78    mac_pua: u16,
79}
80
81impl PuaMapping {
82    const fn new(u: u16, win_pua: u16, mac_pua: u16) -> Self {
83        PuaMapping {
84            u,
85            win_pua,
86            mac_pua,
87        }
88    }
89}
90
91const SD_MAPPINGS: &[PuaMapping] = &[
92    PuaMapping::new(0x0E48, 0xF70A, 0xF88B), // MAI EK
93    PuaMapping::new(0x0E49, 0xF70B, 0xF88E), // MAI THO
94    PuaMapping::new(0x0E4A, 0xF70C, 0xF891), // MAI TRI
95    PuaMapping::new(0x0E4B, 0xF70D, 0xF894), // MAI CHATTAWA
96    PuaMapping::new(0x0E4C, 0xF70E, 0xF897), // THANTHAKHAT
97    PuaMapping::new(0x0E38, 0xF718, 0xF89B), // SARA U
98    PuaMapping::new(0x0E39, 0xF719, 0xF89C), // SARA UU
99    PuaMapping::new(0x0E3A, 0xF71A, 0xF89D), // PHINTHU
100    PuaMapping::new(0x0000, 0x0000, 0x0000),
101];
102
103const SDL_MAPPINGS: &[PuaMapping] = &[
104    PuaMapping::new(0x0E48, 0xF705, 0xF88C), // MAI EK
105    PuaMapping::new(0x0E49, 0xF706, 0xF88F), // MAI THO
106    PuaMapping::new(0x0E4A, 0xF707, 0xF892), // MAI TRI
107    PuaMapping::new(0x0E4B, 0xF708, 0xF895), // MAI CHATTAWA
108    PuaMapping::new(0x0E4C, 0xF709, 0xF898), // THANTHAKHAT
109    PuaMapping::new(0x0000, 0x0000, 0x0000),
110];
111
112const SL_MAPPINGS: &[PuaMapping] = &[
113    PuaMapping::new(0x0E48, 0xF713, 0xF88A), // MAI EK
114    PuaMapping::new(0x0E49, 0xF714, 0xF88D), // MAI THO
115    PuaMapping::new(0x0E4A, 0xF715, 0xF890), // MAI TRI
116    PuaMapping::new(0x0E4B, 0xF716, 0xF893), // MAI CHATTAWA
117    PuaMapping::new(0x0E4C, 0xF717, 0xF896), // THANTHAKHAT
118    PuaMapping::new(0x0E31, 0xF710, 0xF884), // MAI HAN-AKAT
119    PuaMapping::new(0x0E34, 0xF701, 0xF885), // SARA I
120    PuaMapping::new(0x0E35, 0xF702, 0xF886), // SARA II
121    PuaMapping::new(0x0E36, 0xF703, 0xF887), // SARA UE
122    PuaMapping::new(0x0E37, 0xF704, 0xF888), // SARA UEE
123    PuaMapping::new(0x0E47, 0xF712, 0xF889), // MAITAIKHU
124    PuaMapping::new(0x0E4D, 0xF711, 0xF899), // NIKHAHIT
125    PuaMapping::new(0x0000, 0x0000, 0x0000),
126];
127
128const RD_MAPPINGS: &[PuaMapping] = &[
129    PuaMapping::new(0x0E0D, 0xF70F, 0xF89A), // YO YING
130    PuaMapping::new(0x0E10, 0xF700, 0xF89E), // THO THAN
131    PuaMapping::new(0x0000, 0x0000, 0x0000),
132];
133
134fn pua_shape(u: u32, action: Action, face: &hb_font_t) -> u32 {
135    let mappings = match action {
136        Action::NOP => return u,
137        Action::SD => SD_MAPPINGS,
138        Action::SL => SL_MAPPINGS,
139        Action::SDL => SDL_MAPPINGS,
140        Action::RD => RD_MAPPINGS,
141    };
142
143    for m in mappings {
144        if m.u as u32 == u {
145            if face.get_nominal_glyph(m.win_pua as u32).is_some() {
146                return m.win_pua as u32;
147            }
148
149            if face.get_nominal_glyph(m.mac_pua as u32).is_some() {
150                return m.mac_pua as u32;
151            }
152
153            break;
154        }
155    }
156
157    u
158}
159
160#[derive(Clone, Copy)]
161enum AboveState {
162    // Cluster above looks like:
163    T0, //  ⣤
164    T1, //     ⣼
165    T2, //        ⣾
166    T3, //           ⣿
167}
168
169const ABOVE_START_STATE: &[AboveState] = &[
170    AboveState::T0, // NC
171    AboveState::T1, // AC
172    AboveState::T0, // RC
173    AboveState::T0, // DC
174    AboveState::T3, // NotConsonant
175];
176
177#[derive(Clone, Copy)]
178struct AboveStateMachineEdge {
179    action: Action,
180    next_state: AboveState,
181}
182
183impl AboveStateMachineEdge {
184    const fn new(action: Action, next_state: AboveState) -> Self {
185        AboveStateMachineEdge { action, next_state }
186    }
187}
188
189type ASME = AboveStateMachineEdge;
190
191const ABOVE_STATE_MACHINE: &[[ASME; 3]] = &[
192    //        AV                                      BV                                      T
193    /* T0 */
194    [
195        ASME::new(Action::NOP, AboveState::T3),
196        ASME::new(Action::NOP, AboveState::T0),
197        ASME::new(Action::SD, AboveState::T3),
198    ],
199    /* T1 */
200    [
201        ASME::new(Action::SL, AboveState::T2),
202        ASME::new(Action::NOP, AboveState::T1),
203        ASME::new(Action::SDL, AboveState::T2),
204    ],
205    /* T2 */
206    [
207        ASME::new(Action::NOP, AboveState::T3),
208        ASME::new(Action::NOP, AboveState::T2),
209        ASME::new(Action::SL, AboveState::T3),
210    ],
211    /* T3 */
212    [
213        ASME::new(Action::NOP, AboveState::T3),
214        ASME::new(Action::NOP, AboveState::T3),
215        ASME::new(Action::NOP, AboveState::T3),
216    ],
217];
218
219#[derive(Clone, Copy)]
220enum BelowState {
221    /// No descender.
222    B0,
223    /// Removable descender.
224    B1,
225    /// Strict descender.
226    B2,
227}
228
229const BELOW_START_STATE: &[BelowState] = &[
230    BelowState::B0, // NC
231    BelowState::B0, // AC
232    BelowState::B1, // RC
233    BelowState::B2, // DC
234    BelowState::B2, // NotConsonant
235];
236
237#[derive(Clone, Copy)]
238struct BelowStateMachineEdge {
239    action: Action,
240    next_state: BelowState,
241}
242
243impl BelowStateMachineEdge {
244    const fn new(action: Action, next_state: BelowState) -> Self {
245        BelowStateMachineEdge { action, next_state }
246    }
247}
248
249type BSME = BelowStateMachineEdge;
250
251const BELOW_STATE_MACHINE: &[[BSME; 3]] = &[
252    //        AV                                      BV                                      T
253    /* B0 */
254    [
255        BSME::new(Action::NOP, BelowState::B0),
256        BSME::new(Action::NOP, BelowState::B2),
257        BSME::new(Action::NOP, BelowState::B0),
258    ],
259    /* B1 */
260    [
261        BSME::new(Action::NOP, BelowState::B1),
262        BSME::new(Action::RD, BelowState::B2),
263        BSME::new(Action::NOP, BelowState::B1),
264    ],
265    /* B2 */
266    [
267        BSME::new(Action::NOP, BelowState::B2),
268        BSME::new(Action::SD, BelowState::B2),
269        BSME::new(Action::NOP, BelowState::B2),
270    ],
271];
272
273fn do_pua_shaping(face: &hb_font_t, buffer: &mut hb_buffer_t) {
274    let mut above_state = ABOVE_START_STATE[Consonant::NotConsonant as usize];
275    let mut below_state = BELOW_START_STATE[Consonant::NotConsonant as usize];
276    let mut base = 0;
277
278    for i in 0..buffer.len {
279        let mt = get_mark_type(buffer.info[i].glyph_id);
280
281        if mt == Mark::NotMark {
282            let ct = get_consonant_type(buffer.info[i].glyph_id);
283            above_state = ABOVE_START_STATE[ct as usize];
284            below_state = BELOW_START_STATE[ct as usize];
285            base = i;
286            continue;
287        }
288
289        let above_edge = ABOVE_STATE_MACHINE[above_state as usize][mt as usize];
290        let below_edge = BELOW_STATE_MACHINE[below_state as usize][mt as usize];
291        above_state = above_edge.next_state;
292        below_state = below_edge.next_state;
293
294        // At least one of the above/below actions is NOP.
295        let action = if above_edge.action != Action::NOP {
296            above_edge.action
297        } else {
298            below_edge.action
299        };
300
301        buffer.unsafe_to_break(Some(base), Some(i));
302        if action == Action::RD {
303            buffer.info[base].glyph_id = pua_shape(buffer.info[base].glyph_id, action, face);
304        } else {
305            buffer.info[i].glyph_id = pua_shape(buffer.info[i].glyph_id, action, face);
306        }
307    }
308}
309
310// TODO: more tests
311fn preprocess_text(plan: &hb_ot_shape_plan_t, face: &hb_font_t, buffer: &mut hb_buffer_t) {
312    // This function implements the shaping logic documented here:
313    //
314    //   https://linux.thai.net/~thep/th-otf/shaping.html
315    //
316    // The first shaping rule listed there is needed even if the font has Thai
317    // OpenType tables.  The rest do fallback positioning based on PUA codepoints.
318    // We implement that only if there exist no Thai GSUB in the font.
319
320    // The following is NOT specified in the MS OT Thai spec, however, it seems
321    // to be what Uniscribe and other engines implement.  According to Eric Muller:
322    //
323    // When you have a SARA AM, decompose it in NIKHAHIT + SARA AA, *and* move the
324    // NIKHAHIT backwards over any above-base marks (0E31, 0E34-0E37, 0E47-0E4E).
325    //
326    // <0E14, 0E4B, 0E33> -> <0E14, 0E4D, 0E4B, 0E32>
327    //
328    // This reordering is legit only when the NIKHAHIT comes from a SARA AM, not
329    // when it's there to start with. The string <0E14, 0E4B, 0E4D> is probably
330    // not what a user wanted, but the rendering is nevertheless nikhahit above
331    // chattawa.
332    //
333    // Same for Lao.
334    //
335    // Note:
336    //
337    // Uniscribe also does some below-marks reordering.  Namely, it positions U+0E3A
338    // after U+0E38 and U+0E39.  We do that by modifying the ccc for U+0E3A.
339    // See unicode->modified_combining_class ().  Lao does NOT have a U+0E3A
340    // equivalent.
341
342    // Here are the characters of significance:
343    //
344    //              Thai    Lao
345    // SARA AM:     U+0E33  U+0EB3
346    // SARA AA:     U+0E32  U+0EB2
347    // Nikhahit:    U+0E4D  U+0ECD
348    //
349    // Testing shows that Uniscribe reorder the following marks:
350    // Thai:	<0E31,0E34..0E37,0E47..0E4E>
351    // Lao:     <0EB1,0EB4..0EB7,0EBB,0EC8..0ECD>
352    //
353    // Note how the Lao versions are the same as Thai + 0x80.
354
355    // We only get one script at a time, so a script-agnostic implementation
356    // is adequate here.
357    #[inline]
358    fn is_sara_am(u: u32) -> bool {
359        (u & !0x0080) == 0x0E33
360    }
361    #[inline]
362    fn nikhahit_from_sara_am(u: u32) -> u32 {
363        u - 0x0E33 + 0x0E4D
364    }
365    #[inline]
366    fn sara_aa_from_sara_am(u: u32) -> u32 {
367        u - 1
368    }
369    #[inline]
370    fn is_above_base_mark(u: u32) -> bool {
371        let u = u & !0x0080;
372        matches!(u, 0x0E34..=0x0E37 | 0x0E47..=0x0E4E | 0x0E31..=0x0E31 | 0x0E3B..=0x0E3B)
373    }
374
375    buffer.clear_output();
376    buffer.idx = 0;
377    while buffer.idx < buffer.len {
378        let u = buffer.cur(0).glyph_id;
379        if !is_sara_am(u) {
380            buffer.next_glyph();
381            continue;
382        }
383
384        // Is SARA AM. Decompose and reorder.
385        buffer.output_glyph(nikhahit_from_sara_am(u));
386        {
387            let out_idx = buffer.out_len - 1;
388            _hb_glyph_info_set_continuation(&mut buffer.out_info_mut()[out_idx]);
389        }
390        buffer.replace_glyph(sara_aa_from_sara_am(u));
391
392        // Make Nikhahit be recognized as a ccc=0 mark when zeroing widths.
393        let end = buffer.out_len;
394        _hb_glyph_info_set_general_category(
395            &mut buffer.out_info_mut()[end - 2],
396            hb_unicode_general_category_t::NonspacingMark,
397        );
398
399        // Ok, let's see...
400        let mut start = end - 2;
401        while start > 0 && is_above_base_mark(buffer.out_info()[start - 1].glyph_id) {
402            start -= 1;
403        }
404
405        if start + 2 < end {
406            // Move Nikhahit (end-2) to the beginning
407            buffer.merge_out_clusters(start, end);
408            let t = buffer.out_info()[end - 2];
409            for i in 0..(end - start - 2) {
410                buffer.out_info_mut()[i + start + 1] = buffer.out_info()[i + start];
411            }
412            buffer.out_info_mut()[start] = t;
413        } else {
414            // Since we decomposed, and NIKHAHIT is combining, merge clusters with the
415            // previous cluster.
416            if start != 0 && buffer.cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES {
417                buffer.merge_out_clusters(start - 1, end);
418            }
419        }
420    }
421
422    buffer.sync();
423
424    // If font has Thai GSUB, we are done.
425    if plan.script == Some(script::THAI) && !plan.ot_map.found_script(TableIndex::GSUB) {
426        do_pua_shaping(face, buffer);
427    }
428}