rustybuzz/hb/
ot_shape_normalize.rs

1use super::buffer::*;
2use super::common::hb_codepoint_t;
3use super::hb_font_t;
4use super::ot_layout::*;
5use super::ot_shape_plan::hb_ot_shape_plan_t;
6use super::ot_shaper::{ComposeFn, DecomposeFn, MAX_COMBINING_MARKS};
7use super::unicode::{hb_unicode_funcs_t, CharExt};
8
9pub struct hb_ot_shape_normalize_context_t<'a> {
10    pub plan: &'a hb_ot_shape_plan_t,
11    pub buffer: &'a mut hb_buffer_t,
12    pub face: &'a hb_font_t<'a>,
13    pub decompose: DecomposeFn,
14    pub compose: ComposeFn,
15}
16
17impl hb_ot_shape_normalize_context_t<'_> {
18    pub(crate) fn override_decompose_and_compose(
19        &mut self,
20        decompose: Option<DecomposeFn>,
21        compose: Option<ComposeFn>,
22    ) {
23        if let Some(decompose) = decompose {
24            self.decompose = decompose;
25        }
26
27        if let Some(compose) = compose {
28            self.compose = compose;
29        }
30    }
31}
32
33pub type hb_ot_shape_normalization_mode_t = i32;
34pub const HB_OT_SHAPE_NORMALIZATION_MODE_NONE: i32 = 0;
35pub const HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED: i32 = 1;
36pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS: i32 = 2; /* Never composes base-to-base */
37pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT: i32 = 3; /* Always fully decomposes and then recompose back */
38pub const HB_OT_SHAPE_NORMALIZATION_MODE_AUTO: i32 = 4; /* See hb-ot-shape-normalize.cc for logic. */
39#[allow(dead_code)]
40pub const HB_OT_SHAPE_NORMALIZATION_MODE_DEFAULT: i32 = HB_OT_SHAPE_NORMALIZATION_MODE_AUTO;
41
42// HIGHLEVEL DESIGN:
43//
44// This file exports one main function: normalize().
45//
46// This function closely reflects the Unicode Normalization Algorithm,
47// yet it's different.
48//
49// Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC).
50// The logic however tries to use whatever the font can support.
51//
52// In general what happens is that: each grapheme is decomposed in a chain
53// of 1:2 decompositions, marks reordered, and then recomposed if desired,
54// so far it's like Unicode Normalization.  However, the decomposition and
55// recomposition only happens if the font supports the resulting characters.
56//
57// The goals are:
58//
59//   - Try to render all canonically equivalent strings similarly.  To really
60//     achieve this we have to always do the full decomposition and then
61//     selectively recompose from there.  It's kinda too expensive though, so
62//     we skip some cases.  For example, if composed is desired, we simply
63//     don't touch 1-character clusters that are supported by the font, even
64//     though their NFC may be different.
65//
66//   - When a font has a precomposed character for a sequence but the 'ccmp'
67//     feature in the font is not adequate, use the precomposed character
68//     which typically has better mark positioning.
69//
70//   - When a font does not support a combining mark, but supports it precomposed
71//     with previous base, use that.  This needs the itemizer to have this
72//     knowledge too.  We need to provide assistance to the itemizer.
73//
74//   - When a font does not support a character but supports its canonical
75//     decomposition, well, use the decomposition.
76//
77//   - The shapers can customize the compose and decompose functions to
78//     offload some of their requirements to the normalizer.  For example, the
79//     Indic shaper may want to disallow recomposing of two matras.
80
81fn decompose_unicode(
82    _: &hb_ot_shape_normalize_context_t,
83    ab: hb_codepoint_t,
84) -> Option<(hb_codepoint_t, hb_codepoint_t)> {
85    super::unicode::decompose(ab)
86}
87
88fn compose_unicode(
89    _: &hb_ot_shape_normalize_context_t,
90    a: hb_codepoint_t,
91    b: hb_codepoint_t,
92) -> Option<hb_codepoint_t> {
93    super::unicode::compose(a, b)
94}
95
96fn set_glyph(info: &mut hb_glyph_info_t, font: &hb_font_t) {
97    if let Some(glyph_id) = font.get_nominal_glyph(info.glyph_id) {
98        info.set_glyph_index(u32::from(glyph_id.0));
99    }
100}
101
102fn output_char(buffer: &mut hb_buffer_t, unichar: u32, glyph: u32) {
103    // This is very confusing indeed.
104    buffer.cur_mut(0).set_glyph_index(glyph);
105    buffer.output_glyph(unichar);
106    // TODO: should be _hb_glyph_info_set_unicode_props (&buffer->prev(), buffer);
107    let mut flags = buffer.scratch_flags;
108    buffer.prev_mut().init_unicode_props(&mut flags);
109    buffer.scratch_flags = flags;
110}
111
112fn next_char(buffer: &mut hb_buffer_t, glyph: u32) {
113    buffer.cur_mut(0).set_glyph_index(glyph);
114    buffer.next_glyph();
115}
116
117fn skip_char(buffer: &mut hb_buffer_t) {
118    buffer.skip_glyph();
119}
120
121/// Returns 0 if didn't decompose, number of resulting characters otherwise.
122fn decompose(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool, ab: hb_codepoint_t) -> u32 {
123    let (a, b) = match (ctx.decompose)(ctx, ab) {
124        Some(decomposed) => decomposed,
125        _ => return 0,
126    };
127
128    let a_glyph = ctx.face.get_nominal_glyph(u32::from(a));
129    let b_glyph = if b != '\0' {
130        match ctx.face.get_nominal_glyph(u32::from(b)) {
131            Some(glyph_id) => Some(glyph_id),
132            None => return 0,
133        }
134    } else {
135        None
136    };
137
138    if !shortest || a_glyph.is_none() {
139        let ret = decompose(ctx, shortest, a);
140        if ret != 0 {
141            if let Some(b_glyph) = b_glyph {
142                output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0));
143                return ret + 1;
144            }
145            return ret;
146        }
147    }
148
149    if let Some(a_glyph) = a_glyph {
150        // Output a and b.
151        output_char(ctx.buffer, u32::from(a), u32::from(a_glyph.0));
152        if let Some(b_glyph) = b_glyph {
153            output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0));
154            return 2;
155        }
156        return 1;
157    }
158
159    0
160}
161
162fn decompose_current_character(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool) {
163    let u = ctx.buffer.cur(0).as_char();
164    let glyph = ctx.face.get_nominal_glyph(u32::from(u));
165
166    // TODO: different to harfbuzz, sync
167    if !shortest || glyph.is_none() {
168        if decompose(ctx, shortest, u) > 0 {
169            skip_char(ctx.buffer);
170            return;
171        }
172    }
173
174    // TODO: different to harfbuzz, sync
175    if let Some(glyph) = glyph {
176        next_char(ctx.buffer, u32::from(glyph.0));
177        return;
178    }
179
180    if _hb_glyph_info_is_unicode_space(ctx.buffer.cur(0)) {
181        let space_type = u.space_fallback();
182        if space_type != hb_unicode_funcs_t::NOT_SPACE {
183            let space_glyph = ctx.face.get_nominal_glyph(0x0020).or(ctx.buffer.invisible);
184
185            if let Some(space_glyph) = space_glyph {
186                _hb_glyph_info_set_unicode_space_fallback_type(ctx.buffer.cur_mut(0), space_type);
187                next_char(ctx.buffer, u32::from(space_glyph.0));
188                ctx.buffer.scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK;
189                return;
190            }
191        }
192    }
193
194    // U+2011 is the only sensible character that is a no-break version of another character
195    // and not a space.  The space ones are handled already.  Handle this lone one.
196    if u == '\u{2011}' {
197        if let Some(other_glyph) = ctx.face.get_nominal_glyph(0x2010) {
198            next_char(ctx.buffer, u32::from(other_glyph.0));
199            return;
200        }
201    }
202
203    // Insert a .notdef glyph if decomposition failed.
204    next_char(ctx.buffer, 0);
205}
206
207fn handle_variation_selector_cluster(
208    ctx: &mut hb_ot_shape_normalize_context_t,
209    end: usize,
210    _: bool,
211) {
212    let face = ctx.face;
213
214    // Currently if there's a variation-selector we give-up on normalization, it's just too hard.
215    let buffer = &mut ctx.buffer;
216    while buffer.idx < end - 1 && buffer.successful {
217        if buffer.cur(1).as_char().is_variation_selector() {
218            if let Some(glyph_id) =
219                face.glyph_variation_index(buffer.cur(0).as_char(), buffer.cur(1).as_char())
220            {
221                buffer.cur_mut(0).set_glyph_index(u32::from(glyph_id.0));
222                let unicode = buffer.cur(0).glyph_id;
223                buffer.replace_glyphs(2, 1, &[unicode]);
224            } else {
225                // Just pass on the two characters separately, let GSUB do its magic.
226                set_glyph(buffer.cur_mut(0), face);
227                buffer.next_glyph();
228
229                buffer.scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_VARIATION_SELECTOR_FALLBACK;
230
231                _hb_glyph_info_set_variation_selector(buffer.cur_mut(0), true);
232
233                if buffer.not_found_variation_selector.is_some() {
234                    _hb_glyph_info_clear_default_ignorable(buffer.cur_mut(0))
235                }
236
237                set_glyph(buffer.cur_mut(0), face);
238                buffer.next_glyph();
239            }
240
241            // Skip any further variation selectors.
242            while buffer.idx < end && buffer.cur(0).as_char().is_variation_selector() {
243                set_glyph(buffer.cur_mut(0), face);
244                buffer.next_glyph();
245            }
246        } else {
247            set_glyph(buffer.cur_mut(0), face);
248            buffer.next_glyph();
249        }
250    }
251
252    if ctx.buffer.idx < end {
253        set_glyph(ctx.buffer.cur_mut(0), face);
254        ctx.buffer.next_glyph();
255    }
256}
257
258fn decompose_multi_char_cluster(
259    ctx: &mut hb_ot_shape_normalize_context_t,
260    end: usize,
261    short_circuit: bool,
262) {
263    let mut i = ctx.buffer.idx;
264    while i < end && ctx.buffer.successful {
265        if ctx.buffer.info[i].as_char().is_variation_selector() {
266            handle_variation_selector_cluster(ctx, end, short_circuit);
267            return;
268        }
269        i += 1;
270    }
271
272    while ctx.buffer.idx < end && ctx.buffer.successful {
273        decompose_current_character(ctx, short_circuit);
274    }
275}
276
277fn compare_combining_class(pa: &hb_glyph_info_t, pb: &hb_glyph_info_t) -> bool {
278    let a = _hb_glyph_info_get_modified_combining_class(pa);
279    let b = _hb_glyph_info_get_modified_combining_class(pb);
280    a > b
281}
282
283pub fn _hb_ot_shape_normalize(
284    plan: &hb_ot_shape_plan_t,
285    buffer: &mut hb_buffer_t,
286    face: &hb_font_t,
287) {
288    if buffer.is_empty() {
289        return;
290    }
291
292    let mut mode = plan.shaper.normalization_preference;
293    if mode == HB_OT_SHAPE_NORMALIZATION_MODE_AUTO {
294        if plan.has_gpos_mark {
295            // https://github.com/harfbuzz/harfbuzz/issues/653#issuecomment-423905920
296            // mode = Some(HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED);
297            mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS;
298        } else {
299            mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS;
300        }
301    }
302
303    let mut ctx = hb_ot_shape_normalize_context_t {
304        plan,
305        buffer,
306        face,
307        decompose: decompose_unicode,
308        compose: compose_unicode,
309    };
310    ctx.override_decompose_and_compose(plan.shaper.decompose, plan.shaper.compose);
311
312    let mut buffer = &mut ctx.buffer;
313
314    let always_short_circuit = mode == HB_OT_SHAPE_NORMALIZATION_MODE_NONE;
315    let might_short_circuit = always_short_circuit
316        || (mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED
317            && mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT);
318
319    // We do a fairly straightforward yet custom normalization process in three
320    // separate rounds: decompose, reorder, recompose (if desired).  Currently
321    // this makes two buffer swaps.  We can make it faster by moving the last
322    // two rounds into the inner loop for the first round, but it's more readable
323    // this way.
324
325    // First round, decompose
326    let mut all_simple = true;
327    {
328        buffer.clear_output();
329        let count = buffer.len;
330        buffer.idx = 0;
331        loop {
332            let mut end = buffer.idx + 1;
333            while end < count && !_hb_glyph_info_is_unicode_mark(&buffer.info[end]) {
334                end += 1;
335            }
336
337            if end < count {
338                // Leave one base for the marks to cluster with.
339                end -= 1;
340            }
341
342            // From idx to end are simple clusters.
343            if might_short_circuit {
344                let len = end - buffer.idx;
345                let mut done = 0;
346                while done < len {
347                    let cur = buffer.cur_mut(done);
348                    cur.set_glyph_index(match face.get_nominal_glyph(cur.glyph_id) {
349                        Some(glyph_id) => u32::from(glyph_id.0),
350                        None => break,
351                    });
352                    done += 1;
353                }
354                buffer.next_glyphs(done);
355            }
356
357            while buffer.idx < end && buffer.successful {
358                decompose_current_character(&mut ctx, might_short_circuit);
359                buffer = &mut ctx.buffer;
360            }
361
362            if buffer.idx == count || !buffer.successful {
363                break;
364            }
365
366            all_simple = false;
367
368            // Find all the marks now.
369            end = buffer.idx + 1;
370            while end < count && _hb_glyph_info_is_unicode_mark(&buffer.info[end]) {
371                end += 1;
372            }
373
374            // idx to end is one non-simple cluster.
375            decompose_multi_char_cluster(&mut ctx, end, always_short_circuit);
376            buffer = &mut ctx.buffer;
377
378            if buffer.idx >= count || !buffer.successful {
379                break;
380            }
381        }
382
383        buffer.sync();
384    }
385
386    // Second round, reorder (inplace)
387    if !all_simple {
388        let count = buffer.len;
389        let mut i = 0;
390        while i < count {
391            if _hb_glyph_info_get_modified_combining_class(&buffer.info[i]) == 0 {
392                i += 1;
393                continue;
394            }
395
396            let mut end = i + 1;
397            while end < count && _hb_glyph_info_get_modified_combining_class(&buffer.info[end]) != 0
398            {
399                end += 1;
400            }
401
402            // We are going to do a O(n^2).  Only do this if the sequence is short.
403            if end - i <= MAX_COMBINING_MARKS {
404                buffer.sort(i, end, compare_combining_class);
405
406                if let Some(reorder_marks) = ctx.plan.shaper.reorder_marks {
407                    reorder_marks(ctx.plan, buffer, i, end);
408                }
409            }
410
411            i = end + 1;
412        }
413    }
414    if buffer.scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_CGJ != 0 {
415        // For all CGJ, check if it prevented any reordering at all.
416        // If it did NOT, then make it skippable.
417        // https://github.com/harfbuzz/harfbuzz/issues/554
418        for i in 1..buffer.len.saturating_sub(1) {
419            if buffer.info[i].glyph_id == 0x034F
420            /* CGJ */
421            {
422                let last = _hb_glyph_info_get_modified_combining_class(&buffer.info[i - 1]);
423                let next = _hb_glyph_info_get_modified_combining_class(&buffer.info[i + 1]);
424                if next == 0 || last <= next {
425                    buffer.info[i].unhide();
426                }
427            }
428        }
429    }
430
431    // Third round, recompose
432    if !all_simple
433        && buffer.successful
434        && (mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS
435            || mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT)
436    {
437        // As noted in the comment earlier, we don't try to combine
438        // ccc=0 chars with their previous Starter.
439
440        let count = buffer.len;
441        let mut starter = 0;
442        buffer.clear_output();
443        buffer.next_glyph();
444        while buffer.idx < count && buffer.successful {
445            // We don't try to compose a non-mark character with it's preceding starter.
446            // This is both an optimization to avoid trying to compose every two neighboring
447            // glyphs in most scripts AND a desired feature for Hangul.  Apparently Hangul
448            // fonts are not designed to mix-and-match pre-composed syllables and Jamo.
449            let cur = buffer.cur(0);
450            if _hb_glyph_info_is_unicode_mark(cur) &&
451                // If there's anything between the starter and this char, they should have CCC
452                // smaller than this character's.
453                (starter == buffer.out_len - 1
454                    || _hb_glyph_info_get_modified_combining_class(buffer.prev()) < _hb_glyph_info_get_modified_combining_class(cur))
455            {
456                let a = buffer.out_info()[starter].as_char();
457                let b = cur.as_char();
458                if let Some(composed) = (ctx.compose)(&ctx, a, b) {
459                    if let Some(glyph_id) = face.get_nominal_glyph(u32::from(composed)) {
460                        // Copy to out-buffer.
461                        buffer = &mut ctx.buffer;
462                        buffer.next_glyph();
463                        if !buffer.successful {
464                            return;
465                        }
466
467                        // Merge and remove the second composable.
468                        buffer.merge_out_clusters(starter, buffer.out_len);
469                        buffer.out_len -= 1;
470
471                        // Modify starter and carry on.
472                        let mut flags = buffer.scratch_flags;
473                        let info = &mut buffer.out_info_mut()[starter];
474                        info.glyph_id = u32::from(composed);
475                        info.set_glyph_index(u32::from(glyph_id.0));
476                        info.init_unicode_props(&mut flags);
477                        buffer.scratch_flags = flags;
478
479                        continue;
480                    }
481                }
482            }
483
484            // Blocked, or doesn't compose.
485            buffer = &mut ctx.buffer;
486            buffer.next_glyph();
487
488            if _hb_glyph_info_get_modified_combining_class(buffer.prev()) == 0 {
489                starter = buffer.out_len - 1;
490            }
491        }
492
493        buffer.sync();
494    }
495}