2 * Copyright © 2011,2012 Google, Inc.
4 * This is part of HarfBuzz, a text shaping library.
6 * Permission is hereby granted, without written agreement and without
7 * license or royalty fees, to use, copy, modify, and distribute this
8 * software and its documentation for any purpose, provided that the
9 * above copyright notice and the following two paragraphs appear in
10 * all copies of this software.
12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
24 * Google Author(s): Behdad Esfahbod
29 #ifndef HB_NO_OT_SHAPE
31 #include "hb-ot-shape-normalize.hh"
32 #include "hb-ot-shaper.hh"
33 #include "hb-ot-shape.hh"
39 * This file exports one main function: _hb_ot_shape_normalize().
41 * This function closely reflects the Unicode Normalization Algorithm,
44 * Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC).
45 * The logic however tries to use whatever the font can support.
47 * In general what happens is that: each grapheme is decomposed in a chain
48 * of 1:2 decompositions, marks reordered, and then recomposed if desired,
49 * so far it's like Unicode Normalization. However, the decomposition and
50 * recomposition only happens if the font supports the resulting characters.
54 * - Try to render all canonically equivalent strings similarly. To really
55 * achieve this we have to always do the full decomposition and then
56 * selectively recompose from there. It's kinda too expensive though, so
57 * we skip some cases. For example, if composed is desired, we simply
58 * don't touch 1-character clusters that are supported by the font, even
59 * though their NFC may be different.
61 * - When a font has a precomposed character for a sequence but the 'ccmp'
62 * feature in the font is not adequate, use the precomposed character
63 * which typically has better mark positioning.
65 * - When a font does not support a combining mark, but supports it precomposed
66 * with previous base, use that. This needs the itemizer to have this
67 * knowledge too. We need to provide assistance to the itemizer.
69 * - When a font does not support a character but supports its canonical
70 * decomposition, well, use the decomposition.
72 * - The shapers can customize the compose and decompose functions to
73 * offload some of their requirements to the normalizer. For example, the
74 * Indic shaper may want to disallow recomposing of two matras.
78 decompose_unicode (const hb_ot_shape_normalize_context_t
*c
,
83 return (bool) c
->unicode
->decompose (ab
, a
, b
);
87 compose_unicode (const hb_ot_shape_normalize_context_t
*c
,
92 return (bool) c
->unicode
->compose (a
, b
, ab
);
96 set_glyph (hb_glyph_info_t
&info
, hb_font_t
*font
)
98 (void) font
->get_nominal_glyph (info
.codepoint
, &info
.glyph_index());
102 output_char (hb_buffer_t
*buffer
, hb_codepoint_t unichar
, hb_codepoint_t glyph
)
104 /* This is very confusing indeed. */
105 buffer
->cur().glyph_index() = glyph
;
106 (void) buffer
->output_glyph (unichar
);
107 _hb_glyph_info_set_unicode_props (&buffer
->prev(), buffer
);
111 next_char (hb_buffer_t
*buffer
, hb_codepoint_t glyph
)
113 buffer
->cur().glyph_index() = glyph
;
114 (void) buffer
->next_glyph ();
118 skip_char (hb_buffer_t
*buffer
)
120 buffer
->skip_glyph ();
123 /* Returns 0 if didn't decompose, number of resulting characters otherwise. */
124 static inline unsigned int
125 decompose (const hb_ot_shape_normalize_context_t
*c
, bool shortest
, hb_codepoint_t ab
)
127 hb_codepoint_t a
= 0, b
= 0, a_glyph
= 0, b_glyph
= 0;
128 hb_buffer_t
* const buffer
= c
->buffer
;
129 hb_font_t
* const font
= c
->font
;
131 if (!c
->decompose (c
, ab
, &a
, &b
) ||
132 (b
&& !font
->get_nominal_glyph (b
, &b_glyph
)))
135 bool has_a
= (bool) font
->get_nominal_glyph (a
, &a_glyph
);
136 if (shortest
&& has_a
) {
138 output_char (buffer
, a
, a_glyph
);
140 output_char (buffer
, b
, b_glyph
);
146 if (unsigned ret
= decompose (c
, shortest
, a
)) {
148 output_char (buffer
, b
, b_glyph
);
155 output_char (buffer
, a
, a_glyph
);
157 output_char (buffer
, b
, b_glyph
);
167 decompose_current_character (const hb_ot_shape_normalize_context_t
*c
, bool shortest
)
169 hb_buffer_t
* const buffer
= c
->buffer
;
170 hb_codepoint_t u
= buffer
->cur().codepoint
;
171 hb_codepoint_t glyph
= 0;
173 if (shortest
&& c
->font
->get_nominal_glyph (u
, &glyph
, c
->not_found
))
175 next_char (buffer
, glyph
);
179 if (decompose (c
, shortest
, u
))
185 if (!shortest
&& c
->font
->get_nominal_glyph (u
, &glyph
, c
->not_found
))
187 next_char (buffer
, glyph
);
191 if (_hb_glyph_info_is_unicode_space (&buffer
->cur()))
193 hb_codepoint_t space_glyph
;
194 hb_unicode_funcs_t::space_t space_type
= buffer
->unicode
->space_fallback_type (u
);
195 if (space_type
!= hb_unicode_funcs_t::NOT_SPACE
&&
196 (c
->font
->get_nominal_glyph (0x0020, &space_glyph
) || (space_glyph
= buffer
->invisible
)))
198 _hb_glyph_info_set_unicode_space_fallback_type (&buffer
->cur(), space_type
);
199 next_char (buffer
, space_glyph
);
200 buffer
->scratch_flags
|= HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK
;
207 /* U+2011 is the only sensible character that is a no-break version of another character
208 * and not a space. The space ones are handled already. Handle this lone one. */
209 hb_codepoint_t other_glyph
;
210 if (c
->font
->get_nominal_glyph (0x2010u
, &other_glyph
))
212 next_char (buffer
, other_glyph
);
217 next_char (buffer
, glyph
); /* glyph is initialized in earlier branches. */
221 handle_variation_selector_cluster (const hb_ot_shape_normalize_context_t
*c
,
223 bool short_circuit HB_UNUSED
)
225 /* Currently if there's a variation-selector we give-up on normalization, it's just too hard. */
226 hb_buffer_t
* const buffer
= c
->buffer
;
227 hb_font_t
* const font
= c
->font
;
228 for (; buffer
->idx
< end
- 1 && buffer
->successful
;) {
229 if (unlikely (buffer
->unicode
->is_variation_selector (buffer
->cur(+1).codepoint
))) {
230 if (font
->get_variation_glyph (buffer
->cur().codepoint
, buffer
->cur(+1).codepoint
, &buffer
->cur().glyph_index()))
232 hb_codepoint_t unicode
= buffer
->cur().codepoint
;
233 (void) buffer
->replace_glyphs (2, 1, &unicode
);
237 /* Just pass on the two characters separately, let GSUB do its magic. */
238 set_glyph (buffer
->cur(), font
);
239 (void) buffer
->next_glyph ();
240 set_glyph (buffer
->cur(), font
);
241 (void) buffer
->next_glyph ();
243 /* Skip any further variation selectors. */
244 while (buffer
->idx
< end
&&
245 buffer
->successful
&&
246 unlikely (buffer
->unicode
->is_variation_selector (buffer
->cur().codepoint
)))
248 set_glyph (buffer
->cur(), font
);
249 (void) buffer
->next_glyph ();
254 set_glyph (buffer
->cur(), font
);
255 (void) buffer
->next_glyph ();
258 if (likely (buffer
->idx
< end
))
260 set_glyph (buffer
->cur(), font
);
261 (void) buffer
->next_glyph ();
266 decompose_multi_char_cluster (const hb_ot_shape_normalize_context_t
*c
, unsigned int end
, bool short_circuit
)
268 hb_buffer_t
* const buffer
= c
->buffer
;
269 for (unsigned int i
= buffer
->idx
; i
< end
&& buffer
->successful
; i
++)
270 if (unlikely (buffer
->unicode
->is_variation_selector (buffer
->info
[i
].codepoint
))) {
271 handle_variation_selector_cluster (c
, end
, short_circuit
);
275 while (buffer
->idx
< end
&& buffer
->successful
)
276 decompose_current_character (c
, short_circuit
);
281 compare_combining_class (const hb_glyph_info_t
*pa
, const hb_glyph_info_t
*pb
)
283 unsigned int a
= _hb_glyph_info_get_modified_combining_class (pa
);
284 unsigned int b
= _hb_glyph_info_get_modified_combining_class (pb
);
286 return a
< b
? -1 : a
== b
? 0 : +1;
291 _hb_ot_shape_normalize (const hb_ot_shape_plan_t
*plan
,
295 if (unlikely (!buffer
->len
)) return;
297 _hb_buffer_assert_unicode_vars (buffer
);
299 hb_ot_shape_normalization_mode_t mode
= plan
->shaper
->normalization_preference
;
300 if (mode
== HB_OT_SHAPE_NORMALIZATION_MODE_AUTO
)
302 if (plan
->has_gpos_mark
)
303 // https://github.com/harfbuzz/harfbuzz/issues/653#issuecomment-423905920
304 //mode = HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED;
305 mode
= HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS
;
307 mode
= HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS
;
310 const hb_ot_shape_normalize_context_t c
= {
316 plan
->shaper
->decompose
? plan
->shaper
->decompose
: decompose_unicode
,
317 plan
->shaper
->compose
? plan
->shaper
->compose
: compose_unicode
320 bool always_short_circuit
= mode
== HB_OT_SHAPE_NORMALIZATION_MODE_NONE
;
321 bool might_short_circuit
= always_short_circuit
||
322 (mode
!= HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED
&&
323 mode
!= HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT
);
326 /* We do a fairly straightforward yet custom normalization process in three
327 * separate rounds: decompose, reorder, recompose (if desired). Currently
328 * this makes two buffer swaps. We can make it faster by moving the last
329 * two rounds into the inner loop for the first round, but it's more readable
333 /* First round, decompose */
335 bool all_simple
= true;
337 buffer
->clear_output ();
343 for (end
= buffer
->idx
+ 1; end
< count
; end
++)
344 if (_hb_glyph_info_is_unicode_mark (&buffer
->info
[end
]))
348 end
--; /* Leave one base for the marks to cluster with. */
350 /* From idx to end are simple clusters. */
351 if (might_short_circuit
)
353 unsigned int done
= font
->get_nominal_glyphs (end
- buffer
->idx
,
354 &buffer
->cur().codepoint
,
355 sizeof (buffer
->info
[0]),
356 &buffer
->cur().glyph_index(),
357 sizeof (buffer
->info
[0]));
358 if (unlikely (!buffer
->next_glyphs (done
))) break;
360 while (buffer
->idx
< end
&& buffer
->successful
)
361 decompose_current_character (&c
, might_short_circuit
);
363 if (buffer
->idx
== count
|| !buffer
->successful
)
368 /* Find all the marks now. */
369 for (end
= buffer
->idx
+ 1; end
< count
; end
++)
370 if (!_hb_glyph_info_is_unicode_mark(&buffer
->info
[end
]))
373 /* idx to end is one non-simple cluster. */
374 decompose_multi_char_cluster (&c
, end
, always_short_circuit
);
376 while (buffer
->idx
< count
&& buffer
->successful
);
381 /* Second round, reorder (inplace) */
383 if (!all_simple
&& buffer
->message(font
, "start reorder"))
386 hb_glyph_info_t
*info
= buffer
->info
;
387 for (unsigned int i
= 0; i
< count
; i
++)
389 if (_hb_glyph_info_get_modified_combining_class (&info
[i
]) == 0)
393 for (end
= i
+ 1; end
< count
; end
++)
394 if (_hb_glyph_info_get_modified_combining_class (&info
[end
]) == 0)
397 /* We are going to do a O(n^2). Only do this if the sequence is short. */
398 if (end
- i
> HB_OT_SHAPE_MAX_COMBINING_MARKS
) {
403 buffer
->sort (i
, end
, compare_combining_class
);
405 if (plan
->shaper
->reorder_marks
)
406 plan
->shaper
->reorder_marks (plan
, buffer
, i
, end
);
410 (void) buffer
->message(font
, "end reorder");
412 if (buffer
->scratch_flags
& HB_BUFFER_SCRATCH_FLAG_HAS_CGJ
)
414 /* For all CGJ, check if it prevented any reordering at all.
415 * If it did NOT, then make it skippable.
416 * https://github.com/harfbuzz/harfbuzz/issues/554
418 unsigned count
= buffer
->len
;
419 hb_glyph_info_t
*info
= buffer
->info
;
420 for (unsigned int i
= 1; i
+ 1 < count
; i
++)
421 if (info
[i
].codepoint
== 0x034Fu
/*CGJ*/ &&
422 (info_cc(info
[i
+1]) == 0 || info_cc(info
[i
-1]) <= info_cc(info
[i
+1])))
424 _hb_glyph_info_unhide (&info
[i
]);
429 /* Third round, recompose */
432 buffer
->successful
&&
433 (mode
== HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS
||
434 mode
== HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT
))
436 /* As noted in the comment earlier, we don't try to combine
437 * ccc=0 chars with their previous Starter. */
439 buffer
->clear_output ();
441 unsigned int starter
= 0;
442 (void) buffer
->next_glyph ();
443 while (buffer
->idx
< count
/* No need for: && buffer->successful */)
445 hb_codepoint_t composed
, glyph
;
446 if (/* We don't try to compose a non-mark character with it's preceding starter.
447 * This is both an optimization to avoid trying to compose every two neighboring
448 * glyphs in most scripts AND a desired feature for Hangul. Apparently Hangul
449 * fonts are not designed to mix-and-match pre-composed syllables and Jamo. */
450 _hb_glyph_info_is_unicode_mark(&buffer
->cur()))
452 if (/* If there's anything between the starter and this char, they should have CCC
453 * smaller than this character's. */
454 (starter
== buffer
->out_len
- 1 ||
455 info_cc (buffer
->prev()) < info_cc (buffer
->cur())) &&
458 buffer
->out_info
[starter
].codepoint
,
459 buffer
->cur().codepoint
,
461 /* And the font has glyph for the composite. */
462 font
->get_nominal_glyph (composed
, &glyph
))
465 if (unlikely (!buffer
->next_glyph ())) break; /* Copy to out-buffer. */
466 buffer
->merge_out_clusters (starter
, buffer
->out_len
);
467 buffer
->out_len
--; /* Remove the second composable. */
468 /* Modify starter and carry on. */
469 buffer
->out_info
[starter
].codepoint
= composed
;
470 buffer
->out_info
[starter
].glyph_index() = glyph
;
471 _hb_glyph_info_set_unicode_props (&buffer
->out_info
[starter
], buffer
);
477 /* Blocked, or doesn't compose. */
478 if (unlikely (!buffer
->next_glyph ())) break;
480 if (info_cc (buffer
->prev()) == 0)
481 starter
= buffer
->out_len
- 1;