Bumping manifests a=b2g-bump
[gecko.git] / intl / unicharutil / nsUnicodeNormalizer.cpp
blobf8f4ca6b379fe4bc70204adda0524565fa885454
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 /* This file is modified from JPNIC's mDNKit, it is under both MPL and
4 * JPNIC's license.
5 */
7 /* This Source Code Form is subject to the terms of the Mozilla Public
8 * License, v. 2.0. If a copy of the MPL was not distributed with this
9 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
12 * Copyright (c) 2000,2002 Japan Network Information Center.
13 * All rights reserved.
15 * By using this file, you agree to the terms and conditions set forth bellow.
17 * LICENSE TERMS AND CONDITIONS
19 * The following License Terms and Conditions apply, unless a different
20 * license is obtained from Japan Network Information Center ("JPNIC"),
21 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
22 * Chiyoda-ku, Tokyo 101-0047, Japan.
24 * 1. Use, Modification and Redistribution (including distribution of any
25 * modified or derived work) in source and/or binary forms is permitted
26 * under this License Terms and Conditions.
28 * 2. Redistribution of source code must retain the copyright notices as they
29 * appear in each source code file, this License Terms and Conditions.
31 * 3. Redistribution in binary form must reproduce the Copyright Notice,
32 * this License Terms and Conditions, in the documentation and/or other
33 * materials provided with the distribution. For the purposes of binary
34 * distribution the "Copyright Notice" refers to the following language:
35 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
37 * 4. The name of JPNIC may not be used to endorse or promote products
38 * derived from this Software without specific prior written approval of
39 * JPNIC.
41 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
44 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
46 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
47 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
48 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
49 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
50 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
51 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
54 #include <string.h>
56 #include "nsMemory.h"
57 #include "nsUnicodeNormalizer.h"
58 #include "nsString.h"
60 NS_IMPL_ISUPPORTS(nsUnicodeNormalizer, nsIUnicodeNormalizer)
63 nsUnicodeNormalizer::nsUnicodeNormalizer()
67 nsUnicodeNormalizer::~nsUnicodeNormalizer()
73 #define END_BIT 0x80000000
77 * Some constants for Hangul decomposition/composition.
78 * These things were taken from unicode book.
80 #define SBase 0xac00
81 #define LBase 0x1100
82 #define VBase 0x1161
83 #define TBase 0x11a7
84 #define LCount 19
85 #define VCount 21
86 #define TCount 28
87 #define SLast (SBase + LCount * VCount * TCount)
89 struct composition {
90 uint32_t c2; /* 2nd character */
91 uint32_t comp; /* composed character */
95 #include "normalization_data.h"
98 * Macro for multi-level index table.
100 #define LOOKUPTBL(vprefix, mprefix, v) \
101 DMAP(vprefix)[\
102 IMAP(vprefix)[\
103 IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\
105 ].tbl[IDX2(mprefix, v)]
107 #define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix))
108 #define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix))
109 #define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix))
111 #define IDX_0(v, bits1, bits2) ((v) >> ((bits1) + (bits2)))
112 #define IDX_1(v, bits1, bits2) (((v) >> (bits2)) & ((1 << (bits1)) - 1))
113 #define IDX_2(v, bits1, bits2) ((v) & ((1 << (bits2)) - 1))
115 #define BITS1(mprefix) mprefix ## _BITS_1
116 #define BITS2(mprefix) mprefix ## _BITS_2
118 #define IMAP(vprefix) vprefix ## _imap
119 #define DMAP(vprefix) vprefix ## _table
120 #define SEQ(vprefix) vprefix ## _seq
122 static int32_t
123 canonclass(uint32_t c) {
124 /* Look up canonicalclass table. */
125 return (LOOKUPTBL(canon_class, CANON_CLASS, c));
128 static int32_t
129 decompose_char(uint32_t c, const uint32_t **seqp)
131 /* Look up decomposition table. */
132 int32_t seqidx = LOOKUPTBL(decompose, DECOMP, c);
133 *seqp = SEQ(decompose) + (seqidx & ~DECOMP_COMPAT);
134 return (seqidx);
137 static int32_t
138 compose_char(uint32_t c,
139 const struct composition **compp)
141 /* Look up composition table. */
142 int32_t seqidx = LOOKUPTBL(compose, CANON_COMPOSE, c);
143 *compp = SEQ(compose) + (seqidx & 0xffff);
144 return (seqidx >> 16);
147 static nsresult
148 mdn__unicode_decompose(int32_t compat, uint32_t *v, size_t vlen,
149 uint32_t c, int32_t *decomp_lenp)
151 uint32_t *vorg = v;
152 int32_t seqidx;
153 const uint32_t *seq;
155 //assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr);
158 * First, check for Hangul.
160 if (SBase <= c && c < SLast) {
161 int32_t idx, t_offset, v_offset, l_offset;
163 idx = c - SBase;
164 t_offset = idx % TCount;
165 idx /= TCount;
166 v_offset = idx % VCount;
167 l_offset = idx / VCount;
168 if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
169 return (NS_ERROR_UNORM_MOREOUTPUT);
170 *v++ = LBase + l_offset;
171 *v++ = VBase + v_offset;
172 if (t_offset > 0)
173 *v++ = TBase + t_offset;
174 *decomp_lenp = v - vorg;
175 return (NS_OK);
179 * Look up decomposition table. If no decomposition is defined
180 * or if it is a compatibility decomosition when canonical
181 * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'.
183 seqidx = decompose_char(c, &seq);
184 if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
185 return (NS_SUCCESS_UNORM_NOTFOUND);
188 * Copy the decomposed sequence. The end of the sequence are
189 * marked with END_BIT.
191 do {
192 uint32_t c;
193 int32_t dlen;
194 nsresult r;
196 c = *seq & ~END_BIT;
198 /* Decompose recursively. */
199 r = mdn__unicode_decompose(compat, v, vlen, c, &dlen);
200 if (r == NS_OK) {
201 v += dlen;
202 vlen -= dlen;
203 } else if (r == NS_SUCCESS_UNORM_NOTFOUND) {
204 if (vlen < 1)
205 return (NS_ERROR_UNORM_MOREOUTPUT);
206 *v++ = c;
207 vlen--;
208 } else {
209 return (r);
212 } while ((*seq++ & END_BIT) == 0);
214 *decomp_lenp = v - vorg;
216 return (NS_OK);
219 static int32_t
220 mdn__unicode_iscompositecandidate(uint32_t c)
222 const struct composition *dummy;
224 /* Check for Hangul */
225 if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
226 return (1);
229 * Look up composition table. If there are no composition
230 * that begins with the given character, it is not a
231 * composition candidate.
233 if (compose_char(c, &dummy) == 0)
234 return (0);
235 else
236 return (1);
239 static nsresult
240 mdn__unicode_compose(uint32_t c1, uint32_t c2, uint32_t *compp)
242 int32_t n;
243 int32_t lo, hi;
244 const struct composition *cseq;
246 //assert(compp != nullptr);
249 * Check for Hangul.
251 if (LBase <= c1 && c1 < LBase + LCount &&
252 VBase <= c2 && c2 < VBase + VCount) {
254 * Hangul L and V.
256 *compp = SBase +
257 ((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
258 return (NS_OK);
259 } else if (SBase <= c1 && c1 < SLast &&
260 TBase <= c2 && c2 < TBase + TCount &&
261 (c1 - SBase) % TCount == 0) {
263 * Hangul LV and T.
265 *compp = c1 + (c2 - TBase);
266 return (NS_OK);
270 * Look up composition table. If the result is 0, no composition
271 * is defined. Otherwise, upper 16bits of the result contains
272 * the number of composition that begins with 'c1', and the lower
273 * 16bits is the offset in 'compose_seq'.
275 if ((n = compose_char(c1, &cseq)) == 0)
276 return (NS_SUCCESS_UNORM_NOTFOUND);
279 * The composite sequences are sorted by the 2nd character 'c2'.
280 * So we can use binary search.
282 lo = 0;
283 hi = n - 1;
284 while (lo <= hi) {
285 int32_t mid = (lo + hi) / 2;
287 if (cseq[mid].c2 < c2) {
288 lo = mid + 1;
289 } else if (cseq[mid].c2 > c2) {
290 hi = mid - 1;
291 } else {
292 *compp = cseq[mid].comp;
293 return (NS_OK);
296 return (NS_SUCCESS_UNORM_NOTFOUND);
300 #define WORKBUF_SIZE 128
301 #define WORKBUF_SIZE_MAX 10000
303 typedef struct {
304 int32_t cur; /* pointing now processing character */
305 int32_t last; /* pointing just after the last character */
306 int32_t size; /* size of UCS and CLASS array */
307 uint32_t *ucs; /* UCS-4 characters */
308 int32_t *cclass; /* and their canonical classes */
309 uint32_t ucs_buf[WORKBUF_SIZE]; /* local buffer */
310 int32_t class_buf[WORKBUF_SIZE]; /* ditto */
311 } workbuf_t;
313 static nsresult decompose(workbuf_t *wb, uint32_t c, int32_t compat);
314 static void get_class(workbuf_t *wb);
315 static void reorder(workbuf_t *wb);
316 static void compose(workbuf_t *wb);
317 static nsresult flush_before_cur(workbuf_t *wb, nsAString& aToStr);
318 static void workbuf_init(workbuf_t *wb);
319 static void workbuf_free(workbuf_t *wb);
320 static nsresult workbuf_extend(workbuf_t *wb);
321 static nsresult workbuf_append(workbuf_t *wb, uint32_t c);
322 static void workbuf_shift(workbuf_t *wb, int32_t shift);
323 static void workbuf_removevoid(workbuf_t *wb);
326 static nsresult
327 mdn_normalize(bool do_composition, bool compat,
328 const nsAString& aSrcStr, nsAString& aToStr)
330 workbuf_t wb;
331 nsresult r = NS_OK;
333 * Initialize working buffer.
335 workbuf_init(&wb);
337 nsAString::const_iterator start, end;
338 aSrcStr.BeginReading(start);
339 aSrcStr.EndReading(end);
341 while (start != end) {
342 uint32_t c;
343 char16_t curChar;
345 //assert(wb.cur == wb.last);
348 * Get one character from 'from'.
350 curChar= *start++;
352 if (NS_IS_HIGH_SURROGATE(curChar) && start != end && NS_IS_LOW_SURROGATE(*(start)) ) {
353 c = SURROGATE_TO_UCS4(curChar, *start);
354 ++start;
355 } else {
356 c = curChar;
360 * Decompose it.
362 if ((r = decompose(&wb, c, compat)) != NS_OK)
363 break;
366 * Get canonical class.
368 get_class(&wb);
371 * Reorder & compose.
373 for (; wb.cur < wb.last; wb.cur++) {
374 if (wb.cur == 0) {
375 continue;
376 } else if (wb.cclass[wb.cur] > 0) {
378 * This is not a starter. Try reordering.
379 * Note that characters up to it are
380 * already in canonical order.
382 reorder(&wb);
383 continue;
387 * This is a starter character, and there are
388 * some characters before it. Those characters
389 * have been reordered properly, and
390 * ready for composition.
392 if (do_composition && wb.cclass[0] == 0)
393 compose(&wb);
396 * If CUR points to a starter character,
397 * then process of characters before CUR are
398 * already finished, because any further
399 * reordering/composition for them are blocked
400 * by the starter CUR points.
402 if (wb.cur > 0 && wb.cclass[wb.cur] == 0) {
403 /* Flush everything before CUR. */
404 r = flush_before_cur(&wb, aToStr);
405 if (r != NS_OK)
406 break;
411 if (r == NS_OK) {
412 if (do_composition && wb.cur > 0 && wb.cclass[0] == 0) {
414 * There is some characters left in WB.
415 * They are ordered, but not composed yet.
416 * Now CUR points just after the last character in WB,
417 * and since compose() tries to compose characters
418 * between top and CUR inclusive, we must make CUR
419 * one character back during compose().
421 wb.cur--;
422 compose(&wb);
423 wb.cur++;
426 * Call this even when WB.CUR == 0, to make TO
427 * NUL-terminated.
429 r = flush_before_cur(&wb, aToStr);
432 workbuf_free(&wb);
434 return (r);
437 static nsresult
438 decompose(workbuf_t *wb, uint32_t c, int32_t compat) {
439 nsresult r;
440 int32_t dec_len;
442 again:
443 r = mdn__unicode_decompose(compat, wb->ucs + wb->last,
444 wb->size - wb->last, c, &dec_len);
445 switch (r) {
446 case NS_OK:
447 wb->last += dec_len;
448 return (NS_OK);
449 case NS_SUCCESS_UNORM_NOTFOUND:
450 return (workbuf_append(wb, c));
451 case NS_ERROR_UNORM_MOREOUTPUT:
452 if ((r = workbuf_extend(wb)) != NS_OK)
453 return (r);
454 if (wb->size > WORKBUF_SIZE_MAX) {
455 // "mdn__unormalize_form*: " "working buffer too large\n"
456 return (NS_ERROR_FAILURE);
458 goto again;
459 default:
460 return (r);
462 /* NOTREACHED */
465 static void
466 get_class(workbuf_t *wb) {
467 int32_t i;
469 for (i = wb->cur; i < wb->last; i++)
470 wb->cclass[i] = canonclass(wb->ucs[i]);
473 static void
474 reorder(workbuf_t *wb) {
475 uint32_t c;
476 int32_t i;
477 int32_t cclass;
479 //assert(wb != nullptr);
481 i = wb->cur;
482 c = wb->ucs[i];
483 cclass = wb->cclass[i];
485 while (i > 0 && wb->cclass[i - 1] > cclass) {
486 wb->ucs[i] = wb->ucs[i - 1];
487 wb->cclass[i] =wb->cclass[i - 1];
488 i--;
489 wb->ucs[i] = c;
490 wb->cclass[i] = cclass;
494 static void
495 compose(workbuf_t *wb) {
496 int32_t cur;
497 uint32_t *ucs;
498 int32_t *cclass;
499 int32_t last_class;
500 int32_t nvoids;
501 int32_t i;
503 //assert(wb != nullptr && wb->cclass[0] == 0);
505 cur = wb->cur;
506 ucs = wb->ucs;
507 cclass = wb->cclass;
510 * If there are no decomposition sequence that begins with
511 * the top character, composition is impossible.
513 if (!mdn__unicode_iscompositecandidate(ucs[0]))
514 return;
516 last_class = 0;
517 nvoids = 0;
518 for (i = 1; i <= cur; i++) {
519 uint32_t c;
520 int32_t cl = cclass[i];
522 if ((last_class < cl || cl == 0) &&
523 mdn__unicode_compose(ucs[0], ucs[i],
524 &c) == NS_OK) {
526 * Replace the top character with the composed one.
528 ucs[0] = c;
529 cclass[0] = canonclass(c);
531 cclass[i] = -1; /* void this character */
532 nvoids++;
533 } else {
534 last_class = cl;
538 /* Purge void characters, if any. */
539 if (nvoids > 0)
540 workbuf_removevoid(wb);
543 static nsresult
544 flush_before_cur(workbuf_t *wb, nsAString& aToStr)
546 int32_t i;
548 for (i = 0; i < wb->cur; i++) {
549 if (!IS_IN_BMP(wb->ucs[i])) {
550 aToStr.Append((char16_t)H_SURROGATE(wb->ucs[i]));
551 aToStr.Append((char16_t)L_SURROGATE(wb->ucs[i]));
552 } else {
553 aToStr.Append((char16_t)(wb->ucs[i]));
557 workbuf_shift(wb, wb->cur);
559 return (NS_OK);
562 static void
563 workbuf_init(workbuf_t *wb) {
564 wb->cur = 0;
565 wb->last = 0;
566 wb->size = WORKBUF_SIZE;
567 wb->ucs = wb->ucs_buf;
568 wb->cclass = wb->class_buf;
571 static void
572 workbuf_free(workbuf_t *wb) {
573 if (wb->ucs != wb->ucs_buf) {
574 nsMemory::Free(wb->ucs);
575 nsMemory::Free(wb->cclass);
579 static nsresult
580 workbuf_extend(workbuf_t *wb) {
581 int32_t newsize = wb->size * 3;
583 if (wb->ucs == wb->ucs_buf) {
584 wb->ucs = (uint32_t*)nsMemory::Alloc(sizeof(wb->ucs[0]) * newsize);
585 if (!wb->ucs)
586 return NS_ERROR_OUT_OF_MEMORY;
587 wb->cclass = (int32_t*)nsMemory::Alloc(sizeof(wb->cclass[0]) * newsize);
588 if (!wb->cclass) {
589 nsMemory::Free(wb->ucs);
590 wb->ucs = nullptr;
591 return NS_ERROR_OUT_OF_MEMORY;
593 } else {
594 void* buf = nsMemory::Realloc(wb->ucs, sizeof(wb->ucs[0]) * newsize);
595 if (!buf)
596 return NS_ERROR_OUT_OF_MEMORY;
597 wb->ucs = (uint32_t*)buf;
598 buf = nsMemory::Realloc(wb->cclass, sizeof(wb->cclass[0]) * newsize);
599 if (!buf)
600 return NS_ERROR_OUT_OF_MEMORY;
601 wb->cclass = (int32_t*)buf;
603 return (NS_OK);
606 static nsresult
607 workbuf_append(workbuf_t *wb, uint32_t c) {
608 nsresult r;
610 if (wb->last >= wb->size && (r = workbuf_extend(wb)) != NS_OK)
611 return (r);
612 wb->ucs[wb->last++] = c;
613 return (NS_OK);
616 static void
617 workbuf_shift(workbuf_t *wb, int32_t shift) {
618 int32_t nmove;
620 //assert(wb != nullptr && wb->cur >= shift);
622 nmove = wb->last - shift;
623 memmove(&wb->ucs[0], &wb->ucs[shift],
624 nmove * sizeof(wb->ucs[0]));
625 memmove(&wb->cclass[0], &wb->cclass[shift],
626 nmove * sizeof(wb->cclass[0]));
627 wb->cur -= shift;
628 wb->last -= shift;
631 static void
632 workbuf_removevoid(workbuf_t *wb) {
633 int32_t i, j;
634 int32_t last = wb->last;
636 for (i = j = 0; i < last; i++) {
637 if (wb->cclass[i] >= 0) {
638 if (j < i) {
639 wb->ucs[j] = wb->ucs[i];
640 wb->cclass[j] = wb->cclass[i];
642 j++;
645 wb->cur -= last - j;
646 wb->last = j;
649 nsresult
650 nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString& aSrc, nsAString& aDest)
652 return mdn_normalize(false, false, aSrc, aDest);
655 nsresult
656 nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString& aSrc, nsAString& aDest)
658 return mdn_normalize(true, false, aSrc, aDest);
661 nsresult
662 nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString& aSrc, nsAString& aDest)
664 return mdn_normalize(false, true, aSrc, aDest);
667 nsresult
668 nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString& aSrc, nsAString& aDest)
670 return mdn_normalize(true, true, aSrc, aDest);
673 bool
674 nsUnicodeNormalizer::Compose(uint32_t a, uint32_t b, uint32_t *ab)
676 return mdn__unicode_compose(a, b, ab) == NS_OK;
679 bool
680 nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c, uint32_t *c1, uint32_t *c2)
682 // We can't use mdn__unicode_decompose here, because that does a recursive
683 // decomposition that may yield more than two characters, but the harfbuzz
684 // callback wants just a single-step decomp that is guaranteed to produce
685 // no more than two characters. So we do a low-level lookup in the table
686 // of decomp sequences.
687 const uint32_t *seq;
688 uint32_t seqidx = decompose_char(c, &seq);
689 if (seqidx == 0 || ((seqidx & DECOMP_COMPAT) != 0)) {
690 return false;
692 *c1 = *seq & ~END_BIT;
693 if (*seq & END_BIT) {
694 *c2 = 0;
695 } else {
696 *c2 = *++seq & ~END_BIT;
698 return true;