1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 /* This file is modified from JPNIC's mDNKit, it is under both MPL and
7 /* This Source Code Form is subject to the terms of the Mozilla Public
8 * License, v. 2.0. If a copy of the MPL was not distributed with this
9 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
12 * Copyright (c) 2000,2002 Japan Network Information Center.
13 * All rights reserved.
15 * By using this file, you agree to the terms and conditions set forth bellow.
17 * LICENSE TERMS AND CONDITIONS
19 * The following License Terms and Conditions apply, unless a different
20 * license is obtained from Japan Network Information Center ("JPNIC"),
21 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
22 * Chiyoda-ku, Tokyo 101-0047, Japan.
24 * 1. Use, Modification and Redistribution (including distribution of any
25 * modified or derived work) in source and/or binary forms is permitted
26 * under this License Terms and Conditions.
28 * 2. Redistribution of source code must retain the copyright notices as they
29 * appear in each source code file, this License Terms and Conditions.
31 * 3. Redistribution in binary form must reproduce the Copyright Notice,
32 * this License Terms and Conditions, in the documentation and/or other
33 * materials provided with the distribution. For the purposes of binary
34 * distribution the "Copyright Notice" refers to the following language:
35 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
37 * 4. The name of JPNIC may not be used to endorse or promote products
38 * derived from this Software without specific prior written approval of
41 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
44 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
46 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
47 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
48 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
49 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
50 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
51 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
57 #include "nsUnicodeNormalizer.h"
60 NS_IMPL_ISUPPORTS(nsUnicodeNormalizer
, nsIUnicodeNormalizer
)
63 nsUnicodeNormalizer::nsUnicodeNormalizer()
67 nsUnicodeNormalizer::~nsUnicodeNormalizer()
73 #define END_BIT 0x80000000
77 * Some constants for Hangul decomposition/composition.
78 * These things were taken from unicode book.
87 #define SLast (SBase + LCount * VCount * TCount)
90 uint32_t c2
; /* 2nd character */
91 uint32_t comp
; /* composed character */
95 #include "normalization_data.h"
98 * Macro for multi-level index table.
100 #define LOOKUPTBL(vprefix, mprefix, v) \
103 IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\
105 ].tbl[IDX2(mprefix, v)]
107 #define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix))
108 #define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix))
109 #define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix))
111 #define IDX_0(v, bits1, bits2) ((v) >> ((bits1) + (bits2)))
112 #define IDX_1(v, bits1, bits2) (((v) >> (bits2)) & ((1 << (bits1)) - 1))
113 #define IDX_2(v, bits1, bits2) ((v) & ((1 << (bits2)) - 1))
115 #define BITS1(mprefix) mprefix ## _BITS_1
116 #define BITS2(mprefix) mprefix ## _BITS_2
118 #define IMAP(vprefix) vprefix ## _imap
119 #define DMAP(vprefix) vprefix ## _table
120 #define SEQ(vprefix) vprefix ## _seq
123 canonclass(uint32_t c
) {
124 /* Look up canonicalclass table. */
125 return (LOOKUPTBL(canon_class
, CANON_CLASS
, c
));
129 decompose_char(uint32_t c
, const uint32_t **seqp
)
131 /* Look up decomposition table. */
132 int32_t seqidx
= LOOKUPTBL(decompose
, DECOMP
, c
);
133 *seqp
= SEQ(decompose
) + (seqidx
& ~DECOMP_COMPAT
);
138 compose_char(uint32_t c
,
139 const struct composition
**compp
)
141 /* Look up composition table. */
142 int32_t seqidx
= LOOKUPTBL(compose
, CANON_COMPOSE
, c
);
143 *compp
= SEQ(compose
) + (seqidx
& 0xffff);
144 return (seqidx
>> 16);
148 mdn__unicode_decompose(int32_t compat
, uint32_t *v
, size_t vlen
,
149 uint32_t c
, int32_t *decomp_lenp
)
155 //assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr);
158 * First, check for Hangul.
160 if (SBase
<= c
&& c
< SLast
) {
161 int32_t idx
, t_offset
, v_offset
, l_offset
;
164 t_offset
= idx
% TCount
;
166 v_offset
= idx
% VCount
;
167 l_offset
= idx
/ VCount
;
168 if ((t_offset
== 0 && vlen
< 2) || (t_offset
> 0 && vlen
< 3))
169 return (NS_ERROR_UNORM_MOREOUTPUT
);
170 *v
++ = LBase
+ l_offset
;
171 *v
++ = VBase
+ v_offset
;
173 *v
++ = TBase
+ t_offset
;
174 *decomp_lenp
= v
- vorg
;
179 * Look up decomposition table. If no decomposition is defined
180 * or if it is a compatibility decomosition when canonical
181 * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'.
183 seqidx
= decompose_char(c
, &seq
);
184 if (seqidx
== 0 || (compat
== 0 && (seqidx
& DECOMP_COMPAT
) != 0))
185 return (NS_SUCCESS_UNORM_NOTFOUND
);
188 * Copy the decomposed sequence. The end of the sequence are
189 * marked with END_BIT.
198 /* Decompose recursively. */
199 r
= mdn__unicode_decompose(compat
, v
, vlen
, c
, &dlen
);
203 } else if (r
== NS_SUCCESS_UNORM_NOTFOUND
) {
205 return (NS_ERROR_UNORM_MOREOUTPUT
);
212 } while ((*seq
++ & END_BIT
) == 0);
214 *decomp_lenp
= v
- vorg
;
220 mdn__unicode_iscompositecandidate(uint32_t c
)
222 const struct composition
*dummy
;
224 /* Check for Hangul */
225 if ((LBase
<= c
&& c
< LBase
+ LCount
) || (SBase
<= c
&& c
< SLast
))
229 * Look up composition table. If there are no composition
230 * that begins with the given character, it is not a
231 * composition candidate.
233 if (compose_char(c
, &dummy
) == 0)
240 mdn__unicode_compose(uint32_t c1
, uint32_t c2
, uint32_t *compp
)
244 const struct composition
*cseq
;
246 //assert(compp != nullptr);
251 if (LBase
<= c1
&& c1
< LBase
+ LCount
&&
252 VBase
<= c2
&& c2
< VBase
+ VCount
) {
257 ((c1
- LBase
) * VCount
+ (c2
- VBase
)) * TCount
;
259 } else if (SBase
<= c1
&& c1
< SLast
&&
260 TBase
<= c2
&& c2
< TBase
+ TCount
&&
261 (c1
- SBase
) % TCount
== 0) {
265 *compp
= c1
+ (c2
- TBase
);
270 * Look up composition table. If the result is 0, no composition
271 * is defined. Otherwise, upper 16bits of the result contains
272 * the number of composition that begins with 'c1', and the lower
273 * 16bits is the offset in 'compose_seq'.
275 if ((n
= compose_char(c1
, &cseq
)) == 0)
276 return (NS_SUCCESS_UNORM_NOTFOUND
);
279 * The composite sequences are sorted by the 2nd character 'c2'.
280 * So we can use binary search.
285 int32_t mid
= (lo
+ hi
) / 2;
287 if (cseq
[mid
].c2
< c2
) {
289 } else if (cseq
[mid
].c2
> c2
) {
292 *compp
= cseq
[mid
].comp
;
296 return (NS_SUCCESS_UNORM_NOTFOUND
);
300 #define WORKBUF_SIZE 128
301 #define WORKBUF_SIZE_MAX 10000
304 int32_t cur
; /* pointing now processing character */
305 int32_t last
; /* pointing just after the last character */
306 int32_t size
; /* size of UCS and CLASS array */
307 uint32_t *ucs
; /* UCS-4 characters */
308 int32_t *cclass
; /* and their canonical classes */
309 uint32_t ucs_buf
[WORKBUF_SIZE
]; /* local buffer */
310 int32_t class_buf
[WORKBUF_SIZE
]; /* ditto */
313 static nsresult
decompose(workbuf_t
*wb
, uint32_t c
, int32_t compat
);
314 static void get_class(workbuf_t
*wb
);
315 static void reorder(workbuf_t
*wb
);
316 static void compose(workbuf_t
*wb
);
317 static nsresult
flush_before_cur(workbuf_t
*wb
, nsAString
& aToStr
);
318 static void workbuf_init(workbuf_t
*wb
);
319 static void workbuf_free(workbuf_t
*wb
);
320 static nsresult
workbuf_extend(workbuf_t
*wb
);
321 static nsresult
workbuf_append(workbuf_t
*wb
, uint32_t c
);
322 static void workbuf_shift(workbuf_t
*wb
, int32_t shift
);
323 static void workbuf_removevoid(workbuf_t
*wb
);
327 mdn_normalize(bool do_composition
, bool compat
,
328 const nsAString
& aSrcStr
, nsAString
& aToStr
)
333 * Initialize working buffer.
337 nsAString::const_iterator start
, end
;
338 aSrcStr
.BeginReading(start
);
339 aSrcStr
.EndReading(end
);
341 while (start
!= end
) {
345 //assert(wb.cur == wb.last);
348 * Get one character from 'from'.
352 if (NS_IS_HIGH_SURROGATE(curChar
) && start
!= end
&& NS_IS_LOW_SURROGATE(*(start
)) ) {
353 c
= SURROGATE_TO_UCS4(curChar
, *start
);
362 if ((r
= decompose(&wb
, c
, compat
)) != NS_OK
)
366 * Get canonical class.
373 for (; wb
.cur
< wb
.last
; wb
.cur
++) {
376 } else if (wb
.cclass
[wb
.cur
] > 0) {
378 * This is not a starter. Try reordering.
379 * Note that characters up to it are
380 * already in canonical order.
387 * This is a starter character, and there are
388 * some characters before it. Those characters
389 * have been reordered properly, and
390 * ready for composition.
392 if (do_composition
&& wb
.cclass
[0] == 0)
396 * If CUR points to a starter character,
397 * then process of characters before CUR are
398 * already finished, because any further
399 * reordering/composition for them are blocked
400 * by the starter CUR points.
402 if (wb
.cur
> 0 && wb
.cclass
[wb
.cur
] == 0) {
403 /* Flush everything before CUR. */
404 r
= flush_before_cur(&wb
, aToStr
);
412 if (do_composition
&& wb
.cur
> 0 && wb
.cclass
[0] == 0) {
414 * There is some characters left in WB.
415 * They are ordered, but not composed yet.
416 * Now CUR points just after the last character in WB,
417 * and since compose() tries to compose characters
418 * between top and CUR inclusive, we must make CUR
419 * one character back during compose().
426 * Call this even when WB.CUR == 0, to make TO
429 r
= flush_before_cur(&wb
, aToStr
);
438 decompose(workbuf_t
*wb
, uint32_t c
, int32_t compat
) {
443 r
= mdn__unicode_decompose(compat
, wb
->ucs
+ wb
->last
,
444 wb
->size
- wb
->last
, c
, &dec_len
);
449 case NS_SUCCESS_UNORM_NOTFOUND
:
450 return (workbuf_append(wb
, c
));
451 case NS_ERROR_UNORM_MOREOUTPUT
:
452 if ((r
= workbuf_extend(wb
)) != NS_OK
)
454 if (wb
->size
> WORKBUF_SIZE_MAX
) {
455 // "mdn__unormalize_form*: " "working buffer too large\n"
456 return (NS_ERROR_FAILURE
);
466 get_class(workbuf_t
*wb
) {
469 for (i
= wb
->cur
; i
< wb
->last
; i
++)
470 wb
->cclass
[i
] = canonclass(wb
->ucs
[i
]);
474 reorder(workbuf_t
*wb
) {
479 //assert(wb != nullptr);
483 cclass
= wb
->cclass
[i
];
485 while (i
> 0 && wb
->cclass
[i
- 1] > cclass
) {
486 wb
->ucs
[i
] = wb
->ucs
[i
- 1];
487 wb
->cclass
[i
] =wb
->cclass
[i
- 1];
490 wb
->cclass
[i
] = cclass
;
495 compose(workbuf_t
*wb
) {
503 //assert(wb != nullptr && wb->cclass[0] == 0);
510 * If there are no decomposition sequence that begins with
511 * the top character, composition is impossible.
513 if (!mdn__unicode_iscompositecandidate(ucs
[0]))
518 for (i
= 1; i
<= cur
; i
++) {
520 int32_t cl
= cclass
[i
];
522 if ((last_class
< cl
|| cl
== 0) &&
523 mdn__unicode_compose(ucs
[0], ucs
[i
],
526 * Replace the top character with the composed one.
529 cclass
[0] = canonclass(c
);
531 cclass
[i
] = -1; /* void this character */
538 /* Purge void characters, if any. */
540 workbuf_removevoid(wb
);
544 flush_before_cur(workbuf_t
*wb
, nsAString
& aToStr
)
548 for (i
= 0; i
< wb
->cur
; i
++) {
549 if (!IS_IN_BMP(wb
->ucs
[i
])) {
550 aToStr
.Append((char16_t
)H_SURROGATE(wb
->ucs
[i
]));
551 aToStr
.Append((char16_t
)L_SURROGATE(wb
->ucs
[i
]));
553 aToStr
.Append((char16_t
)(wb
->ucs
[i
]));
557 workbuf_shift(wb
, wb
->cur
);
563 workbuf_init(workbuf_t
*wb
) {
566 wb
->size
= WORKBUF_SIZE
;
567 wb
->ucs
= wb
->ucs_buf
;
568 wb
->cclass
= wb
->class_buf
;
572 workbuf_free(workbuf_t
*wb
) {
573 if (wb
->ucs
!= wb
->ucs_buf
) {
574 nsMemory::Free(wb
->ucs
);
575 nsMemory::Free(wb
->cclass
);
580 workbuf_extend(workbuf_t
*wb
) {
581 int32_t newsize
= wb
->size
* 3;
583 if (wb
->ucs
== wb
->ucs_buf
) {
584 wb
->ucs
= (uint32_t*)nsMemory::Alloc(sizeof(wb
->ucs
[0]) * newsize
);
586 return NS_ERROR_OUT_OF_MEMORY
;
587 wb
->cclass
= (int32_t*)nsMemory::Alloc(sizeof(wb
->cclass
[0]) * newsize
);
589 nsMemory::Free(wb
->ucs
);
591 return NS_ERROR_OUT_OF_MEMORY
;
594 void* buf
= nsMemory::Realloc(wb
->ucs
, sizeof(wb
->ucs
[0]) * newsize
);
596 return NS_ERROR_OUT_OF_MEMORY
;
597 wb
->ucs
= (uint32_t*)buf
;
598 buf
= nsMemory::Realloc(wb
->cclass
, sizeof(wb
->cclass
[0]) * newsize
);
600 return NS_ERROR_OUT_OF_MEMORY
;
601 wb
->cclass
= (int32_t*)buf
;
607 workbuf_append(workbuf_t
*wb
, uint32_t c
) {
610 if (wb
->last
>= wb
->size
&& (r
= workbuf_extend(wb
)) != NS_OK
)
612 wb
->ucs
[wb
->last
++] = c
;
617 workbuf_shift(workbuf_t
*wb
, int32_t shift
) {
620 //assert(wb != nullptr && wb->cur >= shift);
622 nmove
= wb
->last
- shift
;
623 memmove(&wb
->ucs
[0], &wb
->ucs
[shift
],
624 nmove
* sizeof(wb
->ucs
[0]));
625 memmove(&wb
->cclass
[0], &wb
->cclass
[shift
],
626 nmove
* sizeof(wb
->cclass
[0]));
632 workbuf_removevoid(workbuf_t
*wb
) {
634 int32_t last
= wb
->last
;
636 for (i
= j
= 0; i
< last
; i
++) {
637 if (wb
->cclass
[i
] >= 0) {
639 wb
->ucs
[j
] = wb
->ucs
[i
];
640 wb
->cclass
[j
] = wb
->cclass
[i
];
650 nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString
& aSrc
, nsAString
& aDest
)
652 return mdn_normalize(false, false, aSrc
, aDest
);
656 nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString
& aSrc
, nsAString
& aDest
)
658 return mdn_normalize(true, false, aSrc
, aDest
);
662 nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString
& aSrc
, nsAString
& aDest
)
664 return mdn_normalize(false, true, aSrc
, aDest
);
668 nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString
& aSrc
, nsAString
& aDest
)
670 return mdn_normalize(true, true, aSrc
, aDest
);
674 nsUnicodeNormalizer::Compose(uint32_t a
, uint32_t b
, uint32_t *ab
)
676 return mdn__unicode_compose(a
, b
, ab
) == NS_OK
;
680 nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c
, uint32_t *c1
, uint32_t *c2
)
682 // We can't use mdn__unicode_decompose here, because that does a recursive
683 // decomposition that may yield more than two characters, but the harfbuzz
684 // callback wants just a single-step decomp that is guaranteed to produce
685 // no more than two characters. So we do a low-level lookup in the table
686 // of decomp sequences.
688 uint32_t seqidx
= decompose_char(c
, &seq
);
689 if (seqidx
== 0 || ((seqidx
& DECOMP_COMPAT
) != 0)) {
692 *c1
= *seq
& ~END_BIT
;
693 if (*seq
& END_BIT
) {
696 *c2
= *++seq
& ~END_BIT
;