2 * Copyright (c) 2004 Kungliga Tekniska Högskolan
3 * (Royal Institute of Technology, Stockholm, Sweden).
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the Institute nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 #include "normalize_table.h"
49 translation_cmp(const void *key
, const void *data
)
51 const struct translation
*t1
= (const struct translation
*)key
;
52 const struct translation
*t2
= (const struct translation
*)data
;
54 return t1
->key
- t2
->key
;
57 enum { s_base
= 0xAC00};
58 enum { s_count
= 11172};
59 enum { l_base
= 0x1100};
61 enum { v_base
= 0x1161};
63 enum { t_base
= 0x11A7};
65 enum { n_count
= v_count
* t_count
};
68 hangul_decomp(const uint32_t *in
, size_t in_len
,
69 uint32_t *out
, size_t *out_len
)
76 if (u
< s_base
|| u
>= s_base
+ s_count
)
79 l
= l_base
+ s_index
/ n_count
;
80 v
= v_base
+ (s_index
% n_count
) / t_count
;
81 t
= t_base
+ s_index
% t_count
;
86 return WIND_ERR_OVERRUN
;
96 hangul_composition(const uint32_t *in
, size_t in_len
)
100 if (in
[0] >= l_base
&& in
[0] < l_base
+ l_count
) {
101 unsigned l_index
= in
[0] - l_base
;
104 if (in
[1] < v_base
|| in
[1] >= v_base
+ v_count
)
106 v_index
= in
[1] - v_base
;
107 return (l_index
* v_count
+ v_index
) * t_count
+ s_base
;
108 } else if (in
[0] >= s_base
&& in
[0] < s_base
+ s_count
) {
109 unsigned s_index
= in
[0] - s_base
;
112 if (s_index
% t_count
!= 0)
114 if (in
[1] < t_base
|| in
[1] >= t_base
+ t_count
)
116 t_index
= in
[1] - t_base
;
117 return in
[0] + t_index
;
123 compat_decomp(const uint32_t *in
, size_t in_len
,
124 uint32_t *out
, size_t *out_len
)
129 for (i
= 0; i
< in_len
; ++i
) {
130 struct translation ts
= {in
[i
]};
131 size_t sub_len
= *out_len
- o
;
134 ret
= hangul_decomp(in
+ i
, in_len
- i
,
137 if (ret
== WIND_ERR_OVERRUN
)
141 void *s
= bsearch(&ts
,
142 _wind_normalize_table
,
143 _wind_normalize_table_size
,
144 sizeof(_wind_normalize_table
[0]),
147 const struct translation
*t
= (const struct translation
*)s
;
149 ret
= compat_decomp(_wind_normalize_val_table
+ t
->val_offset
,
157 return WIND_ERR_OVERRUN
;
168 swap_char(uint32_t * a
, uint32_t * b
)
176 /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177 * that all have Canonical_Combining_Class > 0 */
179 canonical_reorder_sequence(uint32_t * a
, size_t len
)
186 for (i
= 1; i
< len
; i
++) {
189 _wind_combining_class(a
[j
]) < _wind_combining_class(a
[j
-1]);
191 swap_char(&a
[j
], &a
[j
-1]);
196 canonical_reorder(uint32_t *tmp
, size_t tmp_len
)
200 for (i
= 0; i
< tmp_len
; ++i
) {
201 int cc
= _wind_combining_class(tmp
[i
]);
205 j
< tmp_len
&& _wind_combining_class(tmp
[j
]);
208 canonical_reorder_sequence(&tmp
[i
], j
- i
);
215 find_composition(const uint32_t *in
, unsigned in_len
)
217 unsigned short canon_index
= 0;
221 cur
= hangul_composition(in
, in_len
);
226 const struct canon_node
*c
= &_wind_canon_table
[canon_index
];
236 if (i
< c
->next_start
|| i
>= c
->next_end
)
240 _wind_canon_next_table
[c
->next_offset
+ i
- c
->next_start
];
241 if (canon_index
!= 0) {
242 cur
= (cur
<< 4) & 0xFFFFF;
245 } while (canon_index
!= 0);
250 combine(const uint32_t *in
, size_t in_len
,
251 uint32_t *out
, size_t *out_len
)
258 for (i
= 0; i
< in_len
;) {
259 while (i
< in_len
&& _wind_combining_class(in
[i
]) != 0) {
264 return WIND_ERR_OVERRUN
;
274 v
[0] = out
[ostarter
];
277 cc
= _wind_combining_class(in
[i
]);
278 if (old_cc
!= cc
&& (comb
= find_composition(v
, 2))) {
279 out
[ostarter
] = comb
;
280 } else if (cc
== 0) {
284 return WIND_ERR_OVERRUN
;
297 _wind_stringprep_normalize(const uint32_t *in
, size_t in_len
,
298 uint32_t *out
, size_t *out_len
)
309 tmp_len
= in_len
* 4;
310 if (tmp_len
< MAX_LENGTH_CANON
)
311 tmp_len
= MAX_LENGTH_CANON
;
312 tmp
= malloc(tmp_len
* sizeof(uint32_t));
316 ret
= compat_decomp(in
, in_len
, tmp
, &tmp_len
);
321 canonical_reorder(tmp
, tmp_len
);
322 ret
= combine(tmp
, tmp_len
, out
, out_len
);