fix sizeof, only matter on platforms where sizeof(unsigned) != sizeof(uint32_t)
[heimdal.git] / lib / wind / normalize.c
blob3fcddc91b0d38c83b88085a507d737222b36dad8
1 /*
2 * Copyright (c) 2004 Kungliga Tekniska Högskolan
3 * (Royal Institute of Technology, Stockholm, Sweden).
4 * All rights reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the Institute nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
37 #include "windlocl.h"
39 #include <assert.h>
40 #include <stdlib.h>
41 #include <errno.h>
43 #include "normalize_table.h"
45 static int
46 translation_cmp(const void *key, const void *data)
48 const struct translation *t1 = (const struct translation *)key;
49 const struct translation *t2 = (const struct translation *)data;
51 return t1->key - t2->key;
54 enum { s_base = 0xAC00};
55 enum { s_count = 11172};
56 enum { l_base = 0x1100};
57 enum { l_count = 19};
58 enum { v_base = 0x1161};
59 enum { v_count = 21};
60 enum { t_base = 0x11A7};
61 enum { t_count = 28};
62 enum { n_count = v_count * t_count};
64 static int
65 hangul_decomp(const uint32_t *in, size_t in_len,
66 uint32_t *out, size_t *out_len)
68 uint32_t u = *in;
69 unsigned s_index;
70 unsigned l, v, t;
71 unsigned o;
73 if (u < s_base || u >= s_base + s_count)
74 return 0;
75 s_index = u - s_base;
76 l = l_base + s_index / n_count;
77 v = v_base + (s_index % n_count) / t_count;
78 t = t_base + s_index % t_count;
79 o = 2;
80 if (t != t_base)
81 ++o;
82 if (*out_len < o)
83 return WIND_ERR_OVERRUN;
84 out[0] = l;
85 out[1] = v;
86 if (t != t_base)
87 out[2] = t;
88 *out_len = o;
89 return 1;
92 static uint32_t
93 hangul_composition(const uint32_t *in, size_t in_len)
95 if (in_len < 2)
96 return 0;
97 if (in[0] >= l_base && in[0] < l_base + l_count) {
98 unsigned l_index = in[0] - l_base;
99 unsigned v_index;
101 if (in[1] < v_base || in[1] >= v_base + v_count)
102 return 0;
103 v_index = in[1] - v_base;
104 return (l_index * v_count + v_index) * t_count + s_base;
105 } else if (in[0] >= s_base && in[0] < s_base + s_count) {
106 unsigned s_index = in[0] - s_base;
107 unsigned t_index;
109 if (s_index % t_count != 0)
110 return 0;
111 if (in[1] < t_base || in[1] >= t_base + t_count)
112 return 0;
113 t_index = in[1] - t_base;
114 return in[0] + t_index;
116 return 0;
119 static int
120 compat_decomp(const uint32_t *in, size_t in_len,
121 uint32_t *out, size_t *out_len)
123 unsigned i;
124 unsigned o = 0;
126 for (i = 0; i < in_len; ++i) {
127 struct translation ts = {in[i]};
128 size_t sub_len = *out_len - o;
129 int ret;
131 ret = hangul_decomp(in + i, in_len - i,
132 out + o, &sub_len);
133 if (ret) {
134 if (ret == WIND_ERR_OVERRUN)
135 return ret;
136 o += sub_len;
137 } else {
138 void *s = bsearch(&ts,
139 _wind_normalize_table,
140 _wind_normalize_table_size,
141 sizeof(_wind_normalize_table[0]),
142 translation_cmp);
143 if (s != NULL) {
144 const struct translation *t = (const struct translation *)s;
146 ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
147 t->val_len,
148 out + o, &sub_len);
149 if (ret)
150 return ret;
151 o += sub_len;
152 } else {
153 if (o >= *out_len)
154 return WIND_ERR_OVERRUN;
155 out[o++] = in[i];
160 *out_len = o;
161 return 0;
164 static int
165 cc_cmp(const void *a, const void *b)
167 const uint32_t *ua = (const uint32_t *)a;
168 const uint32_t *ub = (const uint32_t *)b;
170 return _wind_combining_class(*ua) - _wind_combining_class(*ub);
173 static void
174 canonical_reorder(uint32_t *tmp, size_t tmp_len)
176 unsigned i;
178 for (i = 0; i < tmp_len; ++i) {
179 int cc = _wind_combining_class(tmp[i]);
180 if (cc) {
181 size_t j;
182 for (j = i + 1;
183 j < tmp_len && _wind_combining_class(tmp[j]);
184 ++j)
186 qsort(&tmp[i], j - i, sizeof(tmp[0]), cc_cmp);
187 i = j;
192 static uint32_t
193 find_composition(const uint32_t *in, unsigned in_len)
195 unsigned short canon_index = 0;
196 uint32_t cur;
197 unsigned n = 0;
199 cur = hangul_composition(in, in_len);
200 if (cur)
201 return cur;
203 do {
204 const struct canon_node *c = &_wind_canon_table[canon_index];
205 unsigned i;
207 if (n % 5 == 0) {
208 cur = *in++;
209 if (in_len-- == 0)
210 return c->val;
213 i = cur >> 16;
214 if (i < c->next_start || i >= c->next_end)
215 canon_index = 0;
216 else
217 canon_index =
218 _wind_canon_next_table[c->next_offset + i - c->next_start];
219 if (canon_index != 0) {
220 cur = (cur << 4) & 0xFFFFF;
221 ++n;
223 } while (canon_index != 0);
224 return 0;
227 static int
228 combine(const uint32_t *in, size_t in_len,
229 uint32_t *out, size_t *out_len)
231 unsigned i;
232 int ostarter;
233 unsigned o = 0;
234 int old_cc;
236 for (i = 0; i < in_len;) {
237 while (i < in_len && _wind_combining_class(in[i]) != 0) {
238 out[o++] = in[i++];
240 if (i < in_len) {
241 if (o >= *out_len)
242 return WIND_ERR_OVERRUN;
243 ostarter = o;
244 out[o++] = in[i++];
245 old_cc = -1;
247 while (i < in_len) {
248 uint32_t comb;
249 uint32_t v[2];
250 int cc;
252 v[0] = out[ostarter];
253 v[1] = in[i];
255 cc = _wind_combining_class(in[i]);
256 if (old_cc != cc && (comb = find_composition(v, 2))) {
257 out[ostarter] = comb;
258 } else if (cc == 0) {
259 break;
260 } else {
261 if (o >= *out_len)
262 return WIND_ERR_OVERRUN;
263 out[o++] = in[i];
264 old_cc = cc;
266 ++i;
270 *out_len = o;
271 return 0;
275 _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
276 uint32_t *out, size_t *out_len)
278 size_t tmp_len;
279 uint32_t *tmp;
280 int ret;
282 if (in_len == 0) {
283 *out_len = 0;
284 return 0;
287 tmp_len = in_len * 4;
288 if (tmp_len < MAX_LENGTH_CANON)
289 tmp_len = MAX_LENGTH_CANON;
290 tmp = malloc(tmp_len * sizeof(uint32_t));
291 if (tmp == NULL)
292 return ENOMEM;
294 ret = compat_decomp(in, in_len, tmp, &tmp_len);
295 if (ret) {
296 free(tmp);
297 return ret;
299 canonical_reorder(tmp, tmp_len);
300 ret = combine(tmp, tmp_len, out, out_len);
301 free(tmp);
302 return ret;