Update.
[glibc.git] / iconvdata / johab.c
blob7253ff6cb8ef1d55eafd34a5fe7c70d7f0383092
1 /* Mapping tables for JOHAB handling.
2 Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Jungshik Shin <jshin@pantheon.yale.edu>
5 and Ulrich Drepper <drepper@cygnus.com>, 1998.
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
17 You should have received a copy of the GNU Library General Public
18 License along with the GNU C Library; see the file COPYING.LIB. If not,
19 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 #include <dlfcn.h>
23 #include <stdint.h>
24 #include <ksc5601.h>
26 /* The table for Bit pattern to Hangul Jamo
27 5 bits each are used to encode
28 leading consonants(19 + 1 filler), medial vowels(21 + 1 filler)
29 and trailing consonants(27 + 1 filler).
31 KS C 5601-1992 Annex 3 Table 2
32 0 : Filler, -1: invalid, >= 1 : valid
35 static const int init[32] =
37 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
38 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
40 static const int mid[32] =
42 -1, -1, 0, 1, 2, 3, 4, 5,
43 -1, -1, 6, 7, 8, 9, 10, 11,
44 -1, -1, 12, 13, 14, 15, 16, 17,
45 -1, -1, 18, 19, 20, 21, -1, -1
47 static const int final[32] =
49 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
50 -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1
54 Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0
55 defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e]
57 It's to be considered later which Jamo block to use, Compatibility
58 block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff]
61 static const uint32_t init_to_ucs[19] =
63 0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
64 0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b,
65 0x314c, 0x314d, 0x314e
68 static const uint32_t final_to_ucs[31] =
70 L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0',
71 0x313a, 0x313b, 0x314c, 0x313d, 0x313e, 0x313f,
72 0x3140, L'\0', L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0',
73 L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'
76 /* The following three arrays are used to convert
77 precomposed Hangul syllables in [0xac00,0xd???]
78 to Jamo bit patterns for Johab encoding
80 cf. : KS C 5601-1992, Annex3 Table 2
82 Arrays are used to speed up things although it's possible
83 to get the same result arithmetically.
86 static const int init_to_bit[19] =
88 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00,
89 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400,
90 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00,
91 0xd000
94 static const int mid_to_bit[21] =
96 0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0,
97 0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0,
98 0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0,
99 0x0340, 0x0360, 0x0380, 0x03a0
102 static const int final_to_bit[28] =
104 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
105 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d
108 /* The conversion table from
109 UCS4 Hangul Compatibility Jamo in [0x3131,0x3163]
110 to Johab
112 cf. 1. KS C 5601-1992 Annex 3 Table 2
113 2. Unicode 2.0 manual
116 static const uint16_t jamo_from_ucs_table[51] =
118 0x8841, 0x8c41,
119 0x8444,
120 0x9041,
121 0x8446, 0x8447,
122 0x9441, 0x9841, 0x9c41,
123 0x844a, 0x844b, 0x844c, 0x844d, 0x884e, 0x884f, 0x8450,
124 0xa041, 0xa441, 0xa841,
125 0x8454,
126 0xac41, 0xb041, 0xb441, 0xb841, 0xbc41,
127 0xc041, 0xc441, 0xc841, 0xca41, 0xd041,
128 0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1,
129 0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1,
130 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
131 0x8741, 0x8761, 0x8781, 0x87a1
135 static inline uint32_t
136 johab_sym_hanja_to_ucs (uint_fast32_t idx, uint_fast32_t c1, uint_fast32_t c2)
138 if (idx <= 0xdefe)
139 return (uint32_t) __ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2
140 - (c2 > 0x90 ? 0x43 : 0x31)];
141 else
142 return (uint32_t) __ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2
143 - (c2 > 0x90 ? 0x43 : 0x31)];
145 /* Definitions used in the body of the `gconv' function. */
146 #define CHARSET_NAME "JOHAB//"
147 #define FROM_LOOP from_johab
148 #define TO_LOOP to_johab
149 #define DEFINE_INIT 1
150 #define DEFINE_FINI 1
151 #define MIN_NEEDED_FROM 1
152 #define MAX_NEEDED_FROM 2
153 #define MIN_NEEDED_TO 4
156 /* First define the conversion function from JOHAB to UCS4. */
157 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
158 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
159 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
160 #define LOOPFCT FROM_LOOP
161 #define BODY \
163 uint32_t ch = *inptr; \
165 /* half-width Korean Currency WON sign \
166 if (ch == 0x5c) \
167 ch = 0x20a9; \
168 else if (ch < 0x7f) \
169 ch = (uint32_t) ch; \
170 */ \
171 if (ch < 0x7f) \
172 /* Plain ASCII. */ \
173 ++inptr; \
174 /* Johab : 1. Hangul \
175 1st byte : 0x84-0xd3 \
176 2nd byte : 0x41-0x7e, 0x81-0xfe \
177 2. Hanja & Symbol : \
178 1st byte : 0xd8-0xde, 0xe0-0xf9 \
179 2nd byte : 0x31-0x7e, 0x91-0xfe \
180 0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */ \
181 else \
183 if (__builtin_expect (ch, 0) > 0xf9 \
184 || __builtin_expect (ch, 0) == 0xdf \
185 || (__builtin_expect (ch, 0) > 0x7e && ch < 0x84) \
186 || (__builtin_expect (ch, 0) > 0xd3 && ch < 0xd9)) \
188 /* These are illegal. */ \
189 if (! ignore_errors_p ()) \
191 /* This is an illegal character. */ \
192 result = __GCONV_ILLEGAL_INPUT; \
193 break; \
196 ++inptr; \
197 ++*irreversible; \
198 continue; \
200 else \
202 /* Two-byte character. First test whether the next \
203 character is also available. */ \
204 uint32_t ch2; \
205 uint_fast32_t idx; \
207 if (__builtin_expect (inptr + 1 >= inend, 0)) \
209 /* The second character is not available. Store the \
210 intermediate result. */ \
211 result = __GCONV_INCOMPLETE_INPUT; \
212 break; \
215 ch2 = inptr[1]; \
216 idx = ch * 256 + ch2; \
217 if (__builtin_expect (ch, 0) <= 0xd3) \
219 /* Hangul */ \
220 uint_fast32_t i, m, f; \
222 i = init[(idx & 0x7c00) >> 10]; \
223 m = mid[(idx & 0x03e0) >> 5]; \
224 f = final[idx & 0x001f]; \
226 if (__builtin_expect (i, 0) == -1 \
227 || __builtin_expect (m, 0) == -1 \
228 || __builtin_expect (f, 0) == -1) \
230 /* This is illegal. */ \
231 if (! ignore_errors_p ()) \
233 /* This is an illegal character. */ \
234 result = __GCONV_ILLEGAL_INPUT; \
235 break; \
238 ++inptr; \
239 ++*irreversible; \
240 continue; \
242 else if (i > 0 && m > 0) \
243 ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00; \
244 else if (i > 0 && m == 0 && f == 0) \
245 ch = init_to_ucs[i - 1]; \
246 else if (i == 0 && m > 0 && f == 0) \
247 ch = 0x314e + m; /* 0x314f + m - 1 */ \
248 else if (__builtin_expect (i | m, 0) == 0 \
249 && __builtin_expect (f, 1) > 0) \
250 ch = final_to_ucs[f - 1]; /* round trip?? */ \
251 else \
253 /* This is illegal. */ \
254 if (! ignore_errors_p ()) \
256 /* This is an illegal character. */ \
257 result = __GCONV_ILLEGAL_INPUT; \
258 break; \
261 ++inptr; \
262 ++*irreversible; \
263 continue; \
266 else \
268 if (__builtin_expect (ch2, 0x31) < 0x31 \
269 || (__builtin_expect (ch2, 0x7e) > 0x7e && ch2 < 0x91) \
270 || __builtin_expect (ch2, 0) == 0xff \
271 || (__builtin_expect (ch, 0) == 0xda \
272 && ch2 > 0xa0 && ch2 < 0xd4)) \
274 /* This is illegal. */ \
275 if (! ignore_errors_p ()) \
277 /* This is an illegal character. */ \
278 result = __GCONV_ILLEGAL_INPUT; \
279 break; \
282 ++inptr; \
283 ++*irreversible; \
284 continue; \
286 else \
288 ch = johab_sym_hanja_to_ucs (idx, ch, ch2); \
289 /* if (idx <= 0xdefe) \
290 ch = __ksc5601_sym_to_ucs[(ch - 0xd9) * 192 \
291 + ch2 - (ch2 > 0x90 \
292 ? 0x43 : 0x31)]; \
293 else \
294 ch = __ksc5601_hanja_to_ucs[(ch - 0xe0) *192 \
295 + ch2 - (ch2 > 0x90 \
296 ?0x43 : 0x31)];\
297 */ \
302 if (__builtin_expect (ch, 1) == 0) \
304 /* This is an illegal character. */ \
305 if (! ignore_errors_p ()) \
307 /* This is an illegal character. */ \
308 result = __GCONV_ILLEGAL_INPUT; \
309 break; \
312 inptr += 2; \
313 ++*irreversible; \
314 continue; \
317 inptr += 2; \
320 put32 (outptr, ch); \
321 outptr += 4; \
323 #define LOOP_NEED_FLAGS
324 #include <iconv/loop.c>
327 /* Next, define the other direction. */
328 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
329 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
330 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
331 #define LOOPFCT TO_LOOP
332 #define BODY \
334 uint32_t ch = get32 (inptr); \
335 /* \
336 if (ch >= (sizeof (from_ucs4_lat1) / sizeof (from_ucs4_lat1[0]))) \
338 if (ch >= 0x0391 && ch <= 0x0451) \
339 cp = from_ucs4_greek[ch - 0x391]; \
340 else if (ch >= 0x2010 && ch <= 0x9fa0) \
341 cp = from_ucs4_cjk[ch - 0x02010]; \
342 else \
343 break; \
345 else \
346 cp = from_ucs4_lat1[ch]; \
347 */ \
349 if (ch < 0x7f) \
350 *outptr++ = ch; \
351 else \
353 if (ch >= 0xac00 && ch <= 0xd7a3) \
355 if (__builtin_expect (outptr + 2 > outend, 0)) \
357 result = __GCONV_FULL_OUTPUT; \
358 break; \
361 ch -= 0xac00; \
363 ch = (init_to_bit[ch / 588] /* 21 * 28 = 588 */ \
364 + mid_to_bit[(ch / 28) % 21]/* (ch % (21 * 28)) / 28 */ \
365 + final_to_bit[ch % 28]); /* (ch % (21 * 28)) % 28 */ \
367 *outptr++ = ch / 256; \
368 *outptr++ = ch % 256; \
370 /* KS C 5601-1992 Annex 3 regards 0xA4DA(Hangul Filler : U3164) \
371 as symbol */ \
372 else if (ch >= 0x3131 && ch <= 0x3163) \
374 ch = jamo_from_ucs_table[ch - 0x3131]; \
376 if (__builtin_expect (outptr + 2 > outend, 0)) \
378 result = __GCONV_FULL_OUTPUT; \
379 break; \
382 *outptr++ = ch / 256; \
383 *outptr++ = ch % 256; \
385 else if ((ch >= 0x4e00 && ch <= 0x9fa5) \
386 || (ch >= 0xf900 && ch <= 0xfa0b)) \
388 size_t written; \
389 uint32_t temp; \
391 written = ucs4_to_ksc5601_hanja (ch, outptr, outend - outptr); \
392 if (__builtin_expect (written, 1) == 0) \
394 result = __GCONV_FULL_OUTPUT; \
395 break; \
397 if (__builtin_expect (written, 0) == __UNKNOWN_10646_CHAR) \
399 if (step_data->__trans.__trans_fct != NULL) \
401 result = DL_CALL_FCT (step_data->__trans.__trans_fct, \
402 (step, step_data, *inptrp, &inptr, \
403 inend, *outptrp, &outptr, outend, \
404 irreversible)); \
405 if (result != __GCONV_OK) \
406 break; \
408 else if (! ignore_errors_p ()) \
410 /* This is an illegal character. */ \
411 result = __GCONV_ILLEGAL_INPUT; \
412 break; \
414 else \
416 inptr += 4; \
417 ++*irreversible; \
419 continue; \
422 outptr[0] -= 0x4a; \
423 outptr[1] -= 0x21; \
425 temp = outptr[0] * 94 + outptr[1]; \
427 outptr[0] = 0xe0 + temp / 188; \
428 outptr[1] = temp % 188; \
429 outptr[1] += outptr[1] >= 78 ? 0x43 : 0x31; \
431 outptr += 2; \
433 else \
435 size_t written; \
437 written = ucs4_to_ksc5601_sym (ch, outptr, outend - outptr); \
438 if (__builtin_expect (written, 1) == 0) \
440 result = __GCONV_FULL_OUTPUT; \
441 break; \
443 if (__builtin_expect (written, 1) == __UNKNOWN_10646_CHAR) \
445 if (step_data->__trans.__trans_fct != NULL) \
447 result = DL_CALL_FCT (step_data->__trans.__trans_fct, \
448 (step, step_data, *inptrp, &inptr, \
449 inend, *outptrp, &outptr, outend, \
450 irreversible)); \
451 if (result != __GCONV_OK) \
452 break; \
454 else if (! ignore_errors_p ()) \
456 /* This is an illegal character. */ \
457 result = __GCONV_ILLEGAL_INPUT; \
458 break; \
460 else \
462 inptr += 4; \
463 ++*irreversible; \
465 continue; \
468 outptr[0] -= 0x4a; \
469 outptr[1] += 0x80; \
471 outptr[1] += (outptr[0] % 2 \
472 ? 0 : (outptr[1] > 0xee ? 0x43 : 0x31)); \
473 outptr[1] -= 0xa1; \
474 outptr[0] /= 2; \
475 outptr[0] += 0xe0; \
477 outptr += 2; \
481 inptr += 4; \
483 #define LOOP_NEED_FLAGS
484 #include <iconv/loop.c>
487 /* Now define the toplevel functions. */
488 #include <iconv/skeleton.c>