Update.
[glibc.git] / iconvdata / johab.c
blobfccfbabd4efc11d169cf3b26f366c5ef96047c82
1 /* Mapping tables for JOHAB handling.
2 Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Jungshik Shin <jshin@pantheon.yale.edu>
5 and Ulrich Drepper <drepper@cygnus.com>, 1998.
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
17 You should have received a copy of the GNU Library General Public
18 License along with the GNU C Library; see the file COPYING.LIB. If not,
19 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 #include <stdint.h>
23 #include <ksc5601.h>
25 /* The table for Bit pattern to Hangul Jamo
26 5 bits each are used to encode
27 leading consonants(19 + 1 filler), medial vowels(21 + 1 filler)
28 and trailing consonants(27 + 1 filler).
30 KS C 5601-1992 Annex 3 Table 2
31 0 : Filler, -1: invalid, >= 1 : valid
34 static const int init[32] =
36 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
37 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
39 static const int mid[32] =
41 -1, -1, 0, 1, 2, 3, 4, 5,
42 -1, -1, 6, 7, 8, 9, 10, 11,
43 -1, -1, 12, 13, 14, 15, 16, 17,
44 -1, -1, 18, 19, 20, 21, -1, -1
46 static const int final[32] =
48 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
49 -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1
53 Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0
54 defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e]
56 It's to be considered later which Jamo block to use, Compatibility
57 block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff]
60 static const uint32_t init_to_ucs[19] =
62 0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
63 0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b,
64 0x314c, 0x314d, 0x314e
67 static const uint32_t final_to_ucs[31] =
69 L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0',
70 0x313a, 0x313b, 0x314c, 0x313d, 0x313e, 0x313f,
71 0x3140, L'\0', L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0',
72 L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'
75 /* The following three arrays are used to convert
76 precomposed Hangul syllables in [0xac00,0xd???]
77 to Jamo bit patterns for Johab encoding
79 cf. : KS C 5601-1992, Annex3 Table 2
81 Arrays are used to speed up things although it's possible
82 to get the same result arithmetically.
85 static const int init_to_bit[19] =
87 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00,
88 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400,
89 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00,
90 0xd000
93 static const int mid_to_bit[21] =
95 0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0,
96 0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0,
97 0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0,
98 0x0340, 0x0360, 0x0380, 0x03a0
101 static const int final_to_bit[28] =
103 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
104 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d
107 /* The conversion table from
108 UCS4 Hangul Compatibility Jamo in [0x3131,0x3163]
109 to Johab
111 cf. 1. KS C 5601-1992 Annex 3 Table 2
112 2. Unicode 2.0 manual
115 static const uint16_t jamo_from_ucs_table[51] =
117 0x8841, 0x8c41,
118 0x8444,
119 0x9041,
120 0x8446, 0x8447,
121 0x9441, 0x9841, 0x9c41,
122 0x844a, 0x844b, 0x844c, 0x844d, 0x884e, 0x884f, 0x8450,
123 0xa041, 0xa441, 0xa841,
124 0x8454,
125 0xac41, 0xb041, 0xb441, 0xb841, 0xbc41,
126 0xc041, 0xc441, 0xc841, 0xca41, 0xd041,
127 0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1,
128 0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1,
129 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
130 0x8741, 0x8761, 0x8781, 0x87a1
134 static inline uint32_t
135 johab_sym_hanja_to_ucs (uint_fast32_t idx, uint_fast32_t c1, uint_fast32_t c2)
137 if (idx <= 0xdefe)
138 return (uint32_t) __ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2
139 - (c2 > 0x90 ? 0x43 : 0x31)];
140 else
141 return (uint32_t) __ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2
142 - (c2 > 0x90 ? 0x43 : 0x31)];
144 /* Definitions used in the body of the `gconv' function. */
145 #define CHARSET_NAME "JOHAB//"
146 #define FROM_LOOP from_johab
147 #define TO_LOOP to_johab
148 #define DEFINE_INIT 1
149 #define DEFINE_FINI 1
150 #define MIN_NEEDED_FROM 1
151 #define MAX_NEEDED_FROM 2
152 #define MIN_NEEDED_TO 4
155 /* First define the conversion function from JOHAB to UCS4. */
156 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
157 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
158 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
159 #define LOOPFCT FROM_LOOP
160 #define BODY \
162 uint32_t ch = *inptr; \
164 /* half-width Korean Currency WON sign \
165 if (ch == 0x5c) \
166 ch = 0x20a9; \
167 else if (ch < 0x7f) \
168 ch = (uint32_t) ch; \
169 */ \
170 if (ch < 0x7f) \
171 /* Plain ASCII. */ \
172 ++inptr; \
173 /* Johab : 1. Hangul \
174 1st byte : 0x84-0xd3 \
175 2nd byte : 0x41-0x7e, 0x81-0xfe \
176 2. Hanja & Symbol : \
177 1st byte : 0xd8-0xde, 0xe0-0xf9 \
178 2nd byte : 0x31-0x7e, 0x91-0xfe \
179 0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */ \
180 else \
182 if (__builtin_expect (ch, 0) > 0xf9 \
183 || __builtin_expect (ch, 0) == 0xdf \
184 || (__builtin_expect (ch, 0) > 0x7e && ch < 0x84) \
185 || (__builtin_expect (ch, 0) > 0xd3 && ch < 0xd9)) \
187 /* These are illegal. */ \
188 if (! ignore_errors_p ()) \
190 /* This is an illegal character. */ \
191 result = __GCONV_ILLEGAL_INPUT; \
192 break; \
195 ++inptr; \
196 ++*converted; \
197 continue; \
199 else \
201 /* Two-byte character. First test whether the next \
202 character is also available. */ \
203 uint32_t ch2; \
204 uint_fast32_t idx; \
206 if (NEED_LENGTH_TEST && __builtin_expect (inptr + 1 >= inend, 0)) \
208 /* The second character is not available. Store the \
209 intermediate result. */ \
210 result = __GCONV_INCOMPLETE_INPUT; \
211 break; \
214 ch2 = inptr[1]; \
215 idx = ch * 256 + ch2; \
216 if (__builtin_expect (ch, 0) <= 0xd3) \
218 /* Hangul */ \
219 uint_fast32_t i, m, f; \
221 i = init[(idx & 0x7c00) >> 10]; \
222 m = mid[(idx & 0x03e0) >> 5]; \
223 f = final[idx & 0x001f]; \
225 if (__builtin_expect (i, 0) == -1 \
226 || __builtin_expect (m, 0) == -1 \
227 || __builtin_expect (f, 0) == -1) \
229 /* This is illegal. */ \
230 if (! ignore_errors_p ()) \
232 /* This is an illegal character. */ \
233 result = __GCONV_ILLEGAL_INPUT; \
234 break; \
237 ++inptr; \
238 ++*converted; \
239 continue; \
241 else if (i > 0 && m > 0) \
242 ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00; \
243 else if (i > 0 && m == 0 && f == 0) \
244 ch = init_to_ucs[i - 1]; \
245 else if (i == 0 && m > 0 && f == 0) \
246 ch = 0x314e + m; /* 0x314f + m - 1 */ \
247 else if (__builtin_expect (i | m, 0) == 0 \
248 && __builtin_expect (f, 1) > 0) \
249 ch = final_to_ucs[f - 1]; /* round trip?? */ \
250 else \
252 /* This is illegal. */ \
253 if (! ignore_errors_p ()) \
255 /* This is an illegal character. */ \
256 result = __GCONV_ILLEGAL_INPUT; \
257 break; \
260 ++inptr; \
261 ++*converted; \
262 continue; \
265 else \
267 if (__builtin_expect (ch2, 0x31) < 0x31 \
268 || (__builtin_expect (ch2, 0x7e) > 0x7e && ch2 < 0x91) \
269 || __builtin_expect (ch2, 0) == 0xff \
270 || (__builtin_expect (ch, 0) == 0xda \
271 && ch2 > 0xa0 && ch2 < 0xd4)) \
273 /* This is illegal. */ \
274 if (! ignore_errors_p ()) \
276 /* This is an illegal character. */ \
277 result = __GCONV_ILLEGAL_INPUT; \
278 break; \
281 ++inptr; \
282 ++*converted; \
283 continue; \
285 else \
287 ch = johab_sym_hanja_to_ucs (idx, ch, ch2); \
288 /* if (idx <= 0xdefe) \
289 ch = __ksc5601_sym_to_ucs[(ch - 0xd9) * 192 \
290 + ch2 - (ch2 > 0x90 \
291 ? 0x43 : 0x31)]; \
292 else \
293 ch = __ksc5601_hanja_to_ucs[(ch - 0xe0) *192 \
294 + ch2 - (ch2 > 0x90 \
295 ?0x43 : 0x31)];\
296 */ \
301 if (__builtin_expect (ch, 1) == 0) \
303 /* This is an illegal character. */ \
304 if (! ignore_errors_p ()) \
306 /* This is an illegal character. */ \
307 result = __GCONV_ILLEGAL_INPUT; \
308 break; \
311 inptr += 2; \
312 ++*converted; \
313 continue; \
316 inptr += 2; \
319 put32 (outptr, ch); \
320 outptr += 4; \
322 #include <iconv/loop.c>
325 /* Next, define the other direction. */
326 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
327 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
328 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
329 #define LOOPFCT TO_LOOP
330 #define BODY \
332 uint32_t ch = get32 (inptr); \
333 /* \
334 if (ch >= (sizeof (from_ucs4_lat1) / sizeof (from_ucs4_lat1[0]))) \
336 if (ch >= 0x0391 && ch <= 0x0451) \
337 cp = from_ucs4_greek[ch - 0x391]; \
338 else if (ch >= 0x2010 && ch <= 0x9fa0) \
339 cp = from_ucs4_cjk[ch - 0x02010]; \
340 else \
341 break; \
343 else \
344 cp = from_ucs4_lat1[ch]; \
345 */ \
347 if (ch < 0x7f) \
348 *outptr++ = ch; \
349 else \
351 if (ch >= 0xac00 && ch <= 0xd7a3) \
353 if (NEED_LENGTH_TEST && __builtin_expect (outptr + 2 > outend, 0))\
355 result = __GCONV_FULL_OUTPUT; \
356 break; \
359 ch -= 0xac00; \
361 ch = (init_to_bit[ch / 588] /* 21 * 28 = 588 */ \
362 + mid_to_bit[(ch / 28) % 21]/* (ch % (21 * 28)) / 28 */ \
363 + final_to_bit[ch % 28]); /* (ch % (21 * 28)) % 28 */ \
365 *outptr++ = ch / 256; \
366 *outptr++ = ch % 256; \
368 /* KS C 5601-1992 Annex 3 regards 0xA4DA(Hangul Filler : U3164) \
369 as symbol */ \
370 else if (ch >= 0x3131 && ch <= 0x3163) \
372 ch = jamo_from_ucs_table[ch - 0x3131]; \
374 if (NEED_LENGTH_TEST && __builtin_expect (outptr + 2 > outend, 0))\
376 result = __GCONV_FULL_OUTPUT; \
377 break; \
380 *outptr++ = ch / 256; \
381 *outptr++ = ch % 256; \
383 else if ((ch >= 0x4e00 && ch <= 0x9fa5) \
384 || (ch >= 0xf900 && ch <= 0xfa0b)) \
386 size_t written; \
387 uint32_t temp; \
389 written = ucs4_to_ksc5601_hanja (ch, outptr, \
390 (NEED_LENGTH_TEST \
391 ? outend - outptr : 2)); \
392 if (NEED_LENGTH_TEST && __builtin_expect (written, 1) == 0) \
394 result = __GCONV_FULL_OUTPUT; \
395 break; \
397 if (__builtin_expect (written, 0) == __UNKNOWN_10646_CHAR) \
399 if (! ignore_errors_p ()) \
401 /* This is an illegal character. */ \
402 result = __GCONV_ILLEGAL_INPUT; \
403 break; \
406 inptr += 4; \
407 ++*converted; \
408 continue; \
411 outptr[0] -= 0x4a; \
412 outptr[1] -= 0x21; \
414 temp = outptr[0] * 94 + outptr[1]; \
416 outptr[0] = 0xe0 + temp / 188; \
417 outptr[1] = temp % 188; \
418 outptr[1] += outptr[1] >= 78 ? 0x43 : 0x31; \
420 outptr += 2; \
422 else \
424 size_t written; \
426 written = ucs4_to_ksc5601_sym (ch, outptr, \
427 (NEED_LENGTH_TEST \
428 ? outend - outptr : 2)); \
429 if (NEED_LENGTH_TEST && __builtin_expect (written, 1) == 0) \
431 result = __GCONV_FULL_OUTPUT; \
432 break; \
434 if (__builtin_expect (written, 1) == __UNKNOWN_10646_CHAR) \
436 if (! ignore_errors_p ()) \
438 /* This is an illegal character. */ \
439 result = __GCONV_ILLEGAL_INPUT; \
440 break; \
443 inptr += 4; \
444 ++*converted; \
445 continue; \
448 outptr[0] -= 0x4a; \
449 outptr[1] += 0x80; \
451 outptr[1] += (outptr[0] % 2 \
452 ? 0 : (outptr[1] > 0xee ? 0x43 : 0x31)); \
453 outptr[1] -= 0xa1; \
454 outptr[0] /= 2; \
455 outptr[0] += 0xe0; \
457 outptr += 2; \
461 inptr += 4; \
463 #include <iconv/loop.c>
466 /* Now define the toplevel functions. */
467 #include <iconv/skeleton.c>