beta-0.89.2
[luatex.git] / source / texk / web2c / luatexdir / unilib / gwwiconv.c
blob3671116cf300cde25f8c505ec08a7e7e3ba83267
1 /* Copyright (C) 2004-2012 by George Williams */
2 /*
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright notice, this
7 * list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright notice,
10 * this list of conditions and the following disclaimer in the documentation
11 * and/or other materials provided with the distribution.
13 * The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
17 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
19 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
25 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include <unibasics.h>
29 #include <gwwiconv.h>
30 #include <charset.h>
31 #include <chardata.h>
32 #include <string.h>
33 #include <ustring.h>
34 #include <stdio.h>
36 #ifndef HAVE_ICONV_H
38 /* I have written an limited iconv which will convert either to or from unichar_t */
39 /* (either UCS2 or UCS4) */
40 /* it will not convert latin1 to latin2, but latin1->UCS2, UCS2->latin2 */
41 /* it uses the encodings built into libgunicode for systems with no iconv */
42 /* (ie. macs before 10.3, perhaps others) */
44 struct gww_iconv_t {
45 enum encoding from;
46 enum encoding to;
49 enum extended_encoding { e_jisgbpk = e_encodingmax };
51 static enum endian { end_big, end_little, end_unknown } endian = end_unknown;
53 static void endian_detector(void) {
54 union { short s; char c[2]; } u;
56 u.s = 0x0102;
57 if ( u.c[0]==0x1 )
58 endian = end_big;
59 else
60 endian = end_little;
63 static enum encoding name_to_enc(const char *encname) {
64 struct { const char *name; enum encoding enc; } map[] = {
65 { "UCS-2-INTERNAL", e_unicode },
66 { "UCS2", e_unicode },
67 { "UCS-2", e_unicode },
68 { "UCS-2LE", e_unicode },
69 { "UCS-2BE", e_unicode },
70 { "UNICODELITTLE", e_unicode },
71 { "UNICODEBIG", e_unicode },
72 { "ISO-10646/UCS2", e_unicode },
73 { "ISO-10646/USC2", e_unicode }, /* Old typo */
74 { "UCS4", e_ucs4 },
75 { "UCS-4", e_ucs4 },
76 { "UCS-4LE", e_ucs4 },
77 { "UCS-4BE", e_ucs4 },
78 { "UCS-4-INTERNAL", e_ucs4 },
79 { "ISO-10646/UCS4", e_ucs4 },
80 { "iso8859-1", e_iso8859_1 },
81 { "iso8859-2", e_iso8859_2 },
82 { "iso8859-3", e_iso8859_3 },
83 { "iso8859-4", e_iso8859_4 },
84 { "iso8859-5", e_iso8859_5 },
85 { "iso8859-6", e_iso8859_6 },
86 { "iso8859-7", e_iso8859_7 },
87 { "iso8859-8", e_iso8859_8 },
88 { "iso8859-9", e_iso8859_9 },
89 { "iso8859-10", e_iso8859_10 },
90 { "iso8859-11", e_iso8859_11 },
91 { "iso8859-13", e_iso8859_13 },
92 { "iso8859-14", e_iso8859_14 },
93 { "iso8859-15", e_iso8859_15 },
94 { "iso-8859-1", e_iso8859_1 },
95 { "iso-8859-2", e_iso8859_2 },
96 { "iso-8859-3", e_iso8859_3 },
97 { "iso-8859-4", e_iso8859_4 },
98 { "iso-8859-5", e_iso8859_5 },
99 { "iso-8859-6", e_iso8859_6 },
100 { "iso-8859-7", e_iso8859_7 },
101 { "iso-8859-8", e_iso8859_8 },
102 { "iso-8859-9", e_iso8859_9 },
103 { "iso-8859-10", e_iso8859_10 },
104 { "iso-8859-11", e_iso8859_11 },
105 { "iso-8859-13", e_iso8859_13 },
106 { "iso-8859-14", e_iso8859_14 },
107 { "iso-8859-15", e_iso8859_15 },
108 { "koi8-r", e_koi8_r },
109 { "jis201", e_jis201 },
110 { "mac", e_mac },
111 { "Macintosh", e_mac },
112 { "MS-ANSI", e_win },
113 { "EUC-KR", e_wansung },
114 { "johab", e_johab },
115 { "ISO-2022-KR", e_jiskorean },
116 { "ISO-2022-CN", e_jisgb },
117 { "EUC-CN", e_jisgbpk },
118 { "big5", e_big5 },
119 { "big5hkscs", e_big5hkscs },
120 { "ISO-2022-JP", e_jis },
121 { "ISO-2022-JP-2", e_jis2 },
122 { "Sjis", e_sjis },
123 { "UTF-8", e_utf8 },
124 { "UTF8", e_utf8 },
125 { NULL }};
126 int i;
128 for ( i=0; map[i].name!=NULL; ++i )
129 if ( strmatch(map[i].name,encname)==0 )
130 return( map[i].enc );
132 return( -1 );
135 gww_iconv_t gww_iconv_open(const char *toenc,const char *fromenc) {
136 struct gww_iconv_t stuff, *ret;
138 if ( endian==end_unknown )
139 endian_detector();
141 stuff.from = name_to_enc(fromenc);
142 stuff.to = name_to_enc(toenc);
143 if ( stuff.from==(enum encoding) -1 || stuff.to==(enum encoding) -1 ) {
144 /*fprintf( stderr, "Unknown encoding\n" );*/
145 return( (iconv_t)(-1) );
146 } else if ( stuff.from!=e_ucs4 && stuff.to!=e_ucs4 ) {
147 fprintf( stderr, "Bad call to gww_iconv_open, neither arg is UCS4\n" );
148 return( (iconv_t)(-1) );
151 ret = xmalloc(sizeof(struct gww_iconv_t));
152 *ret = stuff;
153 return( ret );
156 void gww_iconv_close( gww_iconv_t cd) {
157 free(cd);
160 size_t gww_iconv( gww_iconv_t _cd,
161 char **inbuf, size_t *inlen,
162 char **outbuf, size_t *outlen) {
163 struct gww_iconv_t *cd = _cd;
164 int char_cnt = 0;
165 unsigned char *plane;
166 int ch;
168 if ( inbuf==NULL || outbuf==NULL || inlen==NULL || outlen==NULL ||
169 *inbuf==NULL || *outbuf==NULL )
170 return( 0 ); /* Legal, used to reset the state. As we don't do states, irrelevant */
172 if ( cd->from<0 || cd->from>e_encodingmax || cd->to<0 || cd->to>e_encodingmax ) {
173 fprintf( stderr, "Garbage encoding passed to gww_iconv()\n" );
174 return( (size_t) -1 );
177 if ( cd->from==e_unicode ) {
178 if ( cd->to==e_unicode ) {
179 int min = *inlen < *outlen ? *inlen : *outlen;
180 min &= ~1;
181 memcpy(*inbuf,*outbuf,min);
182 char_cnt = min/sizeof(short);
183 *inbuf += min; *outbuf += min;
184 *inlen -= min; *outlen -= min;
185 if ( *inlen==1 && *outlen>0 )
186 return( (size_t) -1 ); /* Incomplete multi-byte sequence */
187 } else if ( cd->to==e_ucs4 ) {
188 int min = *inlen/sizeof(short) < *outlen/sizeof(int32) ? *inlen/sizeof(short) : *outlen/sizeof(int32);
189 int highch, lowch;
190 if ( endian == end_little ) {
191 while ( *inlen>=sizeof(short) && *outlen>=sizeof(int32) ) {
192 highch = ((unsigned char *) *inbuf)[1], lowch = *(unsigned char *) *inbuf;
193 ((uint8 *) outbuf)[3] = 0; ((uint8 *) outbuf)[2] = 0;
194 ((uint8 *) outbuf)[1] = highch; ((uint8 *) outbuf)[0] = lowch;
195 outbuf += sizeof(int32); inbuf += sizeof(short);
196 *outlen -= sizeof(int32); *inlen -= sizeof(short);
198 } else {
199 while ( *inlen>=sizeof(short) && *outlen>=sizeof(int32) ) {
200 highch = ((unsigned char *) *inbuf)[0], lowch = ((unsigned char *) *inbuf)[1];
201 ((uint8 *) outbuf)[0] = 0; ((uint8 *) outbuf)[1] = 0;
202 ((uint8 *) outbuf)[2] = highch; ((uint8 *) outbuf)[3] = lowch;
203 outbuf += sizeof(int32); inbuf += sizeof(short);
204 *outlen -= sizeof(int32); *inlen -= sizeof(short);
207 char_cnt = min;
208 if ( *inlen==1 && *outlen>0 )
209 return( (size_t) -1 ); /* Incomplete multi-byte sequence */
210 } else if ( cd->to<e_first2byte ) {
211 struct charmap *table = NULL;
212 table = alphabets_from_unicode[cd->to];
213 while ( *inlen>1 && *outlen>0 ) {
214 int highch, lowch;
215 if ( endian == end_little ) {
216 highch = ((unsigned char *) *inbuf)[1], lowch = *(unsigned char *) *inbuf;
217 } else {
218 highch = *(unsigned char *) *inbuf, lowch = ((unsigned char *) *inbuf)[1];
220 if ( highch>=table->first && highch<=table->last &&
221 (plane = table->table[highch])!=NULL &&
222 (ch=plane[lowch])!=0 ) {
223 *((*outbuf)++) = ch;
224 -- *outlen;
225 *inlen -= 2;
226 *inbuf += 2;
227 ++char_cnt;
228 } else
229 return( (size_t) -1 );
231 } else if ( cd->to==e_utf8 ) {
232 while ( *inlen>1 && *outlen>0 ) {
233 unichar_t uch;
234 if ( endian == end_little ) {
235 uch = (((unsigned char *) *inbuf)[1]<<8) | (*((unsigned char *) *inbuf));
236 } else {
237 uch = (*((unsigned char *) *inbuf)<<8) | (((unsigned char *) *inbuf)[1]);
239 if ( uch < 0x80 ) {
240 *((*outbuf)++) = uch;
241 --*outlen;
242 } else if ( uch<0x800 ) {
243 if ( *outlen==1 )
244 return( (size_t) -1 );
245 *((*outbuf)++) = 0xc0 | (uch>>6);
246 *((*outbuf)++) = 0x80 | (uch&0x3f);
247 *outlen-=2;
248 } else { /* I'm not dealing with */
249 if ( *outlen<=2 )
250 return( (size_t) -1 );
251 *((*outbuf)++) = 0xe0 | (uch>>12);
252 *((*outbuf)++) = 0x80 | ((uch>>6)&0x3f);
253 *((*outbuf)++) = 0x80 | (uch&0x3f);
254 *outlen-=3;
256 *inbuf += 2;
257 *inlen -= 2;
258 ++char_cnt;
260 } else {
261 fprintf( stderr, "Unexpected encoding\n" );
262 return( (size_t) -1 );
264 } else if ( cd->from==e_ucs4 ) {
265 if ( cd->to==e_unicode ) {
266 int min = *inlen/sizeof(int32) < *outlen/sizeof(int16) ? *inlen/sizeof(int32) : *outlen/sizeof(int16);
267 int highch, lowch;
268 if ( endian == end_little ) {
269 while ( *inlen>=sizeof(short) && *outlen>=sizeof(int32) ) {
270 highch = ((unsigned char *) *inbuf)[1], lowch = *(unsigned char *) *inbuf;
271 ((uint8 *) outbuf)[1] = highch; ((uint8 *) outbuf)[0] = lowch;
272 outbuf += sizeof(int16); inbuf += sizeof(int32);
273 *outlen -= sizeof(int16); *inlen -= sizeof(int32);
275 } else {
276 while ( *inlen>=sizeof(short) && *outlen>=sizeof(int32) ) {
277 highch = ((unsigned char *) *inbuf)[2], lowch = ((unsigned char *) *inbuf)[3];
278 ((uint8 *) outbuf)[0] = highch; ((uint8 *) outbuf)[1] = lowch;
279 outbuf += sizeof(int16); inbuf += sizeof(int32);
280 *outlen -= sizeof(int16); *inlen -= sizeof(int32);
283 char_cnt = min;
284 if ( *inlen>0 && *outlen>0 )
285 return( (size_t) -1 ); /* Incomplete multi-byte sequence */
286 } else if ( cd->to<e_first2byte ) {
287 struct charmap *table = NULL;
288 table = alphabets_from_unicode[cd->to];
289 while ( *inlen>1 && *outlen>0 ) {
290 int highch, lowch;
291 if ( endian == end_little ) {
292 highch = ((unsigned char *) *inbuf)[1], lowch = *(unsigned char *) *inbuf;
293 } else {
294 highch = ((unsigned char *) *inbuf)[2], lowch = ((unsigned char *) *inbuf)[3];
296 if ( highch>=table->first && highch<=table->last &&
297 (plane = table->table[highch])!=NULL &&
298 (ch=plane[lowch])!=0 ) {
299 *((*outbuf)++) = ch;
300 -- *outlen;
301 *inlen -= 4;
302 *inbuf += 4;
303 ++char_cnt;
304 } else
305 return( (size_t) -1 );
307 } else if ( cd->to==e_utf8 ) {
308 while ( *inlen>1 && *outlen>0 ) {
309 int uch;
310 if ( endian == end_little ) {
311 uch = (((unsigned char *) *inbuf)[3]<<24) |
312 (((unsigned char *) *inbuf)[2]<<16) |
313 (((unsigned char *) *inbuf)[1]<<8) |
314 (*((unsigned char *) *inbuf));
315 } else {
316 uch = (*((unsigned char *) *inbuf)<<24) |
317 (((unsigned char *) *inbuf)[1]<<16) |
318 (((unsigned char *) *inbuf)[2]<<8) |
319 (((unsigned char *) *inbuf)[3]);
321 if ( uch < 0x80 ) {
322 *((*outbuf)++) = uch;
323 --*outlen;
324 } else if ( uch<0x800 ) {
325 if ( *outlen==1 )
326 return( (size_t) -1 );
327 *((*outbuf)++) = 0xc0 | (uch>>6);
328 *((*outbuf)++) = 0x80 | (uch&0x3f);
329 *outlen-=2;
330 } else if ( uch < 0x10000 ) {
331 if ( *outlen<=2 )
332 return( (size_t) -1 );
333 *((*outbuf)++) = 0xe0 | (uch>>12);
334 *((*outbuf)++) = 0x80 | ((uch>>6)&0x3f);
335 *((*outbuf)++) = 0x80 | (uch&0x3f);
336 *outlen-=3;
337 } else {
338 uint32 val = uch-0x10000;
339 int u = ((val&0xf0000)>>16)+1, z=(val&0x0f000)>>12, y=(val&0x00fc0)>>6, x=val&0x0003f;
340 if ( *outlen<=3 )
341 return( (size_t) -1 );
342 *(*outbuf)++ = 0xf0 | (u>>2);
343 *(*outbuf)++ = 0x80 | ((u&3)<<4) | z;
344 *(*outbuf)++ = 0x80 | y;
345 *(*outbuf)++ = 0x80 | x;
346 *outlen-=4;
348 *inbuf += 4;
349 *inlen -= 4;
350 ++char_cnt;
352 } else {
353 fprintf( stderr, "Unexpected encoding\n" );
354 return( (size_t) -1 );
356 } else if ( cd->to==e_unicode ) {
357 const unichar_t *table;
358 if ( cd->from<e_first2byte ) {
359 table = unicode_from_alphabets[cd->from];
360 while ( *inlen>0 && *outlen>1 ) {
361 unichar_t ch = table[ *(unsigned char *) ((*inbuf)++)];
362 --*inlen;
363 if ( endian==end_little ) {
364 *((*outbuf)++) = ch&0xff;
365 *((*outbuf)++) = ch>>8;
366 } else {
367 *((*outbuf)++) = ch>>8;
368 *((*outbuf)++) = ch&0xff;
370 *outlen -= sizeof(unichar_t);
371 ++char_cnt;
373 } else if ( cd->from==e_jis || cd->from==e_jis2 ||
374 cd->from==e_jiskorean || cd->from==e_jisgb ) {
375 table = cd->from==e_jisgb ? unicode_from_gb2312 :
376 cd->from==e_jiskorean ? unicode_from_ksc5601 :
377 cd->from==e_jis ? unicode_from_jis208 :
378 unicode_from_jis212;
379 while ( *inlen>1 && *outlen>1 ) {
380 unsigned char *ipt = (unsigned char *) *inbuf;
381 int ch;
382 if ( *ipt<0x21 || *ipt>0x7e || ipt[1]<0x21 || ipt[1]>0x7e )
383 return( (size_t) -1 );
384 ch = (*ipt-0x21)*94 + (ipt[1]-0x21);
385 ch = table[ch];
386 *inlen -= 2;
387 *inbuf = (char *) ipt+2;
388 if ( endian==end_little ) {
389 *((*outbuf)++) = ch&0xff;
390 *((*outbuf)++) = ch>>8;
391 } else {
392 *((*outbuf)++) = ch>>8;
393 *((*outbuf)++) = ch&0xff;
395 *outlen -= sizeof(unichar_t);
396 ++char_cnt;
398 if ( *inlen==1 && *outlen>0 )
399 return( (size_t) -1 ); /* Incomplete multi-byte sequence */
400 } else if ( cd->from==e_wansung || cd->from==e_jisgbpk ) {
401 table = cd->from==e_jisgbpk ? unicode_from_gb2312 :
402 unicode_from_ksc5601 ;
403 while ( *inlen>0 && *outlen>1 ) {
404 unsigned char *ipt = (unsigned char *) *inbuf;
405 int ch;
406 if ( *ipt<0x7f ) {
407 ch = *ipt;
408 --*inlen;
409 *inbuf = (char *) ipt+1;
410 } else {
411 if ( *ipt<0xa1 || *ipt>0xfe || ipt[1]<0xa1 || ipt[1]>0xfe ||
412 *inlen==1 )
413 return( (size_t) -1 );
414 ch = (*ipt-0xa1)*94 + (ipt[1]-0xa1);
415 ch = table[ch];
416 *inlen -= 2;;
417 *inbuf = (char *) ipt+2;
419 if ( endian==end_little ) {
420 *((*outbuf)++) = ch&0xff;
421 *((*outbuf)++) = ch>>8;
422 } else {
423 *((*outbuf)++) = ch>>8;
424 *((*outbuf)++) = ch&0xff;
426 *outlen -= sizeof(unichar_t);
427 ++char_cnt;
429 } else if ( cd->from==e_johab || cd->from==e_big5 || cd->from==e_big5hkscs ) {
430 int offset;
431 if ( cd->from==e_big5 ) {
432 offset = 0xa100;
433 table = unicode_from_big5;
434 } else if ( cd->from==e_big5hkscs ) {
435 offset = 0x8100;
436 table = unicode_from_big5hkscs;
437 } else {
438 offset = 0x8400;
439 table = unicode_from_johab;
441 while ( *inlen>0 && *outlen>1 ) {
442 unsigned char *ipt = (unsigned char *) *inbuf;
443 int ch;
444 if ( *ipt<0x7f ) {
445 ch = *ipt;
446 --*inlen;
447 *inbuf = (char *) ipt+1;
448 } else {
449 if ( *inlen==1 )
450 return( (size_t) -1 );
451 ch = (*ipt<<8) | ipt[1];
452 if ( ch<offset )
453 return( (size_t) -1 );
454 ch -= offset;
455 ch = table[ch];
456 *inlen -= 2;
457 *inbuf = (char *) ipt+2;
459 if ( endian==end_little ) {
460 *((*outbuf)++) = ch&0xff;
461 *((*outbuf)++) = ch>>8;
462 } else {
463 *((*outbuf)++) = ch>>8;
464 *((*outbuf)++) = ch&0xff;
466 *outlen -= sizeof(unichar_t);
467 ++char_cnt;
469 } else if ( cd->from==e_sjis ) {
470 while ( *inlen>0 && *outlen>1 ) {
471 unsigned char *ipt = (unsigned char *) *inbuf;
472 int ch1 = *ipt;
473 if ( ch1<127 || ( ch1>=161 && ch1<=223 )) {
474 ch = unicode_from_jis201[ch1];
475 *inbuf = (char *) ipt+1;
476 --*inlen;
477 } else if ( *inlen==1 )
478 return( (size_t) -1 );
479 else {
480 int ch2 = ipt[1];
481 if ( ch1 >= 129 && ch1<= 159 )
482 ch1 -= 112;
483 else
484 ch1 -= 176;
485 ch1 <<= 1;
486 if ( ch2>=159 )
487 ch2-= 126;
488 else if ( ch2>127 ) {
489 --ch1;
490 ch2 -= 32;
491 } else {
492 --ch1;
493 ch2 -= 31;
495 if ( ch1-0x21>=94 || ch2-0x21>=94 )
496 return( (size_t) -1 );
497 ch = unicode_from_jis208[(ch1-0x21)*94+(ch2-0x21)];
498 *inlen -= 2;
499 *inbuf = (char *) ipt+2;
501 if ( endian==end_little ) {
502 *((*outbuf)++) = ch&0xff;
503 *((*outbuf)++) = ch>>8;
504 } else {
505 *((*outbuf)++) = ch>>8;
506 *((*outbuf)++) = ch&0xff;
508 *outlen -= sizeof(unichar_t);
509 ++char_cnt;
511 } else if ( cd->from==e_utf8 ) {
512 while ( *inlen>0 && *outlen>sizeof(unichar_t) ) {
513 unsigned char *ipt = (unsigned char *) *inbuf;
514 int ch = *ipt;
515 if ( ch <= 127 ) {
516 *inbuf = (char *) ipt+1;
517 --*inlen;
518 } else if ( ch<=0xdf ) {
519 if ( *inlen<2 || ipt[1]<0x80 )
520 return( (size_t) -1 );
521 ch = ((ch&0x1f)<<6) | (ipt[1] &0x3f);
522 *inlen -= 2;
523 *inbuf = (char *) ipt+2;
524 } else if ( ch<=0xef ) {
525 if ( *inlen<3 || ipt[1]<0x80 || ipt[2]<0x80 )
526 return( (size_t) -1 );
527 ch = ((ch&0x1f)<<12) | ((ipt[1] &0x3f)<<6) | (ipt[2]&0x3f);
528 *inlen -= 3;
529 *inbuf = (char *) ipt+3;
530 } else {
531 int w;
532 if ( *inlen<4 || *outlen<4 || ipt[1]<0x80 || ipt[2]<0x80 || ipt[3]<0x80 )
533 return( (size_t) -1 );
534 w = ( ((ch&0x7)<<2) | ((ipt[1]&0x30)>>4) )-1;
535 ch = 0xd800 | (w<<6) | ((ipt[1]&0xf)<<2) | ((ipt[2]&0x30)>>4);
536 if ( endian==end_little ) {
537 *((*outbuf)++) = ch&0xff;
538 *((*outbuf)++) = ch>>8;
539 } else {
540 *((*outbuf)++) = ch>>8;
541 *((*outbuf)++) = ch&0xff;
543 *outlen -= 2;
544 ch = 0xdc00 | ((ipt[2]&0xf)<<6) | (ipt[3]&0x3f);
546 if ( endian==end_little ) {
547 *((*outbuf)++) = ch&0xff;
548 *((*outbuf)++) = ch>>8;
549 } else {
550 *((*outbuf)++) = ch>>8;
551 *((*outbuf)++) = ch&0xff;
553 *outlen -= sizeof(unichar_t);
554 ++char_cnt;
556 } else {
557 fprintf( stderr, "Unexpected encoding\n" );
558 return( (size_t) -1 );
560 } else if ( cd->to==e_ucs4 ) {
561 const unichar_t *table;
562 if ( cd->from<e_first2byte ) {
563 table = unicode_from_alphabets[cd->from];
564 while ( *inlen>0 && *outlen>1 ) {
565 unichar_t ch = table[ *(unsigned char *) ((*inbuf)++)];
566 --*inlen;
567 if ( endian==end_little ) {
568 *((*outbuf)++) = 0;
569 *((*outbuf)++) = 0;
570 *((*outbuf)++) = ch&0xff;
571 *((*outbuf)++) = ch>>8;
572 } else {
573 *((*outbuf)++) = ch>>8;
574 *((*outbuf)++) = ch&0xff;
575 *((*outbuf)++) = 0;
576 *((*outbuf)++) = 0;
578 *outlen -= sizeof(unichar_t);
579 ++char_cnt;
581 } else if ( cd->from==e_jis || cd->from==e_jis2 ||
582 cd->from==e_jiskorean || cd->from==e_jisgb ) {
583 table = cd->from==e_jisgb ? unicode_from_gb2312 :
584 cd->from==e_jiskorean ? unicode_from_ksc5601 :
585 cd->from==e_jis ? unicode_from_jis208 :
586 unicode_from_jis212;
587 while ( *inlen>1 && *outlen>1 ) {
588 unsigned char *ipt = (unsigned char *) *inbuf;
589 int ch;
590 if ( *ipt<0x21 || *ipt>0x7e || ipt[1]<0x21 || ipt[1]>0x7e )
591 return( (size_t) -1 );
592 ch = (*ipt-0x21)*94 + (ipt[1]-0x21);
593 ch = table[ch];
594 *inlen -= 2;
595 *inbuf = (char *) ipt+2;
596 if ( endian==end_little ) {
597 *((*outbuf)++) = 0;
598 *((*outbuf)++) = 0;
599 *((*outbuf)++) = ch&0xff;
600 *((*outbuf)++) = ch>>8;
601 } else {
602 *((*outbuf)++) = ch>>8;
603 *((*outbuf)++) = ch&0xff;
604 *((*outbuf)++) = 0;
605 *((*outbuf)++) = 0;
607 *outlen -= sizeof(unichar_t);
608 ++char_cnt;
610 if ( *inlen==1 && *outlen>0 )
611 return( (size_t) -1 ); /* Incomplete multi-byte sequence */
612 } else if ( cd->from==e_wansung || cd->from==e_jisgbpk ) {
613 table = cd->from==e_jisgbpk ? unicode_from_gb2312 :
614 unicode_from_ksc5601 ;
615 while ( *inlen>0 && *outlen>1 ) {
616 unsigned char *ipt = (unsigned char *) *inbuf;
617 int ch;
618 if ( *ipt<0x7f ) {
619 ch = *ipt;
620 --*inlen;
621 *inbuf = (char *) ipt+1;
622 } else {
623 if ( *ipt<0xa1 || *ipt>0xfe || ipt[1]<0xa1 || ipt[1]>0xfe ||
624 *inlen==1 )
625 return( (size_t) -1 );
626 ch = (*ipt-0xa1)*94 + (ipt[1]-0xa1);
627 ch = table[ch];
628 *inlen -= 2;;
629 *inbuf = (char *) ipt+2;
631 if ( endian==end_little ) {
632 *((*outbuf)++) = 0;
633 *((*outbuf)++) = 0;
634 *((*outbuf)++) = ch&0xff;
635 *((*outbuf)++) = ch>>8;
636 } else {
637 *((*outbuf)++) = ch>>8;
638 *((*outbuf)++) = ch&0xff;
639 *((*outbuf)++) = 0;
640 *((*outbuf)++) = 0;
642 *outlen -= sizeof(unichar_t);
643 ++char_cnt;
645 } else if ( cd->from==e_johab || cd->from==e_big5 || cd->from==e_big5hkscs ) {
646 int offset;
647 if ( cd->from==e_big5 ) {
648 offset = 0xa100;
649 table = unicode_from_big5;
650 } else if ( cd->from==e_big5hkscs ) {
651 offset = 0x8100;
652 table = unicode_from_big5hkscs;
653 } else {
654 offset = 0x8400;
655 table = unicode_from_johab;
657 while ( *inlen>0 && *outlen>1 ) {
658 unsigned char *ipt = (unsigned char *) *inbuf;
659 int ch;
660 if ( *ipt<0x7f ) {
661 ch = *ipt;
662 --*inlen;
663 *inbuf = (char *) ipt+1;
664 } else {
665 if ( *inlen==1 )
666 return( (size_t) -1 );
667 ch = (*ipt<<8) | ipt[1];
668 if ( ch<offset )
669 return( (size_t) -1 );
670 ch -= offset;
671 ch = table[ch];
672 *inlen -= 2;
673 *inbuf = (char *) ipt+2;
675 if ( endian==end_little ) {
676 *((*outbuf)++) = 0;
677 *((*outbuf)++) = 0;
678 *((*outbuf)++) = ch&0xff;
679 *((*outbuf)++) = ch>>8;
680 } else {
681 *((*outbuf)++) = ch>>8;
682 *((*outbuf)++) = ch&0xff;
683 *((*outbuf)++) = 0;
684 *((*outbuf)++) = 0;
686 *outlen -= sizeof(unichar_t);
687 ++char_cnt;
689 } else if ( cd->from==e_sjis ) {
690 while ( *inlen>0 && *outlen>1 ) {
691 unsigned char *ipt = (unsigned char *) *inbuf;
692 int ch1 = *ipt;
693 if ( ch1<127 || ( ch1>=161 && ch1<=223 )) {
694 ch = unicode_from_jis201[ch1];
695 *inbuf = (char *) ipt+1;
696 --*inlen;
697 } else if ( *inlen==1 )
698 return( (size_t) -1 );
699 else {
700 int ch2 = ipt[1];
701 if ( ch1 >= 129 && ch1<= 159 )
702 ch1 -= 112;
703 else
704 ch1 -= 176;
705 ch1 <<= 1;
706 if ( ch2>=159 )
707 ch2-= 126;
708 else if ( ch2>127 ) {
709 --ch1;
710 ch2 -= 32;
711 } else {
712 --ch1;
713 ch2 -= 31;
715 if ( ch1-0x21>=94 || ch2-0x21>=94 )
716 return( (size_t) -1 );
717 ch = unicode_from_jis208[(ch1-0x21)*94+(ch2-0x21)];
718 *inlen -= 2;
719 *inbuf = (char *) ipt+2;
721 if ( endian==end_little ) {
722 *((*outbuf)++) = 0;
723 *((*outbuf)++) = 0;
724 *((*outbuf)++) = ch&0xff;
725 *((*outbuf)++) = ch>>8;
726 } else {
727 *((*outbuf)++) = ch>>8;
728 *((*outbuf)++) = ch&0xff;
729 *((*outbuf)++) = 0;
730 *((*outbuf)++) = 0;
732 *outlen -= sizeof(unichar_t);
733 ++char_cnt;
735 } else if ( cd->from==e_utf8 ) {
736 while ( *inlen>0 && *outlen>sizeof(unichar_t) ) {
737 unsigned char *ipt = (unsigned char *) *inbuf;
738 int ch = *ipt;
739 if ( ch <= 127 ) {
740 *inbuf = (char *) ipt+1;
741 --*inlen;
742 } else if ( ch<=0xdf ) {
743 if ( *inlen<2 || ipt[1]<0x80 )
744 return( (size_t) -1 );
745 ch = ((ch&0x1f)<<6) | (ipt[1] &0x3f);
746 *inlen -= 2;
747 *inbuf = (char *) ipt+2;
748 } else if ( ch<=0xef ) {
749 if ( *inlen<3 || ipt[1]<0x80 || ipt[2]<0x80 )
750 return( (size_t) -1 );
751 ch = ((ch&0x1f)<<12) | ((ipt[1] &0x3f)<<6) | (ipt[2]&0x3f);
752 *inlen -= 3;
753 *inbuf = (char *) ipt+3;
754 } else {
755 int w,w2;
756 w = ( ((*ipt&0x7)<<2) | ((ipt[1]&0x30)>>4) )-1;
757 w = (w<<6) | ((ipt[1]&0xf)<<2) | ((ipt[2]&0x30)>>4);
758 w2 = ((ipt[2]&0xf)<<6) | (ipt[3]&0x3f);
759 ch = w*0x400 + w2 + 0x10000;
760 *inbuf = (char *) ipt+4;
762 if ( endian==end_little ) {
763 *((*outbuf)++) = ch&0xff;
764 *((*outbuf)++) = ch>>8;
765 *((*outbuf)++) = ch>>16;
766 *((*outbuf)++) = ch>>24;
767 } else {
768 *((*outbuf)++) = ch>>24;
769 *((*outbuf)++) = ch>>16;
770 *((*outbuf)++) = ch>>8;
771 *((*outbuf)++) = ch&0xff;
773 *outlen -= sizeof(unichar_t);
774 ++char_cnt;
776 } else {
777 fprintf( stderr, "Unexpected encoding\n" );
778 return( (size_t) -1 );
780 } else {
781 fprintf( stderr, "One of the two encodings must be UCS2 in gww_iconv()\n" );
782 return( (size_t) -1 );
785 if ( *outlen>=1 ) {
786 **outbuf = '\0';
787 if ( *outlen>1 )
788 (*outbuf)[1] = '\0';
789 if ( cd->to==e_ucs4 && *outlen>3 ) {
790 (*outbuf)[2] = '\0';
791 (*outbuf)[3] = '\0';
794 return( char_cnt );
796 #else
797 static const int a_file_must_define_something=1;
798 #endif /* HAVE_ICONV_H */