Update.
[glibc.git] / iconv / gconv_simple.c
blob74dbfc0356a98edc28e5ece2d7e4b8c4dadffe8a
1 /* Simple transformations functions.
2 Copyright (C) 1997, 1998, 1999 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
16 You should have received a copy of the GNU Library General Public
17 License along with the GNU C Library; see the file COPYING.LIB. If not,
18 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
21 #include <byteswap.h>
22 #include <endian.h>
23 #include <errno.h>
24 #include <gconv.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <wchar.h>
29 #include <sys/param.h>
31 #ifndef EILSEQ
32 # define EILSEQ EINVAL
33 #endif
36 /* These are definitions used by some of the functions for handling
37 UTF-8 encoding below. */
38 static const uint32_t encoding_mask[] =
40 ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
43 static const unsigned char encoding_byte[] =
45 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
49 /* Transform from the internal, UCS4-like format, to UCS4. The
50 difference between the internal ucs4 format and the real UCS4
51 format is, if any, the endianess. The Unicode/ISO 10646 says that
52 unless some higher protocol specifies it differently, the byte
53 order is big endian.*/
54 #define DEFINE_INIT 0
55 #define DEFINE_FINI 0
56 #define MIN_NEEDED_FROM 4
57 #define MIN_NEEDED_TO 4
58 #define FROM_DIRECTION 1
59 #define FROM_LOOP internal_ucs4_loop
60 #define TO_LOOP internal_ucs4_loop /* This is not used. */
61 #define FUNCTION_NAME __gconv_transform_internal_ucs4
64 static inline int
65 internal_ucs4_loop (const unsigned char **inptrp, const unsigned char *inend,
66 unsigned char **outptrp, unsigned char *outend,
67 mbstate_t *state, void *data, size_t *converted)
69 const unsigned char *inptr = *inptrp;
70 unsigned char *outptr = *outptrp;
71 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
72 int result;
74 #if __BYTE_ORDER == __LITTLE_ENDIAN
75 /* Sigh, we have to do some real work. */
76 size_t cnt;
78 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
79 *((uint32_t *) outptr)++ = bswap_32 (*(uint32_t *) inptr);
81 *inptrp = inptr;
82 *outptrp = outptr;
83 #elif __BYTE_ORDER == __BIG_ENDIAN
84 /* Simply copy the data. */
85 *inptrp = inptr + n_convert * 4;
86 *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
87 #else
88 # error "This endianess is not supported."
89 #endif
91 /* Determine the status. */
92 if (*outptrp == outend)
93 result = GCONV_FULL_OUTPUT;
94 else if (*inptrp == inend)
95 result = GCONV_EMPTY_INPUT;
96 else
97 result = GCONV_INCOMPLETE_INPUT;
99 if (converted != NULL)
100 converted += n_convert;
102 return result;
105 #include <iconv/skeleton.c>
108 /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */
109 #define DEFINE_INIT 0
110 #define DEFINE_FINI 0
111 #define MIN_NEEDED_FROM 1
112 #define MIN_NEEDED_TO 4
113 #define FROM_DIRECTION 1
114 #define FROM_LOOP ascii_internal_loop
115 #define TO_LOOP ascii_internal_loop /* This is not used. */
116 #define FUNCTION_NAME __gconv_transform_ascii_internal
118 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
119 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
120 #define LOOPFCT FROM_LOOP
121 #define BODY \
123 if (*inptr > '\x7f') \
125 /* This is no correct ANSI_X3.4-1968 character. */ \
126 result = GCONV_ILLEGAL_INPUT; \
127 break; \
130 /* It's an one byte sequence. */ \
131 *((uint32_t *) outptr)++ = *inptr++; \
133 #include <iconv/loop.c>
134 #include <iconv/skeleton.c>
137 /* Convert from the internal (UCS4-like) format to ISO 646-IRV. */
138 #define DEFINE_INIT 0
139 #define DEFINE_FINI 0
140 #define MIN_NEEDED_FROM 4
141 #define MIN_NEEDED_TO 1
142 #define FROM_DIRECTION 1
143 #define FROM_LOOP internal_ascii_loop
144 #define TO_LOOP internal_ascii_loop /* This is not used. */
145 #define FUNCTION_NAME __gconv_transform_internal_ascii
147 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
148 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
149 #define LOOPFCT FROM_LOOP
150 #define BODY \
152 if (*((uint32_t *) inptr) > 0x7f) \
154 /* This is no correct ANSI_X3.4-1968 character. */ \
155 result = GCONV_ILLEGAL_INPUT; \
156 break; \
159 /* It's an one byte sequence. */ \
160 *outptr++ = *((uint32_t *) inptr)++; \
162 #include <iconv/loop.c>
163 #include <iconv/skeleton.c>
166 /* Convert from the internal (UCS4-like) format to UTF-8. */
167 #define DEFINE_INIT 0
168 #define DEFINE_FINI 0
169 #define MIN_NEEDED_FROM 4
170 #define MIN_NEEDED_TO 1
171 #define MAX_NEEDED_TO 6
172 #define FROM_DIRECTION 1
173 #define FROM_LOOP internal_utf8_loop
174 #define TO_LOOP internal_utf8_loop /* This is not used. */
175 #define FUNCTION_NAME __gconv_transform_internal_utf8
177 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
178 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
179 #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
180 #define LOOPFCT FROM_LOOP
181 #define BODY \
183 uint32_t wc = *((uint32_t *) inptr); \
185 /* Since we control every character we read this cannot happen. */ \
186 assert (wc <= 0x7fffffff); \
188 if (wc < 0x80) \
189 /* It's an one byte sequence. */ \
190 *outptr++ = (unsigned char) wc; \
191 else \
193 size_t step; \
194 char *start; \
196 for (step = 2; step < 6; ++step) \
197 if ((wc & encoding_mask[step - 2]) == 0) \
198 break; \
200 if (outptr + step >= outend) \
202 /* Too long. */ \
203 result = GCONV_FULL_OUTPUT; \
204 break; \
207 start = outptr; \
208 *outptr = encoding_byte[step - 2]; \
209 outptr += step; \
210 --step; \
211 do \
213 start[step] = 0x80 | (wc & 0x3f); \
214 wc >>= 6; \
216 while (--step > 0); \
217 start[0] |= wc; \
220 inptr += 4; \
222 #include <iconv/loop.c>
223 #include <iconv/skeleton.c>
226 /* Convert from UTF-8 to the internal (UCS4-like) format. */
227 #define DEFINE_INIT 0
228 #define DEFINE_FINI 0
229 #define MIN_NEEDED_FROM 1
230 #define MAX_NEEDED_FROM 6
231 #define MIN_NEEDED_TO 4
232 #define FROM_DIRECTION 1
233 #define FROM_LOOP utf8_internal_loop
234 #define TO_LOOP utf8_internal_loop /* This is not used. */
235 #define FUNCTION_NAME __gconv_transform_utf8_internal
237 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
238 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
239 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
240 #define LOOPFCT FROM_LOOP
241 #define BODY \
243 uint32_t ch; \
244 uint_fast32_t cnt; \
245 uint_fast32_t i; \
247 /* Next input byte. */ \
248 ch = *inptr; \
250 if (ch < 0x80) \
252 /* One byte sequence. */ \
253 cnt = 1; \
254 ++inptr; \
256 else \
258 if ((ch & 0xe0) == 0xc0) \
260 cnt = 2; \
261 ch &= 0x1f; \
263 else if ((ch & 0xf0) == 0xe0) \
265 /* We expect three bytes. */ \
266 cnt = 3; \
267 ch &= 0x0f; \
269 else if ((ch & 0xf8) == 0xf0) \
271 /* We expect four bytes. */ \
272 cnt = 4; \
273 ch &= 0x07; \
275 else if ((ch & 0xfc) == 0xf8) \
277 /* We expect five bytes. */ \
278 cnt = 5; \
279 ch &= 0x03; \
281 else if ((ch & 0xfe) == 0xfc) \
283 /* We expect six bytes. */ \
284 cnt = 6; \
285 ch &= 0x01; \
287 else \
289 /* This is an illegal encoding. */ \
290 result = GCONV_ILLEGAL_INPUT; \
291 break; \
294 if (NEED_LENGTH_TEST && inptr + cnt > inend) \
296 /* We don't have enough input. */ \
297 result = GCONV_INCOMPLETE_INPUT; \
298 break; \
301 /* Read the possible remaining bytes. */ \
302 for (i = 1; i < cnt; ++i) \
304 uint32_t byte = inptr[i]; \
306 if ((byte & 0xc0) != 0x80) \
308 /* This is an illegal encoding. */ \
309 result = GCONV_ILLEGAL_INPUT; \
310 break; \
313 ch <<= 6; \
314 ch |= byte & 0x3f; \
316 inptr += cnt; \
319 /* Now adjust the pointers and store the result. */ \
320 *((uint32_t *) outptr)++ = ch; \
322 #include <iconv/loop.c>
323 #include <iconv/skeleton.c>
326 /* Convert from UCS2 to the internal (UCS4-like) format. */
327 #define DEFINE_INIT 0
328 #define DEFINE_FINI 0
329 #define MIN_NEEDED_FROM 2
330 #define MIN_NEEDED_TO 4
331 #define FROM_DIRECTION 1
332 #define FROM_LOOP ucs2_internal_loop
333 #define TO_LOOP ucs2_internal_loop /* This is not used. */
334 #define FUNCTION_NAME __gconv_transform_ucs2_internal
336 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
337 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
338 #define LOOPFCT FROM_LOOP
339 #if __BYTE_ORDER == __LITTLE_ENDIAN
340 # define BODY \
341 *((uint32_t *) outptr)++ = bswap_16 (*(uint16_t *) inptr); \
342 inptr += 2;
343 #else
344 # define BODY \
345 *((uint32_t *) outptr)++ = *((uint16_t *) inptr)++;
346 #endif
347 #include <iconv/loop.c>
348 #include <iconv/skeleton.c>
351 /* Convert from the internal (UCS4-like) format to UCS2. */
352 #define DEFINE_INIT 0
353 #define DEFINE_FINI 0
354 #define MIN_NEEDED_FROM 4
355 #define MIN_NEEDED_TO 2
356 #define FROM_DIRECTION 1
357 #define FROM_LOOP internal_ucs2_loop
358 #define TO_LOOP internal_ucs2_loop /* This is not used. */
359 #define FUNCTION_NAME __gconv_transform_internal_ucs2
361 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
362 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
363 #define LOOPFCT FROM_LOOP
364 #if __BYTE_ORDER == __LITTLE_ENDIAN
365 # define BODY \
367 if (*((uint32_t *) inptr) >= 0x10000) \
369 result = GCONV_ILLEGAL_INPUT; \
370 break; \
372 /* Please note that we use the `uint32_t' from-pointer as an `uint16_t' \
373 pointer which works since we are on a little endian machine. */ \
374 *((uint16_t *) outptr)++ = bswap_16 (*((uint16_t *) inptr)); \
375 inptr += 4; \
377 #else
378 # define BODY \
380 if (*((uint32_t *) inptr) >= 0x10000) \
382 result = GCONV_ILLEGAL_INPUT; \
383 break; \
385 *((uint16_t *) outptr)++ = *((uint32_t *) inptr)++; \
387 #endif
388 #include <iconv/loop.c>
389 #include <iconv/skeleton.c>
392 /* Convert from UCS2 in little endian to the internal (UCS4-like) format. */
393 #define DEFINE_INIT 0
394 #define DEFINE_FINI 0
395 #define MIN_NEEDED_FROM 2
396 #define MIN_NEEDED_TO 4
397 #define FROM_DIRECTION 1
398 #define FROM_LOOP ucs2little_internal_loop
399 #define TO_LOOP ucs2little_internal_loop /* This is not used.*/
400 #define FUNCTION_NAME __gconv_transform_ucs2little_internal
402 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
403 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
404 #define LOOPFCT FROM_LOOP
405 #if __BYTE_ORDER == __LITTLE_ENDIAN
406 # define BODY \
407 *((uint32_t *) outptr)++ = *((uint16_t *) inptr)++;
408 #else
409 # define BODY \
410 *((uint32_t *) outptr)++ = bswap_16 (*(uint16_t *) inptr); \
411 inptr += 2;
412 #endif
413 #include <iconv/loop.c>
414 #include <iconv/skeleton.c>
417 /* Convert from the internal (UCS4-like) format to UCS2 in little endian. */
418 #define DEFINE_INIT 0
419 #define DEFINE_FINI 0
420 #define MIN_NEEDED_FROM 4
421 #define MIN_NEEDED_TO 2
422 #define FROM_DIRECTION 1
423 #define FROM_LOOP internal_ucs2little_loop
424 #define TO_LOOP internal_ucs2little_loop /* This is not used.*/
425 #define FUNCTION_NAME __gconv_transform_internal_ucs2little
427 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
428 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
429 #define LOOPFCT FROM_LOOP
430 #if __BYTE_ORDER == __LITTLE_ENDIAN
431 # define BODY \
433 if (*((uint32_t *) inptr) >= 0x10000) \
435 result = GCONV_ILLEGAL_INPUT; \
436 break; \
438 *((uint16_t *) outptr)++ = *((uint32_t *) inptr)++; \
440 #else
441 # define BODY \
443 if (*((uint32_t *) inptr) >= 0x10000) \
445 result = GCONV_ILLEGAL_INPUT; \
446 break; \
448 *((uint16_t *) outptr)++ = bswap_16 (((uint16_t *) inptr)[1]); \
449 inptr += 4; \
451 #endif
452 #include <iconv/loop.c>
453 #include <iconv/skeleton.c>
456 /* Convert from the internal (UCS4-like) format to UTF-16. */
457 #define DEFINE_INIT 0
458 #define DEFINE_FINI 0
459 #define MIN_NEEDED_FROM 4
460 #define MIN_NEEDED_TO 2
461 #define MAX_NEEDED_TO 4
462 #define FROM_DIRECTION 1
463 #define FROM_LOOP internal_utf16_loop
464 #define TO_LOOP internal_utf16_loop /* This is not used. */
465 #define FUNCTION_NAME __gconv_transform_internal_utf16
467 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
468 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
469 #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
470 #define LOOPFCT FROM_LOOP
471 #if __BYTE_ORDER == __LITTLE_ENDIAN
472 # define BODY \
474 if (*((uint32_t *) inptr) >= 0x10000) \
476 if (*((uint32_t *) inptr) >= 0x110000) \
478 result = GCONV_ILLEGAL_INPUT; \
479 break; \
482 /* Generate a surrogate character. */ \
483 if (NEED_LENGTH_TEST && outptr + 4 > outend) \
485 /* Overflow in the output buffer. */ \
486 result = GCONV_FULL_OUTPUT; \
487 break; \
490 *((uint16_t *) outptr)++ = bswap_16 (0xd7c0 \
491 + (*((uint32_t *) inptr) >> 10));\
492 *((uint16_t *) outptr)++ = bswap_16 (0xdc00 \
493 + (*((uint32_t *) inptr) \
494 & 0x3ff)); \
496 else \
497 /* Please note that we use the `uint32_t' from-pointer as an `uint16_t' \
498 pointer which works since we are on a little endian machine. */ \
499 *((uint16_t *) outptr)++ = bswap_16 (*((uint16_t *) inptr)); \
500 inptr += 4; \
502 #else
503 # define BODY \
505 if (*((uint32_t *) inptr) >= 0x10000) \
507 if (*((uint32_t *) inptr) >= 0x110000) \
509 result = GCONV_ILLEGAL_INPUT; \
510 break; \
513 /* Generate a surrogate character. */ \
514 if (NEED_LENGTH_TEST && outptr + 4 > outend) \
516 /* Overflow in the output buffer. */ \
517 result = GCONV_FULL_OUTPUT; \
518 break; \
521 *((uint16_t *) outptr)++ = 0xd7c0 + (*((uint32_t *) inptr) >> 10); \
522 *((uint16_t *) outptr)++ = 0xdc00 + (*((uint32_t *) inptr) & 0x3ff); \
524 else \
525 *((uint16_t *) outptr)++ = *((uint32_t *) inptr)++; \
527 #endif
528 #include <iconv/loop.c>
529 #include <iconv/skeleton.c>
532 /* Convert from UTF-16 to the internal (UCS4-like) format. */
533 #define DEFINE_INIT 0
534 #define DEFINE_FINI 0
535 #define MIN_NEEDED_FROM 2
536 #define MAX_NEEDED_FROM 4
537 #define MIN_NEEDED_TO 4
538 #define FROM_DIRECTION 1
539 #define FROM_LOOP utf16_internal_loop
540 #define TO_LOOP utf16_internal_loop /* This is not used.*/
541 #define FUNCTION_NAME __gconv_transform_utf16_internal
543 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
544 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
545 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
546 #define LOOPFCT FROM_LOOP
547 #if __BYTE_ORDER == __LITTLE_ENDIAN
548 # define BODY \
550 uint16_t u1 = bswap_16 (*(uint16_t *) inptr); \
552 if (u1 < 0xd800 || u1 > 0xdfff) \
554 /* No surrogate. */ \
555 *((uint32_t *) outptr)++ = u1; \
556 inptr += 2; \
558 else \
560 uint16_t u2; \
562 /* It's a surrogate character. At least the first word says \
563 it is. */ \
564 if (NEED_LENGTH_TEST && inptr + 4 > inend) \
566 /* We don't have enough input for another complete input \
567 character. */ \
568 result = GCONV_INCOMPLETE_INPUT; \
569 break; \
572 u2 = bswap_16 (((uint16_t *) inptr)[1]); \
573 if (u2 < 0xdc00 || u2 >= 0xdfff) \
575 /* This is no valid second word for a surrogate. */ \
576 result = GCONV_ILLEGAL_INPUT; \
577 break; \
580 *((uint32_t *) outptr)++ = ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00); \
581 inptr += 4; \
584 #else
585 # define BODY \
587 uint16_t u1 = *(uint16_t *) inptr; \
589 if (u1 < 0xd800 || u1 > 0xdfff) \
591 /* No surrogate. */ \
592 *((uint32_t *) outptr)++ = u1; \
593 inptr += 2; \
595 else \
597 uint16_t u2; \
599 /* It's a surrogate character. At least the first word says \
600 it is. */ \
601 if (NEED_LENGTH_TEST && inptr + 4 > inend) \
603 /* We don't have enough input for another complete input \
604 character. */ \
605 result = GCONV_INCOMPLETE_INPUT; \
606 break; \
609 u2 = ((uint16_t *) inptr)[1]; \
610 if (u2 < 0xdc00 || u2 >= 0xdfff) \
612 /* This is no valid second word for a surrogate. */ \
613 result = GCONV_ILLEGAL_INPUT; \
614 break; \
617 *((uint32_t *) outptr)++ = ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00); \
618 inptr += 4; \
621 #endif
622 #include <iconv/loop.c>
623 #include <iconv/skeleton.c>