exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / striconveh.c
blobdb83a1ddca7e56b563900555de51f057999088eb
1 /* Character set conversion with error handling.
2 Copyright (C) 2001-2024 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation; either version 2.1 of the
8 License, or (at your option) any later version.
10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 #include <config.h>
20 /* Specification. */
21 #include "striconveh.h"
23 #include <errno.h>
24 #include <stdlib.h>
25 #include <string.h>
27 #if HAVE_ICONV
28 # include <iconv.h>
29 # include "unistr.h"
30 #endif
32 #include "c-strcase.h"
33 #include "c-strcaseeq.h"
35 #ifndef SIZE_MAX
36 # define SIZE_MAX ((size_t) -1)
37 #endif
40 #if HAVE_ICONV
42 /* The caller must provide an iconveh_t, not just an iconv_t, because when a
43 conversion error occurs, we may have to determine the Unicode representation
44 of the inconvertible character. */
46 int
47 iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
49 iconv_t cd;
50 iconv_t cd1;
51 iconv_t cd2;
53 /* Avoid glibc-2.1 bug with EUC-KR. */
54 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
55 && !defined _LIBICONV_VERSION
56 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
57 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
59 errno = EINVAL;
60 return -1;
62 # endif
64 cd = iconv_open (to_codeset, from_codeset);
66 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
67 cd1 = (iconv_t)(-1);
68 else
70 cd1 = iconv_open ("UTF-8", from_codeset);
71 if (cd1 == (iconv_t)(-1))
73 int saved_errno = errno;
74 if (cd != (iconv_t)(-1))
75 iconv_close (cd);
76 errno = saved_errno;
77 return -1;
81 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
82 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
83 && !defined __UCLIBC__) \
84 || _LIBICONV_VERSION >= 0x0105 \
85 || defined ICONV_SET_TRANSLITERATE
86 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
87 # endif
89 cd2 = (iconv_t)(-1);
90 else
92 cd2 = iconv_open (to_codeset, "UTF-8");
93 if (cd2 == (iconv_t)(-1))
95 int saved_errno = errno;
96 if (cd1 != (iconv_t)(-1))
97 iconv_close (cd1);
98 if (cd != (iconv_t)(-1))
99 iconv_close (cd);
100 errno = saved_errno;
101 return -1;
105 cdp->cd = cd;
106 cdp->cd1 = cd1;
107 cdp->cd2 = cd2;
108 return 0;
112 iconveh_close (const iconveh_t *cd)
114 if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
116 /* Return -1, but preserve the errno from iconv_close. */
117 int saved_errno = errno;
118 if (cd->cd1 != (iconv_t)(-1))
119 iconv_close (cd->cd1);
120 if (cd->cd != (iconv_t)(-1))
121 iconv_close (cd->cd);
122 errno = saved_errno;
123 return -1;
125 if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
127 /* Return -1, but preserve the errno from iconv_close. */
128 int saved_errno = errno;
129 if (cd->cd != (iconv_t)(-1))
130 iconv_close (cd->cd);
131 errno = saved_errno;
132 return -1;
134 if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
135 return -1;
136 return 0;
139 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
140 a conversion error, and it returns in *INCREMENTED a boolean telling whether
141 it has incremented the input pointers past the error location. */
142 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
143 && !(defined __GLIBC__ && !defined __UCLIBC__)
144 /* Irix iconv() inserts a NUL byte if it cannot convert.
145 NetBSD iconv() inserts a question mark if it cannot convert.
146 Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
147 known to prefer to fail rather than doing a lossy conversion. */
148 static size_t
149 iconv_carefully (iconv_t cd,
150 const char **inbuf, size_t *inbytesleft,
151 char **outbuf, size_t *outbytesleft,
152 bool *incremented)
154 const char *inptr = *inbuf;
155 const char *inptr_end = inptr + *inbytesleft;
156 char *outptr = *outbuf;
157 size_t outsize = *outbytesleft;
158 const char *inptr_before;
159 size_t res;
163 size_t insize;
165 inptr_before = inptr;
166 res = (size_t)(-1);
168 for (insize = 1; inptr + insize <= inptr_end; insize++)
170 res = iconv (cd,
171 (ICONV_CONST char **) &inptr, &insize,
172 &outptr, &outsize);
173 if (!(res == (size_t)(-1) && errno == EINVAL))
174 break;
175 /* iconv can eat up a shift sequence but give EINVAL while attempting
176 to convert the first character. E.g. libiconv does this. */
177 if (inptr > inptr_before)
179 res = 0;
180 break;
184 if (res == 0)
186 *outbuf = outptr;
187 *outbytesleft = outsize;
190 while (res == 0 && inptr < inptr_end);
192 *inbuf = inptr;
193 *inbytesleft = inptr_end - inptr;
194 if (res != (size_t)(-1) && res > 0)
196 /* iconv() has already incremented INPTR. We cannot go back to a
197 previous INPTR, otherwise the state inside CD would become invalid,
198 if FROM_CODESET is a stateful encoding. So, tell the caller that
199 *INBUF has already been incremented. */
200 *incremented = (inptr > inptr_before);
201 errno = EILSEQ;
202 return (size_t)(-1);
204 else
206 *incremented = false;
207 return res;
210 # else
211 # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
212 (*(incremented) = false, \
213 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
214 # endif
216 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
217 converting one character or one shift sequence. */
218 static size_t
219 iconv_carefully_1 (iconv_t cd,
220 const char **inbuf, size_t *inbytesleft,
221 char **outbuf, size_t *outbytesleft,
222 bool *incremented)
224 const char *inptr_before = *inbuf;
225 const char *inptr = inptr_before;
226 const char *inptr_end = inptr_before + *inbytesleft;
227 char *outptr = *outbuf;
228 size_t outsize = *outbytesleft;
229 size_t res = (size_t)(-1);
230 size_t insize;
232 for (insize = 1; inptr_before + insize <= inptr_end; insize++)
234 inptr = inptr_before;
235 res = iconv (cd,
236 (ICONV_CONST char **) &inptr, &insize,
237 &outptr, &outsize);
238 if (!(res == (size_t)(-1) && errno == EINVAL))
239 break;
240 /* iconv can eat up a shift sequence but give EINVAL while attempting
241 to convert the first character. E.g. libiconv does this. */
242 if (inptr > inptr_before)
244 res = 0;
245 break;
249 *inbuf = inptr;
250 *inbytesleft = inptr_end - inptr;
251 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
252 && !(defined __GLIBC__ && !defined __UCLIBC__)
253 /* Irix iconv() inserts a NUL byte if it cannot convert.
254 NetBSD iconv() inserts a question mark if it cannot convert.
255 Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
256 known to prefer to fail rather than doing a lossy conversion. */
257 if (res != (size_t)(-1) && res > 0)
259 /* iconv() has already incremented INPTR. We cannot go back to a
260 previous INPTR, otherwise the state inside CD would become invalid,
261 if FROM_CODESET is a stateful encoding. So, tell the caller that
262 *INBUF has already been incremented. */
263 *incremented = (inptr > inptr_before);
264 errno = EILSEQ;
265 return (size_t)(-1);
267 # endif
269 if (res != (size_t)(-1))
271 *outbuf = outptr;
272 *outbytesleft = outsize;
274 *incremented = false;
275 return res;
278 /* utf8conv_carefully is like iconv, except that
279 - it converts from UTF-8 to UTF-8,
280 - it stops as soon as it encounters a conversion error, and it returns
281 in *INCREMENTED a boolean telling whether it has incremented the input
282 pointers past the error location,
283 - if one_character_only is true, it stops after converting one
284 character. */
285 static size_t
286 utf8conv_carefully (bool one_character_only,
287 const char **inbuf, size_t *inbytesleft,
288 char **outbuf, size_t *outbytesleft,
289 bool *incremented)
291 const char *inptr = *inbuf;
292 size_t insize = *inbytesleft;
293 char *outptr = *outbuf;
294 size_t outsize = *outbytesleft;
295 size_t res;
297 res = 0;
300 ucs4_t uc;
301 int n;
302 int m;
304 n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
305 if (n < 0)
307 errno = (n == -2 ? EINVAL : EILSEQ);
308 n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
309 inptr += n;
310 insize -= n;
311 res = (size_t)(-1);
312 *incremented = true;
313 break;
315 if (outsize == 0)
317 errno = E2BIG;
318 res = (size_t)(-1);
319 *incremented = false;
320 break;
322 m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
323 if (m == -2)
325 errno = E2BIG;
326 res = (size_t)(-1);
327 *incremented = false;
328 break;
330 inptr += n;
331 insize -= n;
332 if (m == -1)
334 errno = EILSEQ;
335 res = (size_t)(-1);
336 *incremented = true;
337 break;
339 outptr += m;
340 outsize -= m;
342 while (!one_character_only && insize > 0);
344 *inbuf = inptr;
345 *inbytesleft = insize;
346 *outbuf = outptr;
347 *outbytesleft = outsize;
348 return res;
351 static int
352 mem_cd_iconveh_internal (const char *src, size_t srclen,
353 iconv_t cd, iconv_t cd1, iconv_t cd2,
354 enum iconv_ilseq_handler handler,
355 size_t extra_alloc,
356 size_t *offsets,
357 char **resultp, size_t *lengthp)
359 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
360 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
361 Instead, we have to start afresh from the beginning of SRC. */
362 /* Use a temporary buffer, so that for small strings, a single malloc()
363 call will be sufficient. */
364 # define tmpbufsize 4096
365 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
366 libiconv's UCS-4-INTERNAL encoding. */
367 union { unsigned int align; char buf[tmpbufsize]; } tmp;
368 # define tmpbuf tmp.buf
370 char *initial_result;
371 char *result;
372 size_t allocated;
373 size_t length;
374 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
376 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
378 initial_result = *resultp;
379 allocated = *lengthp;
381 else
383 initial_result = tmpbuf;
384 allocated = sizeof (tmpbuf);
386 result = initial_result;
388 /* Test whether a direct conversion is possible at all. */
389 if (cd == (iconv_t)(-1))
390 goto indirectly;
392 if (offsets != NULL)
394 size_t i;
396 for (i = 0; i < srclen; i++)
397 offsets[i] = (size_t)(-1);
399 last_length = (size_t)(-1);
401 length = 0;
403 /* First, try a direct conversion, and see whether a conversion error
404 occurs at all. */
406 const char *inptr = src;
407 size_t insize = srclen;
409 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
410 # if defined _LIBICONV_VERSION \
411 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
412 || defined __sun)
413 /* Set to the initial state. */
414 iconv (cd, NULL, NULL, NULL, NULL);
415 # endif
417 while (insize > 0)
419 char *outptr = result + length;
420 size_t outsize = allocated - extra_alloc - length;
421 bool incremented;
422 size_t res;
423 bool grow;
425 if (offsets != NULL)
427 if (length != last_length) /* ensure that offset[] be increasing */
429 offsets[inptr - src] = length;
430 last_length = length;
432 res = iconv_carefully_1 (cd,
433 &inptr, &insize,
434 &outptr, &outsize,
435 &incremented);
437 else
438 /* Use iconv_carefully instead of iconv here, because:
439 - If TO_CODESET is UTF-8, we can do the error handling in this
440 loop, no need for a second loop,
441 - With iconv() implementations other than GNU libiconv and GNU
442 libc, if we use iconv() in a big swoop, checking for an E2BIG
443 return, we lose the number of irreversible conversions. */
444 res = iconv_carefully (cd,
445 &inptr, &insize,
446 &outptr, &outsize,
447 &incremented);
449 length = outptr - result;
450 grow = (length + extra_alloc > allocated / 2);
451 if (res == (size_t)(-1))
453 if (errno == E2BIG)
454 grow = true;
455 else if (errno == EINVAL)
456 break;
457 else if (errno == EILSEQ && handler != iconveh_error)
459 if (cd2 == (iconv_t)(-1))
461 /* TO_CODESET is UTF-8. */
462 /* Error handling can produce up to 1 or 3 bytes of
463 output. */
464 size_t extra_need =
465 (handler == iconveh_replacement_character ? 3 : 1);
466 if (length + extra_need + extra_alloc > allocated)
468 char *memory;
470 allocated = 2 * allocated;
471 if (length + extra_need + extra_alloc > allocated)
472 allocated = 2 * allocated;
473 if (length + extra_need + extra_alloc > allocated)
474 abort ();
475 if (result == initial_result)
476 memory = (char *) malloc (allocated);
477 else
478 memory = (char *) realloc (result, allocated);
479 if (memory == NULL)
481 if (result != initial_result)
482 free (result);
483 errno = ENOMEM;
484 return -1;
486 if (result == initial_result)
487 memcpy (memory, initial_result, length);
488 result = memory;
489 grow = false;
491 /* The input is invalid in FROM_CODESET. Eat up one byte
492 and emit a replacement character or a question mark. */
493 if (!incremented)
495 if (insize == 0)
496 abort ();
497 inptr++;
498 insize--;
500 if (handler == iconveh_replacement_character)
502 /* U+FFFD in UTF-8 encoding. */
503 result[length+0] = '\357';
504 result[length+1] = '\277';
505 result[length+2] = '\275';
506 length += 3;
508 else
510 result[length] = '?';
511 length++;
514 else
515 goto indirectly;
517 else
519 if (result != initial_result)
520 free (result);
521 return -1;
524 if (insize == 0)
525 break;
526 if (grow)
528 char *memory;
530 allocated = 2 * allocated;
531 if (result == initial_result)
532 memory = (char *) malloc (allocated);
533 else
534 memory = (char *) realloc (result, allocated);
535 if (memory == NULL)
537 if (result != initial_result)
538 free (result);
539 errno = ENOMEM;
540 return -1;
542 if (result == initial_result)
543 memcpy (memory, initial_result, length);
544 result = memory;
549 /* Now get the conversion state back to the initial state.
550 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
551 #if defined _LIBICONV_VERSION \
552 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
553 || defined __sun)
554 for (;;)
556 char *outptr = result + length;
557 size_t outsize = allocated - extra_alloc - length;
558 size_t res;
560 res = iconv (cd, NULL, NULL, &outptr, &outsize);
561 length = outptr - result;
562 if (res == (size_t)(-1))
564 if (errno == E2BIG)
566 char *memory;
568 allocated = 2 * allocated;
569 if (result == initial_result)
570 memory = (char *) malloc (allocated);
571 else
572 memory = (char *) realloc (result, allocated);
573 if (memory == NULL)
575 if (result != initial_result)
576 free (result);
577 errno = ENOMEM;
578 return -1;
580 if (result == initial_result)
581 memcpy (memory, initial_result, length);
582 result = memory;
584 else
586 if (result != initial_result)
587 free (result);
588 return -1;
591 else
592 break;
594 #endif
596 /* The direct conversion succeeded. */
597 goto done;
599 indirectly:
600 /* The direct conversion failed.
601 Use a conversion through UTF-8. */
602 if (offsets != NULL)
604 size_t i;
606 for (i = 0; i < srclen; i++)
607 offsets[i] = (size_t)(-1);
609 last_length = (size_t)(-1);
611 length = 0;
613 const bool slowly = (offsets != NULL || handler == iconveh_error);
614 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
615 char utf8buf[utf8bufsize + 3];
616 size_t utf8len = 0;
617 const char *in1ptr = src;
618 size_t in1size = srclen;
619 bool do_final_flush1 = true;
620 bool do_final_flush2 = true;
622 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
623 # if defined _LIBICONV_VERSION \
624 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
625 || defined __sun)
626 /* Set to the initial state. */
627 if (cd1 != (iconv_t)(-1))
628 iconv (cd1, NULL, NULL, NULL, NULL);
629 if (cd2 != (iconv_t)(-1))
630 iconv (cd2, NULL, NULL, NULL, NULL);
631 # endif
633 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
635 char *out1ptr = utf8buf + utf8len;
636 size_t out1size = utf8bufsize - utf8len;
637 bool incremented1;
638 size_t res1;
639 int errno1;
641 /* Conversion step 1: from FROM_CODESET to UTF-8. */
642 if (in1size > 0)
644 if (offsets != NULL
645 && length != last_length) /* ensure that offset[] be increasing */
647 offsets[in1ptr - src] = length;
648 last_length = length;
650 if (cd1 != (iconv_t)(-1))
652 if (slowly)
653 res1 = iconv_carefully_1 (cd1,
654 &in1ptr, &in1size,
655 &out1ptr, &out1size,
656 &incremented1);
657 else
658 res1 = iconv_carefully (cd1,
659 &in1ptr, &in1size,
660 &out1ptr, &out1size,
661 &incremented1);
663 else
665 /* FROM_CODESET is UTF-8. */
666 res1 = utf8conv_carefully (slowly,
667 &in1ptr, &in1size,
668 &out1ptr, &out1size,
669 &incremented1);
672 else if (do_final_flush1)
674 /* Now get the conversion state of CD1 back to the initial state.
675 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
676 # if defined _LIBICONV_VERSION \
677 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
678 || defined __sun)
679 if (cd1 != (iconv_t)(-1))
680 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
681 else
682 # endif
683 res1 = 0;
684 do_final_flush1 = false;
685 incremented1 = true;
687 else
689 res1 = 0;
690 incremented1 = true;
692 if (res1 == (size_t)(-1)
693 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
695 if (result != initial_result)
696 free (result);
697 return -1;
699 if (res1 == (size_t)(-1)
700 && errno == EILSEQ && handler != iconveh_error)
702 /* The input is invalid in FROM_CODESET. Eat up one byte and
703 emit a U+FFFD character or a question mark. Room for this
704 character was allocated at the end of utf8buf. */
705 if (!incremented1)
707 if (in1size == 0)
708 abort ();
709 in1ptr++;
710 in1size--;
712 if (handler == iconveh_replacement_character)
714 /* U+FFFD in UTF-8 encoding. */
715 out1ptr[0] = '\357';
716 out1ptr[1] = '\277';
717 out1ptr[2] = '\275';
718 out1ptr += 3;
720 else
721 *out1ptr++ = '?';
722 res1 = 0;
724 errno1 = errno;
725 utf8len = out1ptr - utf8buf;
727 if (offsets != NULL
728 || in1size == 0
729 || utf8len > utf8bufsize / 2
730 || (res1 == (size_t)(-1) && errno1 == E2BIG))
732 /* Conversion step 2: from UTF-8 to TO_CODESET. */
733 const char *in2ptr = utf8buf;
734 size_t in2size = utf8len;
736 while (in2size > 0
737 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
739 char *out2ptr = result + length;
740 size_t out2size = allocated - extra_alloc - length;
741 bool incremented2;
742 size_t res2;
743 bool grow;
745 if (in2size > 0)
747 if (cd2 != (iconv_t)(-1))
748 res2 = iconv_carefully (cd2,
749 &in2ptr, &in2size,
750 &out2ptr, &out2size,
751 &incremented2);
752 else
753 /* TO_CODESET is UTF-8. */
754 res2 = utf8conv_carefully (false,
755 &in2ptr, &in2size,
756 &out2ptr, &out2size,
757 &incremented2);
759 else /* in1size == 0 && !do_final_flush1
760 && in2size == 0 && do_final_flush2 */
762 /* Now get the conversion state of CD1 back to the initial
763 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
764 # if defined _LIBICONV_VERSION \
765 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
766 || defined __sun)
767 if (cd2 != (iconv_t)(-1))
768 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
769 else
770 # endif
771 res2 = 0;
772 do_final_flush2 = false;
773 incremented2 = true;
776 length = out2ptr - result;
777 grow = (length + extra_alloc > allocated / 2);
778 if (res2 == (size_t)(-1))
780 if (errno == E2BIG)
781 grow = true;
782 else if (errno == EINVAL)
783 break;
784 else if (errno == EILSEQ && handler != iconveh_error)
786 /* Error handling can produce up to 10 bytes of UTF-8
787 output. But TO_CODESET may be UCS-2, UTF-16 or
788 UCS-4, so use CD2 here as well. */
789 char scratchbuf[10];
790 size_t scratchlen;
791 ucs4_t uc;
792 const char *inptr;
793 size_t insize;
794 size_t res;
796 if (incremented2)
798 if (u8_prev (&uc, (const uint8_t *) in2ptr,
799 (const uint8_t *) utf8buf)
800 == NULL)
801 abort ();
803 else
805 int n;
806 if (in2size == 0)
807 abort ();
808 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
809 in2size);
810 in2ptr += n;
811 in2size -= n;
814 if (handler == iconveh_escape_sequence)
816 static char const hex[16] = "0123456789ABCDEF";
817 scratchlen = 0;
818 scratchbuf[scratchlen++] = '\\';
819 if (uc < 0x10000)
820 scratchbuf[scratchlen++] = 'u';
821 else
823 scratchbuf[scratchlen++] = 'U';
824 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
825 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
826 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
827 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
829 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
830 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
831 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
832 scratchbuf[scratchlen++] = hex[uc & 15];
834 else if (handler == iconveh_replacement_character)
836 /* U+FFFD in UTF-8 encoding. */
837 scratchbuf[0] = '\357';
838 scratchbuf[1] = '\277';
839 scratchbuf[2] = '\275';
840 scratchlen = 3;
842 else
844 scratchbuf[0] = '?';
845 scratchlen = 1;
848 inptr = scratchbuf;
849 insize = scratchlen;
850 if (cd2 != (iconv_t)(-1))
852 char *out2ptr_try = out2ptr;
853 size_t out2size_try = out2size;
854 res = iconv (cd2,
855 (ICONV_CONST char **) &inptr, &insize,
856 &out2ptr_try, &out2size_try);
857 if (handler == iconveh_replacement_character
858 && (res == (size_t)(-1)
859 ? errno == EILSEQ
860 /* FreeBSD iconv(), NetBSD iconv(), and
861 Solaris 11 iconv() insert a '?' if they
862 cannot convert. This is what we want.
863 But IRIX iconv() inserts a NUL byte if it
864 cannot convert.
865 And musl libc iconv() inserts a '*' if it
866 cannot convert. */
867 : (res > 0
868 && !(out2ptr_try - out2ptr == 1
869 && *out2ptr == '?'))))
871 /* The iconv() call failed.
872 U+FFFD can't be converted to TO_CODESET.
873 Use '?' instead. */
874 scratchbuf[0] = '?';
875 scratchlen = 1;
876 inptr = scratchbuf;
877 insize = scratchlen;
878 res = iconv (cd2,
879 (ICONV_CONST char **) &inptr, &insize,
880 &out2ptr, &out2size);
882 else
884 /* Accept the results of the iconv() call. */
885 out2ptr = out2ptr_try;
886 out2size = out2size_try;
887 res = 0;
890 else
892 /* TO_CODESET is UTF-8. */
893 if (out2size >= insize)
895 memcpy (out2ptr, inptr, insize);
896 out2ptr += insize;
897 out2size -= insize;
898 inptr += insize;
899 insize = 0;
900 res = 0;
902 else
904 errno = E2BIG;
905 res = (size_t)(-1);
908 length = out2ptr - result;
909 if (res == (size_t)(-1) && errno == E2BIG)
911 char *memory;
913 allocated = 2 * allocated;
914 if (length + 1 + extra_alloc > allocated)
915 abort ();
916 if (result == initial_result)
917 memory = (char *) malloc (allocated);
918 else
919 memory = (char *) realloc (result, allocated);
920 if (memory == NULL)
922 if (result != initial_result)
923 free (result);
924 errno = ENOMEM;
925 return -1;
927 if (result == initial_result)
928 memcpy (memory, initial_result, length);
929 result = memory;
930 grow = false;
932 out2ptr = result + length;
933 out2size = allocated - extra_alloc - length;
934 if (cd2 != (iconv_t)(-1))
935 res = iconv (cd2,
936 (ICONV_CONST char **) &inptr,
937 &insize,
938 &out2ptr, &out2size);
939 else
941 /* TO_CODESET is UTF-8. */
942 if (!(out2size >= insize))
943 abort ();
944 memcpy (out2ptr, inptr, insize);
945 out2ptr += insize;
946 out2size -= insize;
947 inptr += insize;
948 insize = 0;
949 res = 0;
951 length = out2ptr - result;
953 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
954 && !(defined __GLIBC__ && !defined __UCLIBC__)
955 /* IRIX iconv() inserts a NUL byte if it cannot convert.
956 FreeBSD iconv(), NetBSD iconv(), and Solaris 11
957 iconv() insert a '?' if they cannot convert.
958 musl libc iconv() inserts a '*' if it cannot convert.
959 Only GNU libiconv (excluding the bastard Apple iconv)
960 and GNU libc are known to prefer to fail rather than
961 doing a lossy conversion. */
962 if (res != (size_t)(-1) && res > 0)
964 errno = EILSEQ;
965 res = (size_t)(-1);
967 # endif
968 if (res == (size_t)(-1))
970 /* Failure converting the ASCII replacement. */
971 if (result != initial_result)
972 free (result);
973 return -1;
976 else
978 if (result != initial_result)
979 free (result);
980 return -1;
983 if (!(in2size > 0
984 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
985 break;
986 if (grow)
988 char *memory;
990 allocated = 2 * allocated;
991 if (result == initial_result)
992 memory = (char *) malloc (allocated);
993 else
994 memory = (char *) realloc (result, allocated);
995 if (memory == NULL)
997 if (result != initial_result)
998 free (result);
999 errno = ENOMEM;
1000 return -1;
1002 if (result == initial_result)
1003 memcpy (memory, initial_result, length);
1004 result = memory;
1008 /* Move the remaining bytes to the beginning of utf8buf. */
1009 if (in2size > 0)
1010 memmove (utf8buf, in2ptr, in2size);
1011 utf8len = in2size;
1014 if (res1 == (size_t)(-1))
1016 if (errno1 == EINVAL)
1017 in1size = 0;
1018 else if (errno1 == EILSEQ)
1020 if (result != initial_result)
1021 free (result);
1022 errno = errno1;
1023 return -1;
1027 # undef utf8bufsize
1030 done:
1031 /* Now the final memory allocation. */
1032 if (result == tmpbuf)
1034 size_t memsize = length + extra_alloc;
1036 if (*resultp != NULL && *lengthp >= memsize)
1037 result = *resultp;
1038 else
1040 char *memory;
1042 memory = (char *) malloc (memsize > 0 ? memsize : 1);
1043 if (memory != NULL)
1044 result = memory;
1045 else
1047 errno = ENOMEM;
1048 return -1;
1051 memcpy (result, tmpbuf, length);
1053 else if (result != *resultp && length + extra_alloc < allocated)
1055 /* Shrink the allocated memory if possible. */
1056 size_t memsize = length + extra_alloc;
1057 char *memory;
1059 memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1060 if (memory != NULL)
1061 result = memory;
1063 *resultp = result;
1064 *lengthp = length;
1065 return 0;
1066 # undef tmpbuf
1067 # undef tmpbufsize
1071 mem_cd_iconveh (const char *src, size_t srclen,
1072 const iconveh_t *cd,
1073 enum iconv_ilseq_handler handler,
1074 size_t *offsets,
1075 char **resultp, size_t *lengthp)
1077 return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1078 handler, 0, offsets, resultp, lengthp);
1081 char *
1082 str_cd_iconveh (const char *src,
1083 const iconveh_t *cd,
1084 enum iconv_ilseq_handler handler)
1086 /* For most encodings, a trailing NUL byte in the input will be converted
1087 to a trailing NUL byte in the output. But not for UTF-7. So that this
1088 function is usable for UTF-7, we have to exclude the NUL byte from the
1089 conversion and add it by hand afterwards. */
1090 char *result = NULL;
1091 size_t length = 0;
1092 int retval = mem_cd_iconveh_internal (src, strlen (src),
1093 cd->cd, cd->cd1, cd->cd2, handler, 1,
1094 NULL, &result, &length);
1096 if (retval < 0)
1098 free (result);
1099 return NULL;
1102 /* Add the terminating NUL byte. */
1103 result[length] = '\0';
1105 return result;
1108 #endif
1111 mem_iconveh (const char *src, size_t srclen,
1112 const char *from_codeset, const char *to_codeset,
1113 enum iconv_ilseq_handler handler,
1114 size_t *offsets,
1115 char **resultp, size_t *lengthp)
1117 if (srclen == 0)
1119 /* Nothing to convert. */
1120 *lengthp = 0;
1121 return 0;
1123 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1125 char *result;
1127 if (*resultp != NULL && *lengthp >= srclen)
1128 result = *resultp;
1129 else
1131 result = (char *) malloc (srclen);
1132 if (result == NULL)
1134 errno = ENOMEM;
1135 return -1;
1138 memcpy (result, src, srclen);
1139 *resultp = result;
1140 *lengthp = srclen;
1141 return 0;
1143 else
1145 #if HAVE_ICONV
1146 iconveh_t cd;
1147 char *result;
1148 size_t length;
1149 int retval;
1151 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1152 return -1;
1154 result = *resultp;
1155 length = *lengthp;
1156 retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1157 &result, &length);
1159 if (retval < 0)
1161 /* Close cd, but preserve the errno from str_cd_iconv. */
1162 int saved_errno = errno;
1163 iconveh_close (&cd);
1164 errno = saved_errno;
1166 else
1168 if (iconveh_close (&cd) < 0)
1170 if (result != *resultp)
1171 free (result);
1172 return -1;
1174 *resultp = result;
1175 *lengthp = length;
1177 return retval;
1178 #else
1179 /* This is a different error code than if iconv_open existed but didn't
1180 support from_codeset and to_codeset, so that the caller can emit
1181 an error message such as
1182 "iconv() is not supported. Installing GNU libiconv and
1183 then reinstalling this package would fix this." */
1184 errno = ENOSYS;
1185 return -1;
1186 #endif
1190 char *
1191 str_iconveh (const char *src,
1192 const char *from_codeset, const char *to_codeset,
1193 enum iconv_ilseq_handler handler)
1195 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1197 char *result = strdup (src);
1199 if (result == NULL)
1200 errno = ENOMEM;
1201 return result;
1203 else
1205 #if HAVE_ICONV
1206 iconveh_t cd;
1207 char *result;
1209 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1210 return NULL;
1212 result = str_cd_iconveh (src, &cd, handler);
1214 if (result == NULL)
1216 /* Close cd, but preserve the errno from str_cd_iconv. */
1217 int saved_errno = errno;
1218 iconveh_close (&cd);
1219 errno = saved_errno;
1221 else
1223 if (iconveh_close (&cd) < 0)
1225 free (result);
1226 return NULL;
1229 return result;
1230 #else
1231 /* This is a different error code than if iconv_open existed but didn't
1232 support from_codeset and to_codeset, so that the caller can emit
1233 an error message such as
1234 "iconv() is not supported. Installing GNU libiconv and
1235 then reinstalling this package would fix this." */
1236 errno = ENOSYS;
1237 return NULL;
1238 #endif