lib/striconveh.c

   1 /* Character set conversion with error handling.
   2    Copyright (C) 2001-2024 Free Software Foundation, Inc.
   3    Written by Bruno Haible and Simon Josefsson.
   4
   5    This file is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU Lesser General Public License as
   7    published by the Free Software Foundation; either version 2.1 of the
   8    License, or (at your option) any later version.
   9
  10    This file is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include "striconveh.h"
  22
  23 #include <errno.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26
  27 #if HAVE_ICONV
  28 # include <iconv.h>
  29 # include "unistr.h"
  30 #endif
  31
  32 #include "c-strcase.h"
  33 #include "c-strcaseeq.h"
  34
  35 #ifndef SIZE_MAX
  36 # define SIZE_MAX ((size_t) -1)
  37 #endif
  38
  39
  40 #if HAVE_ICONV
  41
  42 /* The caller must provide an iconveh_t, not just an iconv_t, because when a
  43    conversion error occurs, we may have to determine the Unicode representation
  44    of the inconvertible character.  */
  45
  46 int
  47 iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
  48 {
  49   iconv_t cd;
  50   iconv_t cd1;
  51   iconv_t cd2;
  52
  53   /* Avoid glibc-2.1 bug with EUC-KR.  */
  54 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  55      && !defined _LIBICONV_VERSION
  56   if (c_strcasecmp (from_codeset, "EUC-KR") == 0
  57       || c_strcasecmp (to_codeset, "EUC-KR") == 0)
  58     {
  59       errno = EINVAL;
  60       return -1;
  61     }
  62 # endif
  63
  64   cd = iconv_open (to_codeset, from_codeset);
  65
  66   if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
  67     cd1 = (iconv_t)(-1);
  68   else
  69     {
  70       cd1 = iconv_open ("UTF-8", from_codeset);
  71       if (cd1 == (iconv_t)(-1))
  72         {
  73           int saved_errno = errno;
  74           if (cd != (iconv_t)(-1))
  75             iconv_close (cd);
  76           errno = saved_errno;
  77           return -1;
  78         }
  79     }
  80
  81   if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
  82 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
  83       && !defined __UCLIBC__) \
  84      || _LIBICONV_VERSION >= 0x0105 \
  85      || defined ICONV_SET_TRANSLITERATE
  86       || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
  87 # endif
  88      )
  89     cd2 = (iconv_t)(-1);
  90   else
  91     {
  92       cd2 = iconv_open (to_codeset, "UTF-8");
  93       if (cd2 == (iconv_t)(-1))
  94         {
  95           int saved_errno = errno;
  96           if (cd1 != (iconv_t)(-1))
  97             iconv_close (cd1);
  98           if (cd != (iconv_t)(-1))
  99             iconv_close (cd);
 100           errno = saved_errno;
 101           return -1;
 102         }
 103     }
 104
 105   cdp->cd = cd;
 106   cdp->cd1 = cd1;
 107   cdp->cd2 = cd2;
 108   return 0;
 109 }
 110
 111 int
 112 iconveh_close (const iconveh_t *cd)
 113 {
 114   if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
 115     {
 116       /* Return -1, but preserve the errno from iconv_close.  */
 117       int saved_errno = errno;
 118       if (cd->cd1 != (iconv_t)(-1))
 119         iconv_close (cd->cd1);
 120       if (cd->cd != (iconv_t)(-1))
 121         iconv_close (cd->cd);
 122       errno = saved_errno;
 123       return -1;
 124     }
 125   if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
 126     {
 127       /* Return -1, but preserve the errno from iconv_close.  */
 128       int saved_errno = errno;
 129       if (cd->cd != (iconv_t)(-1))
 130         iconv_close (cd->cd);
 131       errno = saved_errno;
 132       return -1;
 133     }
 134   if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
 135     return -1;
 136   return 0;
 137 }
 138
 139 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
 140    a conversion error, and it returns in *INCREMENTED a boolean telling whether
 141    it has incremented the input pointers past the error location.  */
 142 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
 143      && !(defined __GLIBC__ && !defined __UCLIBC__)
 144 /* Irix iconv() inserts a NUL byte if it cannot convert.
 145    NetBSD iconv() inserts a question mark if it cannot convert.
 146    Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
 147    known to prefer to fail rather than doing a lossy conversion.  */
 148 static size_t
 149 iconv_carefully (iconv_t cd,
 150                  const char **inbuf, size_t *inbytesleft,
 151                  char **outbuf, size_t *outbytesleft,
 152                  bool *incremented)
 153 {
 154   const char *inptr = *inbuf;
 155   const char *inptr_end = inptr + *inbytesleft;
 156   char *outptr = *outbuf;
 157   size_t outsize = *outbytesleft;
 158   const char *inptr_before;
 159   size_t res;
 160
 161   do
 162     {
 163       size_t insize;
 164
 165       inptr_before = inptr;
 166       res = (size_t)(-1);
 167
 168       for (insize = 1; inptr + insize <= inptr_end; insize++)
 169         {
 170           res = iconv (cd,
 171                        (ICONV_CONST char **) &inptr, &insize,
 172                        &outptr, &outsize);
 173           if (!(res == (size_t)(-1) && errno == EINVAL))
 174             break;
 175           /* iconv can eat up a shift sequence but give EINVAL while attempting
 176              to convert the first character.  E.g. libiconv does this.  */
 177           if (inptr > inptr_before)
 178             {
 179               res = 0;
 180               break;
 181             }
 182         }
 183
 184       if (res == 0)
 185         {
 186           *outbuf = outptr;
 187           *outbytesleft = outsize;
 188         }
 189     }
 190   while (res == 0 && inptr < inptr_end);
 191
 192   *inbuf = inptr;
 193   *inbytesleft = inptr_end - inptr;
 194   if (res != (size_t)(-1) && res > 0)
 195     {
 196       /* iconv() has already incremented INPTR.  We cannot go back to a
 197          previous INPTR, otherwise the state inside CD would become invalid,
 198          if FROM_CODESET is a stateful encoding.  So, tell the caller that
 199          *INBUF has already been incremented.  */
 200       *incremented = (inptr > inptr_before);
 201       errno = EILSEQ;
 202       return (size_t)(-1);
 203     }
 204   else
 205     {
 206       *incremented = false;
 207       return res;
 208     }
 209 }
 210 # else
 211 #  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
 212      (*(incremented) = false, \
 213       iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
 214 # endif
 215
 216 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
 217    converting one character or one shift sequence.  */
 218 static size_t
 219 iconv_carefully_1 (iconv_t cd,
 220                    const char **inbuf, size_t *inbytesleft,
 221                    char **outbuf, size_t *outbytesleft,
 222                    bool *incremented)
 223 {
 224   const char *inptr_before = *inbuf;
 225   const char *inptr = inptr_before;
 226   const char *inptr_end = inptr_before + *inbytesleft;
 227   char *outptr = *outbuf;
 228   size_t outsize = *outbytesleft;
 229   size_t res = (size_t)(-1);
 230   size_t insize;
 231
 232   for (insize = 1; inptr_before + insize <= inptr_end; insize++)
 233     {
 234       inptr = inptr_before;
 235       res = iconv (cd,
 236                    (ICONV_CONST char **) &inptr, &insize,
 237                    &outptr, &outsize);
 238       if (!(res == (size_t)(-1) && errno == EINVAL))
 239         break;
 240       /* iconv can eat up a shift sequence but give EINVAL while attempting
 241          to convert the first character.  E.g. libiconv does this.  */
 242       if (inptr > inptr_before)
 243         {
 244           res = 0;
 245           break;
 246         }
 247     }
 248
 249   *inbuf = inptr;
 250   *inbytesleft = inptr_end - inptr;
 251 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
 252      && !(defined __GLIBC__ && !defined __UCLIBC__)
 253   /* Irix iconv() inserts a NUL byte if it cannot convert.
 254      NetBSD iconv() inserts a question mark if it cannot convert.
 255      Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
 256      known to prefer to fail rather than doing a lossy conversion.  */
 257   if (res != (size_t)(-1) && res > 0)
 258     {
 259       /* iconv() has already incremented INPTR.  We cannot go back to a
 260          previous INPTR, otherwise the state inside CD would become invalid,
 261          if FROM_CODESET is a stateful encoding.  So, tell the caller that
 262          *INBUF has already been incremented.  */
 263       *incremented = (inptr > inptr_before);
 264       errno = EILSEQ;
 265       return (size_t)(-1);
 266     }
 267 # endif
 268
 269   if (res != (size_t)(-1))
 270     {
 271       *outbuf = outptr;
 272       *outbytesleft = outsize;
 273     }
 274   *incremented = false;
 275   return res;
 276 }
 277
 278 /* utf8conv_carefully is like iconv, except that
 279      - it converts from UTF-8 to UTF-8,
 280      - it stops as soon as it encounters a conversion error, and it returns
 281        in *INCREMENTED a boolean telling whether it has incremented the input
 282        pointers past the error location,
 283      - if one_character_only is true, it stops after converting one
 284        character.  */
 285 static size_t
 286 utf8conv_carefully (bool one_character_only,
 287                     const char **inbuf, size_t *inbytesleft,
 288                     char **outbuf, size_t *outbytesleft,
 289                     bool *incremented)
 290 {
 291   const char *inptr = *inbuf;
 292   size_t insize = *inbytesleft;
 293   char *outptr = *outbuf;
 294   size_t outsize = *outbytesleft;
 295   size_t res;
 296
 297   res = 0;
 298   do
 299     {
 300       ucs4_t uc;
 301       int n;
 302       int m;
 303
 304       n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
 305       if (n < 0)
 306         {
 307           errno = (n == -2 ? EINVAL : EILSEQ);
 308           n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
 309           inptr += n;
 310           insize -= n;
 311           res = (size_t)(-1);
 312           *incremented = true;
 313           break;
 314         }
 315       if (outsize == 0)
 316         {
 317           errno = E2BIG;
 318           res = (size_t)(-1);
 319           *incremented = false;
 320           break;
 321         }
 322       m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
 323       if (m == -2)
 324         {
 325           errno = E2BIG;
 326           res = (size_t)(-1);
 327           *incremented = false;
 328           break;
 329         }
 330       inptr += n;
 331       insize -= n;
 332       if (m == -1)
 333         {
 334           errno = EILSEQ;
 335           res = (size_t)(-1);
 336           *incremented = true;
 337           break;
 338         }
 339       outptr += m;
 340       outsize -= m;
 341     }
 342   while (!one_character_only && insize > 0);
 343
 344   *inbuf = inptr;
 345   *inbytesleft = insize;
 346   *outbuf = outptr;
 347   *outbytesleft = outsize;
 348   return res;
 349 }
 350
 351 static int
 352 mem_cd_iconveh_internal (const char *src, size_t srclen,
 353                          iconv_t cd, iconv_t cd1, iconv_t cd2,
 354                          enum iconv_ilseq_handler handler,
 355                          size_t extra_alloc,
 356                          size_t *offsets,
 357                          char **resultp, size_t *lengthp)
 358 {
 359   /* When a conversion error occurs, we cannot start using CD1 and CD2 at
 360      this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
 361      Instead, we have to start afresh from the beginning of SRC.  */
 362   /* Use a temporary buffer, so that for small strings, a single malloc()
 363      call will be sufficient.  */
 364 # define tmpbufsize 4096
 365   /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
 366      libiconv's UCS-4-INTERNAL encoding.  */
 367   union { unsigned int align; char buf[tmpbufsize]; } tmp;
 368 # define tmpbuf tmp.buf
 369
 370   char *initial_result;
 371   char *result;
 372   size_t allocated;
 373   size_t length;
 374   size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
 375
 376   if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
 377     {
 378       initial_result = *resultp;
 379       allocated = *lengthp;
 380     }
 381   else
 382     {
 383       initial_result = tmpbuf;
 384       allocated = sizeof (tmpbuf);
 385     }
 386   result = initial_result;
 387
 388   /* Test whether a direct conversion is possible at all.  */
 389   if (cd == (iconv_t)(-1))
 390     goto indirectly;
 391
 392   if (offsets != NULL)
 393     {
 394       size_t i;
 395
 396       for (i = 0; i < srclen; i++)
 397         offsets[i] = (size_t)(-1);
 398
 399       last_length = (size_t)(-1);
 400     }
 401   length = 0;
 402
 403   /* First, try a direct conversion, and see whether a conversion error
 404      occurs at all.  */
 405   {
 406     const char *inptr = src;
 407     size_t insize = srclen;
 408
 409     /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
 410 # if defined _LIBICONV_VERSION \
 411      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 412           || defined __sun)
 413     /* Set to the initial state.  */
 414     iconv (cd, NULL, NULL, NULL, NULL);
 415 # endif
 416
 417     while (insize > 0)
 418       {
 419         char *outptr = result + length;
 420         size_t outsize = allocated - extra_alloc - length;
 421         bool incremented;
 422         size_t res;
 423         bool grow;
 424
 425         if (offsets != NULL)
 426           {
 427             if (length != last_length) /* ensure that offset[] be increasing */
 428               {
 429                 offsets[inptr - src] = length;
 430                 last_length = length;
 431               }
 432             res = iconv_carefully_1 (cd,
 433                                      &inptr, &insize,
 434                                      &outptr, &outsize,
 435                                      &incremented);
 436           }
 437         else
 438           /* Use iconv_carefully instead of iconv here, because:
 439              - If TO_CODESET is UTF-8, we can do the error handling in this
 440                loop, no need for a second loop,
 441              - With iconv() implementations other than GNU libiconv and GNU
 442                libc, if we use iconv() in a big swoop, checking for an E2BIG
 443                return, we lose the number of irreversible conversions.  */
 444           res = iconv_carefully (cd,
 445                                  &inptr, &insize,
 446                                  &outptr, &outsize,
 447                                  &incremented);
 448
 449         length = outptr - result;
 450         grow = (length + extra_alloc > allocated / 2);
 451         if (res == (size_t)(-1))
 452           {
 453             if (errno == E2BIG)
 454               grow = true;
 455             else if (errno == EINVAL)
 456               break;
 457             else if (errno == EILSEQ && handler != iconveh_error)
 458               {
 459                 if (cd2 == (iconv_t)(-1))
 460                   {
 461                     /* TO_CODESET is UTF-8.  */
 462                     /* Error handling can produce up to 1 or 3 bytes of
 463                        output.  */
 464                     size_t extra_need =
 465                       (handler == iconveh_replacement_character ? 3 : 1);
 466                     if (length + extra_need + extra_alloc > allocated)
 467                       {
 468                         char *memory;
 469
 470                         allocated = 2 * allocated;
 471                         if (length + extra_need + extra_alloc > allocated)
 472                           allocated = 2 * allocated;
 473                         if (length + extra_need + extra_alloc > allocated)
 474                           abort ();
 475                         if (result == initial_result)
 476                           memory = (char *) malloc (allocated);
 477                         else
 478                           memory = (char *) realloc (result, allocated);
 479                         if (memory == NULL)
 480                           {
 481                             if (result != initial_result)
 482                               free (result);
 483                             errno = ENOMEM;
 484                             return -1;
 485                           }
 486                         if (result == initial_result)
 487                           memcpy (memory, initial_result, length);
 488                         result = memory;
 489                         grow = false;
 490                       }
 491                     /* The input is invalid in FROM_CODESET.  Eat up one byte
 492                        and emit a replacement character or a question mark.  */
 493                     if (!incremented)
 494                       {
 495                         if (insize == 0)
 496                           abort ();
 497                         inptr++;
 498                         insize--;
 499                       }
 500                     if (handler == iconveh_replacement_character)
 501                       {
 502                         /* U+FFFD in UTF-8 encoding.  */
 503                         result[length+0] = '\357';
 504                         result[length+1] = '\277';
 505                         result[length+2] = '\275';
 506                         length += 3;
 507                       }
 508                     else
 509                       {
 510                         result[length] = '?';
 511                         length++;
 512                       }
 513                   }
 514                 else
 515                   goto indirectly;
 516               }
 517             else
 518               {
 519                 if (result != initial_result)
 520                   free (result);
 521                 return -1;
 522               }
 523           }
 524         if (insize == 0)
 525           break;
 526         if (grow)
 527           {
 528             char *memory;
 529
 530             allocated = 2 * allocated;
 531             if (result == initial_result)
 532               memory = (char *) malloc (allocated);
 533             else
 534               memory = (char *) realloc (result, allocated);
 535             if (memory == NULL)
 536               {
 537                 if (result != initial_result)
 538                   free (result);
 539                 errno = ENOMEM;
 540                 return -1;
 541               }
 542             if (result == initial_result)
 543               memcpy (memory, initial_result, length);
 544             result = memory;
 545           }
 546       }
 547   }
 548
 549   /* Now get the conversion state back to the initial state.
 550      But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 551 #if defined _LIBICONV_VERSION \
 552     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 553          || defined __sun)
 554   for (;;)
 555     {
 556       char *outptr = result + length;
 557       size_t outsize = allocated - extra_alloc - length;
 558       size_t res;
 559
 560       res = iconv (cd, NULL, NULL, &outptr, &outsize);
 561       length = outptr - result;
 562       if (res == (size_t)(-1))
 563         {
 564           if (errno == E2BIG)
 565             {
 566               char *memory;
 567
 568               allocated = 2 * allocated;
 569               if (result == initial_result)
 570                 memory = (char *) malloc (allocated);
 571               else
 572                 memory = (char *) realloc (result, allocated);
 573               if (memory == NULL)
 574                 {
 575                   if (result != initial_result)
 576                     free (result);
 577                   errno = ENOMEM;
 578                   return -1;
 579                 }
 580               if (result == initial_result)
 581                 memcpy (memory, initial_result, length);
 582               result = memory;
 583             }
 584           else
 585             {
 586               if (result != initial_result)
 587                 free (result);
 588               return -1;
 589             }
 590         }
 591       else
 592         break;
 593     }
 594 #endif
 595
 596   /* The direct conversion succeeded.  */
 597   goto done;
 598
 599  indirectly:
 600   /* The direct conversion failed.
 601      Use a conversion through UTF-8.  */
 602   if (offsets != NULL)
 603     {
 604       size_t i;
 605
 606       for (i = 0; i < srclen; i++)
 607         offsets[i] = (size_t)(-1);
 608
 609       last_length = (size_t)(-1);
 610     }
 611   length = 0;
 612   {
 613     const bool slowly = (offsets != NULL || handler == iconveh_error);
 614 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
 615     char utf8buf[utf8bufsize + 3];
 616     size_t utf8len = 0;
 617     const char *in1ptr = src;
 618     size_t in1size = srclen;
 619     bool do_final_flush1 = true;
 620     bool do_final_flush2 = true;
 621
 622     /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
 623 # if defined _LIBICONV_VERSION \
 624      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 625           || defined __sun)
 626     /* Set to the initial state.  */
 627     if (cd1 != (iconv_t)(-1))
 628       iconv (cd1, NULL, NULL, NULL, NULL);
 629     if (cd2 != (iconv_t)(-1))
 630       iconv (cd2, NULL, NULL, NULL, NULL);
 631 # endif
 632
 633     while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
 634       {
 635         char *out1ptr = utf8buf + utf8len;
 636         size_t out1size = utf8bufsize - utf8len;
 637         bool incremented1;
 638         size_t res1;
 639         int errno1;
 640
 641         /* Conversion step 1: from FROM_CODESET to UTF-8.  */
 642         if (in1size > 0)
 643           {
 644             if (offsets != NULL
 645                 && length != last_length) /* ensure that offset[] be increasing */
 646               {
 647                 offsets[in1ptr - src] = length;
 648                 last_length = length;
 649               }
 650             if (cd1 != (iconv_t)(-1))
 651               {
 652                 if (slowly)
 653                   res1 = iconv_carefully_1 (cd1,
 654                                             &in1ptr, &in1size,
 655                                             &out1ptr, &out1size,
 656                                             &incremented1);
 657                 else
 658                   res1 = iconv_carefully (cd1,
 659                                           &in1ptr, &in1size,
 660                                           &out1ptr, &out1size,
 661                                           &incremented1);
 662               }
 663             else
 664               {
 665                 /* FROM_CODESET is UTF-8.  */
 666                 res1 = utf8conv_carefully (slowly,
 667                                            &in1ptr, &in1size,
 668                                            &out1ptr, &out1size,
 669                                            &incremented1);
 670               }
 671           }
 672         else if (do_final_flush1)
 673           {
 674             /* Now get the conversion state of CD1 back to the initial state.
 675                But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 676 # if defined _LIBICONV_VERSION \
 677      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 678           || defined __sun)
 679             if (cd1 != (iconv_t)(-1))
 680               res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
 681             else
 682 # endif
 683               res1 = 0;
 684             do_final_flush1 = false;
 685             incremented1 = true;
 686           }
 687         else
 688           {
 689             res1 = 0;
 690             incremented1 = true;
 691           }
 692         if (res1 == (size_t)(-1)
 693             && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
 694           {
 695             if (result != initial_result)
 696               free (result);
 697             return -1;
 698           }
 699         if (res1 == (size_t)(-1)
 700             && errno == EILSEQ && handler != iconveh_error)
 701           {
 702             /* The input is invalid in FROM_CODESET.  Eat up one byte and
 703                emit a U+FFFD character or a question mark.  Room for this
 704                character was allocated at the end of utf8buf.  */
 705             if (!incremented1)
 706               {
 707                 if (in1size == 0)
 708                   abort ();
 709                 in1ptr++;
 710                 in1size--;
 711               }
 712             if (handler == iconveh_replacement_character)
 713               {
 714                 /* U+FFFD in UTF-8 encoding.  */
 715                 out1ptr[0] = '\357';
 716                 out1ptr[1] = '\277';
 717                 out1ptr[2] = '\275';
 718                 out1ptr += 3;
 719               }
 720             else
 721               *out1ptr++ = '?';
 722             res1 = 0;
 723           }
 724         errno1 = errno;
 725         utf8len = out1ptr - utf8buf;
 726
 727         if (offsets != NULL
 728             || in1size == 0
 729             || utf8len > utf8bufsize / 2
 730             || (res1 == (size_t)(-1) && errno1 == E2BIG))
 731           {
 732             /* Conversion step 2: from UTF-8 to TO_CODESET.  */
 733             const char *in2ptr = utf8buf;
 734             size_t in2size = utf8len;
 735
 736             while (in2size > 0
 737                    || (in1size == 0 && !do_final_flush1 && do_final_flush2))
 738               {
 739                 char *out2ptr = result + length;
 740                 size_t out2size = allocated - extra_alloc - length;
 741                 bool incremented2;
 742                 size_t res2;
 743                 bool grow;
 744
 745                 if (in2size > 0)
 746                   {
 747                     if (cd2 != (iconv_t)(-1))
 748                       res2 = iconv_carefully (cd2,
 749                                               &in2ptr, &in2size,
 750                                               &out2ptr, &out2size,
 751                                               &incremented2);
 752                     else
 753                       /* TO_CODESET is UTF-8.  */
 754                       res2 = utf8conv_carefully (false,
 755                                                  &in2ptr, &in2size,
 756                                                  &out2ptr, &out2size,
 757                                                  &incremented2);
 758                   }
 759                 else /* in1size == 0 && !do_final_flush1
 760                         && in2size == 0 && do_final_flush2 */
 761                   {
 762                     /* Now get the conversion state of CD1 back to the initial
 763                        state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 764 # if defined _LIBICONV_VERSION \
 765      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 766           || defined __sun)
 767                     if (cd2 != (iconv_t)(-1))
 768                       res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
 769                     else
 770 # endif
 771                       res2 = 0;
 772                     do_final_flush2 = false;
 773                     incremented2 = true;
 774                   }
 775
 776                 length = out2ptr - result;
 777                 grow = (length + extra_alloc > allocated / 2);
 778                 if (res2 == (size_t)(-1))
 779                   {
 780                     if (errno == E2BIG)
 781                       grow = true;
 782                     else if (errno == EINVAL)
 783                       break;
 784                     else if (errno == EILSEQ && handler != iconveh_error)
 785                       {
 786                         /* Error handling can produce up to 10 bytes of UTF-8
 787                            output.  But TO_CODESET may be UCS-2, UTF-16 or
 788                            UCS-4, so use CD2 here as well.  */
 789                         char scratchbuf[10];
 790                         size_t scratchlen;
 791                         ucs4_t uc;
 792                         const char *inptr;
 793                         size_t insize;
 794                         size_t res;
 795
 796                         if (incremented2)
 797                           {
 798                             if (u8_prev (&uc, (const uint8_t *) in2ptr,
 799                                          (const uint8_t *) utf8buf)
 800                                 == NULL)
 801                               abort ();
 802                           }
 803                         else
 804                           {
 805                             int n;
 806                             if (in2size == 0)
 807                               abort ();
 808                             n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
 809                                                   in2size);
 810                             in2ptr += n;
 811                             in2size -= n;
 812                           }
 813
 814                         if (handler == iconveh_escape_sequence)
 815                           {
 816                             static char const hex[16] = "0123456789ABCDEF";
 817                             scratchlen = 0;
 818                             scratchbuf[scratchlen++] = '\\';
 819                             if (uc < 0x10000)
 820                               scratchbuf[scratchlen++] = 'u';
 821                             else
 822                               {
 823                                 scratchbuf[scratchlen++] = 'U';
 824                                 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
 825                                 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
 826                                 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
 827                                 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
 828                               }
 829                             scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
 830                             scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
 831                             scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
 832                             scratchbuf[scratchlen++] = hex[uc & 15];
 833                           }
 834                         else if (handler == iconveh_replacement_character)
 835                           {
 836                             /* U+FFFD in UTF-8 encoding.  */
 837                             scratchbuf[0] = '\357';
 838                             scratchbuf[1] = '\277';
 839                             scratchbuf[2] = '\275';
 840                             scratchlen = 3;
 841                           }
 842                         else
 843                           {
 844                             scratchbuf[0] = '?';
 845                             scratchlen = 1;
 846                           }
 847
 848                         inptr = scratchbuf;
 849                         insize = scratchlen;
 850                         if (cd2 != (iconv_t)(-1))
 851                           {
 852                             char *out2ptr_try = out2ptr;
 853                             size_t out2size_try = out2size;
 854                             res = iconv (cd2,
 855                                          (ICONV_CONST char **) &inptr, &insize,
 856                                          &out2ptr_try, &out2size_try);
 857                             if (handler == iconveh_replacement_character
 858                                 && (res == (size_t)(-1)
 859                                     ? errno == EILSEQ
 860                                     /* FreeBSD iconv(), NetBSD iconv(), and
 861                                        Solaris 11 iconv() insert a '?' if they
 862                                        cannot convert.  This is what we want.
 863                                        But IRIX iconv() inserts a NUL byte if it
 864                                        cannot convert.
 865                                        And musl libc iconv() inserts a '*' if it
 866                                        cannot convert.  */
 867                                     : (res > 0
 868                                        && !(out2ptr_try - out2ptr == 1
 869                                             && *out2ptr == '?'))))
 870                               {
 871                                 /* The iconv() call failed.
 872                                    U+FFFD can't be converted to TO_CODESET.
 873                                    Use '?' instead.  */
 874                                 scratchbuf[0] = '?';
 875                                 scratchlen = 1;
 876                                 inptr = scratchbuf;
 877                                 insize = scratchlen;
 878                                 res = iconv (cd2,
 879                                              (ICONV_CONST char **) &inptr, &insize,
 880                                              &out2ptr, &out2size);
 881                               }
 882                             else
 883                               {
 884                                 /* Accept the results of the iconv() call.  */
 885                                 out2ptr = out2ptr_try;
 886                                 out2size = out2size_try;
 887                                 res = 0;
 888                               }
 889                           }
 890                         else
 891                           {
 892                             /* TO_CODESET is UTF-8.  */
 893                             if (out2size >= insize)
 894                               {
 895                                 memcpy (out2ptr, inptr, insize);
 896                                 out2ptr += insize;
 897                                 out2size -= insize;
 898                                 inptr += insize;
 899                                 insize = 0;
 900                                 res = 0;
 901                               }
 902                             else
 903                               {
 904                                 errno = E2BIG;
 905                                 res = (size_t)(-1);
 906                               }
 907                           }
 908                         length = out2ptr - result;
 909                         if (res == (size_t)(-1) && errno == E2BIG)
 910                           {
 911                             char *memory;
 912
 913                             allocated = 2 * allocated;
 914                             if (length + 1 + extra_alloc > allocated)
 915                               abort ();
 916                             if (result == initial_result)
 917                               memory = (char *) malloc (allocated);
 918                             else
 919                               memory = (char *) realloc (result, allocated);
 920                             if (memory == NULL)
 921                               {
 922                                 if (result != initial_result)
 923                                   free (result);
 924                                 errno = ENOMEM;
 925                                 return -1;
 926                               }
 927                             if (result == initial_result)
 928                               memcpy (memory, initial_result, length);
 929                             result = memory;
 930                             grow = false;
 931
 932                             out2ptr = result + length;
 933                             out2size = allocated - extra_alloc - length;
 934                             if (cd2 != (iconv_t)(-1))
 935                               res = iconv (cd2,
 936                                            (ICONV_CONST char **) &inptr,
 937                                            &insize,
 938                                            &out2ptr, &out2size);
 939                             else
 940                               {
 941                                 /* TO_CODESET is UTF-8.  */
 942                                 if (!(out2size >= insize))
 943                                   abort ();
 944                                 memcpy (out2ptr, inptr, insize);
 945                                 out2ptr += insize;
 946                                 out2size -= insize;
 947                                 inptr += insize;
 948                                 insize = 0;
 949                                 res = 0;
 950                               }
 951                             length = out2ptr - result;
 952                           }
 953 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
 954      && !(defined __GLIBC__ && !defined __UCLIBC__)
 955                         /* IRIX iconv() inserts a NUL byte if it cannot convert.
 956                            FreeBSD iconv(), NetBSD iconv(), and Solaris 11
 957                            iconv() insert a '?' if they cannot convert.
 958                            musl libc iconv() inserts a '*' if it cannot convert.
 959                            Only GNU libiconv (excluding the bastard Apple iconv)
 960                            and GNU libc are known to prefer to fail rather than
 961                            doing a lossy conversion.  */
 962                         if (res != (size_t)(-1) && res > 0)
 963                           {
 964                             errno = EILSEQ;
 965                             res = (size_t)(-1);
 966                           }
 967 # endif
 968                         if (res == (size_t)(-1))
 969                           {
 970                             /* Failure converting the ASCII replacement.  */
 971                             if (result != initial_result)
 972                               free (result);
 973                             return -1;
 974                           }
 975                       }
 976                     else
 977                       {
 978                         if (result != initial_result)
 979                           free (result);
 980                         return -1;
 981                       }
 982                   }
 983                 if (!(in2size > 0
 984                       || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
 985                   break;
 986                 if (grow)
 987                   {
 988                     char *memory;
 989
 990                     allocated = 2 * allocated;
 991                     if (result == initial_result)
 992                       memory = (char *) malloc (allocated);
 993                     else
 994                       memory = (char *) realloc (result, allocated);
 995                     if (memory == NULL)
 996                       {
 997                         if (result != initial_result)
 998                           free (result);
 999                         errno = ENOMEM;
1000                         return -1;
1001                       }
1002                     if (result == initial_result)
1003                       memcpy (memory, initial_result, length);
1004                     result = memory;
1005                   }
1006               }
1007
1008             /* Move the remaining bytes to the beginning of utf8buf.  */
1009             if (in2size > 0)
1010               memmove (utf8buf, in2ptr, in2size);
1011             utf8len = in2size;
1012           }
1013
1014         if (res1 == (size_t)(-1))
1015           {
1016             if (errno1 == EINVAL)
1017               in1size = 0;
1018             else if (errno1 == EILSEQ)
1019               {
1020                 if (result != initial_result)
1021                   free (result);
1022                 errno = errno1;
1023                 return -1;
1024               }
1025           }
1026       }
1027 # undef utf8bufsize
1028   }
1029
1030  done:
1031   /* Now the final memory allocation.  */
1032   if (result == tmpbuf)
1033     {
1034       size_t memsize = length + extra_alloc;
1035
1036       if (*resultp != NULL && *lengthp >= memsize)
1037         result = *resultp;
1038       else
1039         {
1040           char *memory;
1041
1042           memory = (char *) malloc (memsize > 0 ? memsize : 1);
1043           if (memory != NULL)
1044             result = memory;
1045           else
1046             {
1047               errno = ENOMEM;
1048               return -1;
1049             }
1050         }
1051       memcpy (result, tmpbuf, length);
1052     }
1053   else if (result != *resultp && length + extra_alloc < allocated)
1054     {
1055       /* Shrink the allocated memory if possible.  */
1056       size_t memsize = length + extra_alloc;
1057       char *memory;
1058
1059       memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1060       if (memory != NULL)
1061         result = memory;
1062     }
1063   *resultp = result;
1064   *lengthp = length;
1065   return 0;
1066 # undef tmpbuf
1067 # undef tmpbufsize
1068 }
1069
1070 int
1071 mem_cd_iconveh (const char *src, size_t srclen,
1072                 const iconveh_t *cd,
1073                 enum iconv_ilseq_handler handler,
1074                 size_t *offsets,
1075                 char **resultp, size_t *lengthp)
1076 {
1077   return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1078                                   handler, 0, offsets, resultp, lengthp);
1079 }
1080
1081 char *
1082 str_cd_iconveh (const char *src,
1083                 const iconveh_t *cd,
1084                 enum iconv_ilseq_handler handler)
1085 {
1086   /* For most encodings, a trailing NUL byte in the input will be converted
1087      to a trailing NUL byte in the output.  But not for UTF-7.  So that this
1088      function is usable for UTF-7, we have to exclude the NUL byte from the
1089      conversion and add it by hand afterwards.  */
1090   char *result = NULL;
1091   size_t length = 0;
1092   int retval = mem_cd_iconveh_internal (src, strlen (src),
1093                                         cd->cd, cd->cd1, cd->cd2, handler, 1,
1094                                         NULL, &result, &length);
1095
1096   if (retval < 0)
1097     {
1098       free (result);
1099       return NULL;
1100     }
1101
1102   /* Add the terminating NUL byte.  */
1103   result[length] = '\0';
1104
1105   return result;
1106 }
1107
1108 #endif
1109
1110 int
1111 mem_iconveh (const char *src, size_t srclen,
1112              const char *from_codeset, const char *to_codeset,
1113              enum iconv_ilseq_handler handler,
1114              size_t *offsets,
1115              char **resultp, size_t *lengthp)
1116 {
1117   if (srclen == 0)
1118     {
1119       /* Nothing to convert.  */
1120       *lengthp = 0;
1121       return 0;
1122     }
1123   else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1124     {
1125       char *result;
1126
1127       if (*resultp != NULL && *lengthp >= srclen)
1128         result = *resultp;
1129       else
1130         {
1131           result = (char *) malloc (srclen);
1132           if (result == NULL)
1133             {
1134               errno = ENOMEM;
1135               return -1;
1136             }
1137         }
1138       memcpy (result, src, srclen);
1139       *resultp = result;
1140       *lengthp = srclen;
1141       return 0;
1142     }
1143   else
1144     {
1145 #if HAVE_ICONV
1146       iconveh_t cd;
1147       char *result;
1148       size_t length;
1149       int retval;
1150
1151       if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1152         return -1;
1153
1154       result = *resultp;
1155       length = *lengthp;
1156       retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1157                                &result, &length);
1158
1159       if (retval < 0)
1160         {
1161           /* Close cd, but preserve the errno from str_cd_iconv.  */
1162           int saved_errno = errno;
1163           iconveh_close (&cd);
1164           errno = saved_errno;
1165         }
1166       else
1167         {
1168           if (iconveh_close (&cd) < 0)
1169             {
1170               if (result != *resultp)
1171                 free (result);
1172               return -1;
1173             }
1174           *resultp = result;
1175           *lengthp = length;
1176         }
1177       return retval;
1178 #else
1179       /* This is a different error code than if iconv_open existed but didn't
1180          support from_codeset and to_codeset, so that the caller can emit
1181          an error message such as
1182            "iconv() is not supported. Installing GNU libiconv and
1183             then reinstalling this package would fix this."  */
1184       errno = ENOSYS;
1185       return -1;
1186 #endif
1187     }
1188 }
1189
1190 char *
1191 str_iconveh (const char *src,
1192              const char *from_codeset, const char *to_codeset,
1193              enum iconv_ilseq_handler handler)
1194 {
1195   if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1196     {
1197       char *result = strdup (src);
1198
1199       if (result == NULL)
1200         errno = ENOMEM;
1201       return result;
1202     }
1203   else
1204     {
1205 #if HAVE_ICONV
1206       iconveh_t cd;
1207       char *result;
1208
1209       if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1210         return NULL;
1211
1212       result = str_cd_iconveh (src, &cd, handler);
1213
1214       if (result == NULL)
1215         {
1216           /* Close cd, but preserve the errno from str_cd_iconv.  */
1217           int saved_errno = errno;
1218           iconveh_close (&cd);
1219           errno = saved_errno;
1220         }
1221       else
1222         {
1223           if (iconveh_close (&cd) < 0)
1224             {
1225               free (result);
1226               return NULL;
1227             }
1228         }
1229       return result;
1230 #else
1231       /* This is a different error code than if iconv_open existed but didn't
1232          support from_codeset and to_codeset, so that the caller can emit
1233          an error message such as
1234            "iconv() is not supported. Installing GNU libiconv and
1235             then reinstalling this package would fix this."  */
1236       errno = ENOSYS;
1237       return NULL;
1238 #endif
1239     }
1240 }