lib/striconveh.c

   1 /* Character set conversion with error handling.
   2    Copyright (C) 2001-2017 Free Software Foundation, Inc.
   3    Written by Bruno Haible and Simon Josefsson.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include "striconveh.h"
  22
  23 #include <errno.h>
  24 #include <stdbool.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27
  28 #if HAVE_ICONV
  29 # include <iconv.h>
  30 # include "unistr.h"
  31 #endif
  32
  33 #include "c-strcase.h"
  34 #include "c-strcaseeq.h"
  35
  36 #ifndef SIZE_MAX
  37 # define SIZE_MAX ((size_t) -1)
  38 #endif
  39
  40
  41 #if HAVE_ICONV
  42
  43 /* The caller must provide an iconveh_t, not just an iconv_t, because when a
  44    conversion error occurs, we may have to determine the Unicode representation
  45    of the inconvertible character.  */
  46
  47 int
  48 iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
  49 {
  50   iconv_t cd;
  51   iconv_t cd1;
  52   iconv_t cd2;
  53
  54   /* Avoid glibc-2.1 bug with EUC-KR.  */
  55 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  56      && !defined _LIBICONV_VERSION
  57   if (c_strcasecmp (from_codeset, "EUC-KR") == 0
  58       || c_strcasecmp (to_codeset, "EUC-KR") == 0)
  59     {
  60       errno = EINVAL;
  61       return -1;
  62     }
  63 # endif
  64
  65   cd = iconv_open (to_codeset, from_codeset);
  66
  67   if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
  68     cd1 = (iconv_t)(-1);
  69   else
  70     {
  71       cd1 = iconv_open ("UTF-8", from_codeset);
  72       if (cd1 == (iconv_t)(-1))
  73         {
  74           int saved_errno = errno;
  75           if (cd != (iconv_t)(-1))
  76             iconv_close (cdp->cd);
  77           errno = saved_errno;
  78           return -1;
  79         }
  80     }
  81
  82   if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
  83 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
  84       && !defined __UCLIBC__) \
  85      || _LIBICONV_VERSION >= 0x0105
  86       || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
  87 # endif
  88      )
  89     cd2 = (iconv_t)(-1);
  90   else
  91     {
  92       cd2 = iconv_open (to_codeset, "UTF-8");
  93       if (cd2 == (iconv_t)(-1))
  94         {
  95           int saved_errno = errno;
  96           if (cd1 != (iconv_t)(-1))
  97             iconv_close (cd1);
  98           if (cd != (iconv_t)(-1))
  99             iconv_close (cd);
 100           errno = saved_errno;
 101           return -1;
 102         }
 103     }
 104
 105   cdp->cd = cd;
 106   cdp->cd1 = cd1;
 107   cdp->cd2 = cd2;
 108   return 0;
 109 }
 110
 111 int
 112 iconveh_close (const iconveh_t *cd)
 113 {
 114   if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
 115     {
 116       /* Return -1, but preserve the errno from iconv_close.  */
 117       int saved_errno = errno;
 118       if (cd->cd1 != (iconv_t)(-1))
 119         iconv_close (cd->cd1);
 120       if (cd->cd != (iconv_t)(-1))
 121         iconv_close (cd->cd);
 122       errno = saved_errno;
 123       return -1;
 124     }
 125   if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
 126     {
 127       /* Return -1, but preserve the errno from iconv_close.  */
 128       int saved_errno = errno;
 129       if (cd->cd != (iconv_t)(-1))
 130         iconv_close (cd->cd);
 131       errno = saved_errno;
 132       return -1;
 133     }
 134   if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
 135     return -1;
 136   return 0;
 137 }
 138
 139 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
 140    a conversion error, and it returns in *INCREMENTED a boolean telling whether
 141    it has incremented the input pointers past the error location.  */
 142 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
 143 /* Irix iconv() inserts a NUL byte if it cannot convert.
 144    NetBSD iconv() inserts a question mark if it cannot convert.
 145    Only GNU libiconv and GNU libc are known to prefer to fail rather
 146    than doing a lossy conversion.  */
 147 static size_t
 148 iconv_carefully (iconv_t cd,
 149                  const char **inbuf, size_t *inbytesleft,
 150                  char **outbuf, size_t *outbytesleft,
 151                  bool *incremented)
 152 {
 153   const char *inptr = *inbuf;
 154   const char *inptr_end = inptr + *inbytesleft;
 155   char *outptr = *outbuf;
 156   size_t outsize = *outbytesleft;
 157   const char *inptr_before;
 158   size_t res;
 159
 160   do
 161     {
 162       size_t insize;
 163
 164       inptr_before = inptr;
 165       res = (size_t)(-1);
 166
 167       for (insize = 1; inptr + insize <= inptr_end; insize++)
 168         {
 169           res = iconv (cd,
 170                        (ICONV_CONST char **) &inptr, &insize,
 171                        &outptr, &outsize);
 172           if (!(res == (size_t)(-1) && errno == EINVAL))
 173             break;
 174           /* iconv can eat up a shift sequence but give EINVAL while attempting
 175              to convert the first character.  E.g. libiconv does this.  */
 176           if (inptr > inptr_before)
 177             {
 178               res = 0;
 179               break;
 180             }
 181         }
 182
 183       if (res == 0)
 184         {
 185           *outbuf = outptr;
 186           *outbytesleft = outsize;
 187         }
 188     }
 189   while (res == 0 && inptr < inptr_end);
 190
 191   *inbuf = inptr;
 192   *inbytesleft = inptr_end - inptr;
 193   if (res != (size_t)(-1) && res > 0)
 194     {
 195       /* iconv() has already incremented INPTR.  We cannot go back to a
 196          previous INPTR, otherwise the state inside CD would become invalid,
 197          if FROM_CODESET is a stateful encoding.  So, tell the caller that
 198          *INBUF has already been incremented.  */
 199       *incremented = (inptr > inptr_before);
 200       errno = EILSEQ;
 201       return (size_t)(-1);
 202     }
 203   else
 204     {
 205       *incremented = false;
 206       return res;
 207     }
 208 }
 209 # else
 210 #  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
 211      (*(incremented) = false, \
 212       iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
 213 # endif
 214
 215 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
 216    converting one character or one shift sequence.  */
 217 static size_t
 218 iconv_carefully_1 (iconv_t cd,
 219                    const char **inbuf, size_t *inbytesleft,
 220                    char **outbuf, size_t *outbytesleft,
 221                    bool *incremented)
 222 {
 223   const char *inptr_before = *inbuf;
 224   const char *inptr = inptr_before;
 225   const char *inptr_end = inptr_before + *inbytesleft;
 226   char *outptr = *outbuf;
 227   size_t outsize = *outbytesleft;
 228   size_t res = (size_t)(-1);
 229   size_t insize;
 230
 231   for (insize = 1; inptr_before + insize <= inptr_end; insize++)
 232     {
 233       inptr = inptr_before;
 234       res = iconv (cd,
 235                    (ICONV_CONST char **) &inptr, &insize,
 236                    &outptr, &outsize);
 237       if (!(res == (size_t)(-1) && errno == EINVAL))
 238         break;
 239       /* iconv can eat up a shift sequence but give EINVAL while attempting
 240          to convert the first character.  E.g. libiconv does this.  */
 241       if (inptr > inptr_before)
 242         {
 243           res = 0;
 244           break;
 245         }
 246     }
 247
 248   *inbuf = inptr;
 249   *inbytesleft = inptr_end - inptr;
 250 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
 251   /* Irix iconv() inserts a NUL byte if it cannot convert.
 252      NetBSD iconv() inserts a question mark if it cannot convert.
 253      Only GNU libiconv and GNU libc are known to prefer to fail rather
 254      than doing a lossy conversion.  */
 255   if (res != (size_t)(-1) && res > 0)
 256     {
 257       /* iconv() has already incremented INPTR.  We cannot go back to a
 258          previous INPTR, otherwise the state inside CD would become invalid,
 259          if FROM_CODESET is a stateful encoding.  So, tell the caller that
 260          *INBUF has already been incremented.  */
 261       *incremented = (inptr > inptr_before);
 262       errno = EILSEQ;
 263       return (size_t)(-1);
 264     }
 265 # endif
 266
 267   if (res != (size_t)(-1))
 268     {
 269       *outbuf = outptr;
 270       *outbytesleft = outsize;
 271     }
 272   *incremented = false;
 273   return res;
 274 }
 275
 276 /* utf8conv_carefully is like iconv, except that
 277      - it converts from UTF-8 to UTF-8,
 278      - it stops as soon as it encounters a conversion error, and it returns
 279        in *INCREMENTED a boolean telling whether it has incremented the input
 280        pointers past the error location,
 281      - if one_character_only is true, it stops after converting one
 282        character.  */
 283 static size_t
 284 utf8conv_carefully (bool one_character_only,
 285                     const char **inbuf, size_t *inbytesleft,
 286                     char **outbuf, size_t *outbytesleft,
 287                     bool *incremented)
 288 {
 289   const char *inptr = *inbuf;
 290   size_t insize = *inbytesleft;
 291   char *outptr = *outbuf;
 292   size_t outsize = *outbytesleft;
 293   size_t res;
 294
 295   res = 0;
 296   do
 297     {
 298       ucs4_t uc;
 299       int n;
 300       int m;
 301
 302       n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
 303       if (n < 0)
 304         {
 305           errno = (n == -2 ? EINVAL : EILSEQ);
 306           n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
 307           inptr += n;
 308           insize -= n;
 309           res = (size_t)(-1);
 310           *incremented = true;
 311           break;
 312         }
 313       if (outsize == 0)
 314         {
 315           errno = E2BIG;
 316           res = (size_t)(-1);
 317           *incremented = false;
 318           break;
 319         }
 320       m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
 321       if (m == -2)
 322         {
 323           errno = E2BIG;
 324           res = (size_t)(-1);
 325           *incremented = false;
 326           break;
 327         }
 328       inptr += n;
 329       insize -= n;
 330       if (m == -1)
 331         {
 332           errno = EILSEQ;
 333           res = (size_t)(-1);
 334           *incremented = true;
 335           break;
 336         }
 337       outptr += m;
 338       outsize -= m;
 339     }
 340   while (!one_character_only && insize > 0);
 341
 342   *inbuf = inptr;
 343   *inbytesleft = insize;
 344   *outbuf = outptr;
 345   *outbytesleft = outsize;
 346   return res;
 347 }
 348
 349 static int
 350 mem_cd_iconveh_internal (const char *src, size_t srclen,
 351                          iconv_t cd, iconv_t cd1, iconv_t cd2,
 352                          enum iconv_ilseq_handler handler,
 353                          size_t extra_alloc,
 354                          size_t *offsets,
 355                          char **resultp, size_t *lengthp)
 356 {
 357   /* When a conversion error occurs, we cannot start using CD1 and CD2 at
 358      this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
 359      Instead, we have to start afresh from the beginning of SRC.  */
 360   /* Use a temporary buffer, so that for small strings, a single malloc()
 361      call will be sufficient.  */
 362 # define tmpbufsize 4096
 363   /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
 364      libiconv's UCS-4-INTERNAL encoding.  */
 365   union { unsigned int align; char buf[tmpbufsize]; } tmp;
 366 # define tmpbuf tmp.buf
 367
 368   char *initial_result;
 369   char *result;
 370   size_t allocated;
 371   size_t length;
 372   size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
 373
 374   if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
 375     {
 376       initial_result = *resultp;
 377       allocated = *lengthp;
 378     }
 379   else
 380     {
 381       initial_result = tmpbuf;
 382       allocated = sizeof (tmpbuf);
 383     }
 384   result = initial_result;
 385
 386   /* Test whether a direct conversion is possible at all.  */
 387   if (cd == (iconv_t)(-1))
 388     goto indirectly;
 389
 390   if (offsets != NULL)
 391     {
 392       size_t i;
 393
 394       for (i = 0; i < srclen; i++)
 395         offsets[i] = (size_t)(-1);
 396
 397       last_length = (size_t)(-1);
 398     }
 399   length = 0;
 400
 401   /* First, try a direct conversion, and see whether a conversion error
 402      occurs at all.  */
 403   {
 404     const char *inptr = src;
 405     size_t insize = srclen;
 406
 407     /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
 408 # if defined _LIBICONV_VERSION \
 409      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 410           || defined __sun)
 411     /* Set to the initial state.  */
 412     iconv (cd, NULL, NULL, NULL, NULL);
 413 # endif
 414
 415     while (insize > 0)
 416       {
 417         char *outptr = result + length;
 418         size_t outsize = allocated - extra_alloc - length;
 419         bool incremented;
 420         size_t res;
 421         bool grow;
 422
 423         if (offsets != NULL)
 424           {
 425             if (length != last_length) /* ensure that offset[] be increasing */
 426               {
 427                 offsets[inptr - src] = length;
 428                 last_length = length;
 429               }
 430             res = iconv_carefully_1 (cd,
 431                                      &inptr, &insize,
 432                                      &outptr, &outsize,
 433                                      &incremented);
 434           }
 435         else
 436           /* Use iconv_carefully instead of iconv here, because:
 437              - If TO_CODESET is UTF-8, we can do the error handling in this
 438                loop, no need for a second loop,
 439              - With iconv() implementations other than GNU libiconv and GNU
 440                libc, if we use iconv() in a big swoop, checking for an E2BIG
 441                return, we lose the number of irreversible conversions.  */
 442           res = iconv_carefully (cd,
 443                                  &inptr, &insize,
 444                                  &outptr, &outsize,
 445                                  &incremented);
 446
 447         length = outptr - result;
 448         grow = (length + extra_alloc > allocated / 2);
 449         if (res == (size_t)(-1))
 450           {
 451             if (errno == E2BIG)
 452               grow = true;
 453             else if (errno == EINVAL)
 454               break;
 455             else if (errno == EILSEQ && handler != iconveh_error)
 456               {
 457                 if (cd2 == (iconv_t)(-1))
 458                   {
 459                     /* TO_CODESET is UTF-8.  */
 460                     /* Error handling can produce up to 1 byte of output.  */
 461                     if (length + 1 + extra_alloc > allocated)
 462                       {
 463                         char *memory;
 464
 465                         allocated = 2 * allocated;
 466                         if (length + 1 + extra_alloc > allocated)
 467                           abort ();
 468                         if (result == initial_result)
 469                           memory = (char *) malloc (allocated);
 470                         else
 471                           memory = (char *) realloc (result, allocated);
 472                         if (memory == NULL)
 473                           {
 474                             if (result != initial_result)
 475                               free (result);
 476                             errno = ENOMEM;
 477                             return -1;
 478                           }
 479                         if (result == initial_result)
 480                           memcpy (memory, initial_result, length);
 481                         result = memory;
 482                         grow = false;
 483                       }
 484                     /* The input is invalid in FROM_CODESET.  Eat up one byte
 485                        and emit a question mark.  */
 486                     if (!incremented)
 487                       {
 488                         if (insize == 0)
 489                           abort ();
 490                         inptr++;
 491                         insize--;
 492                       }
 493                     result[length] = '?';
 494                     length++;
 495                   }
 496                 else
 497                   goto indirectly;
 498               }
 499             else
 500               {
 501                 if (result != initial_result)
 502                   {
 503                     int saved_errno = errno;
 504                     free (result);
 505                     errno = saved_errno;
 506                   }
 507                 return -1;
 508               }
 509           }
 510         if (insize == 0)
 511           break;
 512         if (grow)
 513           {
 514             char *memory;
 515
 516             allocated = 2 * allocated;
 517             if (result == initial_result)
 518               memory = (char *) malloc (allocated);
 519             else
 520               memory = (char *) realloc (result, allocated);
 521             if (memory == NULL)
 522               {
 523                 if (result != initial_result)
 524                   free (result);
 525                 errno = ENOMEM;
 526                 return -1;
 527               }
 528             if (result == initial_result)
 529               memcpy (memory, initial_result, length);
 530             result = memory;
 531           }
 532       }
 533   }
 534
 535   /* Now get the conversion state back to the initial state.
 536      But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 537 #if defined _LIBICONV_VERSION \
 538     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 539          || defined __sun)
 540   for (;;)
 541     {
 542       char *outptr = result + length;
 543       size_t outsize = allocated - extra_alloc - length;
 544       size_t res;
 545
 546       res = iconv (cd, NULL, NULL, &outptr, &outsize);
 547       length = outptr - result;
 548       if (res == (size_t)(-1))
 549         {
 550           if (errno == E2BIG)
 551             {
 552               char *memory;
 553
 554               allocated = 2 * allocated;
 555               if (result == initial_result)
 556                 memory = (char *) malloc (allocated);
 557               else
 558                 memory = (char *) realloc (result, allocated);
 559               if (memory == NULL)
 560                 {
 561                   if (result != initial_result)
 562                     free (result);
 563                   errno = ENOMEM;
 564                   return -1;
 565                 }
 566               if (result == initial_result)
 567                 memcpy (memory, initial_result, length);
 568               result = memory;
 569             }
 570           else
 571             {
 572               if (result != initial_result)
 573                 {
 574                   int saved_errno = errno;
 575                   free (result);
 576                   errno = saved_errno;
 577                 }
 578               return -1;
 579             }
 580         }
 581       else
 582         break;
 583     }
 584 #endif
 585
 586   /* The direct conversion succeeded.  */
 587   goto done;
 588
 589  indirectly:
 590   /* The direct conversion failed.
 591      Use a conversion through UTF-8.  */
 592   if (offsets != NULL)
 593     {
 594       size_t i;
 595
 596       for (i = 0; i < srclen; i++)
 597         offsets[i] = (size_t)(-1);
 598
 599       last_length = (size_t)(-1);
 600     }
 601   length = 0;
 602   {
 603     const bool slowly = (offsets != NULL || handler == iconveh_error);
 604 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
 605     char utf8buf[utf8bufsize + 1];
 606     size_t utf8len = 0;
 607     const char *in1ptr = src;
 608     size_t in1size = srclen;
 609     bool do_final_flush1 = true;
 610     bool do_final_flush2 = true;
 611
 612     /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
 613 # if defined _LIBICONV_VERSION \
 614      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 615           || defined __sun)
 616     /* Set to the initial state.  */
 617     if (cd1 != (iconv_t)(-1))
 618       iconv (cd1, NULL, NULL, NULL, NULL);
 619     if (cd2 != (iconv_t)(-1))
 620       iconv (cd2, NULL, NULL, NULL, NULL);
 621 # endif
 622
 623     while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
 624       {
 625         char *out1ptr = utf8buf + utf8len;
 626         size_t out1size = utf8bufsize - utf8len;
 627         bool incremented1;
 628         size_t res1;
 629         int errno1;
 630
 631         /* Conversion step 1: from FROM_CODESET to UTF-8.  */
 632         if (in1size > 0)
 633           {
 634             if (offsets != NULL
 635                 && length != last_length) /* ensure that offset[] be increasing */
 636               {
 637                 offsets[in1ptr - src] = length;
 638                 last_length = length;
 639               }
 640             if (cd1 != (iconv_t)(-1))
 641               {
 642                 if (slowly)
 643                   res1 = iconv_carefully_1 (cd1,
 644                                             &in1ptr, &in1size,
 645                                             &out1ptr, &out1size,
 646                                             &incremented1);
 647                 else
 648                   res1 = iconv_carefully (cd1,
 649                                           &in1ptr, &in1size,
 650                                           &out1ptr, &out1size,
 651                                           &incremented1);
 652               }
 653             else
 654               {
 655                 /* FROM_CODESET is UTF-8.  */
 656                 res1 = utf8conv_carefully (slowly,
 657                                            &in1ptr, &in1size,
 658                                            &out1ptr, &out1size,
 659                                            &incremented1);
 660               }
 661           }
 662         else if (do_final_flush1)
 663           {
 664             /* Now get the conversion state of CD1 back to the initial state.
 665                But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 666 # if defined _LIBICONV_VERSION \
 667      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 668           || defined __sun)
 669             if (cd1 != (iconv_t)(-1))
 670               res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
 671             else
 672 # endif
 673               res1 = 0;
 674             do_final_flush1 = false;
 675             incremented1 = true;
 676           }
 677         else
 678           {
 679             res1 = 0;
 680             incremented1 = true;
 681           }
 682         if (res1 == (size_t)(-1)
 683             && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
 684           {
 685             if (result != initial_result)
 686               {
 687                 int saved_errno = errno;
 688                 free (result);
 689                 errno = saved_errno;
 690               }
 691             return -1;
 692           }
 693         if (res1 == (size_t)(-1)
 694             && errno == EILSEQ && handler != iconveh_error)
 695           {
 696             /* The input is invalid in FROM_CODESET.  Eat up one byte and
 697                emit a question mark.  Room for the question mark was allocated
 698                at the end of utf8buf.  */
 699             if (!incremented1)
 700               {
 701                 if (in1size == 0)
 702                   abort ();
 703                 in1ptr++;
 704                 in1size--;
 705               }
 706             *out1ptr++ = '?';
 707             res1 = 0;
 708           }
 709         errno1 = errno;
 710         utf8len = out1ptr - utf8buf;
 711
 712         if (offsets != NULL
 713             || in1size == 0
 714             || utf8len > utf8bufsize / 2
 715             || (res1 == (size_t)(-1) && errno1 == E2BIG))
 716           {
 717             /* Conversion step 2: from UTF-8 to TO_CODESET.  */
 718             const char *in2ptr = utf8buf;
 719             size_t in2size = utf8len;
 720
 721             while (in2size > 0
 722                    || (in1size == 0 && !do_final_flush1 && do_final_flush2))
 723               {
 724                 char *out2ptr = result + length;
 725                 size_t out2size = allocated - extra_alloc - length;
 726                 bool incremented2;
 727                 size_t res2;
 728                 bool grow;
 729
 730                 if (in2size > 0)
 731                   {
 732                     if (cd2 != (iconv_t)(-1))
 733                       res2 = iconv_carefully (cd2,
 734                                               &in2ptr, &in2size,
 735                                               &out2ptr, &out2size,
 736                                               &incremented2);
 737                     else
 738                       /* TO_CODESET is UTF-8.  */
 739                       res2 = utf8conv_carefully (false,
 740                                                  &in2ptr, &in2size,
 741                                                  &out2ptr, &out2size,
 742                                                  &incremented2);
 743                   }
 744                 else /* in1size == 0 && !do_final_flush1
 745                         && in2size == 0 && do_final_flush2 */
 746                   {
 747                     /* Now get the conversion state of CD1 back to the initial
 748                        state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 749 # if defined _LIBICONV_VERSION \
 750      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 751           || defined __sun)
 752                     if (cd2 != (iconv_t)(-1))
 753                       res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
 754                     else
 755 # endif
 756                       res2 = 0;
 757                     do_final_flush2 = false;
 758                     incremented2 = true;
 759                   }
 760
 761                 length = out2ptr - result;
 762                 grow = (length + extra_alloc > allocated / 2);
 763                 if (res2 == (size_t)(-1))
 764                   {
 765                     if (errno == E2BIG)
 766                       grow = true;
 767                     else if (errno == EINVAL)
 768                       break;
 769                     else if (errno == EILSEQ && handler != iconveh_error)
 770                       {
 771                         /* Error handling can produce up to 10 bytes of ASCII
 772                            output.  But TO_CODESET may be UCS-2, UTF-16 or
 773                            UCS-4, so use CD2 here as well.  */
 774                         char scratchbuf[10];
 775                         size_t scratchlen;
 776                         ucs4_t uc;
 777                         const char *inptr;
 778                         size_t insize;
 779                         size_t res;
 780
 781                         if (incremented2)
 782                           {
 783                             if (u8_prev (&uc, (const uint8_t *) in2ptr,
 784                                          (const uint8_t *) utf8buf)
 785                                 == NULL)
 786                               abort ();
 787                           }
 788                         else
 789                           {
 790                             int n;
 791                             if (in2size == 0)
 792                               abort ();
 793                             n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
 794                                                   in2size);
 795                             in2ptr += n;
 796                             in2size -= n;
 797                           }
 798
 799                         if (handler == iconveh_escape_sequence)
 800                           {
 801                             static char hex[16] = "0123456789ABCDEF";
 802                             scratchlen = 0;
 803                             scratchbuf[scratchlen++] = '\\';
 804                             if (uc < 0x10000)
 805                               scratchbuf[scratchlen++] = 'u';
 806                             else
 807                               {
 808                                 scratchbuf[scratchlen++] = 'U';
 809                                 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
 810                                 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
 811                                 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
 812                                 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
 813                               }
 814                             scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
 815                             scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
 816                             scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
 817                             scratchbuf[scratchlen++] = hex[uc & 15];
 818                           }
 819                         else
 820                           {
 821                             scratchbuf[0] = '?';
 822                             scratchlen = 1;
 823                           }
 824
 825                         inptr = scratchbuf;
 826                         insize = scratchlen;
 827                         if (cd2 != (iconv_t)(-1))
 828                           res = iconv (cd2,
 829                                        (ICONV_CONST char **) &inptr, &insize,
 830                                        &out2ptr, &out2size);
 831                         else
 832                           {
 833                             /* TO_CODESET is UTF-8.  */
 834                             if (out2size >= insize)
 835                               {
 836                                 memcpy (out2ptr, inptr, insize);
 837                                 out2ptr += insize;
 838                                 out2size -= insize;
 839                                 inptr += insize;
 840                                 insize = 0;
 841                                 res = 0;
 842                               }
 843                             else
 844                               {
 845                                 errno = E2BIG;
 846                                 res = (size_t)(-1);
 847                               }
 848                           }
 849                         length = out2ptr - result;
 850                         if (res == (size_t)(-1) && errno == E2BIG)
 851                           {
 852                             char *memory;
 853
 854                             allocated = 2 * allocated;
 855                             if (length + 1 + extra_alloc > allocated)
 856                               abort ();
 857                             if (result == initial_result)
 858                               memory = (char *) malloc (allocated);
 859                             else
 860                               memory = (char *) realloc (result, allocated);
 861                             if (memory == NULL)
 862                               {
 863                                 if (result != initial_result)
 864                                   free (result);
 865                                 errno = ENOMEM;
 866                                 return -1;
 867                               }
 868                             if (result == initial_result)
 869                               memcpy (memory, initial_result, length);
 870                             result = memory;
 871                             grow = false;
 872
 873                             out2ptr = result + length;
 874                             out2size = allocated - extra_alloc - length;
 875                             if (cd2 != (iconv_t)(-1))
 876                               res = iconv (cd2,
 877                                            (ICONV_CONST char **) &inptr,
 878                                            &insize,
 879                                            &out2ptr, &out2size);
 880                             else
 881                               {
 882                                 /* TO_CODESET is UTF-8.  */
 883                                 if (!(out2size >= insize))
 884                                   abort ();
 885                                 memcpy (out2ptr, inptr, insize);
 886                                 out2ptr += insize;
 887                                 out2size -= insize;
 888                                 inptr += insize;
 889                                 insize = 0;
 890                                 res = 0;
 891                               }
 892                             length = out2ptr - result;
 893                           }
 894 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
 895                         /* Irix iconv() inserts a NUL byte if it cannot convert.
 896                            NetBSD iconv() inserts a question mark if it cannot
 897                            convert.
 898                            Only GNU libiconv and GNU libc are known to prefer
 899                            to fail rather than doing a lossy conversion.  */
 900                         if (res != (size_t)(-1) && res > 0)
 901                           {
 902                             errno = EILSEQ;
 903                             res = (size_t)(-1);
 904                           }
 905 # endif
 906                         if (res == (size_t)(-1))
 907                           {
 908                             /* Failure converting the ASCII replacement.  */
 909                             if (result != initial_result)
 910                               {
 911                                 int saved_errno = errno;
 912                                 free (result);
 913                                 errno = saved_errno;
 914                               }
 915                             return -1;
 916                           }
 917                       }
 918                     else
 919                       {
 920                         if (result != initial_result)
 921                           {
 922                             int saved_errno = errno;
 923                             free (result);
 924                             errno = saved_errno;
 925                           }
 926                         return -1;
 927                       }
 928                   }
 929                 if (!(in2size > 0
 930                       || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
 931                   break;
 932                 if (grow)
 933                   {
 934                     char *memory;
 935
 936                     allocated = 2 * allocated;
 937                     if (result == initial_result)
 938                       memory = (char *) malloc (allocated);
 939                     else
 940                       memory = (char *) realloc (result, allocated);
 941                     if (memory == NULL)
 942                       {
 943                         if (result != initial_result)
 944                           free (result);
 945                         errno = ENOMEM;
 946                         return -1;
 947                       }
 948                     if (result == initial_result)
 949                       memcpy (memory, initial_result, length);
 950                     result = memory;
 951                   }
 952               }
 953
 954             /* Move the remaining bytes to the beginning of utf8buf.  */
 955             if (in2size > 0)
 956               memmove (utf8buf, in2ptr, in2size);
 957             utf8len = in2size;
 958           }
 959
 960         if (res1 == (size_t)(-1))
 961           {
 962             if (errno1 == EINVAL)
 963               in1size = 0;
 964             else if (errno1 == EILSEQ)
 965               {
 966                 if (result != initial_result)
 967                   free (result);
 968                 errno = errno1;
 969                 return -1;
 970               }
 971           }
 972       }
 973 # undef utf8bufsize
 974   }
 975
 976  done:
 977   /* Now the final memory allocation.  */
 978   if (result == tmpbuf)
 979     {
 980       size_t memsize = length + extra_alloc;
 981
 982       if (*resultp != NULL && *lengthp >= memsize)
 983         result = *resultp;
 984       else
 985         {
 986           char *memory;
 987
 988           memory = (char *) malloc (memsize > 0 ? memsize : 1);
 989           if (memory != NULL)
 990             result = memory;
 991           else
 992             {
 993               errno = ENOMEM;
 994               return -1;
 995             }
 996         }
 997       memcpy (result, tmpbuf, length);
 998     }
 999   else if (result != *resultp && length + extra_alloc < allocated)
1000     {
1001       /* Shrink the allocated memory if possible.  */
1002       size_t memsize = length + extra_alloc;
1003       char *memory;
1004
1005       memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1006       if (memory != NULL)
1007         result = memory;
1008     }
1009   *resultp = result;
1010   *lengthp = length;
1011   return 0;
1012 # undef tmpbuf
1013 # undef tmpbufsize
1014 }
1015
1016 int
1017 mem_cd_iconveh (const char *src, size_t srclen,
1018                 const iconveh_t *cd,
1019                 enum iconv_ilseq_handler handler,
1020                 size_t *offsets,
1021                 char **resultp, size_t *lengthp)
1022 {
1023   return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1024                                   handler, 0, offsets, resultp, lengthp);
1025 }
1026
1027 char *
1028 str_cd_iconveh (const char *src,
1029                 const iconveh_t *cd,
1030                 enum iconv_ilseq_handler handler)
1031 {
1032   /* For most encodings, a trailing NUL byte in the input will be converted
1033      to a trailing NUL byte in the output.  But not for UTF-7.  So that this
1034      function is usable for UTF-7, we have to exclude the NUL byte from the
1035      conversion and add it by hand afterwards.  */
1036   char *result = NULL;
1037   size_t length = 0;
1038   int retval = mem_cd_iconveh_internal (src, strlen (src),
1039                                         cd->cd, cd->cd1, cd->cd2, handler, 1,
1040                                         NULL, &result, &length);
1041
1042   if (retval < 0)
1043     {
1044       if (result != NULL)
1045         {
1046           int saved_errno = errno;
1047           free (result);
1048           errno = saved_errno;
1049         }
1050       return NULL;
1051     }
1052
1053   /* Add the terminating NUL byte.  */
1054   result[length] = '\0';
1055
1056   return result;
1057 }
1058
1059 #endif
1060
1061 int
1062 mem_iconveh (const char *src, size_t srclen,
1063              const char *from_codeset, const char *to_codeset,
1064              enum iconv_ilseq_handler handler,
1065              size_t *offsets,
1066              char **resultp, size_t *lengthp)
1067 {
1068   if (srclen == 0)
1069     {
1070       /* Nothing to convert.  */
1071       *lengthp = 0;
1072       return 0;
1073     }
1074   else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1075     {
1076       char *result;
1077
1078       if (*resultp != NULL && *lengthp >= srclen)
1079         result = *resultp;
1080       else
1081         {
1082           result = (char *) malloc (srclen);
1083           if (result == NULL)
1084             {
1085               errno = ENOMEM;
1086               return -1;
1087             }
1088         }
1089       memcpy (result, src, srclen);
1090       *resultp = result;
1091       *lengthp = srclen;
1092       return 0;
1093     }
1094   else
1095     {
1096 #if HAVE_ICONV
1097       iconveh_t cd;
1098       char *result;
1099       size_t length;
1100       int retval;
1101
1102       if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1103         return -1;
1104
1105       result = *resultp;
1106       length = *lengthp;
1107       retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1108                                &result, &length);
1109
1110       if (retval < 0)
1111         {
1112           /* Close cd, but preserve the errno from str_cd_iconv.  */
1113           int saved_errno = errno;
1114           iconveh_close (&cd);
1115           errno = saved_errno;
1116         }
1117       else
1118         {
1119           if (iconveh_close (&cd) < 0)
1120             {
1121               /* Return -1, but free the allocated memory, and while doing
1122                  that, preserve the errno from iconveh_close.  */
1123               int saved_errno = errno;
1124               if (result != *resultp && result != NULL)
1125                 free (result);
1126               errno = saved_errno;
1127               return -1;
1128             }
1129           *resultp = result;
1130           *lengthp = length;
1131         }
1132       return retval;
1133 #else
1134       /* This is a different error code than if iconv_open existed but didn't
1135          support from_codeset and to_codeset, so that the caller can emit
1136          an error message such as
1137            "iconv() is not supported. Installing GNU libiconv and
1138             then reinstalling this package would fix this."  */
1139       errno = ENOSYS;
1140       return -1;
1141 #endif
1142     }
1143 }
1144
1145 char *
1146 str_iconveh (const char *src,
1147              const char *from_codeset, const char *to_codeset,
1148              enum iconv_ilseq_handler handler)
1149 {
1150   if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1151     {
1152       char *result = strdup (src);
1153
1154       if (result == NULL)
1155         errno = ENOMEM;
1156       return result;
1157     }
1158   else
1159     {
1160 #if HAVE_ICONV
1161       iconveh_t cd;
1162       char *result;
1163
1164       if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1165         return NULL;
1166
1167       result = str_cd_iconveh (src, &cd, handler);
1168
1169       if (result == NULL)
1170         {
1171           /* Close cd, but preserve the errno from str_cd_iconv.  */
1172           int saved_errno = errno;
1173           iconveh_close (&cd);
1174           errno = saved_errno;
1175         }
1176       else
1177         {
1178           if (iconveh_close (&cd) < 0)
1179             {
1180               /* Return NULL, but free the allocated memory, and while doing
1181                  that, preserve the errno from iconveh_close.  */
1182               int saved_errno = errno;
1183               free (result);
1184               errno = saved_errno;
1185               return NULL;
1186             }
1187         }
1188       return result;
1189 #else
1190       /* This is a different error code than if iconv_open existed but didn't
1191          support from_codeset and to_codeset, so that the caller can emit
1192          an error message such as
1193            "iconv() is not supported. Installing GNU libiconv and
1194             then reinstalling this package would fix this."  */
1195       errno = ENOSYS;
1196       return NULL;
1197 #endif
1198     }
1199 }