libcpp/charset.c

   1 /* CPP Library - charsets
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
   3    Free Software Foundation, Inc.
   4
   5    Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
   6
   7 This program is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 This program is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with this program; if not, write to the Free Software
  19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "cpplib.h"
  24 #include "internal.h"
  25 #include "ucnid.h"
  26
  27 /* Character set handling for C-family languages.
  28
  29    Terminological note: In what follows, "charset" or "character set"
  30    will be taken to mean both an abstract set of characters and an
  31    encoding for that set.
  32
  33    The C99 standard discusses two character sets: source and execution.
  34    The source character set is used for internal processing in translation
  35    phases 1 through 4; the execution character set is used thereafter.
  36    Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
  37    character encodings (see 3.7.2, 3.7.3 for the standardese meanings
  38    of these terms).  Furthermore, the "basic character set" (listed in
  39    5.2.1p3) is to be encoded in each with values one byte wide, and is
  40    to appear in the initial shift state.
  41
  42    It is not explicitly mentioned, but there is also a "wide execution
  43    character set" used to encode wide character constants and wide
  44    string literals; this is supposed to be the result of applying the
  45    standard library function mbstowcs() to an equivalent narrow string
  46    (6.4.5p5).  However, the behavior of hexadecimal and octal
  47    \-escapes is at odds with this; they are supposed to be translated
  48    directly to wchar_t values (6.4.4.4p5,6).
  49
  50    The source character set is not necessarily the character set used
  51    to encode physical source files on disk; translation phase 1 converts
  52    from whatever that encoding is to the source character set.
  53
  54    The presence of universal character names in C99 (6.4.3 et seq.)
  55    forces the source character set to be isomorphic to ISO 10646,
  56    that is, Unicode.  There is no such constraint on the execution
  57    character set; note also that the conversion from source to
  58    execution character set does not occur for identifiers (5.1.1.2p1#5).
  59
  60    For convenience of implementation, the source character set's
  61    encoding of the basic character set should be identical to the
  62    execution character set OF THE HOST SYSTEM's encoding of the basic
  63    character set, and it should not be a state-dependent encoding.
  64
  65    cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
  66    depending on whether the host is based on ASCII or EBCDIC (see
  67    respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
  68    Technical Report #16).  With limited exceptions, it relies on the
  69    system library's iconv() primitive to do charset conversion
  70    (specified in SUSv2).  */
  71
  72 #if !HAVE_ICONV
  73 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
  74    below, which are guarded only by if statements with compile-time
  75    constant conditions, do not cause link errors.  */
  76 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
  77 #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
  78 #define iconv_close(x)   (void)0
  79 #define ICONV_CONST
  80 #endif
  81
  82 #if HOST_CHARSET == HOST_CHARSET_ASCII
  83 #define SOURCE_CHARSET "UTF-8"
  84 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
  85 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
  86 #define SOURCE_CHARSET "UTF-EBCDIC"
  87 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
  88 #else
  89 #error "Unrecognized basic host character set"
  90 #endif
  91
  92 #ifndef EILSEQ
  93 #define EILSEQ EINVAL
  94 #endif
  95
  96 /* This structure is used for a resizable string buffer throughout.  */
  97 /* Don't call it strbuf, as that conflicts with unistd.h on systems
  98    such as DYNIX/ptx where unistd.h includes stropts.h.  */
  99 struct _cpp_strbuf
 100 {
 101   uchar *text;
 102   size_t asize;
 103   size_t len;
 104 };
 105
 106 /* This is enough to hold any string that fits on a single 80-column
 107    line, even if iconv quadruples its size (e.g. conversion from
 108    ASCII to UTF-32) rounded up to a power of two.  */
 109 #define OUTBUF_BLOCK_SIZE 256
 110
 111 /* Conversions between UTF-8 and UTF-16/32 are implemented by custom
 112    logic.  This is because a depressing number of systems lack iconv,
 113    or have have iconv libraries that do not do these conversions, so
 114    we need a fallback implementation for them.  To ensure the fallback
 115    doesn't break due to neglect, it is used on all systems.
 116
 117    UTF-32 encoding is nice and simple: a four-byte binary number,
 118    constrained to the range 00000000-7FFFFFFF to avoid questions of
 119    signedness.  We do have to cope with big- and little-endian
 120    variants.
 121
 122    UTF-16 encoding uses two-byte binary numbers, again in big- and
 123    little-endian variants, for all values in the 00000000-0000FFFF
 124    range.  Values in the 00010000-0010FFFF range are encoded as pairs
 125    of two-byte numbers, called "surrogate pairs": given a number S in
 126    this range, it is mapped to a pair (H, L) as follows:
 127
 128      H = (S - 0x10000) / 0x400 + 0xD800
 129      L = (S - 0x10000) % 0x400 + 0xDC00
 130
 131    Two-byte values in the D800...DFFF range are ill-formed except as a
 132    component of a surrogate pair.  Even if the encoding within a
 133    two-byte value is little-endian, the H member of the surrogate pair
 134    comes first.
 135
 136    There is no way to encode values in the 00110000-7FFFFFFF range,
 137    which is not currently a problem as there are no assigned code
 138    points in that range; however, the author expects that it will
 139    eventually become necessary to abandon UTF-16 due to this
 140    limitation.  Note also that, because of these pairs, UTF-16 does
 141    not meet the requirements of the C standard for a wide character
 142    encoding (see 3.7.3 and 6.4.4.4p11).
 143
 144    UTF-8 encoding looks like this:
 145
 146    value range         encoded as
 147    00000000-0000007F   0xxxxxxx
 148    00000080-000007FF   110xxxxx 10xxxxxx
 149    00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
 150    00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 151    00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 152    04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 153
 154    Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
 155    which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
 156    never occur.  Note also that any value that can be encoded by a
 157    given row of the table can also be encoded by all successive rows,
 158    but this is not done; only the shortest possible encoding for any
 159    given value is valid.  For instance, the character 07C0 could be
 160    encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
 161    FC 80 80 80 9F 80.  Only the first is valid.
 162
 163    An implementation note: the transformation from UTF-16 to UTF-8, or
 164    vice versa, is easiest done by using UTF-32 as an intermediary.  */
 165
 166 /* Internal primitives which go from an UTF-8 byte stream to native-endian
 167    UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
 168    operation in several places below.  */
 169 static inline int
 170 one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
 171                      cppchar_t *cp)
 172 {
 173   static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
 174   static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 175
 176   cppchar_t c;
 177   const uchar *inbuf = *inbufp;
 178   size_t nbytes, i;
 179
 180   if (*inbytesleftp < 1)
 181     return EINVAL;
 182
 183   c = *inbuf;
 184   if (c < 0x80)
 185     {
 186       *cp = c;
 187       *inbytesleftp -= 1;
 188       *inbufp += 1;
 189       return 0;
 190     }
 191
 192   /* The number of leading 1-bits in the first byte indicates how many
 193      bytes follow.  */
 194   for (nbytes = 2; nbytes < 7; nbytes++)
 195     if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
 196       goto found;
 197   return EILSEQ;
 198  found:
 199
 200   if (*inbytesleftp < nbytes)
 201     return EINVAL;
 202
 203   c = (c & masks[nbytes-1]);
 204   inbuf++;
 205   for (i = 1; i < nbytes; i++)
 206     {
 207       cppchar_t n = *inbuf++;
 208       if ((n & 0xC0) != 0x80)
 209         return EILSEQ;
 210       c = ((c << 6) + (n & 0x3F));
 211     }
 212
 213   /* Make sure the shortest possible encoding was used.  */
 214   if (c <=      0x7F && nbytes > 1) return EILSEQ;
 215   if (c <=     0x7FF && nbytes > 2) return EILSEQ;
 216   if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
 217   if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
 218   if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
 219
 220   /* Make sure the character is valid.  */
 221   if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
 222
 223   *cp = c;
 224   *inbufp = inbuf;
 225   *inbytesleftp -= nbytes;
 226   return 0;
 227 }
 228
 229 static inline int
 230 one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
 231 {
 232   static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 233   static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
 234   size_t nbytes;
 235   uchar buf[6], *p = &buf[6];
 236   uchar *outbuf = *outbufp;
 237
 238   nbytes = 1;
 239   if (c < 0x80)
 240     *--p = c;
 241   else
 242     {
 243       do
 244         {
 245           *--p = ((c & 0x3F) | 0x80);
 246           c >>= 6;
 247           nbytes++;
 248         }
 249       while (c >= 0x3F || (c & limits[nbytes-1]));
 250       *--p = (c | masks[nbytes-1]);
 251     }
 252
 253   if (*outbytesleftp < nbytes)
 254     return E2BIG;
 255
 256   while (p < &buf[6])
 257     *outbuf++ = *p++;
 258   *outbytesleftp -= nbytes;
 259   *outbufp = outbuf;
 260   return 0;
 261 }
 262
 263 /* The following four functions transform one character between the two
 264    encodings named in the function name.  All have the signature
 265    int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 266            uchar **outbufp, size_t *outbytesleftp)
 267
 268    BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
 269    interpreted as a boolean indicating whether big-endian or
 270    little-endian encoding is to be used for the member of the pair
 271    that is not UTF-8.
 272
 273    INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
 274    do for iconv.
 275
 276    The return value is either 0 for success, or an errno value for
 277    failure, which may be E2BIG (need more space), EILSEQ (ill-formed
 278    input sequence), ir EINVAL (incomplete input sequence).  */
 279
 280 static inline int
 281 one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 282                    uchar **outbufp, size_t *outbytesleftp)
 283 {
 284   uchar *outbuf;
 285   cppchar_t s = 0;
 286   int rval;
 287
 288   /* Check for space first, since we know exactly how much we need.  */
 289   if (*outbytesleftp < 4)
 290     return E2BIG;
 291
 292   rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
 293   if (rval)
 294     return rval;
 295
 296   outbuf = *outbufp;
 297   outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
 298   outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
 299   outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
 300   outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
 301
 302   *outbufp += 4;
 303   *outbytesleftp -= 4;
 304   return 0;
 305 }
 306
 307 static inline int
 308 one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 309                    uchar **outbufp, size_t *outbytesleftp)
 310 {
 311   cppchar_t s;
 312   int rval;
 313   const uchar *inbuf;
 314
 315   if (*inbytesleftp < 4)
 316     return EINVAL;
 317
 318   inbuf = *inbufp;
 319
 320   s  = inbuf[bigend ? 0 : 3] << 24;
 321   s += inbuf[bigend ? 1 : 2] << 16;
 322   s += inbuf[bigend ? 2 : 1] << 8;
 323   s += inbuf[bigend ? 3 : 0];
 324
 325   if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
 326     return EILSEQ;
 327
 328   rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
 329   if (rval)
 330     return rval;
 331
 332   *inbufp += 4;
 333   *inbytesleftp -= 4;
 334   return 0;
 335 }
 336
 337 static inline int
 338 one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 339                    uchar **outbufp, size_t *outbytesleftp)
 340 {
 341   int rval;
 342   cppchar_t s = 0;
 343   const uchar *save_inbuf = *inbufp;
 344   size_t save_inbytesleft = *inbytesleftp;
 345   uchar *outbuf = *outbufp;
 346
 347   rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
 348   if (rval)
 349     return rval;
 350
 351   if (s > 0x0010FFFF)
 352     {
 353       *inbufp = save_inbuf;
 354       *inbytesleftp = save_inbytesleft;
 355       return EILSEQ;
 356     }
 357
 358   if (s < 0xFFFF)
 359     {
 360       if (*outbytesleftp < 2)
 361         {
 362           *inbufp = save_inbuf;
 363           *inbytesleftp = save_inbytesleft;
 364           return E2BIG;
 365         }
 366       outbuf[bigend ? 1 : 0] = (s & 0x00FF);
 367       outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
 368
 369       *outbufp += 2;
 370       *outbytesleftp -= 2;
 371       return 0;
 372     }
 373   else
 374     {
 375       cppchar_t hi, lo;
 376
 377       if (*outbytesleftp < 4)
 378         {
 379           *inbufp = save_inbuf;
 380           *inbytesleftp = save_inbytesleft;
 381           return E2BIG;
 382         }
 383
 384       hi = (s - 0x10000) / 0x400 + 0xD800;
 385       lo = (s - 0x10000) % 0x400 + 0xDC00;
 386
 387       /* Even if we are little-endian, put the high surrogate first.
 388          ??? Matches practice?  */
 389       outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
 390       outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
 391       outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
 392       outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
 393
 394       *outbufp += 4;
 395       *outbytesleftp -= 4;
 396       return 0;
 397     }
 398 }
 399
 400 static inline int
 401 one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 402                    uchar **outbufp, size_t *outbytesleftp)
 403 {
 404   cppchar_t s;
 405   const uchar *inbuf = *inbufp;
 406   int rval;
 407
 408   if (*inbytesleftp < 2)
 409     return EINVAL;
 410   s  = inbuf[bigend ? 0 : 1] << 8;
 411   s += inbuf[bigend ? 1 : 0];
 412
 413   /* Low surrogate without immediately preceding high surrogate is invalid.  */
 414   if (s >= 0xDC00 && s <= 0xDFFF)
 415     return EILSEQ;
 416   /* High surrogate must have a following low surrogate.  */
 417   else if (s >= 0xD800 && s <= 0xDBFF)
 418     {
 419       cppchar_t hi = s, lo;
 420       if (*inbytesleftp < 4)
 421         return EINVAL;
 422
 423       lo  = inbuf[bigend ? 2 : 3] << 8;
 424       lo += inbuf[bigend ? 3 : 2];
 425
 426       if (lo < 0xDC00 || lo > 0xDFFF)
 427         return EILSEQ;
 428
 429       s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
 430     }
 431
 432   rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
 433   if (rval)
 434     return rval;
 435
 436   /* Success - update the input pointers (one_cppchar_to_utf8 has done
 437      the output pointers for us).  */
 438   if (s <= 0xFFFF)
 439     {
 440       *inbufp += 2;
 441       *inbytesleftp -= 2;
 442     }
 443   else
 444     {
 445       *inbufp += 4;
 446       *inbytesleftp -= 4;
 447     }
 448   return 0;
 449 }
 450
 451 /* Helper routine for the next few functions.  The 'const' on
 452    one_conversion means that we promise not to modify what function is
 453    pointed to, which lets the inliner see through it.  */
 454
 455 static inline bool
 456 conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
 457                                              uchar **, size_t *),
 458                  iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
 459 {
 460   const uchar *inbuf;
 461   uchar *outbuf;
 462   size_t inbytesleft, outbytesleft;
 463   int rval;
 464
 465   inbuf = from;
 466   inbytesleft = flen;
 467   outbuf = to->text + to->len;
 468   outbytesleft = to->asize - to->len;
 469
 470   for (;;)
 471     {
 472       do
 473         rval = one_conversion (cd, &inbuf, &inbytesleft,
 474                                &outbuf, &outbytesleft);
 475       while (inbytesleft && !rval);
 476
 477       if (__builtin_expect (inbytesleft == 0, 1))
 478         {
 479           to->len = to->asize - outbytesleft;
 480           return true;
 481         }
 482       if (rval != E2BIG)
 483         {
 484           errno = rval;
 485           return false;
 486         }
 487
 488       outbytesleft += OUTBUF_BLOCK_SIZE;
 489       to->asize += OUTBUF_BLOCK_SIZE;
 490       to->text = xrealloc (to->text, to->asize);
 491       outbuf = to->text + to->asize - outbytesleft;
 492     }
 493 }
 494
 495
 496 /* These functions convert entire strings between character sets.
 497    They all have the signature
 498
 499    bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
 500
 501    The input string FROM is converted as specified by the function
 502    name plus the iconv descriptor CD (which may be fake), and the
 503    result appended to TO.  On any error, false is returned, otherwise true.  */
 504
 505 /* These four use the custom conversion code above.  */
 506 static bool
 507 convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
 508                     struct _cpp_strbuf *to)
 509 {
 510   return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
 511 }
 512
 513 static bool
 514 convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
 515                     struct _cpp_strbuf *to)
 516 {
 517   return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
 518 }
 519
 520 static bool
 521 convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
 522                     struct _cpp_strbuf *to)
 523 {
 524   return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
 525 }
 526
 527 static bool
 528 convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
 529                     struct _cpp_strbuf *to)
 530 {
 531   return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
 532 }
 533
 534 /* Identity conversion, used when we have no alternative.  */
 535 static bool
 536 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
 537                        const uchar *from, size_t flen, struct _cpp_strbuf *to)
 538 {
 539   if (to->len + flen > to->asize)
 540     {
 541       to->asize = to->len + flen;
 542       to->text = xrealloc (to->text, to->asize);
 543     }
 544   memcpy (to->text + to->len, from, flen);
 545   to->len += flen;
 546   return true;
 547 }
 548
 549 /* And this one uses the system iconv primitive.  It's a little
 550    different, since iconv's interface is a little different.  */
 551 #if HAVE_ICONV
 552 static bool
 553 convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
 554                      struct _cpp_strbuf *to)
 555 {
 556   ICONV_CONST char *inbuf;
 557   char *outbuf;
 558   size_t inbytesleft, outbytesleft;
 559
 560   /* Reset conversion descriptor and check that it is valid.  */
 561   if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
 562     return false;
 563
 564   inbuf = (ICONV_CONST char *)from;
 565   inbytesleft = flen;
 566   outbuf = (char *)to->text + to->len;
 567   outbytesleft = to->asize - to->len;
 568
 569   for (;;)
 570     {
 571       iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 572       if (__builtin_expect (inbytesleft == 0, 1))
 573         {
 574           to->len = to->asize - outbytesleft;
 575           return true;
 576         }
 577       if (errno != E2BIG)
 578         return false;
 579
 580       outbytesleft += OUTBUF_BLOCK_SIZE;
 581       to->asize += OUTBUF_BLOCK_SIZE;
 582       to->text = xrealloc (to->text, to->asize);
 583       outbuf = (char *)to->text + to->asize - outbytesleft;
 584     }
 585 }
 586 #else
 587 #define convert_using_iconv 0 /* prevent undefined symbol error below */
 588 #endif
 589
 590 /* Arrange for the above custom conversion logic to be used automatically
 591    when conversion between a suitable pair of character sets is requested.  */
 592
 593 #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
 594    CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
 595
 596 struct conversion
 597 {
 598   const char *pair;
 599   convert_f func;
 600   iconv_t fake_cd;
 601 };
 602 static const struct conversion conversion_tab[] = {
 603   { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
 604   { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
 605   { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
 606   { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
 607   { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
 608   { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
 609   { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
 610   { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
 611 };
 612
 613 /* Subroutine of cpp_init_iconv: initialize and return a
 614    cset_converter structure for conversion from FROM to TO.  If
 615    iconv_open() fails, issue an error and return an identity
 616    converter.  Silently return an identity converter if FROM and TO
 617    are identical.  */
 618 static struct cset_converter
 619 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
 620 {
 621   struct cset_converter ret;
 622   char *pair;
 623   size_t i;
 624
 625   if (!strcasecmp (to, from))
 626     {
 627       ret.func = convert_no_conversion;
 628       ret.cd = (iconv_t) -1;
 629       return ret;
 630     }
 631
 632   pair = alloca(strlen(to) + strlen(from) + 2);
 633
 634   strcpy(pair, from);
 635   strcat(pair, "/");
 636   strcat(pair, to);
 637   for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
 638     if (!strcasecmp (pair, conversion_tab[i].pair))
 639       {
 640         ret.func = conversion_tab[i].func;
 641         ret.cd = conversion_tab[i].fake_cd;
 642         return ret;
 643       }
 644
 645   /* No custom converter - try iconv.  */
 646   if (HAVE_ICONV)
 647     {
 648       ret.func = convert_using_iconv;
 649       ret.cd = iconv_open (to, from);
 650
 651       if (ret.cd == (iconv_t) -1)
 652         {
 653           if (errno == EINVAL)
 654             cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
 655                        "conversion from %s to %s not supported by iconv",
 656                        from, to);
 657           else
 658             cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
 659
 660           ret.func = convert_no_conversion;
 661         }
 662     }
 663   else
 664     {
 665       cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
 666                  "no iconv implementation, cannot convert from %s to %s",
 667                  from, to);
 668       ret.func = convert_no_conversion;
 669       ret.cd = (iconv_t) -1;
 670     }
 671   return ret;
 672 }
 673
 674 /* If charset conversion is requested, initialize iconv(3) descriptors
 675    for conversion from the source character set to the execution
 676    character sets.  If iconv is not present in the C library, and
 677    conversion is requested, issue an error.  */
 678
 679 void
 680 cpp_init_iconv (cpp_reader *pfile)
 681 {
 682   const char *ncset = CPP_OPTION (pfile, narrow_charset);
 683   const char *wcset = CPP_OPTION (pfile, wide_charset);
 684   const char *default_wcset;
 685
 686   bool be = CPP_OPTION (pfile, bytes_big_endian);
 687
 688   if (CPP_OPTION (pfile, wchar_precision) >= 32)
 689     default_wcset = be ? "UTF-32BE" : "UTF-32LE";
 690   else if (CPP_OPTION (pfile, wchar_precision) >= 16)
 691     default_wcset = be ? "UTF-16BE" : "UTF-16LE";
 692   else
 693     /* This effectively means that wide strings are not supported,
 694        so don't do any conversion at all.  */
 695    default_wcset = SOURCE_CHARSET;
 696
 697   if (!ncset)
 698     ncset = SOURCE_CHARSET;
 699   if (!wcset)
 700     wcset = default_wcset;
 701
 702   pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
 703   pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
 704 }
 705
 706 /* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
 707 void
 708 _cpp_destroy_iconv (cpp_reader *pfile)
 709 {
 710   if (HAVE_ICONV)
 711     {
 712       if (pfile->narrow_cset_desc.func == convert_using_iconv)
 713         iconv_close (pfile->narrow_cset_desc.cd);
 714       if (pfile->wide_cset_desc.func == convert_using_iconv)
 715         iconv_close (pfile->wide_cset_desc.cd);
 716     }
 717 }
 718
 719 /* Utility routine for use by a full compiler.  C is a character taken
 720    from the *basic* source character set, encoded in the host's
 721    execution encoding.  Convert it to (the target's) execution
 722    encoding, and return that value.
 723
 724    Issues an internal error if C's representation in the narrow
 725    execution character set fails to be a single-byte value (C99
 726    5.2.1p3: "The representation of each member of the source and
 727    execution character sets shall fit in a byte.")  May also issue an
 728    internal error if C fails to be a member of the basic source
 729    character set (testing this exactly is too hard, especially when
 730    the host character set is EBCDIC).  */
 731 cppchar_t
 732 cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
 733 {
 734   uchar sbuf[1];
 735   struct _cpp_strbuf tbuf;
 736
 737   /* This test is merely an approximation, but it suffices to catch
 738      the most important thing, which is that we don't get handed a
 739      character outside the unibyte range of the host character set.  */
 740   if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
 741     {
 742       cpp_error (pfile, CPP_DL_ICE,
 743                  "character 0x%lx is not in the basic source character set\n",
 744                  (unsigned long)c);
 745       return 0;
 746     }
 747
 748   /* Being a character in the unibyte range of the host character set,
 749      we can safely splat it into a one-byte buffer and trust that that
 750      is a well-formed string.  */
 751   sbuf[0] = c;
 752
 753   /* This should never need to reallocate, but just in case... */
 754   tbuf.asize = 1;
 755   tbuf.text = xmalloc (tbuf.asize);
 756   tbuf.len = 0;
 757
 758   if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
 759     {
 760       cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
 761       return 0;
 762     }
 763   if (tbuf.len != 1)
 764     {
 765       cpp_error (pfile, CPP_DL_ICE,
 766                  "character 0x%lx is not unibyte in execution character set",
 767                  (unsigned long)c);
 768       return 0;
 769     }
 770   c = tbuf.text[0];
 771   free(tbuf.text);
 772   return c;
 773 }
 774
 775 \f
 776
 777 /* Utility routine that computes a mask of the form 0000...111... with
 778    WIDTH 1-bits.  */
 779 static inline size_t
 780 width_to_mask (size_t width)
 781 {
 782   width = MIN (width, BITS_PER_CPPCHAR_T);
 783   if (width >= CHAR_BIT * sizeof (size_t))
 784     return ~(size_t) 0;
 785   else
 786     return ((size_t) 1 << width) - 1;
 787 }
 788
 789 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
 790    the start of an identifier, and 0 if C is not valid in an
 791    identifier.  We assume C has already gone through the checks of
 792    _cpp_valid_ucn.  The algorithm is a simple binary search on the
 793    table defined in cppucnid.h.  */
 794
 795 static int
 796 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
 797 {
 798   int mn, mx, md;
 799
 800   mn = -1;
 801   mx = ARRAY_SIZE (ucnranges);
 802   while (mx - mn > 1)
 803     {
 804       md = (mn + mx) / 2;
 805       if (c < ucnranges[md].lo)
 806         mx = md;
 807       else if (c > ucnranges[md].hi)
 808         mn = md;
 809       else
 810         goto found;
 811     }
 812   return 0;
 813
 814  found:
 815   /* When -pedantic, we require the character to have been listed by
 816      the standard for the current language.  Otherwise, we accept the
 817      union of the acceptable sets for C++98 and C99.  */
 818   if (CPP_PEDANTIC (pfile)
 819       && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
 820           || (CPP_OPTION (pfile, cplusplus)
 821               && !(ucnranges[md].flags & CXX))))
 822     return 0;
 823
 824   /* In C99, UCN digits may not begin identifiers.  */
 825   if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
 826     return 2;
 827
 828   return 1;
 829 }
 830
 831 /* [lex.charset]: The character designated by the universal character
 832    name \UNNNNNNNN is that character whose character short name in
 833    ISO/IEC 10646 is NNNNNNNN; the character designated by the
 834    universal character name \uNNNN is that character whose character
 835    short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
 836    for a universal character name is less than 0x20 or in the range
 837    0x7F-0x9F (inclusive), or if the universal character name
 838    designates a character in the basic source character set, then the
 839    program is ill-formed.
 840
 841    *PSTR must be preceded by "\u" or "\U"; it is assumed that the
 842    buffer end is delimited by a non-hex digit.  Returns zero if UCNs
 843    are not part of the relevant standard, or if the string beginning
 844    at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
 845
 846    Otherwise the nonzero value of the UCN, whether valid or invalid,
 847    is returned.  Diagnostics are emitted for invalid values.  PSTR
 848    is updated to point one beyond the UCN, or to the syntactically
 849    invalid character.
 850
 851    IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
 852    an identifier, or 2 otherwise.  */
 853
 854 cppchar_t
 855 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
 856                 const uchar *limit, int identifier_pos)
 857 {
 858   cppchar_t result, c;
 859   unsigned int length;
 860   const uchar *str = *pstr;
 861   const uchar *base = str - 2;
 862
 863   if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
 864     cpp_error (pfile, CPP_DL_WARNING,
 865                "universal character names are only valid in C++ and C99");
 866   else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
 867     cpp_error (pfile, CPP_DL_WARNING,
 868                "the meaning of '\\%c' is different in traditional C",
 869                (int) str[-1]);
 870
 871   if (str[-1] == 'u')
 872     length = 4;
 873   else if (str[-1] == 'U')
 874     length = 8;
 875   else
 876     abort();
 877
 878   result = 0;
 879   do
 880     {
 881       c = *str;
 882       if (!ISXDIGIT (c))
 883         break;
 884       str++;
 885       result = (result << 4) + hex_value (c);
 886     }
 887   while (--length && str < limit);
 888
 889   *pstr = str;
 890   if (length)
 891     {
 892       /* We'll error when we try it out as the start of an identifier.  */
 893       cpp_error (pfile, CPP_DL_ERROR,
 894                  "incomplete universal character name %.*s",
 895                  (int) (str - base), base);
 896       result = 1;
 897     }
 898   /* The standard permits $, @ and ` to be specified as UCNs.  We use
 899      hex escapes so that this also works with EBCDIC hosts.  */
 900   else if ((result < 0xa0
 901             && (result != 0x24 && result != 0x40 && result != 0x60))
 902            || (result & 0x80000000)
 903            || (result >= 0xD800 && result <= 0xDFFF))
 904     {
 905       cpp_error (pfile, CPP_DL_ERROR,
 906                  "%.*s is not a valid universal character",
 907                  (int) (str - base), base);
 908       result = 1;
 909     }
 910   else if (identifier_pos)
 911     {
 912       int validity = ucn_valid_in_identifier (pfile, result);
 913
 914       if (validity == 0)
 915         cpp_error (pfile, CPP_DL_ERROR,
 916                    "universal character %.*s is not valid in an identifier",
 917                    (int) (str - base), base);
 918       else if (validity == 2 && identifier_pos == 1)
 919         cpp_error (pfile, CPP_DL_ERROR,
 920    "universal character %.*s is not valid at the start of an identifier",
 921                    (int) (str - base), base);
 922     }
 923
 924   if (result == 0)
 925     result = 1;
 926
 927   return result;
 928 }
 929
 930 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
 931    it to the execution character set and write the result into TBUF.
 932    An advanced pointer is returned.  Issues all relevant diagnostics.  */
 933 static const uchar *
 934 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
 935              struct _cpp_strbuf *tbuf, bool wide)
 936 {
 937   cppchar_t ucn;
 938   uchar buf[6];
 939   uchar *bufp = buf;
 940   size_t bytesleft = 6;
 941   int rval;
 942   struct cset_converter cvt
 943     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
 944
 945   from++;  /* Skip u/U.  */
 946   ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
 947
 948   rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
 949   if (rval)
 950     {
 951       errno = rval;
 952       cpp_errno (pfile, CPP_DL_ERROR,
 953                  "converting UCN to source character set");
 954     }
 955   else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
 956     cpp_errno (pfile, CPP_DL_ERROR,
 957                "converting UCN to execution character set");
 958
 959   return from;
 960 }
 961
 962 /* Subroutine of convert_hex and convert_oct.  N is the representation
 963    in the execution character set of a numeric escape; write it into the
 964    string buffer TBUF and update the end-of-string pointer therein.  WIDE
 965    is true if it's a wide string that's being assembled in TBUF.  This
 966    function issues no diagnostics and never fails.  */
 967 static void
 968 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
 969                      struct _cpp_strbuf *tbuf, bool wide)
 970 {
 971   if (wide)
 972     {
 973       /* We have to render this into the target byte order, which may not
 974          be our byte order.  */
 975       bool bigend = CPP_OPTION (pfile, bytes_big_endian);
 976       size_t width = CPP_OPTION (pfile, wchar_precision);
 977       size_t cwidth = CPP_OPTION (pfile, char_precision);
 978       size_t cmask = width_to_mask (cwidth);
 979       size_t nbwc = width / cwidth;
 980       size_t i;
 981       size_t off = tbuf->len;
 982       cppchar_t c;
 983
 984       if (tbuf->len + nbwc > tbuf->asize)
 985         {
 986           tbuf->asize += OUTBUF_BLOCK_SIZE;
 987           tbuf->text = xrealloc (tbuf->text, tbuf->asize);
 988         }
 989
 990       for (i = 0; i < nbwc; i++)
 991         {
 992           c = n & cmask;
 993           n >>= cwidth;
 994           tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
 995         }
 996       tbuf->len += nbwc;
 997     }
 998   else
 999     {
1000       /* Note: this code does not handle the case where the target
1001          and host have a different number of bits in a byte.  */
1002       if (tbuf->len + 1 > tbuf->asize)
1003         {
1004           tbuf->asize += OUTBUF_BLOCK_SIZE;
1005           tbuf->text = xrealloc (tbuf->text, tbuf->asize);
1006         }
1007       tbuf->text[tbuf->len++] = n;
1008     }
1009 }
1010
1011 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
1012    character set and write it into the string buffer TBUF.  Returns an
1013    advanced pointer, and issues diagnostics as necessary.
1014    No character set translation occurs; this routine always produces the
1015    execution-set character with numeric value equal to the given hex
1016    number.  You can, e.g. generate surrogate pairs this way.  */
1017 static const uchar *
1018 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1019              struct _cpp_strbuf *tbuf, bool wide)
1020 {
1021   cppchar_t c, n = 0, overflow = 0;
1022   int digits_found = 0;
1023   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1024                   : CPP_OPTION (pfile, char_precision));
1025   size_t mask = width_to_mask (width);
1026
1027   if (CPP_WTRADITIONAL (pfile))
1028     cpp_error (pfile, CPP_DL_WARNING,
1029                "the meaning of '\\x' is different in traditional C");
1030
1031   from++;  /* Skip 'x'.  */
1032   while (from < limit)
1033     {
1034       c = *from;
1035       if (! hex_p (c))
1036         break;
1037       from++;
1038       overflow |= n ^ (n << 4 >> 4);
1039       n = (n << 4) + hex_value (c);
1040       digits_found = 1;
1041     }
1042
1043   if (!digits_found)
1044     {
1045       cpp_error (pfile, CPP_DL_ERROR,
1046                  "\\x used with no following hex digits");
1047       return from;
1048     }
1049
1050   if (overflow | (n != (n & mask)))
1051     {
1052       cpp_error (pfile, CPP_DL_PEDWARN,
1053                  "hex escape sequence out of range");
1054       n &= mask;
1055     }
1056
1057   emit_numeric_escape (pfile, n, tbuf, wide);
1058
1059   return from;
1060 }
1061
1062 /* Convert an octal escape, pointed to by FROM, to the execution
1063    character set and write it into the string buffer TBUF.  Returns an
1064    advanced pointer, and issues diagnostics as necessary.
1065    No character set translation occurs; this routine always produces the
1066    execution-set character with numeric value equal to the given octal
1067    number.  */
1068 static const uchar *
1069 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1070              struct _cpp_strbuf *tbuf, bool wide)
1071 {
1072   size_t count = 0;
1073   cppchar_t c, n = 0;
1074   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1075                   : CPP_OPTION (pfile, char_precision));
1076   size_t mask = width_to_mask (width);
1077   bool overflow = false;
1078
1079   while (from < limit && count++ < 3)
1080     {
1081       c = *from;
1082       if (c < '0' || c > '7')
1083         break;
1084       from++;
1085       overflow |= n ^ (n << 3 >> 3);
1086       n = (n << 3) + c - '0';
1087     }
1088
1089   if (n != (n & mask))
1090     {
1091       cpp_error (pfile, CPP_DL_PEDWARN,
1092                  "octal escape sequence out of range");
1093       n &= mask;
1094     }
1095
1096   emit_numeric_escape (pfile, n, tbuf, wide);
1097
1098   return from;
1099 }
1100
1101 /* Convert an escape sequence (pointed to by FROM) to its value on
1102    the target, and to the execution character set.  Do not scan past
1103    LIMIT.  Write the converted value into TBUF.  Returns an advanced
1104    pointer.  Handles all relevant diagnostics.  */
1105 static const uchar *
1106 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1107                 struct _cpp_strbuf *tbuf, bool wide)
1108 {
1109   /* Values of \a \b \e \f \n \r \t \v respectively.  */
1110 #if HOST_CHARSET == HOST_CHARSET_ASCII
1111   static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
1112 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1113   static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
1114 #else
1115 #error "unknown host character set"
1116 #endif
1117
1118   uchar c;
1119   struct cset_converter cvt
1120     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1121
1122   c = *from;
1123   switch (c)
1124     {
1125       /* UCNs, hex escapes, and octal escapes are processed separately.  */
1126     case 'u': case 'U':
1127       return convert_ucn (pfile, from, limit, tbuf, wide);
1128
1129     case 'x':
1130       return convert_hex (pfile, from, limit, tbuf, wide);
1131       break;
1132
1133     case '0':  case '1':  case '2':  case '3':
1134     case '4':  case '5':  case '6':  case '7':
1135       return convert_oct (pfile, from, limit, tbuf, wide);
1136
1137       /* Various letter escapes.  Get the appropriate host-charset
1138          value into C.  */
1139     case '\\': case '\'': case '"': case '?': break;
1140
1141     case '(': case '{': case '[': case '%':
1142       /* '\(', etc, can be used at the beginning of a line in a long
1143          string split onto multiple lines with \-newline, to prevent
1144          Emacs or other text editors from getting confused.  '\%' can
1145          be used to prevent SCCS from mangling printf format strings.  */
1146       if (CPP_PEDANTIC (pfile))
1147         goto unknown;
1148       break;
1149
1150     case 'b': c = charconsts[1];  break;
1151     case 'f': c = charconsts[3];  break;
1152     case 'n': c = charconsts[4];  break;
1153     case 'r': c = charconsts[5];  break;
1154     case 't': c = charconsts[6];  break;
1155     case 'v': c = charconsts[7];  break;
1156
1157     case 'a':
1158       if (CPP_WTRADITIONAL (pfile))
1159         cpp_error (pfile, CPP_DL_WARNING,
1160                    "the meaning of '\\a' is different in traditional C");
1161       c = charconsts[0];
1162       break;
1163
1164     case 'e': case 'E':
1165       if (CPP_PEDANTIC (pfile))
1166         cpp_error (pfile, CPP_DL_PEDWARN,
1167                    "non-ISO-standard escape sequence, '\\%c'", (int) c);
1168       c = charconsts[2];
1169       break;
1170
1171     default:
1172     unknown:
1173       if (ISGRAPH (c))
1174         cpp_error (pfile, CPP_DL_PEDWARN,
1175                    "unknown escape sequence '\\%c'", (int) c);
1176       else
1177         cpp_error (pfile, CPP_DL_PEDWARN,
1178                    "unknown escape sequence: '\\%03o'", (int) c);
1179     }
1180
1181   /* Now convert what we have to the execution character set.  */
1182   if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1183     cpp_errno (pfile, CPP_DL_ERROR,
1184                "converting escape sequence to execution character set");
1185
1186   return from + 1;
1187 }
1188 \f
1189 /* FROM is an array of cpp_string structures of length COUNT.  These
1190    are to be converted from the source to the execution character set,
1191    escape sequences translated, and finally all are to be
1192    concatenated.  WIDE indicates whether or not to produce a wide
1193    string.  The result is written into TO.  Returns true for success,
1194    false for failure.  */
1195 bool
1196 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1197                       cpp_string *to, bool wide)
1198 {
1199   struct _cpp_strbuf tbuf;
1200   const uchar *p, *base, *limit;
1201   size_t i;
1202   struct cset_converter cvt
1203     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1204
1205   tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1206   tbuf.text = xmalloc (tbuf.asize);
1207   tbuf.len = 0;
1208
1209   for (i = 0; i < count; i++)
1210     {
1211       p = from[i].text;
1212       if (*p == 'L') p++;
1213       p++; /* Skip leading quote.  */
1214       limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
1215
1216       for (;;)
1217         {
1218           base = p;
1219           while (p < limit && *p != '\\')
1220             p++;
1221           if (p > base)
1222             {
1223               /* We have a run of normal characters; these can be fed
1224                  directly to convert_cset.  */
1225               if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1226                 goto fail;
1227             }
1228           if (p == limit)
1229             break;
1230
1231           p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1232         }
1233     }
1234   /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1235      structure.  */
1236   emit_numeric_escape (pfile, 0, &tbuf, wide);
1237   tbuf.text = xrealloc (tbuf.text, tbuf.len);
1238   to->text = tbuf.text;
1239   to->len = tbuf.len;
1240   return true;
1241
1242  fail:
1243   cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1244   free (tbuf.text);
1245   return false;
1246 }
1247
1248 /* Subroutine of do_line and do_linemarker.  Convert escape sequences
1249    in a string, but do not perform character set conversion.  */
1250 bool
1251 cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1252                                   size_t count, cpp_string *to, bool wide)
1253 {
1254   struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1255   bool retval;
1256
1257   pfile->narrow_cset_desc.func = convert_no_conversion;
1258   pfile->narrow_cset_desc.cd = (iconv_t) -1;
1259
1260   retval = cpp_interpret_string (pfile, from, count, to, wide);
1261
1262   pfile->narrow_cset_desc = save_narrow_cset_desc;
1263   return retval;
1264 }
1265
1266 \f
1267 /* Subroutine of cpp_interpret_charconst which performs the conversion
1268    to a number, for narrow strings.  STR is the string structure returned
1269    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1270    cpp_interpret_charconst.  */
1271 static cppchar_t
1272 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1273                          unsigned int *pchars_seen, int *unsignedp)
1274 {
1275   size_t width = CPP_OPTION (pfile, char_precision);
1276   size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1277   size_t mask = width_to_mask (width);
1278   size_t i;
1279   cppchar_t result, c;
1280   bool unsigned_p;
1281
1282   /* The value of a multi-character character constant, or a
1283      single-character character constant whose representation in the
1284      execution character set is more than one byte long, is
1285      implementation defined.  This implementation defines it to be the
1286      number formed by interpreting the byte sequence in memory as a
1287      big-endian binary number.  If overflow occurs, the high bytes are
1288      lost, and a warning is issued.
1289
1290      We don't want to process the NUL terminator handed back by
1291      cpp_interpret_string.  */
1292   result = 0;
1293   for (i = 0; i < str.len - 1; i++)
1294     {
1295       c = str.text[i] & mask;
1296       if (width < BITS_PER_CPPCHAR_T)
1297         result = (result << width) | c;
1298       else
1299         result = c;
1300     }
1301
1302   if (i > max_chars)
1303     {
1304       i = max_chars;
1305       cpp_error (pfile, CPP_DL_WARNING,
1306                  "character constant too long for its type");
1307     }
1308   else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1309     cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
1310
1311   /* Multichar constants are of type int and therefore signed.  */
1312   if (i > 1)
1313     unsigned_p = 0;
1314   else
1315     unsigned_p = CPP_OPTION (pfile, unsigned_char);
1316
1317   /* Truncate the constant to its natural width, and simultaneously
1318      sign- or zero-extend to the full width of cppchar_t.
1319      For single-character constants, the value is WIDTH bits wide.
1320      For multi-character constants, the value is INT_PRECISION bits wide.  */
1321   if (i > 1)
1322     width = CPP_OPTION (pfile, int_precision);
1323   if (width < BITS_PER_CPPCHAR_T)
1324     {
1325       mask = ((cppchar_t) 1 << width) - 1;
1326       if (unsigned_p || !(result & (1 << (width - 1))))
1327         result &= mask;
1328       else
1329         result |= ~mask;
1330     }
1331   *pchars_seen = i;
1332   *unsignedp = unsigned_p;
1333   return result;
1334 }
1335
1336 /* Subroutine of cpp_interpret_charconst which performs the conversion
1337    to a number, for wide strings.  STR is the string structure returned
1338    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1339    cpp_interpret_charconst.  */
1340 static cppchar_t
1341 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1342                        unsigned int *pchars_seen, int *unsignedp)
1343 {
1344   bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1345   size_t width = CPP_OPTION (pfile, wchar_precision);
1346   size_t cwidth = CPP_OPTION (pfile, char_precision);
1347   size_t mask = width_to_mask (width);
1348   size_t cmask = width_to_mask (cwidth);
1349   size_t nbwc = width / cwidth;
1350   size_t off, i;
1351   cppchar_t result = 0, c;
1352
1353   /* This is finicky because the string is in the target's byte order,
1354      which may not be our byte order.  Only the last character, ignoring
1355      the NUL terminator, is relevant.  */
1356   off = str.len - (nbwc * 2);
1357   result = 0;
1358   for (i = 0; i < nbwc; i++)
1359     {
1360       c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1361       result = (result << cwidth) | (c & cmask);
1362     }
1363
1364   /* Wide character constants have type wchar_t, and a single
1365      character exactly fills a wchar_t, so a multi-character wide
1366      character constant is guaranteed to overflow.  */
1367   if (off > 0)
1368     cpp_error (pfile, CPP_DL_WARNING,
1369                "character constant too long for its type");
1370
1371   /* Truncate the constant to its natural width, and simultaneously
1372      sign- or zero-extend to the full width of cppchar_t.  */
1373   if (width < BITS_PER_CPPCHAR_T)
1374     {
1375       if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1376         result &= mask;
1377       else
1378         result |= ~mask;
1379     }
1380
1381   *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1382   *pchars_seen = 1;
1383   return result;
1384 }
1385
1386 /* Interpret a (possibly wide) character constant in TOKEN.
1387    PCHARS_SEEN points to a variable that is filled in with the number
1388    of characters seen, and UNSIGNEDP to a variable that indicates
1389    whether the result has signed type.  */
1390 cppchar_t
1391 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1392                          unsigned int *pchars_seen, int *unsignedp)
1393 {
1394   cpp_string str = { 0, 0 };
1395   bool wide = (token->type == CPP_WCHAR);
1396   cppchar_t result;
1397
1398   /* an empty constant will appear as L'' or '' */
1399   if (token->val.str.len == (size_t) (2 + wide))
1400     {
1401       cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1402       return 0;
1403     }
1404   else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1405     return 0;
1406
1407   if (wide)
1408     result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1409   else
1410     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1411
1412   if (str.text != token->val.str.text)
1413     free ((void *)str.text);
1414
1415   return result;
1416 }
1417
1418 /* Convert an input buffer (containing the complete contents of one
1419    source file) from INPUT_CHARSET to the source character set.  INPUT
1420    points to the input buffer, SIZE is its allocated size, and LEN is
1421    the length of the meaningful data within the buffer.  The
1422    translated buffer is returned, and *ST_SIZE is set to the length of
1423    the meaningful data within the translated buffer.
1424
1425    INPUT is expected to have been allocated with xmalloc.  This function
1426    will either return INPUT, or free it and return a pointer to another
1427    xmalloc-allocated block of memory.  */
1428 uchar *
1429 _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1430                     uchar *input, size_t size, size_t len, off_t *st_size)
1431 {
1432   struct cset_converter input_cset;
1433   struct _cpp_strbuf to;
1434
1435   input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1436   if (input_cset.func == convert_no_conversion)
1437     {
1438       to.text = input;
1439       to.asize = size;
1440       to.len = len;
1441     }
1442   else
1443     {
1444       to.asize = MAX (65536, len);
1445       to.text = xmalloc (to.asize);
1446       to.len = 0;
1447
1448       if (!APPLY_CONVERSION (input_cset, input, len, &to))
1449         cpp_error (pfile, CPP_DL_ERROR,
1450                    "failure to convert %s to %s",
1451                    CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1452
1453       free (input);
1454     }
1455
1456   /* Clean up the mess.  */
1457   if (input_cset.func == convert_using_iconv)
1458     iconv_close (input_cset.cd);
1459
1460   /* Resize buffer if we allocated substantially too much, or if we
1461      haven't enough space for the \n-terminator.  */
1462   if (to.len + 4096 < to.asize || to.len >= to.asize)
1463     to.text = xrealloc (to.text, to.len + 1);
1464
1465   /* If the file is using old-school Mac line endings (\r only),
1466      terminate with another \r, not an \n, so that we do not mistake
1467      the \r\n sequence for a single DOS line ending and erroneously
1468      issue the "No newline at end of file" diagnostic.  */
1469   if (to.text[to.len - 1] == '\r')
1470     to.text[to.len] = '\r';
1471   else
1472     to.text[to.len] = '\n';
1473
1474   *st_size = to.len;
1475   return to.text;
1476 }
1477
1478 /* Decide on the default encoding to assume for input files.  */
1479 const char *
1480 _cpp_default_encoding (void)
1481 {
1482   const char *current_encoding = NULL;
1483
1484   /* We disable this because the default codeset is 7-bit ASCII on
1485      most platforms, and this causes conversion failures on every
1486      file in GCC that happens to have one of the upper 128 characters
1487      in it -- most likely, as part of the name of a contributor.
1488      We should definitely recognize in-band markers of file encoding,
1489      like:
1490      - the appropriate Unicode byte-order mark (FE FF) to recognize
1491        UTF16 and UCS4 (in both big-endian and little-endian flavors)
1492        and UTF8
1493      - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1494        distinguish ASCII and EBCDIC.
1495      - now we can parse something like "#pragma GCC encoding <xyz>
1496        on the first line, or even Emacs/VIM's mode line tags (there's
1497        a problem here in that VIM uses the last line, and Emacs has
1498        its more elaborate "local variables" convention).
1499      - investigate whether Java has another common convention, which
1500        would be friendly to support.
1501      (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
1502 #if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1503   setlocale (LC_CTYPE, "");
1504   current_encoding = nl_langinfo (CODESET);
1505 #endif
1506   if (current_encoding == NULL || *current_encoding == '\0')
1507     current_encoding = SOURCE_CHARSET;
1508
1509   return current_encoding;
1510 }