libcpp/charset.c

   1 /* CPP Library - charsets
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
   3    Free Software Foundation, Inc.
   4
   5    Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
   6
   7 This program is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 This program is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with this program; if not, write to the Free Software
  19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "cpplib.h"
  24 #include "internal.h"
  25 #include "ucnid.h"
  26
  27 /* Character set handling for C-family languages.
  28
  29    Terminological note: In what follows, "charset" or "character set"
  30    will be taken to mean both an abstract set of characters and an
  31    encoding for that set.
  32
  33    The C99 standard discusses two character sets: source and execution.
  34    The source character set is used for internal processing in translation
  35    phases 1 through 4; the execution character set is used thereafter.
  36    Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
  37    character encodings (see 3.7.2, 3.7.3 for the standardese meanings
  38    of these terms).  Furthermore, the "basic character set" (listed in
  39    5.2.1p3) is to be encoded in each with values one byte wide, and is
  40    to appear in the initial shift state.
  41
  42    It is not explicitly mentioned, but there is also a "wide execution
  43    character set" used to encode wide character constants and wide
  44    string literals; this is supposed to be the result of applying the
  45    standard library function mbstowcs() to an equivalent narrow string
  46    (6.4.5p5).  However, the behavior of hexadecimal and octal
  47    \-escapes is at odds with this; they are supposed to be translated
  48    directly to wchar_t values (6.4.4.4p5,6).
  49
  50    The source character set is not necessarily the character set used
  51    to encode physical source files on disk; translation phase 1 converts
  52    from whatever that encoding is to the source character set.
  53
  54    The presence of universal character names in C99 (6.4.3 et seq.)
  55    forces the source character set to be isomorphic to ISO 10646,
  56    that is, Unicode.  There is no such constraint on the execution
  57    character set; note also that the conversion from source to
  58    execution character set does not occur for identifiers (5.1.1.2p1#5).
  59
  60    For convenience of implementation, the source character set's
  61    encoding of the basic character set should be identical to the
  62    execution character set OF THE HOST SYSTEM's encoding of the basic
  63    character set, and it should not be a state-dependent encoding.
  64
  65    cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
  66    depending on whether the host is based on ASCII or EBCDIC (see
  67    respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
  68    Technical Report #16).  With limited exceptions, it relies on the
  69    system library's iconv() primitive to do charset conversion
  70    (specified in SUSv2).  */
  71
  72 #if !HAVE_ICONV
  73 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
  74    below, which are guarded only by if statements with compile-time
  75    constant conditions, do not cause link errors.  */
  76 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
  77 #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
  78 #define iconv_close(x)   (void)0
  79 #define ICONV_CONST
  80 #endif
  81
  82 #if HOST_CHARSET == HOST_CHARSET_ASCII
  83 #define SOURCE_CHARSET "UTF-8"
  84 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
  85 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
  86 #define SOURCE_CHARSET "UTF-EBCDIC"
  87 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
  88 #else
  89 #error "Unrecognized basic host character set"
  90 #endif
  91
  92 #ifndef EILSEQ
  93 #define EILSEQ EINVAL
  94 #endif
  95
  96 /* This structure is used for a resizable string buffer throughout.  */
  97 /* Don't call it strbuf, as that conflicts with unistd.h on systems
  98    such as DYNIX/ptx where unistd.h includes stropts.h.  */
  99 struct _cpp_strbuf
 100 {
 101   uchar *text;
 102   size_t asize;
 103   size_t len;
 104 };
 105
 106 /* This is enough to hold any string that fits on a single 80-column
 107    line, even if iconv quadruples its size (e.g. conversion from
 108    ASCII to UTF-32) rounded up to a power of two.  */
 109 #define OUTBUF_BLOCK_SIZE 256
 110
 111 /* Conversions between UTF-8 and UTF-16/32 are implemented by custom
 112    logic.  This is because a depressing number of systems lack iconv,
 113    or have have iconv libraries that do not do these conversions, so
 114    we need a fallback implementation for them.  To ensure the fallback
 115    doesn't break due to neglect, it is used on all systems.
 116
 117    UTF-32 encoding is nice and simple: a four-byte binary number,
 118    constrained to the range 00000000-7FFFFFFF to avoid questions of
 119    signedness.  We do have to cope with big- and little-endian
 120    variants.
 121
 122    UTF-16 encoding uses two-byte binary numbers, again in big- and
 123    little-endian variants, for all values in the 00000000-0000FFFF
 124    range.  Values in the 00010000-0010FFFF range are encoded as pairs
 125    of two-byte numbers, called "surrogate pairs": given a number S in
 126    this range, it is mapped to a pair (H, L) as follows:
 127
 128      H = (S - 0x10000) / 0x400 + 0xD800
 129      L = (S - 0x10000) % 0x400 + 0xDC00
 130
 131    Two-byte values in the D800...DFFF range are ill-formed except as a
 132    component of a surrogate pair.  Even if the encoding within a
 133    two-byte value is little-endian, the H member of the surrogate pair
 134    comes first.
 135
 136    There is no way to encode values in the 00110000-7FFFFFFF range,
 137    which is not currently a problem as there are no assigned code
 138    points in that range; however, the author expects that it will
 139    eventually become necessary to abandon UTF-16 due to this
 140    limitation.  Note also that, because of these pairs, UTF-16 does
 141    not meet the requirements of the C standard for a wide character
 142    encoding (see 3.7.3 and 6.4.4.4p11).
 143
 144    UTF-8 encoding looks like this:
 145
 146    value range         encoded as
 147    00000000-0000007F   0xxxxxxx
 148    00000080-000007FF   110xxxxx 10xxxxxx
 149    00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
 150    00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 151    00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 152    04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 153
 154    Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
 155    which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
 156    never occur.  Note also that any value that can be encoded by a
 157    given row of the table can also be encoded by all successive rows,
 158    but this is not done; only the shortest possible encoding for any
 159    given value is valid.  For instance, the character 07C0 could be
 160    encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
 161    FC 80 80 80 9F 80.  Only the first is valid.
 162
 163    An implementation note: the transformation from UTF-16 to UTF-8, or
 164    vice versa, is easiest done by using UTF-32 as an intermediary.  */
 165
 166 /* Internal primitives which go from an UTF-8 byte stream to native-endian
 167    UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
 168    operation in several places below.  */
 169 static inline int
 170 one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
 171                      cppchar_t *cp)
 172 {
 173   static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
 174   static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 175
 176   cppchar_t c;
 177   const uchar *inbuf = *inbufp;
 178   size_t nbytes, i;
 179
 180   if (*inbytesleftp < 1)
 181     return EINVAL;
 182
 183   c = *inbuf;
 184   if (c < 0x80)
 185     {
 186       *cp = c;
 187       *inbytesleftp -= 1;
 188       *inbufp += 1;
 189       return 0;
 190     }
 191
 192   /* The number of leading 1-bits in the first byte indicates how many
 193      bytes follow.  */
 194   for (nbytes = 2; nbytes < 7; nbytes++)
 195     if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
 196       goto found;
 197   return EILSEQ;
 198  found:
 199
 200   if (*inbytesleftp < nbytes)
 201     return EINVAL;
 202
 203   c = (c & masks[nbytes-1]);
 204   inbuf++;
 205   for (i = 1; i < nbytes; i++)
 206     {
 207       cppchar_t n = *inbuf++;
 208       if ((n & 0xC0) != 0x80)
 209         return EILSEQ;
 210       c = ((c << 6) + (n & 0x3F));
 211     }
 212
 213   /* Make sure the shortest possible encoding was used.  */
 214   if (c <=      0x7F && nbytes > 1) return EILSEQ;
 215   if (c <=     0x7FF && nbytes > 2) return EILSEQ;
 216   if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
 217   if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
 218   if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
 219
 220   /* Make sure the character is valid.  */
 221   if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
 222
 223   *cp = c;
 224   *inbufp = inbuf;
 225   *inbytesleftp -= nbytes;
 226   return 0;
 227 }
 228
 229 static inline int
 230 one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
 231 {
 232   static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 233   static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
 234   size_t nbytes;
 235   uchar buf[6], *p = &buf[6];
 236   uchar *outbuf = *outbufp;
 237
 238   nbytes = 1;
 239   if (c < 0x80)
 240     *--p = c;
 241   else
 242     {
 243       do
 244         {
 245           *--p = ((c & 0x3F) | 0x80);
 246           c >>= 6;
 247           nbytes++;
 248         }
 249       while (c >= 0x3F || (c & limits[nbytes-1]));
 250       *--p = (c | masks[nbytes-1]);
 251     }
 252
 253   if (*outbytesleftp < nbytes)
 254     return E2BIG;
 255
 256   while (p < &buf[6])
 257     *outbuf++ = *p++;
 258   *outbytesleftp -= nbytes;
 259   *outbufp = outbuf;
 260   return 0;
 261 }
 262
 263 /* The following four functions transform one character between the two
 264    encodings named in the function name.  All have the signature
 265    int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 266            uchar **outbufp, size_t *outbytesleftp)
 267
 268    BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
 269    interpreted as a boolean indicating whether big-endian or
 270    little-endian encoding is to be used for the member of the pair
 271    that is not UTF-8.
 272
 273    INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
 274    do for iconv.
 275
 276    The return value is either 0 for success, or an errno value for
 277    failure, which may be E2BIG (need more space), EILSEQ (ill-formed
 278    input sequence), ir EINVAL (incomplete input sequence).  */
 279
 280 static inline int
 281 one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 282                    uchar **outbufp, size_t *outbytesleftp)
 283 {
 284   uchar *outbuf;
 285   cppchar_t s = 0;
 286   int rval;
 287
 288   /* Check for space first, since we know exactly how much we need.  */
 289   if (*outbytesleftp < 4)
 290     return E2BIG;
 291
 292   rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
 293   if (rval)
 294     return rval;
 295
 296   outbuf = *outbufp;
 297   outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
 298   outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
 299   outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
 300   outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
 301
 302   *outbufp += 4;
 303   *outbytesleftp -= 4;
 304   return 0;
 305 }
 306
 307 static inline int
 308 one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 309                    uchar **outbufp, size_t *outbytesleftp)
 310 {
 311   cppchar_t s;
 312   int rval;
 313   const uchar *inbuf;
 314
 315   if (*inbytesleftp < 4)
 316     return EINVAL;
 317
 318   inbuf = *inbufp;
 319
 320   s  = inbuf[bigend ? 0 : 3] << 24;
 321   s += inbuf[bigend ? 1 : 2] << 16;
 322   s += inbuf[bigend ? 2 : 1] << 8;
 323   s += inbuf[bigend ? 3 : 0];
 324
 325   if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
 326     return EILSEQ;
 327
 328   rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
 329   if (rval)
 330     return rval;
 331
 332   *inbufp += 4;
 333   *inbytesleftp -= 4;
 334   return 0;
 335 }
 336
 337 static inline int
 338 one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 339                    uchar **outbufp, size_t *outbytesleftp)
 340 {
 341   int rval;
 342   cppchar_t s = 0;
 343   const uchar *save_inbuf = *inbufp;
 344   size_t save_inbytesleft = *inbytesleftp;
 345   uchar *outbuf = *outbufp;
 346
 347   rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
 348   if (rval)
 349     return rval;
 350
 351   if (s > 0x0010FFFF)
 352     {
 353       *inbufp = save_inbuf;
 354       *inbytesleftp = save_inbytesleft;
 355       return EILSEQ;
 356     }
 357
 358   if (s < 0xFFFF)
 359     {
 360       if (*outbytesleftp < 2)
 361         {
 362           *inbufp = save_inbuf;
 363           *inbytesleftp = save_inbytesleft;
 364           return E2BIG;
 365         }
 366       outbuf[bigend ? 1 : 0] = (s & 0x00FF);
 367       outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
 368
 369       *outbufp += 2;
 370       *outbytesleftp -= 2;
 371       return 0;
 372     }
 373   else
 374     {
 375       cppchar_t hi, lo;
 376
 377       if (*outbytesleftp < 4)
 378         {
 379           *inbufp = save_inbuf;
 380           *inbytesleftp = save_inbytesleft;
 381           return E2BIG;
 382         }
 383
 384       hi = (s - 0x10000) / 0x400 + 0xD800;
 385       lo = (s - 0x10000) % 0x400 + 0xDC00;
 386
 387       /* Even if we are little-endian, put the high surrogate first.
 388          ??? Matches practice?  */
 389       outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
 390       outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
 391       outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
 392       outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
 393
 394       *outbufp += 4;
 395       *outbytesleftp -= 4;
 396       return 0;
 397     }
 398 }
 399
 400 static inline int
 401 one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
 402                    uchar **outbufp, size_t *outbytesleftp)
 403 {
 404   cppchar_t s;
 405   const uchar *inbuf = *inbufp;
 406   int rval;
 407
 408   if (*inbytesleftp < 2)
 409     return EINVAL;
 410   s  = inbuf[bigend ? 0 : 1] << 8;
 411   s += inbuf[bigend ? 1 : 0];
 412
 413   /* Low surrogate without immediately preceding high surrogate is invalid.  */
 414   if (s >= 0xDC00 && s <= 0xDFFF)
 415     return EILSEQ;
 416   /* High surrogate must have a following low surrogate.  */
 417   else if (s >= 0xD800 && s <= 0xDBFF)
 418     {
 419       cppchar_t hi = s, lo;
 420       if (*inbytesleftp < 4)
 421         return EINVAL;
 422
 423       lo  = inbuf[bigend ? 2 : 3] << 8;
 424       lo += inbuf[bigend ? 3 : 2];
 425
 426       if (lo < 0xDC00 || lo > 0xDFFF)
 427         return EILSEQ;
 428
 429       s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
 430     }
 431
 432   rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
 433   if (rval)
 434     return rval;
 435
 436   /* Success - update the input pointers (one_cppchar_to_utf8 has done
 437      the output pointers for us).  */
 438   if (s <= 0xFFFF)
 439     {
 440       *inbufp += 2;
 441       *inbytesleftp -= 2;
 442     }
 443   else
 444     {
 445       *inbufp += 4;
 446       *inbytesleftp -= 4;
 447     }
 448   return 0;
 449 }
 450
 451 /* Helper routine for the next few functions.  The 'const' on
 452    one_conversion means that we promise not to modify what function is
 453    pointed to, which lets the inliner see through it.  */
 454
 455 static inline bool
 456 conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
 457                                              uchar **, size_t *),
 458                  iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
 459 {
 460   const uchar *inbuf;
 461   uchar *outbuf;
 462   size_t inbytesleft, outbytesleft;
 463   int rval;
 464
 465   inbuf = from;
 466   inbytesleft = flen;
 467   outbuf = to->text + to->len;
 468   outbytesleft = to->asize - to->len;
 469
 470   for (;;)
 471     {
 472       do
 473         rval = one_conversion (cd, &inbuf, &inbytesleft,
 474                                &outbuf, &outbytesleft);
 475       while (inbytesleft && !rval);
 476
 477       if (__builtin_expect (inbytesleft == 0, 1))
 478         {
 479           to->len = to->asize - outbytesleft;
 480           return true;
 481         }
 482       if (rval != E2BIG)
 483         {
 484           errno = rval;
 485           return false;
 486         }
 487
 488       outbytesleft += OUTBUF_BLOCK_SIZE;
 489       to->asize += OUTBUF_BLOCK_SIZE;
 490       to->text = xrealloc (to->text, to->asize);
 491       outbuf = to->text + to->asize - outbytesleft;
 492     }
 493 }
 494
 495
 496 /* These functions convert entire strings between character sets.
 497    They all have the signature
 498
 499    bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
 500
 501    The input string FROM is converted as specified by the function
 502    name plus the iconv descriptor CD (which may be fake), and the
 503    result appended to TO.  On any error, false is returned, otherwise true.  */
 504
 505 /* These four use the custom conversion code above.  */
 506 static bool
 507 convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
 508                     struct _cpp_strbuf *to)
 509 {
 510   return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
 511 }
 512
 513 static bool
 514 convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
 515                     struct _cpp_strbuf *to)
 516 {
 517   return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
 518 }
 519
 520 static bool
 521 convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
 522                     struct _cpp_strbuf *to)
 523 {
 524   return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
 525 }
 526
 527 static bool
 528 convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
 529                     struct _cpp_strbuf *to)
 530 {
 531   return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
 532 }
 533
 534 /* Identity conversion, used when we have no alternative.  */
 535 static bool
 536 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
 537                        const uchar *from, size_t flen, struct _cpp_strbuf *to)
 538 {
 539   if (to->len + flen > to->asize)
 540     {
 541       to->asize = to->len + flen;
 542       to->text = xrealloc (to->text, to->asize);
 543     }
 544   memcpy (to->text + to->len, from, flen);
 545   to->len += flen;
 546   return true;
 547 }
 548
 549 /* And this one uses the system iconv primitive.  It's a little
 550    different, since iconv's interface is a little different.  */
 551 #if HAVE_ICONV
 552 static bool
 553 convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
 554                      struct _cpp_strbuf *to)
 555 {
 556   ICONV_CONST char *inbuf;
 557   char *outbuf;
 558   size_t inbytesleft, outbytesleft;
 559
 560   /* Reset conversion descriptor and check that it is valid.  */
 561   if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
 562     return false;
 563
 564   inbuf = (ICONV_CONST char *)from;
 565   inbytesleft = flen;
 566   outbuf = (char *)to->text + to->len;
 567   outbytesleft = to->asize - to->len;
 568
 569   for (;;)
 570     {
 571       iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 572       if (__builtin_expect (inbytesleft == 0, 1))
 573         {
 574           to->len = to->asize - outbytesleft;
 575           return true;
 576         }
 577       if (errno != E2BIG)
 578         return false;
 579
 580       outbytesleft += OUTBUF_BLOCK_SIZE;
 581       to->asize += OUTBUF_BLOCK_SIZE;
 582       to->text = xrealloc (to->text, to->asize);
 583       outbuf = (char *)to->text + to->asize - outbytesleft;
 584     }
 585 }
 586 #else
 587 #define convert_using_iconv 0 /* prevent undefined symbol error below */
 588 #endif
 589
 590 /* Arrange for the above custom conversion logic to be used automatically
 591    when conversion between a suitable pair of character sets is requested.  */
 592
 593 #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
 594    CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
 595
 596 struct conversion
 597 {
 598   const char *pair;
 599   convert_f func;
 600   iconv_t fake_cd;
 601 };
 602 static const struct conversion conversion_tab[] = {
 603   { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
 604   { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
 605   { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
 606   { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
 607   { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
 608   { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
 609   { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
 610   { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
 611 };
 612
 613 /* Subroutine of cpp_init_iconv: initialize and return a
 614    cset_converter structure for conversion from FROM to TO.  If
 615    iconv_open() fails, issue an error and return an identity
 616    converter.  Silently return an identity converter if FROM and TO
 617    are identical.  */
 618 static struct cset_converter
 619 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
 620 {
 621   struct cset_converter ret;
 622   char *pair;
 623   size_t i;
 624
 625   if (!strcasecmp (to, from))
 626     {
 627       ret.func = convert_no_conversion;
 628       ret.cd = (iconv_t) -1;
 629       return ret;
 630     }
 631
 632   pair = alloca(strlen(to) + strlen(from) + 2);
 633
 634   strcpy(pair, from);
 635   strcat(pair, "/");
 636   strcat(pair, to);
 637   for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
 638     if (!strcasecmp (pair, conversion_tab[i].pair))
 639       {
 640         ret.func = conversion_tab[i].func;
 641         ret.cd = conversion_tab[i].fake_cd;
 642         return ret;
 643       }
 644
 645   /* No custom converter - try iconv.  */
 646   if (HAVE_ICONV)
 647     {
 648       ret.func = convert_using_iconv;
 649       ret.cd = iconv_open (to, from);
 650
 651       if (ret.cd == (iconv_t) -1)
 652         {
 653           if (errno == EINVAL)
 654             cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
 655                        "conversion from %s to %s not supported by iconv",
 656                        from, to);
 657           else
 658             cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
 659
 660           ret.func = convert_no_conversion;
 661         }
 662     }
 663   else
 664     {
 665       cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
 666                  "no iconv implementation, cannot convert from %s to %s",
 667                  from, to);
 668       ret.func = convert_no_conversion;
 669       ret.cd = (iconv_t) -1;
 670     }
 671   return ret;
 672 }
 673
 674 /* If charset conversion is requested, initialize iconv(3) descriptors
 675    for conversion from the source character set to the execution
 676    character sets.  If iconv is not present in the C library, and
 677    conversion is requested, issue an error.  */
 678
 679 void
 680 cpp_init_iconv (cpp_reader *pfile)
 681 {
 682   const char *ncset = CPP_OPTION (pfile, narrow_charset);
 683   const char *wcset = CPP_OPTION (pfile, wide_charset);
 684   const char *default_wcset;
 685
 686   bool be = CPP_OPTION (pfile, bytes_big_endian);
 687
 688   if (CPP_OPTION (pfile, wchar_precision) >= 32)
 689     default_wcset = be ? "UTF-32BE" : "UTF-32LE";
 690   else if (CPP_OPTION (pfile, wchar_precision) >= 16)
 691     default_wcset = be ? "UTF-16BE" : "UTF-16LE";
 692   else
 693     /* This effectively means that wide strings are not supported,
 694        so don't do any conversion at all.  */
 695    default_wcset = SOURCE_CHARSET;
 696
 697   if (!ncset)
 698     ncset = SOURCE_CHARSET;
 699   if (!wcset)
 700     wcset = default_wcset;
 701
 702   pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
 703   pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
 704 }
 705
 706 /* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
 707 void
 708 _cpp_destroy_iconv (cpp_reader *pfile)
 709 {
 710   if (HAVE_ICONV)
 711     {
 712       if (pfile->narrow_cset_desc.func == convert_using_iconv)
 713         iconv_close (pfile->narrow_cset_desc.cd);
 714       if (pfile->wide_cset_desc.func == convert_using_iconv)
 715         iconv_close (pfile->wide_cset_desc.cd);
 716     }
 717 }
 718
 719 /* Utility routine for use by a full compiler.  C is a character taken
 720    from the *basic* source character set, encoded in the host's
 721    execution encoding.  Convert it to (the target's) execution
 722    encoding, and return that value.
 723
 724    Issues an internal error if C's representation in the narrow
 725    execution character set fails to be a single-byte value (C99
 726    5.2.1p3: "The representation of each member of the source and
 727    execution character sets shall fit in a byte.")  May also issue an
 728    internal error if C fails to be a member of the basic source
 729    character set (testing this exactly is too hard, especially when
 730    the host character set is EBCDIC).  */
 731 cppchar_t
 732 cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
 733 {
 734   uchar sbuf[1];
 735   struct _cpp_strbuf tbuf;
 736
 737   /* This test is merely an approximation, but it suffices to catch
 738      the most important thing, which is that we don't get handed a
 739      character outside the unibyte range of the host character set.  */
 740   if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
 741     {
 742       cpp_error (pfile, CPP_DL_ICE,
 743                  "character 0x%lx is not in the basic source character set\n",
 744                  (unsigned long)c);
 745       return 0;
 746     }
 747
 748   /* Being a character in the unibyte range of the host character set,
 749      we can safely splat it into a one-byte buffer and trust that that
 750      is a well-formed string.  */
 751   sbuf[0] = c;
 752
 753   /* This should never need to reallocate, but just in case... */
 754   tbuf.asize = 1;
 755   tbuf.text = xmalloc (tbuf.asize);
 756   tbuf.len = 0;
 757
 758   if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
 759     {
 760       cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
 761       return 0;
 762     }
 763   if (tbuf.len != 1)
 764     {
 765       cpp_error (pfile, CPP_DL_ICE,
 766                  "character 0x%lx is not unibyte in execution character set",
 767                  (unsigned long)c);
 768       return 0;
 769     }
 770   c = tbuf.text[0];
 771   free(tbuf.text);
 772   return c;
 773 }
 774
 775 \f
 776
 777 /* Utility routine that computes a mask of the form 0000...111... with
 778    WIDTH 1-bits.  */
 779 static inline size_t
 780 width_to_mask (size_t width)
 781 {
 782   width = MIN (width, BITS_PER_CPPCHAR_T);
 783   if (width >= CHAR_BIT * sizeof (size_t))
 784     return ~(size_t) 0;
 785   else
 786     return ((size_t) 1 << width) - 1;
 787 }
 788
 789 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
 790    the start of an identifier, and 0 if C is not valid in an
 791    identifier.  We assume C has already gone through the checks of
 792    _cpp_valid_ucn.  The algorithm is a simple binary search on the
 793    table defined in cppucnid.h.  */
 794
 795 static int
 796 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
 797 {
 798   int mn, mx, md;
 799
 800   mn = -1;
 801   mx = ARRAY_SIZE (ucnranges);
 802   while (mx - mn > 1)
 803     {
 804       md = (mn + mx) / 2;
 805       if (c < ucnranges[md].lo)
 806         mx = md;
 807       else if (c > ucnranges[md].hi)
 808         mn = md;
 809       else
 810         goto found;
 811     }
 812   return 0;
 813
 814  found:
 815   /* When -pedantic, we require the character to have been listed by
 816      the standard for the current language.  Otherwise, we accept the
 817      union of the acceptable sets for C++98 and C99.  */
 818   if (CPP_PEDANTIC (pfile)
 819       && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
 820           || (CPP_OPTION (pfile, cplusplus)
 821               && !(ucnranges[md].flags & CXX))))
 822     return 0;
 823
 824   /* In C99, UCN digits may not begin identifiers.  */
 825   if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
 826     return 2;
 827
 828   return 1;
 829 }
 830
 831 /* [lex.charset]: The character designated by the universal character
 832    name \UNNNNNNNN is that character whose character short name in
 833    ISO/IEC 10646 is NNNNNNNN; the character designated by the
 834    universal character name \uNNNN is that character whose character
 835    short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
 836    for a universal character name is less than 0x20 or in the range
 837    0x7F-0x9F (inclusive), or if the universal character name
 838    designates a character in the basic source character set, then the
 839    program is ill-formed.
 840
 841    *PSTR must be preceded by "\u" or "\U"; it is assumed that the
 842    buffer end is delimited by a non-hex digit.  Returns zero if UCNs
 843    are not part of the relevant standard, or if the string beginning
 844    at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
 845
 846    Otherwise the nonzero value of the UCN, whether valid or invalid,
 847    is returned.  Diagnostics are emitted for invalid values.  PSTR
 848    is updated to point one beyond the UCN, or to the syntactically
 849    invalid character.
 850
 851    IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
 852    an identifier, or 2 otherwise.  */
 853
 854 cppchar_t
 855 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
 856                 const uchar *limit, int identifier_pos)
 857 {
 858   cppchar_t result, c;
 859   unsigned int length;
 860   const uchar *str = *pstr;
 861   const uchar *base = str - 2;
 862
 863   if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
 864     cpp_error (pfile, CPP_DL_WARNING,
 865                "universal character names are only valid in C++ and C99");
 866   else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
 867     cpp_error (pfile, CPP_DL_WARNING,
 868                "the meaning of '\\%c' is different in traditional C",
 869                (int) str[-1]);
 870
 871   if (str[-1] == 'u')
 872     length = 4;
 873   else if (str[-1] == 'U')
 874     length = 8;
 875   else
 876     abort();
 877
 878   result = 0;
 879   do
 880     {
 881       c = *str;
 882       if (!ISXDIGIT (c))
 883         break;
 884       str++;
 885       result = (result << 4) + hex_value (c);
 886     }
 887   while (--length && str < limit);
 888
 889   *pstr = str;
 890   if (length)
 891     {
 892       /* We'll error when we try it out as the start of an identifier.  */
 893       cpp_error (pfile, CPP_DL_ERROR,
 894                  "incomplete universal character name %.*s",
 895                  (int) (str - base), base);
 896       result = 1;
 897     }
 898   /* The standard permits $, @ and ` to be specified as UCNs.  We use
 899      hex escapes so that this also works with EBCDIC hosts.  */
 900   else if ((result < 0xa0
 901             && (result != 0x24 && result != 0x40 && result != 0x60))
 902            || (result & 0x80000000)
 903            || (result >= 0xD800 && result <= 0xDFFF))
 904     {
 905       cpp_error (pfile, CPP_DL_ERROR,
 906                  "%.*s is not a valid universal character",
 907                  (int) (str - base), base);
 908       result = 1;
 909     }
 910   else if (identifier_pos && result == 0x24
 911            && CPP_OPTION (pfile, dollars_in_ident))
 912     {
 913       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 914         {
 915           CPP_OPTION (pfile, warn_dollars) = 0;
 916           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 917         }
 918     }
 919   else if (identifier_pos)
 920     {
 921       int validity = ucn_valid_in_identifier (pfile, result);
 922
 923       if (validity == 0)
 924         cpp_error (pfile, CPP_DL_ERROR,
 925                    "universal character %.*s is not valid in an identifier",
 926                    (int) (str - base), base);
 927       else if (validity == 2 && identifier_pos == 1)
 928         cpp_error (pfile, CPP_DL_ERROR,
 929    "universal character %.*s is not valid at the start of an identifier",
 930                    (int) (str - base), base);
 931     }
 932
 933   if (result == 0)
 934     result = 1;
 935
 936   return result;
 937 }
 938
 939 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
 940    it to the execution character set and write the result into TBUF.
 941    An advanced pointer is returned.  Issues all relevant diagnostics.  */
 942 static const uchar *
 943 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
 944              struct _cpp_strbuf *tbuf, bool wide)
 945 {
 946   cppchar_t ucn;
 947   uchar buf[6];
 948   uchar *bufp = buf;
 949   size_t bytesleft = 6;
 950   int rval;
 951   struct cset_converter cvt
 952     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
 953
 954   from++;  /* Skip u/U.  */
 955   ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
 956
 957   rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
 958   if (rval)
 959     {
 960       errno = rval;
 961       cpp_errno (pfile, CPP_DL_ERROR,
 962                  "converting UCN to source character set");
 963     }
 964   else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
 965     cpp_errno (pfile, CPP_DL_ERROR,
 966                "converting UCN to execution character set");
 967
 968   return from;
 969 }
 970
 971 /* Subroutine of convert_hex and convert_oct.  N is the representation
 972    in the execution character set of a numeric escape; write it into the
 973    string buffer TBUF and update the end-of-string pointer therein.  WIDE
 974    is true if it's a wide string that's being assembled in TBUF.  This
 975    function issues no diagnostics and never fails.  */
 976 static void
 977 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
 978                      struct _cpp_strbuf *tbuf, bool wide)
 979 {
 980   if (wide)
 981     {
 982       /* We have to render this into the target byte order, which may not
 983          be our byte order.  */
 984       bool bigend = CPP_OPTION (pfile, bytes_big_endian);
 985       size_t width = CPP_OPTION (pfile, wchar_precision);
 986       size_t cwidth = CPP_OPTION (pfile, char_precision);
 987       size_t cmask = width_to_mask (cwidth);
 988       size_t nbwc = width / cwidth;
 989       size_t i;
 990       size_t off = tbuf->len;
 991       cppchar_t c;
 992
 993       if (tbuf->len + nbwc > tbuf->asize)
 994         {
 995           tbuf->asize += OUTBUF_BLOCK_SIZE;
 996           tbuf->text = xrealloc (tbuf->text, tbuf->asize);
 997         }
 998
 999       for (i = 0; i < nbwc; i++)
1000         {
1001           c = n & cmask;
1002           n >>= cwidth;
1003           tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1004         }
1005       tbuf->len += nbwc;
1006     }
1007   else
1008     {
1009       /* Note: this code does not handle the case where the target
1010          and host have a different number of bits in a byte.  */
1011       if (tbuf->len + 1 > tbuf->asize)
1012         {
1013           tbuf->asize += OUTBUF_BLOCK_SIZE;
1014           tbuf->text = xrealloc (tbuf->text, tbuf->asize);
1015         }
1016       tbuf->text[tbuf->len++] = n;
1017     }
1018 }
1019
1020 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
1021    character set and write it into the string buffer TBUF.  Returns an
1022    advanced pointer, and issues diagnostics as necessary.
1023    No character set translation occurs; this routine always produces the
1024    execution-set character with numeric value equal to the given hex
1025    number.  You can, e.g. generate surrogate pairs this way.  */
1026 static const uchar *
1027 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1028              struct _cpp_strbuf *tbuf, bool wide)
1029 {
1030   cppchar_t c, n = 0, overflow = 0;
1031   int digits_found = 0;
1032   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1033                   : CPP_OPTION (pfile, char_precision));
1034   size_t mask = width_to_mask (width);
1035
1036   if (CPP_WTRADITIONAL (pfile))
1037     cpp_error (pfile, CPP_DL_WARNING,
1038                "the meaning of '\\x' is different in traditional C");
1039
1040   from++;  /* Skip 'x'.  */
1041   while (from < limit)
1042     {
1043       c = *from;
1044       if (! hex_p (c))
1045         break;
1046       from++;
1047       overflow |= n ^ (n << 4 >> 4);
1048       n = (n << 4) + hex_value (c);
1049       digits_found = 1;
1050     }
1051
1052   if (!digits_found)
1053     {
1054       cpp_error (pfile, CPP_DL_ERROR,
1055                  "\\x used with no following hex digits");
1056       return from;
1057     }
1058
1059   if (overflow | (n != (n & mask)))
1060     {
1061       cpp_error (pfile, CPP_DL_PEDWARN,
1062                  "hex escape sequence out of range");
1063       n &= mask;
1064     }
1065
1066   emit_numeric_escape (pfile, n, tbuf, wide);
1067
1068   return from;
1069 }
1070
1071 /* Convert an octal escape, pointed to by FROM, to the execution
1072    character set and write it into the string buffer TBUF.  Returns an
1073    advanced pointer, and issues diagnostics as necessary.
1074    No character set translation occurs; this routine always produces the
1075    execution-set character with numeric value equal to the given octal
1076    number.  */
1077 static const uchar *
1078 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1079              struct _cpp_strbuf *tbuf, bool wide)
1080 {
1081   size_t count = 0;
1082   cppchar_t c, n = 0;
1083   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1084                   : CPP_OPTION (pfile, char_precision));
1085   size_t mask = width_to_mask (width);
1086   bool overflow = false;
1087
1088   while (from < limit && count++ < 3)
1089     {
1090       c = *from;
1091       if (c < '0' || c > '7')
1092         break;
1093       from++;
1094       overflow |= n ^ (n << 3 >> 3);
1095       n = (n << 3) + c - '0';
1096     }
1097
1098   if (n != (n & mask))
1099     {
1100       cpp_error (pfile, CPP_DL_PEDWARN,
1101                  "octal escape sequence out of range");
1102       n &= mask;
1103     }
1104
1105   emit_numeric_escape (pfile, n, tbuf, wide);
1106
1107   return from;
1108 }
1109
1110 /* Convert an escape sequence (pointed to by FROM) to its value on
1111    the target, and to the execution character set.  Do not scan past
1112    LIMIT.  Write the converted value into TBUF.  Returns an advanced
1113    pointer.  Handles all relevant diagnostics.  */
1114 static const uchar *
1115 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1116                 struct _cpp_strbuf *tbuf, bool wide)
1117 {
1118   /* Values of \a \b \e \f \n \r \t \v respectively.  */
1119 #if HOST_CHARSET == HOST_CHARSET_ASCII
1120   static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
1121 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1122   static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
1123 #else
1124 #error "unknown host character set"
1125 #endif
1126
1127   uchar c;
1128   struct cset_converter cvt
1129     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1130
1131   c = *from;
1132   switch (c)
1133     {
1134       /* UCNs, hex escapes, and octal escapes are processed separately.  */
1135     case 'u': case 'U':
1136       return convert_ucn (pfile, from, limit, tbuf, wide);
1137
1138     case 'x':
1139       return convert_hex (pfile, from, limit, tbuf, wide);
1140       break;
1141
1142     case '0':  case '1':  case '2':  case '3':
1143     case '4':  case '5':  case '6':  case '7':
1144       return convert_oct (pfile, from, limit, tbuf, wide);
1145
1146       /* Various letter escapes.  Get the appropriate host-charset
1147          value into C.  */
1148     case '\\': case '\'': case '"': case '?': break;
1149
1150     case '(': case '{': case '[': case '%':
1151       /* '\(', etc, can be used at the beginning of a line in a long
1152          string split onto multiple lines with \-newline, to prevent
1153          Emacs or other text editors from getting confused.  '\%' can
1154          be used to prevent SCCS from mangling printf format strings.  */
1155       if (CPP_PEDANTIC (pfile))
1156         goto unknown;
1157       break;
1158
1159     case 'b': c = charconsts[1];  break;
1160     case 'f': c = charconsts[3];  break;
1161     case 'n': c = charconsts[4];  break;
1162     case 'r': c = charconsts[5];  break;
1163     case 't': c = charconsts[6];  break;
1164     case 'v': c = charconsts[7];  break;
1165
1166     case 'a':
1167       if (CPP_WTRADITIONAL (pfile))
1168         cpp_error (pfile, CPP_DL_WARNING,
1169                    "the meaning of '\\a' is different in traditional C");
1170       c = charconsts[0];
1171       break;
1172
1173     case 'e': case 'E':
1174       if (CPP_PEDANTIC (pfile))
1175         cpp_error (pfile, CPP_DL_PEDWARN,
1176                    "non-ISO-standard escape sequence, '\\%c'", (int) c);
1177       c = charconsts[2];
1178       break;
1179
1180     default:
1181     unknown:
1182       if (ISGRAPH (c))
1183         cpp_error (pfile, CPP_DL_PEDWARN,
1184                    "unknown escape sequence '\\%c'", (int) c);
1185       else
1186         cpp_error (pfile, CPP_DL_PEDWARN,
1187                    "unknown escape sequence: '\\%03o'", (int) c);
1188     }
1189
1190   /* Now convert what we have to the execution character set.  */
1191   if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1192     cpp_errno (pfile, CPP_DL_ERROR,
1193                "converting escape sequence to execution character set");
1194
1195   return from + 1;
1196 }
1197 \f
1198 /* FROM is an array of cpp_string structures of length COUNT.  These
1199    are to be converted from the source to the execution character set,
1200    escape sequences translated, and finally all are to be
1201    concatenated.  WIDE indicates whether or not to produce a wide
1202    string.  The result is written into TO.  Returns true for success,
1203    false for failure.  */
1204 bool
1205 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1206                       cpp_string *to, bool wide)
1207 {
1208   struct _cpp_strbuf tbuf;
1209   const uchar *p, *base, *limit;
1210   size_t i;
1211   struct cset_converter cvt
1212     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1213
1214   tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1215   tbuf.text = xmalloc (tbuf.asize);
1216   tbuf.len = 0;
1217
1218   for (i = 0; i < count; i++)
1219     {
1220       p = from[i].text;
1221       if (*p == 'L') p++;
1222       p++; /* Skip leading quote.  */
1223       limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
1224
1225       for (;;)
1226         {
1227           base = p;
1228           while (p < limit && *p != '\\')
1229             p++;
1230           if (p > base)
1231             {
1232               /* We have a run of normal characters; these can be fed
1233                  directly to convert_cset.  */
1234               if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1235                 goto fail;
1236             }
1237           if (p == limit)
1238             break;
1239
1240           p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1241         }
1242     }
1243   /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1244      structure.  */
1245   emit_numeric_escape (pfile, 0, &tbuf, wide);
1246   tbuf.text = xrealloc (tbuf.text, tbuf.len);
1247   to->text = tbuf.text;
1248   to->len = tbuf.len;
1249   return true;
1250
1251  fail:
1252   cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1253   free (tbuf.text);
1254   return false;
1255 }
1256
1257 /* Subroutine of do_line and do_linemarker.  Convert escape sequences
1258    in a string, but do not perform character set conversion.  */
1259 bool
1260 cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1261                                   size_t count, cpp_string *to, bool wide)
1262 {
1263   struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1264   bool retval;
1265
1266   pfile->narrow_cset_desc.func = convert_no_conversion;
1267   pfile->narrow_cset_desc.cd = (iconv_t) -1;
1268
1269   retval = cpp_interpret_string (pfile, from, count, to, wide);
1270
1271   pfile->narrow_cset_desc = save_narrow_cset_desc;
1272   return retval;
1273 }
1274
1275 \f
1276 /* Subroutine of cpp_interpret_charconst which performs the conversion
1277    to a number, for narrow strings.  STR is the string structure returned
1278    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1279    cpp_interpret_charconst.  */
1280 static cppchar_t
1281 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1282                          unsigned int *pchars_seen, int *unsignedp)
1283 {
1284   size_t width = CPP_OPTION (pfile, char_precision);
1285   size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1286   size_t mask = width_to_mask (width);
1287   size_t i;
1288   cppchar_t result, c;
1289   bool unsigned_p;
1290
1291   /* The value of a multi-character character constant, or a
1292      single-character character constant whose representation in the
1293      execution character set is more than one byte long, is
1294      implementation defined.  This implementation defines it to be the
1295      number formed by interpreting the byte sequence in memory as a
1296      big-endian binary number.  If overflow occurs, the high bytes are
1297      lost, and a warning is issued.
1298
1299      We don't want to process the NUL terminator handed back by
1300      cpp_interpret_string.  */
1301   result = 0;
1302   for (i = 0; i < str.len - 1; i++)
1303     {
1304       c = str.text[i] & mask;
1305       if (width < BITS_PER_CPPCHAR_T)
1306         result = (result << width) | c;
1307       else
1308         result = c;
1309     }
1310
1311   if (i > max_chars)
1312     {
1313       i = max_chars;
1314       cpp_error (pfile, CPP_DL_WARNING,
1315                  "character constant too long for its type");
1316     }
1317   else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1318     cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
1319
1320   /* Multichar constants are of type int and therefore signed.  */
1321   if (i > 1)
1322     unsigned_p = 0;
1323   else
1324     unsigned_p = CPP_OPTION (pfile, unsigned_char);
1325
1326   /* Truncate the constant to its natural width, and simultaneously
1327      sign- or zero-extend to the full width of cppchar_t.
1328      For single-character constants, the value is WIDTH bits wide.
1329      For multi-character constants, the value is INT_PRECISION bits wide.  */
1330   if (i > 1)
1331     width = CPP_OPTION (pfile, int_precision);
1332   if (width < BITS_PER_CPPCHAR_T)
1333     {
1334       mask = ((cppchar_t) 1 << width) - 1;
1335       if (unsigned_p || !(result & (1 << (width - 1))))
1336         result &= mask;
1337       else
1338         result |= ~mask;
1339     }
1340   *pchars_seen = i;
1341   *unsignedp = unsigned_p;
1342   return result;
1343 }
1344
1345 /* Subroutine of cpp_interpret_charconst which performs the conversion
1346    to a number, for wide strings.  STR is the string structure returned
1347    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1348    cpp_interpret_charconst.  */
1349 static cppchar_t
1350 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1351                        unsigned int *pchars_seen, int *unsignedp)
1352 {
1353   bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1354   size_t width = CPP_OPTION (pfile, wchar_precision);
1355   size_t cwidth = CPP_OPTION (pfile, char_precision);
1356   size_t mask = width_to_mask (width);
1357   size_t cmask = width_to_mask (cwidth);
1358   size_t nbwc = width / cwidth;
1359   size_t off, i;
1360   cppchar_t result = 0, c;
1361
1362   /* This is finicky because the string is in the target's byte order,
1363      which may not be our byte order.  Only the last character, ignoring
1364      the NUL terminator, is relevant.  */
1365   off = str.len - (nbwc * 2);
1366   result = 0;
1367   for (i = 0; i < nbwc; i++)
1368     {
1369       c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1370       result = (result << cwidth) | (c & cmask);
1371     }
1372
1373   /* Wide character constants have type wchar_t, and a single
1374      character exactly fills a wchar_t, so a multi-character wide
1375      character constant is guaranteed to overflow.  */
1376   if (off > 0)
1377     cpp_error (pfile, CPP_DL_WARNING,
1378                "character constant too long for its type");
1379
1380   /* Truncate the constant to its natural width, and simultaneously
1381      sign- or zero-extend to the full width of cppchar_t.  */
1382   if (width < BITS_PER_CPPCHAR_T)
1383     {
1384       if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1385         result &= mask;
1386       else
1387         result |= ~mask;
1388     }
1389
1390   *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1391   *pchars_seen = 1;
1392   return result;
1393 }
1394
1395 /* Interpret a (possibly wide) character constant in TOKEN.
1396    PCHARS_SEEN points to a variable that is filled in with the number
1397    of characters seen, and UNSIGNEDP to a variable that indicates
1398    whether the result has signed type.  */
1399 cppchar_t
1400 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1401                          unsigned int *pchars_seen, int *unsignedp)
1402 {
1403   cpp_string str = { 0, 0 };
1404   bool wide = (token->type == CPP_WCHAR);
1405   cppchar_t result;
1406
1407   /* an empty constant will appear as L'' or '' */
1408   if (token->val.str.len == (size_t) (2 + wide))
1409     {
1410       cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1411       return 0;
1412     }
1413   else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1414     return 0;
1415
1416   if (wide)
1417     result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1418   else
1419     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1420
1421   if (str.text != token->val.str.text)
1422     free ((void *)str.text);
1423
1424   return result;
1425 }
1426 \f
1427 /* Convert an identifier denoted by ID and LEN, which might contain
1428    UCN escapes, to the source character set, either UTF-8 or
1429    UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
1430 cpp_hashnode *
1431 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1432 {
1433   /* It turns out that a UCN escape always turns into fewer characters
1434      than the escape itself, so we can allocate a temporary in advance.  */
1435   uchar * buf = alloca (len + 1);
1436   uchar * bufp = buf;
1437   size_t idp;
1438
1439   for (idp = 0; idp < len; idp++)
1440     if (id[idp] != '\\')
1441       *bufp++ = id[idp];
1442     else
1443       {
1444         unsigned length = id[idp+1] == 'u' ? 4 : 8;
1445         cppchar_t value = 0;
1446         size_t bufleft = len - (bufp - buf);
1447         int rval;
1448
1449         idp += 2;
1450         while (length && idp < len && ISXDIGIT (id[idp]))
1451           {
1452             value = (value << 4) + hex_value (id[idp]);
1453             idp++;
1454             length--;
1455           }
1456         idp--;
1457
1458         /* Special case for EBCDIC: if the identifier contains
1459            a '$' specified using a UCN, translate it to EBCDIC.  */
1460         if (value == 0x24)
1461           {
1462             *bufp++ = '$';
1463             continue;
1464           }
1465
1466         rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
1467         if (rval)
1468           {
1469             errno = rval;
1470             cpp_errno (pfile, CPP_DL_ERROR,
1471                        "converting UCN to source character set");
1472             break;
1473           }
1474       }
1475
1476   return CPP_HASHNODE (ht_lookup (pfile->hash_table,
1477                                   buf, bufp - buf, HT_ALLOC));
1478 }
1479 \f
1480 /* Convert an input buffer (containing the complete contents of one
1481    source file) from INPUT_CHARSET to the source character set.  INPUT
1482    points to the input buffer, SIZE is its allocated size, and LEN is
1483    the length of the meaningful data within the buffer.  The
1484    translated buffer is returned, and *ST_SIZE is set to the length of
1485    the meaningful data within the translated buffer.
1486
1487    INPUT is expected to have been allocated with xmalloc.  This function
1488    will either return INPUT, or free it and return a pointer to another
1489    xmalloc-allocated block of memory.  */
1490 uchar *
1491 _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1492                     uchar *input, size_t size, size_t len, off_t *st_size)
1493 {
1494   struct cset_converter input_cset;
1495   struct _cpp_strbuf to;
1496
1497   input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1498   if (input_cset.func == convert_no_conversion)
1499     {
1500       to.text = input;
1501       to.asize = size;
1502       to.len = len;
1503     }
1504   else
1505     {
1506       to.asize = MAX (65536, len);
1507       to.text = xmalloc (to.asize);
1508       to.len = 0;
1509
1510       if (!APPLY_CONVERSION (input_cset, input, len, &to))
1511         cpp_error (pfile, CPP_DL_ERROR,
1512                    "failure to convert %s to %s",
1513                    CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1514
1515       free (input);
1516     }
1517
1518   /* Clean up the mess.  */
1519   if (input_cset.func == convert_using_iconv)
1520     iconv_close (input_cset.cd);
1521
1522   /* Resize buffer if we allocated substantially too much, or if we
1523      haven't enough space for the \n-terminator.  */
1524   if (to.len + 4096 < to.asize || to.len >= to.asize)
1525     to.text = xrealloc (to.text, to.len + 1);
1526
1527   /* If the file is using old-school Mac line endings (\r only),
1528      terminate with another \r, not an \n, so that we do not mistake
1529      the \r\n sequence for a single DOS line ending and erroneously
1530      issue the "No newline at end of file" diagnostic.  */
1531   if (to.text[to.len - 1] == '\r')
1532     to.text[to.len] = '\r';
1533   else
1534     to.text[to.len] = '\n';
1535
1536   *st_size = to.len;
1537   return to.text;
1538 }
1539
1540 /* Decide on the default encoding to assume for input files.  */
1541 const char *
1542 _cpp_default_encoding (void)
1543 {
1544   const char *current_encoding = NULL;
1545
1546   /* We disable this because the default codeset is 7-bit ASCII on
1547      most platforms, and this causes conversion failures on every
1548      file in GCC that happens to have one of the upper 128 characters
1549      in it -- most likely, as part of the name of a contributor.
1550      We should definitely recognize in-band markers of file encoding,
1551      like:
1552      - the appropriate Unicode byte-order mark (FE FF) to recognize
1553        UTF16 and UCS4 (in both big-endian and little-endian flavors)
1554        and UTF8
1555      - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1556        distinguish ASCII and EBCDIC.
1557      - now we can parse something like "#pragma GCC encoding <xyz>
1558        on the first line, or even Emacs/VIM's mode line tags (there's
1559        a problem here in that VIM uses the last line, and Emacs has
1560        its more elaborate "local variables" convention).
1561      - investigate whether Java has another common convention, which
1562        would be friendly to support.
1563      (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
1564 #if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1565   setlocale (LC_CTYPE, "");
1566   current_encoding = nl_langinfo (CODESET);
1567 #endif
1568   if (current_encoding == NULL || *current_encoding == '\0')
1569     current_encoding = SOURCE_CHARSET;
1570
1571   return current_encoding;
1572 }