Index: libcpp/ChangeLog
[official-gcc.git] / libcpp / charset.c
blobcd25f10a2e69fae2f969a6e2fc0a54d37547a2c4
1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
3 Free Software Foundation, Inc.
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
10 later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
21 #include "config.h"
22 #include "system.h"
23 #include "cpplib.h"
24 #include "internal.h"
25 #include "ucnid.h"
27 /* Character set handling for C-family languages.
29 Terminological note: In what follows, "charset" or "character set"
30 will be taken to mean both an abstract set of characters and an
31 encoding for that set.
33 The C99 standard discusses two character sets: source and execution.
34 The source character set is used for internal processing in translation
35 phases 1 through 4; the execution character set is used thereafter.
36 Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
37 character encodings (see 3.7.2, 3.7.3 for the standardese meanings
38 of these terms). Furthermore, the "basic character set" (listed in
39 5.2.1p3) is to be encoded in each with values one byte wide, and is
40 to appear in the initial shift state.
42 It is not explicitly mentioned, but there is also a "wide execution
43 character set" used to encode wide character constants and wide
44 string literals; this is supposed to be the result of applying the
45 standard library function mbstowcs() to an equivalent narrow string
46 (6.4.5p5). However, the behavior of hexadecimal and octal
47 \-escapes is at odds with this; they are supposed to be translated
48 directly to wchar_t values (6.4.4.4p5,6).
50 The source character set is not necessarily the character set used
51 to encode physical source files on disk; translation phase 1 converts
52 from whatever that encoding is to the source character set.
54 The presence of universal character names in C99 (6.4.3 et seq.)
55 forces the source character set to be isomorphic to ISO 10646,
56 that is, Unicode. There is no such constraint on the execution
57 character set; note also that the conversion from source to
58 execution character set does not occur for identifiers (5.1.1.2p1#5).
60 For convenience of implementation, the source character set's
61 encoding of the basic character set should be identical to the
62 execution character set OF THE HOST SYSTEM's encoding of the basic
63 character set, and it should not be a state-dependent encoding.
65 cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
66 depending on whether the host is based on ASCII or EBCDIC (see
67 respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
68 Technical Report #16). With limited exceptions, it relies on the
69 system library's iconv() primitive to do charset conversion
70 (specified in SUSv2). */
72 #if !HAVE_ICONV
73 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
74 below, which are guarded only by if statements with compile-time
75 constant conditions, do not cause link errors. */
76 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
77 #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
78 #define iconv_close(x) (void)0
79 #define ICONV_CONST
80 #endif
82 #if HOST_CHARSET == HOST_CHARSET_ASCII
83 #define SOURCE_CHARSET "UTF-8"
84 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
85 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
86 #define SOURCE_CHARSET "UTF-EBCDIC"
87 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
88 #else
89 #error "Unrecognized basic host character set"
90 #endif
92 #ifndef EILSEQ
93 #define EILSEQ EINVAL
94 #endif
96 /* This structure is used for a resizable string buffer throughout. */
97 /* Don't call it strbuf, as that conflicts with unistd.h on systems
98 such as DYNIX/ptx where unistd.h includes stropts.h. */
99 struct _cpp_strbuf
101 uchar *text;
102 size_t asize;
103 size_t len;
106 /* This is enough to hold any string that fits on a single 80-column
107 line, even if iconv quadruples its size (e.g. conversion from
108 ASCII to UTF-32) rounded up to a power of two. */
109 #define OUTBUF_BLOCK_SIZE 256
111 /* Conversions between UTF-8 and UTF-16/32 are implemented by custom
112 logic. This is because a depressing number of systems lack iconv,
113 or have have iconv libraries that do not do these conversions, so
114 we need a fallback implementation for them. To ensure the fallback
115 doesn't break due to neglect, it is used on all systems.
117 UTF-32 encoding is nice and simple: a four-byte binary number,
118 constrained to the range 00000000-7FFFFFFF to avoid questions of
119 signedness. We do have to cope with big- and little-endian
120 variants.
122 UTF-16 encoding uses two-byte binary numbers, again in big- and
123 little-endian variants, for all values in the 00000000-0000FFFF
124 range. Values in the 00010000-0010FFFF range are encoded as pairs
125 of two-byte numbers, called "surrogate pairs": given a number S in
126 this range, it is mapped to a pair (H, L) as follows:
128 H = (S - 0x10000) / 0x400 + 0xD800
129 L = (S - 0x10000) % 0x400 + 0xDC00
131 Two-byte values in the D800...DFFF range are ill-formed except as a
132 component of a surrogate pair. Even if the encoding within a
133 two-byte value is little-endian, the H member of the surrogate pair
134 comes first.
136 There is no way to encode values in the 00110000-7FFFFFFF range,
137 which is not currently a problem as there are no assigned code
138 points in that range; however, the author expects that it will
139 eventually become necessary to abandon UTF-16 due to this
140 limitation. Note also that, because of these pairs, UTF-16 does
141 not meet the requirements of the C standard for a wide character
142 encoding (see 3.7.3 and 6.4.4.4p11).
144 UTF-8 encoding looks like this:
146 value range encoded as
147 00000000-0000007F 0xxxxxxx
148 00000080-000007FF 110xxxxx 10xxxxxx
149 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
150 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
151 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
152 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
154 Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
155 which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
156 never occur. Note also that any value that can be encoded by a
157 given row of the table can also be encoded by all successive rows,
158 but this is not done; only the shortest possible encoding for any
159 given value is valid. For instance, the character 07C0 could be
160 encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
161 FC 80 80 80 9F 80. Only the first is valid.
163 An implementation note: the transformation from UTF-16 to UTF-8, or
164 vice versa, is easiest done by using UTF-32 as an intermediary. */
166 /* Internal primitives which go from an UTF-8 byte stream to native-endian
167 UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
168 operation in several places below. */
169 static inline int
170 one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
171 cppchar_t *cp)
173 static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
174 static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
176 cppchar_t c;
177 const uchar *inbuf = *inbufp;
178 size_t nbytes, i;
180 if (*inbytesleftp < 1)
181 return EINVAL;
183 c = *inbuf;
184 if (c < 0x80)
186 *cp = c;
187 *inbytesleftp -= 1;
188 *inbufp += 1;
189 return 0;
192 /* The number of leading 1-bits in the first byte indicates how many
193 bytes follow. */
194 for (nbytes = 2; nbytes < 7; nbytes++)
195 if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
196 goto found;
197 return EILSEQ;
198 found:
200 if (*inbytesleftp < nbytes)
201 return EINVAL;
203 c = (c & masks[nbytes-1]);
204 inbuf++;
205 for (i = 1; i < nbytes; i++)
207 cppchar_t n = *inbuf++;
208 if ((n & 0xC0) != 0x80)
209 return EILSEQ;
210 c = ((c << 6) + (n & 0x3F));
213 /* Make sure the shortest possible encoding was used. */
214 if (c <= 0x7F && nbytes > 1) return EILSEQ;
215 if (c <= 0x7FF && nbytes > 2) return EILSEQ;
216 if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
217 if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
218 if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
220 /* Make sure the character is valid. */
221 if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
223 *cp = c;
224 *inbufp = inbuf;
225 *inbytesleftp -= nbytes;
226 return 0;
229 static inline int
230 one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
232 static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
233 static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
234 size_t nbytes;
235 uchar buf[6], *p = &buf[6];
236 uchar *outbuf = *outbufp;
238 nbytes = 1;
239 if (c < 0x80)
240 *--p = c;
241 else
245 *--p = ((c & 0x3F) | 0x80);
246 c >>= 6;
247 nbytes++;
249 while (c >= 0x3F || (c & limits[nbytes-1]));
250 *--p = (c | masks[nbytes-1]);
253 if (*outbytesleftp < nbytes)
254 return E2BIG;
256 while (p < &buf[6])
257 *outbuf++ = *p++;
258 *outbytesleftp -= nbytes;
259 *outbufp = outbuf;
260 return 0;
263 /* The following four functions transform one character between the two
264 encodings named in the function name. All have the signature
265 int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
266 uchar **outbufp, size_t *outbytesleftp)
268 BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
269 interpreted as a boolean indicating whether big-endian or
270 little-endian encoding is to be used for the member of the pair
271 that is not UTF-8.
273 INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
274 do for iconv.
276 The return value is either 0 for success, or an errno value for
277 failure, which may be E2BIG (need more space), EILSEQ (ill-formed
278 input sequence), ir EINVAL (incomplete input sequence). */
280 static inline int
281 one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
282 uchar **outbufp, size_t *outbytesleftp)
284 uchar *outbuf;
285 cppchar_t s = 0;
286 int rval;
288 /* Check for space first, since we know exactly how much we need. */
289 if (*outbytesleftp < 4)
290 return E2BIG;
292 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
293 if (rval)
294 return rval;
296 outbuf = *outbufp;
297 outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
298 outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
299 outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
300 outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
302 *outbufp += 4;
303 *outbytesleftp -= 4;
304 return 0;
307 static inline int
308 one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
309 uchar **outbufp, size_t *outbytesleftp)
311 cppchar_t s;
312 int rval;
313 const uchar *inbuf;
315 if (*inbytesleftp < 4)
316 return EINVAL;
318 inbuf = *inbufp;
320 s = inbuf[bigend ? 0 : 3] << 24;
321 s += inbuf[bigend ? 1 : 2] << 16;
322 s += inbuf[bigend ? 2 : 1] << 8;
323 s += inbuf[bigend ? 3 : 0];
325 if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
326 return EILSEQ;
328 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
329 if (rval)
330 return rval;
332 *inbufp += 4;
333 *inbytesleftp -= 4;
334 return 0;
337 static inline int
338 one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
339 uchar **outbufp, size_t *outbytesleftp)
341 int rval;
342 cppchar_t s = 0;
343 const uchar *save_inbuf = *inbufp;
344 size_t save_inbytesleft = *inbytesleftp;
345 uchar *outbuf = *outbufp;
347 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
348 if (rval)
349 return rval;
351 if (s > 0x0010FFFF)
353 *inbufp = save_inbuf;
354 *inbytesleftp = save_inbytesleft;
355 return EILSEQ;
358 if (s < 0xFFFF)
360 if (*outbytesleftp < 2)
362 *inbufp = save_inbuf;
363 *inbytesleftp = save_inbytesleft;
364 return E2BIG;
366 outbuf[bigend ? 1 : 0] = (s & 0x00FF);
367 outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
369 *outbufp += 2;
370 *outbytesleftp -= 2;
371 return 0;
373 else
375 cppchar_t hi, lo;
377 if (*outbytesleftp < 4)
379 *inbufp = save_inbuf;
380 *inbytesleftp = save_inbytesleft;
381 return E2BIG;
384 hi = (s - 0x10000) / 0x400 + 0xD800;
385 lo = (s - 0x10000) % 0x400 + 0xDC00;
387 /* Even if we are little-endian, put the high surrogate first.
388 ??? Matches practice? */
389 outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
390 outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
391 outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
392 outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
394 *outbufp += 4;
395 *outbytesleftp -= 4;
396 return 0;
400 static inline int
401 one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
402 uchar **outbufp, size_t *outbytesleftp)
404 cppchar_t s;
405 const uchar *inbuf = *inbufp;
406 int rval;
408 if (*inbytesleftp < 2)
409 return EINVAL;
410 s = inbuf[bigend ? 0 : 1] << 8;
411 s += inbuf[bigend ? 1 : 0];
413 /* Low surrogate without immediately preceding high surrogate is invalid. */
414 if (s >= 0xDC00 && s <= 0xDFFF)
415 return EILSEQ;
416 /* High surrogate must have a following low surrogate. */
417 else if (s >= 0xD800 && s <= 0xDBFF)
419 cppchar_t hi = s, lo;
420 if (*inbytesleftp < 4)
421 return EINVAL;
423 lo = inbuf[bigend ? 2 : 3] << 8;
424 lo += inbuf[bigend ? 3 : 2];
426 if (lo < 0xDC00 || lo > 0xDFFF)
427 return EILSEQ;
429 s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
432 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
433 if (rval)
434 return rval;
436 /* Success - update the input pointers (one_cppchar_to_utf8 has done
437 the output pointers for us). */
438 if (s <= 0xFFFF)
440 *inbufp += 2;
441 *inbytesleftp -= 2;
443 else
445 *inbufp += 4;
446 *inbytesleftp -= 4;
448 return 0;
451 /* Helper routine for the next few functions. The 'const' on
452 one_conversion means that we promise not to modify what function is
453 pointed to, which lets the inliner see through it. */
455 static inline bool
456 conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
457 uchar **, size_t *),
458 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
460 const uchar *inbuf;
461 uchar *outbuf;
462 size_t inbytesleft, outbytesleft;
463 int rval;
465 inbuf = from;
466 inbytesleft = flen;
467 outbuf = to->text + to->len;
468 outbytesleft = to->asize - to->len;
470 for (;;)
473 rval = one_conversion (cd, &inbuf, &inbytesleft,
474 &outbuf, &outbytesleft);
475 while (inbytesleft && !rval);
477 if (__builtin_expect (inbytesleft == 0, 1))
479 to->len = to->asize - outbytesleft;
480 return true;
482 if (rval != E2BIG)
484 errno = rval;
485 return false;
488 outbytesleft += OUTBUF_BLOCK_SIZE;
489 to->asize += OUTBUF_BLOCK_SIZE;
490 to->text = xrealloc (to->text, to->asize);
491 outbuf = to->text + to->asize - outbytesleft;
496 /* These functions convert entire strings between character sets.
497 They all have the signature
499 bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
501 The input string FROM is converted as specified by the function
502 name plus the iconv descriptor CD (which may be fake), and the
503 result appended to TO. On any error, false is returned, otherwise true. */
505 /* These four use the custom conversion code above. */
506 static bool
507 convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
508 struct _cpp_strbuf *to)
510 return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
513 static bool
514 convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
515 struct _cpp_strbuf *to)
517 return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
520 static bool
521 convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
522 struct _cpp_strbuf *to)
524 return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
527 static bool
528 convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
529 struct _cpp_strbuf *to)
531 return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
534 /* Identity conversion, used when we have no alternative. */
535 static bool
536 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
537 const uchar *from, size_t flen, struct _cpp_strbuf *to)
539 if (to->len + flen > to->asize)
541 to->asize = to->len + flen;
542 to->text = xrealloc (to->text, to->asize);
544 memcpy (to->text + to->len, from, flen);
545 to->len += flen;
546 return true;
549 /* And this one uses the system iconv primitive. It's a little
550 different, since iconv's interface is a little different. */
551 #if HAVE_ICONV
552 static bool
553 convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
554 struct _cpp_strbuf *to)
556 ICONV_CONST char *inbuf;
557 char *outbuf;
558 size_t inbytesleft, outbytesleft;
560 /* Reset conversion descriptor and check that it is valid. */
561 if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
562 return false;
564 inbuf = (ICONV_CONST char *)from;
565 inbytesleft = flen;
566 outbuf = (char *)to->text + to->len;
567 outbytesleft = to->asize - to->len;
569 for (;;)
571 iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
572 if (__builtin_expect (inbytesleft == 0, 1))
574 to->len = to->asize - outbytesleft;
575 return true;
577 if (errno != E2BIG)
578 return false;
580 outbytesleft += OUTBUF_BLOCK_SIZE;
581 to->asize += OUTBUF_BLOCK_SIZE;
582 to->text = xrealloc (to->text, to->asize);
583 outbuf = (char *)to->text + to->asize - outbytesleft;
586 #else
587 #define convert_using_iconv 0 /* prevent undefined symbol error below */
588 #endif
590 /* Arrange for the above custom conversion logic to be used automatically
591 when conversion between a suitable pair of character sets is requested. */
593 #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
594 CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
596 struct conversion
598 const char *pair;
599 convert_f func;
600 iconv_t fake_cd;
602 static const struct conversion conversion_tab[] = {
603 { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
604 { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
605 { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
606 { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
607 { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
608 { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
609 { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
610 { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
613 /* Subroutine of cpp_init_iconv: initialize and return a
614 cset_converter structure for conversion from FROM to TO. If
615 iconv_open() fails, issue an error and return an identity
616 converter. Silently return an identity converter if FROM and TO
617 are identical. */
618 static struct cset_converter
619 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
621 struct cset_converter ret;
622 char *pair;
623 size_t i;
625 if (!strcasecmp (to, from))
627 ret.func = convert_no_conversion;
628 ret.cd = (iconv_t) -1;
629 return ret;
632 pair = alloca(strlen(to) + strlen(from) + 2);
634 strcpy(pair, from);
635 strcat(pair, "/");
636 strcat(pair, to);
637 for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
638 if (!strcasecmp (pair, conversion_tab[i].pair))
640 ret.func = conversion_tab[i].func;
641 ret.cd = conversion_tab[i].fake_cd;
642 return ret;
645 /* No custom converter - try iconv. */
646 if (HAVE_ICONV)
648 ret.func = convert_using_iconv;
649 ret.cd = iconv_open (to, from);
651 if (ret.cd == (iconv_t) -1)
653 if (errno == EINVAL)
654 cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
655 "conversion from %s to %s not supported by iconv",
656 from, to);
657 else
658 cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
660 ret.func = convert_no_conversion;
663 else
665 cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
666 "no iconv implementation, cannot convert from %s to %s",
667 from, to);
668 ret.func = convert_no_conversion;
669 ret.cd = (iconv_t) -1;
671 return ret;
674 /* If charset conversion is requested, initialize iconv(3) descriptors
675 for conversion from the source character set to the execution
676 character sets. If iconv is not present in the C library, and
677 conversion is requested, issue an error. */
679 void
680 cpp_init_iconv (cpp_reader *pfile)
682 const char *ncset = CPP_OPTION (pfile, narrow_charset);
683 const char *wcset = CPP_OPTION (pfile, wide_charset);
684 const char *default_wcset;
686 bool be = CPP_OPTION (pfile, bytes_big_endian);
688 if (CPP_OPTION (pfile, wchar_precision) >= 32)
689 default_wcset = be ? "UTF-32BE" : "UTF-32LE";
690 else if (CPP_OPTION (pfile, wchar_precision) >= 16)
691 default_wcset = be ? "UTF-16BE" : "UTF-16LE";
692 else
693 /* This effectively means that wide strings are not supported,
694 so don't do any conversion at all. */
695 default_wcset = SOURCE_CHARSET;
697 if (!ncset)
698 ncset = SOURCE_CHARSET;
699 if (!wcset)
700 wcset = default_wcset;
702 pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
703 pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
706 /* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
707 void
708 _cpp_destroy_iconv (cpp_reader *pfile)
710 if (HAVE_ICONV)
712 if (pfile->narrow_cset_desc.func == convert_using_iconv)
713 iconv_close (pfile->narrow_cset_desc.cd);
714 if (pfile->wide_cset_desc.func == convert_using_iconv)
715 iconv_close (pfile->wide_cset_desc.cd);
719 /* Utility routine for use by a full compiler. C is a character taken
720 from the *basic* source character set, encoded in the host's
721 execution encoding. Convert it to (the target's) execution
722 encoding, and return that value.
724 Issues an internal error if C's representation in the narrow
725 execution character set fails to be a single-byte value (C99
726 5.2.1p3: "The representation of each member of the source and
727 execution character sets shall fit in a byte.") May also issue an
728 internal error if C fails to be a member of the basic source
729 character set (testing this exactly is too hard, especially when
730 the host character set is EBCDIC). */
731 cppchar_t
732 cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
734 uchar sbuf[1];
735 struct _cpp_strbuf tbuf;
737 /* This test is merely an approximation, but it suffices to catch
738 the most important thing, which is that we don't get handed a
739 character outside the unibyte range of the host character set. */
740 if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
742 cpp_error (pfile, CPP_DL_ICE,
743 "character 0x%lx is not in the basic source character set\n",
744 (unsigned long)c);
745 return 0;
748 /* Being a character in the unibyte range of the host character set,
749 we can safely splat it into a one-byte buffer and trust that that
750 is a well-formed string. */
751 sbuf[0] = c;
753 /* This should never need to reallocate, but just in case... */
754 tbuf.asize = 1;
755 tbuf.text = xmalloc (tbuf.asize);
756 tbuf.len = 0;
758 if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
760 cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
761 return 0;
763 if (tbuf.len != 1)
765 cpp_error (pfile, CPP_DL_ICE,
766 "character 0x%lx is not unibyte in execution character set",
767 (unsigned long)c);
768 return 0;
770 c = tbuf.text[0];
771 free(tbuf.text);
772 return c;
777 /* Utility routine that computes a mask of the form 0000...111... with
778 WIDTH 1-bits. */
779 static inline size_t
780 width_to_mask (size_t width)
782 width = MIN (width, BITS_PER_CPPCHAR_T);
783 if (width >= CHAR_BIT * sizeof (size_t))
784 return ~(size_t) 0;
785 else
786 return ((size_t) 1 << width) - 1;
789 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
790 the start of an identifier, and 0 if C is not valid in an
791 identifier. We assume C has already gone through the checks of
792 _cpp_valid_ucn. The algorithm is a simple binary search on the
793 table defined in cppucnid.h. */
795 static int
796 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
798 int mn, mx, md;
800 mn = -1;
801 mx = ARRAY_SIZE (ucnranges);
802 while (mx - mn > 1)
804 md = (mn + mx) / 2;
805 if (c < ucnranges[md].lo)
806 mx = md;
807 else if (c > ucnranges[md].hi)
808 mn = md;
809 else
810 goto found;
812 return 0;
814 found:
815 /* When -pedantic, we require the character to have been listed by
816 the standard for the current language. Otherwise, we accept the
817 union of the acceptable sets for C++98 and C99. */
818 if (CPP_PEDANTIC (pfile)
819 && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
820 || (CPP_OPTION (pfile, cplusplus)
821 && !(ucnranges[md].flags & CXX))))
822 return 0;
824 /* In C99, UCN digits may not begin identifiers. */
825 if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
826 return 2;
828 return 1;
831 /* [lex.charset]: The character designated by the universal character
832 name \UNNNNNNNN is that character whose character short name in
833 ISO/IEC 10646 is NNNNNNNN; the character designated by the
834 universal character name \uNNNN is that character whose character
835 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
836 for a universal character name is less than 0x20 or in the range
837 0x7F-0x9F (inclusive), or if the universal character name
838 designates a character in the basic source character set, then the
839 program is ill-formed.
841 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
842 buffer end is delimited by a non-hex digit. Returns zero if UCNs
843 are not part of the relevant standard, or if the string beginning
844 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
846 Otherwise the nonzero value of the UCN, whether valid or invalid,
847 is returned. Diagnostics are emitted for invalid values. PSTR
848 is updated to point one beyond the UCN, or to the syntactically
849 invalid character.
851 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
852 an identifier, or 2 otherwise. */
854 cppchar_t
855 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
856 const uchar *limit, int identifier_pos)
858 cppchar_t result, c;
859 unsigned int length;
860 const uchar *str = *pstr;
861 const uchar *base = str - 2;
863 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
864 cpp_error (pfile, CPP_DL_WARNING,
865 "universal character names are only valid in C++ and C99");
866 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
867 cpp_error (pfile, CPP_DL_WARNING,
868 "the meaning of '\\%c' is different in traditional C",
869 (int) str[-1]);
871 if (str[-1] == 'u')
872 length = 4;
873 else if (str[-1] == 'U')
874 length = 8;
875 else
876 abort();
878 result = 0;
881 c = *str;
882 if (!ISXDIGIT (c))
883 break;
884 str++;
885 result = (result << 4) + hex_value (c);
887 while (--length && str < limit);
889 *pstr = str;
890 if (length)
892 /* We'll error when we try it out as the start of an identifier. */
893 cpp_error (pfile, CPP_DL_ERROR,
894 "incomplete universal character name %.*s",
895 (int) (str - base), base);
896 result = 1;
898 /* The standard permits $, @ and ` to be specified as UCNs. We use
899 hex escapes so that this also works with EBCDIC hosts. */
900 else if ((result < 0xa0
901 && (result != 0x24 && result != 0x40 && result != 0x60))
902 || (result & 0x80000000)
903 || (result >= 0xD800 && result <= 0xDFFF))
905 cpp_error (pfile, CPP_DL_ERROR,
906 "%.*s is not a valid universal character",
907 (int) (str - base), base);
908 result = 1;
910 else if (identifier_pos && result == 0x24
911 && CPP_OPTION (pfile, dollars_in_ident))
913 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
915 CPP_OPTION (pfile, warn_dollars) = 0;
916 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
919 else if (identifier_pos)
921 int validity = ucn_valid_in_identifier (pfile, result);
923 if (validity == 0)
924 cpp_error (pfile, CPP_DL_ERROR,
925 "universal character %.*s is not valid in an identifier",
926 (int) (str - base), base);
927 else if (validity == 2 && identifier_pos == 1)
928 cpp_error (pfile, CPP_DL_ERROR,
929 "universal character %.*s is not valid at the start of an identifier",
930 (int) (str - base), base);
933 if (result == 0)
934 result = 1;
936 return result;
939 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
940 it to the execution character set and write the result into TBUF.
941 An advanced pointer is returned. Issues all relevant diagnostics. */
942 static const uchar *
943 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
944 struct _cpp_strbuf *tbuf, bool wide)
946 cppchar_t ucn;
947 uchar buf[6];
948 uchar *bufp = buf;
949 size_t bytesleft = 6;
950 int rval;
951 struct cset_converter cvt
952 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
954 from++; /* Skip u/U. */
955 ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
957 rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
958 if (rval)
960 errno = rval;
961 cpp_errno (pfile, CPP_DL_ERROR,
962 "converting UCN to source character set");
964 else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
965 cpp_errno (pfile, CPP_DL_ERROR,
966 "converting UCN to execution character set");
968 return from;
971 /* Subroutine of convert_hex and convert_oct. N is the representation
972 in the execution character set of a numeric escape; write it into the
973 string buffer TBUF and update the end-of-string pointer therein. WIDE
974 is true if it's a wide string that's being assembled in TBUF. This
975 function issues no diagnostics and never fails. */
976 static void
977 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
978 struct _cpp_strbuf *tbuf, bool wide)
980 if (wide)
982 /* We have to render this into the target byte order, which may not
983 be our byte order. */
984 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
985 size_t width = CPP_OPTION (pfile, wchar_precision);
986 size_t cwidth = CPP_OPTION (pfile, char_precision);
987 size_t cmask = width_to_mask (cwidth);
988 size_t nbwc = width / cwidth;
989 size_t i;
990 size_t off = tbuf->len;
991 cppchar_t c;
993 if (tbuf->len + nbwc > tbuf->asize)
995 tbuf->asize += OUTBUF_BLOCK_SIZE;
996 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
999 for (i = 0; i < nbwc; i++)
1001 c = n & cmask;
1002 n >>= cwidth;
1003 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1005 tbuf->len += nbwc;
1007 else
1009 /* Note: this code does not handle the case where the target
1010 and host have a different number of bits in a byte. */
1011 if (tbuf->len + 1 > tbuf->asize)
1013 tbuf->asize += OUTBUF_BLOCK_SIZE;
1014 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
1016 tbuf->text[tbuf->len++] = n;
1020 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
1021 character set and write it into the string buffer TBUF. Returns an
1022 advanced pointer, and issues diagnostics as necessary.
1023 No character set translation occurs; this routine always produces the
1024 execution-set character with numeric value equal to the given hex
1025 number. You can, e.g. generate surrogate pairs this way. */
1026 static const uchar *
1027 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1028 struct _cpp_strbuf *tbuf, bool wide)
1030 cppchar_t c, n = 0, overflow = 0;
1031 int digits_found = 0;
1032 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1033 : CPP_OPTION (pfile, char_precision));
1034 size_t mask = width_to_mask (width);
1036 if (CPP_WTRADITIONAL (pfile))
1037 cpp_error (pfile, CPP_DL_WARNING,
1038 "the meaning of '\\x' is different in traditional C");
1040 from++; /* Skip 'x'. */
1041 while (from < limit)
1043 c = *from;
1044 if (! hex_p (c))
1045 break;
1046 from++;
1047 overflow |= n ^ (n << 4 >> 4);
1048 n = (n << 4) + hex_value (c);
1049 digits_found = 1;
1052 if (!digits_found)
1054 cpp_error (pfile, CPP_DL_ERROR,
1055 "\\x used with no following hex digits");
1056 return from;
1059 if (overflow | (n != (n & mask)))
1061 cpp_error (pfile, CPP_DL_PEDWARN,
1062 "hex escape sequence out of range");
1063 n &= mask;
1066 emit_numeric_escape (pfile, n, tbuf, wide);
1068 return from;
1071 /* Convert an octal escape, pointed to by FROM, to the execution
1072 character set and write it into the string buffer TBUF. Returns an
1073 advanced pointer, and issues diagnostics as necessary.
1074 No character set translation occurs; this routine always produces the
1075 execution-set character with numeric value equal to the given octal
1076 number. */
1077 static const uchar *
1078 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1079 struct _cpp_strbuf *tbuf, bool wide)
1081 size_t count = 0;
1082 cppchar_t c, n = 0;
1083 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1084 : CPP_OPTION (pfile, char_precision));
1085 size_t mask = width_to_mask (width);
1086 bool overflow = false;
1088 while (from < limit && count++ < 3)
1090 c = *from;
1091 if (c < '0' || c > '7')
1092 break;
1093 from++;
1094 overflow |= n ^ (n << 3 >> 3);
1095 n = (n << 3) + c - '0';
1098 if (n != (n & mask))
1100 cpp_error (pfile, CPP_DL_PEDWARN,
1101 "octal escape sequence out of range");
1102 n &= mask;
1105 emit_numeric_escape (pfile, n, tbuf, wide);
1107 return from;
1110 /* Convert an escape sequence (pointed to by FROM) to its value on
1111 the target, and to the execution character set. Do not scan past
1112 LIMIT. Write the converted value into TBUF. Returns an advanced
1113 pointer. Handles all relevant diagnostics. */
1114 static const uchar *
1115 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1116 struct _cpp_strbuf *tbuf, bool wide)
1118 /* Values of \a \b \e \f \n \r \t \v respectively. */
1119 #if HOST_CHARSET == HOST_CHARSET_ASCII
1120 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
1121 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1122 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
1123 #else
1124 #error "unknown host character set"
1125 #endif
1127 uchar c;
1128 struct cset_converter cvt
1129 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1131 c = *from;
1132 switch (c)
1134 /* UCNs, hex escapes, and octal escapes are processed separately. */
1135 case 'u': case 'U':
1136 return convert_ucn (pfile, from, limit, tbuf, wide);
1138 case 'x':
1139 return convert_hex (pfile, from, limit, tbuf, wide);
1140 break;
1142 case '0': case '1': case '2': case '3':
1143 case '4': case '5': case '6': case '7':
1144 return convert_oct (pfile, from, limit, tbuf, wide);
1146 /* Various letter escapes. Get the appropriate host-charset
1147 value into C. */
1148 case '\\': case '\'': case '"': case '?': break;
1150 case '(': case '{': case '[': case '%':
1151 /* '\(', etc, can be used at the beginning of a line in a long
1152 string split onto multiple lines with \-newline, to prevent
1153 Emacs or other text editors from getting confused. '\%' can
1154 be used to prevent SCCS from mangling printf format strings. */
1155 if (CPP_PEDANTIC (pfile))
1156 goto unknown;
1157 break;
1159 case 'b': c = charconsts[1]; break;
1160 case 'f': c = charconsts[3]; break;
1161 case 'n': c = charconsts[4]; break;
1162 case 'r': c = charconsts[5]; break;
1163 case 't': c = charconsts[6]; break;
1164 case 'v': c = charconsts[7]; break;
1166 case 'a':
1167 if (CPP_WTRADITIONAL (pfile))
1168 cpp_error (pfile, CPP_DL_WARNING,
1169 "the meaning of '\\a' is different in traditional C");
1170 c = charconsts[0];
1171 break;
1173 case 'e': case 'E':
1174 if (CPP_PEDANTIC (pfile))
1175 cpp_error (pfile, CPP_DL_PEDWARN,
1176 "non-ISO-standard escape sequence, '\\%c'", (int) c);
1177 c = charconsts[2];
1178 break;
1180 default:
1181 unknown:
1182 if (ISGRAPH (c))
1183 cpp_error (pfile, CPP_DL_PEDWARN,
1184 "unknown escape sequence '\\%c'", (int) c);
1185 else
1186 cpp_error (pfile, CPP_DL_PEDWARN,
1187 "unknown escape sequence: '\\%03o'", (int) c);
1190 /* Now convert what we have to the execution character set. */
1191 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1192 cpp_errno (pfile, CPP_DL_ERROR,
1193 "converting escape sequence to execution character set");
1195 return from + 1;
1198 /* FROM is an array of cpp_string structures of length COUNT. These
1199 are to be converted from the source to the execution character set,
1200 escape sequences translated, and finally all are to be
1201 concatenated. WIDE indicates whether or not to produce a wide
1202 string. The result is written into TO. Returns true for success,
1203 false for failure. */
1204 bool
1205 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1206 cpp_string *to, bool wide)
1208 struct _cpp_strbuf tbuf;
1209 const uchar *p, *base, *limit;
1210 size_t i;
1211 struct cset_converter cvt
1212 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1214 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1215 tbuf.text = xmalloc (tbuf.asize);
1216 tbuf.len = 0;
1218 for (i = 0; i < count; i++)
1220 p = from[i].text;
1221 if (*p == 'L') p++;
1222 p++; /* Skip leading quote. */
1223 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
1225 for (;;)
1227 base = p;
1228 while (p < limit && *p != '\\')
1229 p++;
1230 if (p > base)
1232 /* We have a run of normal characters; these can be fed
1233 directly to convert_cset. */
1234 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1235 goto fail;
1237 if (p == limit)
1238 break;
1240 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1243 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1244 structure. */
1245 emit_numeric_escape (pfile, 0, &tbuf, wide);
1246 tbuf.text = xrealloc (tbuf.text, tbuf.len);
1247 to->text = tbuf.text;
1248 to->len = tbuf.len;
1249 return true;
1251 fail:
1252 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1253 free (tbuf.text);
1254 return false;
1257 /* Subroutine of do_line and do_linemarker. Convert escape sequences
1258 in a string, but do not perform character set conversion. */
1259 bool
1260 cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1261 size_t count, cpp_string *to, bool wide)
1263 struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1264 bool retval;
1266 pfile->narrow_cset_desc.func = convert_no_conversion;
1267 pfile->narrow_cset_desc.cd = (iconv_t) -1;
1269 retval = cpp_interpret_string (pfile, from, count, to, wide);
1271 pfile->narrow_cset_desc = save_narrow_cset_desc;
1272 return retval;
1276 /* Subroutine of cpp_interpret_charconst which performs the conversion
1277 to a number, for narrow strings. STR is the string structure returned
1278 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
1279 cpp_interpret_charconst. */
1280 static cppchar_t
1281 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1282 unsigned int *pchars_seen, int *unsignedp)
1284 size_t width = CPP_OPTION (pfile, char_precision);
1285 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1286 size_t mask = width_to_mask (width);
1287 size_t i;
1288 cppchar_t result, c;
1289 bool unsigned_p;
1291 /* The value of a multi-character character constant, or a
1292 single-character character constant whose representation in the
1293 execution character set is more than one byte long, is
1294 implementation defined. This implementation defines it to be the
1295 number formed by interpreting the byte sequence in memory as a
1296 big-endian binary number. If overflow occurs, the high bytes are
1297 lost, and a warning is issued.
1299 We don't want to process the NUL terminator handed back by
1300 cpp_interpret_string. */
1301 result = 0;
1302 for (i = 0; i < str.len - 1; i++)
1304 c = str.text[i] & mask;
1305 if (width < BITS_PER_CPPCHAR_T)
1306 result = (result << width) | c;
1307 else
1308 result = c;
1311 if (i > max_chars)
1313 i = max_chars;
1314 cpp_error (pfile, CPP_DL_WARNING,
1315 "character constant too long for its type");
1317 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1318 cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
1320 /* Multichar constants are of type int and therefore signed. */
1321 if (i > 1)
1322 unsigned_p = 0;
1323 else
1324 unsigned_p = CPP_OPTION (pfile, unsigned_char);
1326 /* Truncate the constant to its natural width, and simultaneously
1327 sign- or zero-extend to the full width of cppchar_t.
1328 For single-character constants, the value is WIDTH bits wide.
1329 For multi-character constants, the value is INT_PRECISION bits wide. */
1330 if (i > 1)
1331 width = CPP_OPTION (pfile, int_precision);
1332 if (width < BITS_PER_CPPCHAR_T)
1334 mask = ((cppchar_t) 1 << width) - 1;
1335 if (unsigned_p || !(result & (1 << (width - 1))))
1336 result &= mask;
1337 else
1338 result |= ~mask;
1340 *pchars_seen = i;
1341 *unsignedp = unsigned_p;
1342 return result;
1345 /* Subroutine of cpp_interpret_charconst which performs the conversion
1346 to a number, for wide strings. STR is the string structure returned
1347 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
1348 cpp_interpret_charconst. */
1349 static cppchar_t
1350 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1351 unsigned int *pchars_seen, int *unsignedp)
1353 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1354 size_t width = CPP_OPTION (pfile, wchar_precision);
1355 size_t cwidth = CPP_OPTION (pfile, char_precision);
1356 size_t mask = width_to_mask (width);
1357 size_t cmask = width_to_mask (cwidth);
1358 size_t nbwc = width / cwidth;
1359 size_t off, i;
1360 cppchar_t result = 0, c;
1362 /* This is finicky because the string is in the target's byte order,
1363 which may not be our byte order. Only the last character, ignoring
1364 the NUL terminator, is relevant. */
1365 off = str.len - (nbwc * 2);
1366 result = 0;
1367 for (i = 0; i < nbwc; i++)
1369 c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1370 result = (result << cwidth) | (c & cmask);
1373 /* Wide character constants have type wchar_t, and a single
1374 character exactly fills a wchar_t, so a multi-character wide
1375 character constant is guaranteed to overflow. */
1376 if (off > 0)
1377 cpp_error (pfile, CPP_DL_WARNING,
1378 "character constant too long for its type");
1380 /* Truncate the constant to its natural width, and simultaneously
1381 sign- or zero-extend to the full width of cppchar_t. */
1382 if (width < BITS_PER_CPPCHAR_T)
1384 if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1385 result &= mask;
1386 else
1387 result |= ~mask;
1390 *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1391 *pchars_seen = 1;
1392 return result;
1395 /* Interpret a (possibly wide) character constant in TOKEN.
1396 PCHARS_SEEN points to a variable that is filled in with the number
1397 of characters seen, and UNSIGNEDP to a variable that indicates
1398 whether the result has signed type. */
1399 cppchar_t
1400 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1401 unsigned int *pchars_seen, int *unsignedp)
1403 cpp_string str = { 0, 0 };
1404 bool wide = (token->type == CPP_WCHAR);
1405 cppchar_t result;
1407 /* an empty constant will appear as L'' or '' */
1408 if (token->val.str.len == (size_t) (2 + wide))
1410 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1411 return 0;
1413 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1414 return 0;
1416 if (wide)
1417 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1418 else
1419 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1421 if (str.text != token->val.str.text)
1422 free ((void *)str.text);
1424 return result;
1427 /* Convert an identifier denoted by ID and LEN, which might contain
1428 UCN escapes, to the source character set, either UTF-8 or
1429 UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
1430 cpp_hashnode *
1431 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1433 /* It turns out that a UCN escape always turns into fewer characters
1434 than the escape itself, so we can allocate a temporary in advance. */
1435 uchar * buf = alloca (len + 1);
1436 uchar * bufp = buf;
1437 size_t idp;
1439 for (idp = 0; idp < len; idp++)
1440 if (id[idp] != '\\')
1441 *bufp++ = id[idp];
1442 else
1444 unsigned length = id[idp+1] == 'u' ? 4 : 8;
1445 cppchar_t value = 0;
1446 size_t bufleft = len - (bufp - buf);
1447 int rval;
1449 idp += 2;
1450 while (length && idp < len && ISXDIGIT (id[idp]))
1452 value = (value << 4) + hex_value (id[idp]);
1453 idp++;
1454 length--;
1456 idp--;
1458 /* Special case for EBCDIC: if the identifier contains
1459 a '$' specified using a UCN, translate it to EBCDIC. */
1460 if (value == 0x24)
1462 *bufp++ = '$';
1463 continue;
1466 rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
1467 if (rval)
1469 errno = rval;
1470 cpp_errno (pfile, CPP_DL_ERROR,
1471 "converting UCN to source character set");
1472 break;
1476 return CPP_HASHNODE (ht_lookup (pfile->hash_table,
1477 buf, bufp - buf, HT_ALLOC));
1480 /* Convert an input buffer (containing the complete contents of one
1481 source file) from INPUT_CHARSET to the source character set. INPUT
1482 points to the input buffer, SIZE is its allocated size, and LEN is
1483 the length of the meaningful data within the buffer. The
1484 translated buffer is returned, and *ST_SIZE is set to the length of
1485 the meaningful data within the translated buffer.
1487 INPUT is expected to have been allocated with xmalloc. This function
1488 will either return INPUT, or free it and return a pointer to another
1489 xmalloc-allocated block of memory. */
1490 uchar *
1491 _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1492 uchar *input, size_t size, size_t len, off_t *st_size)
1494 struct cset_converter input_cset;
1495 struct _cpp_strbuf to;
1497 input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1498 if (input_cset.func == convert_no_conversion)
1500 to.text = input;
1501 to.asize = size;
1502 to.len = len;
1504 else
1506 to.asize = MAX (65536, len);
1507 to.text = xmalloc (to.asize);
1508 to.len = 0;
1510 if (!APPLY_CONVERSION (input_cset, input, len, &to))
1511 cpp_error (pfile, CPP_DL_ERROR,
1512 "failure to convert %s to %s",
1513 CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1515 free (input);
1518 /* Clean up the mess. */
1519 if (input_cset.func == convert_using_iconv)
1520 iconv_close (input_cset.cd);
1522 /* Resize buffer if we allocated substantially too much, or if we
1523 haven't enough space for the \n-terminator. */
1524 if (to.len + 4096 < to.asize || to.len >= to.asize)
1525 to.text = xrealloc (to.text, to.len + 1);
1527 /* If the file is using old-school Mac line endings (\r only),
1528 terminate with another \r, not an \n, so that we do not mistake
1529 the \r\n sequence for a single DOS line ending and erroneously
1530 issue the "No newline at end of file" diagnostic. */
1531 if (to.text[to.len - 1] == '\r')
1532 to.text[to.len] = '\r';
1533 else
1534 to.text[to.len] = '\n';
1536 *st_size = to.len;
1537 return to.text;
1540 /* Decide on the default encoding to assume for input files. */
1541 const char *
1542 _cpp_default_encoding (void)
1544 const char *current_encoding = NULL;
1546 /* We disable this because the default codeset is 7-bit ASCII on
1547 most platforms, and this causes conversion failures on every
1548 file in GCC that happens to have one of the upper 128 characters
1549 in it -- most likely, as part of the name of a contributor.
1550 We should definitely recognize in-band markers of file encoding,
1551 like:
1552 - the appropriate Unicode byte-order mark (FE FF) to recognize
1553 UTF16 and UCS4 (in both big-endian and little-endian flavors)
1554 and UTF8
1555 - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1556 distinguish ASCII and EBCDIC.
1557 - now we can parse something like "#pragma GCC encoding <xyz>
1558 on the first line, or even Emacs/VIM's mode line tags (there's
1559 a problem here in that VIM uses the last line, and Emacs has
1560 its more elaborate "local variables" convention).
1561 - investigate whether Java has another common convention, which
1562 would be friendly to support.
1563 (Zack Weinberg and Paolo Bonzini, May 20th 2004) */
1564 #if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1565 setlocale (LC_CTYPE, "");
1566 current_encoding = nl_langinfo (CODESET);
1567 #endif
1568 if (current_encoding == NULL || *current_encoding == '\0')
1569 current_encoding = SOURCE_CHARSET;
1571 return current_encoding;