* treelang.texi: Fix a typo.
[official-gcc.git] / libcpp / charset.c
blob6b6c360f73d5702fc0442c3fc82b67fa3fcfea11
1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
3 Free Software Foundation, Inc.
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
10 later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
21 #include "config.h"
22 #include "system.h"
23 #include "cpplib.h"
24 #include "internal.h"
25 #include "ucnid.h"
27 /* Character set handling for C-family languages.
29 Terminological note: In what follows, "charset" or "character set"
30 will be taken to mean both an abstract set of characters and an
31 encoding for that set.
33 The C99 standard discusses two character sets: source and execution.
34 The source character set is used for internal processing in translation
35 phases 1 through 4; the execution character set is used thereafter.
36 Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
37 character encodings (see 3.7.2, 3.7.3 for the standardese meanings
38 of these terms). Furthermore, the "basic character set" (listed in
39 5.2.1p3) is to be encoded in each with values one byte wide, and is
40 to appear in the initial shift state.
42 It is not explicitly mentioned, but there is also a "wide execution
43 character set" used to encode wide character constants and wide
44 string literals; this is supposed to be the result of applying the
45 standard library function mbstowcs() to an equivalent narrow string
46 (6.4.5p5). However, the behavior of hexadecimal and octal
47 \-escapes is at odds with this; they are supposed to be translated
48 directly to wchar_t values (6.4.4.4p5,6).
50 The source character set is not necessarily the character set used
51 to encode physical source files on disk; translation phase 1 converts
52 from whatever that encoding is to the source character set.
54 The presence of universal character names in C99 (6.4.3 et seq.)
55 forces the source character set to be isomorphic to ISO 10646,
56 that is, Unicode. There is no such constraint on the execution
57 character set; note also that the conversion from source to
58 execution character set does not occur for identifiers (5.1.1.2p1#5).
60 For convenience of implementation, the source character set's
61 encoding of the basic character set should be identical to the
62 execution character set OF THE HOST SYSTEM's encoding of the basic
63 character set, and it should not be a state-dependent encoding.
65 cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
66 depending on whether the host is based on ASCII or EBCDIC (see
67 respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
68 Technical Report #16). With limited exceptions, it relies on the
69 system library's iconv() primitive to do charset conversion
70 (specified in SUSv2). */
72 #if !HAVE_ICONV
73 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
74 below, which are guarded only by if statements with compile-time
75 constant conditions, do not cause link errors. */
76 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
77 #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
78 #define iconv_close(x) (void)0
79 #define ICONV_CONST
80 #endif
82 #if HOST_CHARSET == HOST_CHARSET_ASCII
83 #define SOURCE_CHARSET "UTF-8"
84 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
85 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
86 #define SOURCE_CHARSET "UTF-EBCDIC"
87 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
88 #else
89 #error "Unrecognized basic host character set"
90 #endif
92 #ifndef EILSEQ
93 #define EILSEQ EINVAL
94 #endif
96 /* This structure is used for a resizable string buffer throughout. */
97 /* Don't call it strbuf, as that conflicts with unistd.h on systems
98 such as DYNIX/ptx where unistd.h includes stropts.h. */
99 struct _cpp_strbuf
101 uchar *text;
102 size_t asize;
103 size_t len;
106 /* This is enough to hold any string that fits on a single 80-column
107 line, even if iconv quadruples its size (e.g. conversion from
108 ASCII to UTF-32) rounded up to a power of two. */
109 #define OUTBUF_BLOCK_SIZE 256
111 /* Conversions between UTF-8 and UTF-16/32 are implemented by custom
112 logic. This is because a depressing number of systems lack iconv,
113 or have have iconv libraries that do not do these conversions, so
114 we need a fallback implementation for them. To ensure the fallback
115 doesn't break due to neglect, it is used on all systems.
117 UTF-32 encoding is nice and simple: a four-byte binary number,
118 constrained to the range 00000000-7FFFFFFF to avoid questions of
119 signedness. We do have to cope with big- and little-endian
120 variants.
122 UTF-16 encoding uses two-byte binary numbers, again in big- and
123 little-endian variants, for all values in the 00000000-0000FFFF
124 range. Values in the 00010000-0010FFFF range are encoded as pairs
125 of two-byte numbers, called "surrogate pairs": given a number S in
126 this range, it is mapped to a pair (H, L) as follows:
128 H = (S - 0x10000) / 0x400 + 0xD800
129 L = (S - 0x10000) % 0x400 + 0xDC00
131 Two-byte values in the D800...DFFF range are ill-formed except as a
132 component of a surrogate pair. Even if the encoding within a
133 two-byte value is little-endian, the H member of the surrogate pair
134 comes first.
136 There is no way to encode values in the 00110000-7FFFFFFF range,
137 which is not currently a problem as there are no assigned code
138 points in that range; however, the author expects that it will
139 eventually become necessary to abandon UTF-16 due to this
140 limitation. Note also that, because of these pairs, UTF-16 does
141 not meet the requirements of the C standard for a wide character
142 encoding (see 3.7.3 and 6.4.4.4p11).
144 UTF-8 encoding looks like this:
146 value range encoded as
147 00000000-0000007F 0xxxxxxx
148 00000080-000007FF 110xxxxx 10xxxxxx
149 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
150 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
151 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
152 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
154 Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
155 which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
156 never occur. Note also that any value that can be encoded by a
157 given row of the table can also be encoded by all successive rows,
158 but this is not done; only the shortest possible encoding for any
159 given value is valid. For instance, the character 07C0 could be
160 encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
161 FC 80 80 80 9F 80. Only the first is valid.
163 An implementation note: the transformation from UTF-16 to UTF-8, or
164 vice versa, is easiest done by using UTF-32 as an intermediary. */
166 /* Internal primitives which go from an UTF-8 byte stream to native-endian
167 UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
168 operation in several places below. */
169 static inline int
170 one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
171 cppchar_t *cp)
173 static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
174 static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
176 cppchar_t c;
177 const uchar *inbuf = *inbufp;
178 size_t nbytes, i;
180 if (*inbytesleftp < 1)
181 return EINVAL;
183 c = *inbuf;
184 if (c < 0x80)
186 *cp = c;
187 *inbytesleftp -= 1;
188 *inbufp += 1;
189 return 0;
192 /* The number of leading 1-bits in the first byte indicates how many
193 bytes follow. */
194 for (nbytes = 2; nbytes < 7; nbytes++)
195 if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
196 goto found;
197 return EILSEQ;
198 found:
200 if (*inbytesleftp < nbytes)
201 return EINVAL;
203 c = (c & masks[nbytes-1]);
204 inbuf++;
205 for (i = 1; i < nbytes; i++)
207 cppchar_t n = *inbuf++;
208 if ((n & 0xC0) != 0x80)
209 return EILSEQ;
210 c = ((c << 6) + (n & 0x3F));
213 /* Make sure the shortest possible encoding was used. */
214 if (c <= 0x7F && nbytes > 1) return EILSEQ;
215 if (c <= 0x7FF && nbytes > 2) return EILSEQ;
216 if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
217 if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
218 if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
220 /* Make sure the character is valid. */
221 if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
223 *cp = c;
224 *inbufp = inbuf;
225 *inbytesleftp -= nbytes;
226 return 0;
229 static inline int
230 one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
232 static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
233 static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
234 size_t nbytes;
235 uchar buf[6], *p = &buf[6];
236 uchar *outbuf = *outbufp;
238 nbytes = 1;
239 if (c < 0x80)
240 *--p = c;
241 else
245 *--p = ((c & 0x3F) | 0x80);
246 c >>= 6;
247 nbytes++;
249 while (c >= 0x3F || (c & limits[nbytes-1]));
250 *--p = (c | masks[nbytes-1]);
253 if (*outbytesleftp < nbytes)
254 return E2BIG;
256 while (p < &buf[6])
257 *outbuf++ = *p++;
258 *outbytesleftp -= nbytes;
259 *outbufp = outbuf;
260 return 0;
263 /* The following four functions transform one character between the two
264 encodings named in the function name. All have the signature
265 int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
266 uchar **outbufp, size_t *outbytesleftp)
268 BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
269 interpreted as a boolean indicating whether big-endian or
270 little-endian encoding is to be used for the member of the pair
271 that is not UTF-8.
273 INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
274 do for iconv.
276 The return value is either 0 for success, or an errno value for
277 failure, which may be E2BIG (need more space), EILSEQ (ill-formed
278 input sequence), ir EINVAL (incomplete input sequence). */
280 static inline int
281 one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
282 uchar **outbufp, size_t *outbytesleftp)
284 uchar *outbuf;
285 cppchar_t s = 0;
286 int rval;
288 /* Check for space first, since we know exactly how much we need. */
289 if (*outbytesleftp < 4)
290 return E2BIG;
292 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
293 if (rval)
294 return rval;
296 outbuf = *outbufp;
297 outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
298 outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
299 outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
300 outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
302 *outbufp += 4;
303 *outbytesleftp -= 4;
304 return 0;
307 static inline int
308 one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
309 uchar **outbufp, size_t *outbytesleftp)
311 cppchar_t s;
312 int rval;
313 const uchar *inbuf;
315 if (*inbytesleftp < 4)
316 return EINVAL;
318 inbuf = *inbufp;
320 s = inbuf[bigend ? 0 : 3] << 24;
321 s += inbuf[bigend ? 1 : 2] << 16;
322 s += inbuf[bigend ? 2 : 1] << 8;
323 s += inbuf[bigend ? 3 : 0];
325 if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
326 return EILSEQ;
328 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
329 if (rval)
330 return rval;
332 *inbufp += 4;
333 *inbytesleftp -= 4;
334 return 0;
337 static inline int
338 one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
339 uchar **outbufp, size_t *outbytesleftp)
341 int rval;
342 cppchar_t s = 0;
343 const uchar *save_inbuf = *inbufp;
344 size_t save_inbytesleft = *inbytesleftp;
345 uchar *outbuf = *outbufp;
347 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
348 if (rval)
349 return rval;
351 if (s > 0x0010FFFF)
353 *inbufp = save_inbuf;
354 *inbytesleftp = save_inbytesleft;
355 return EILSEQ;
358 if (s < 0xFFFF)
360 if (*outbytesleftp < 2)
362 *inbufp = save_inbuf;
363 *inbytesleftp = save_inbytesleft;
364 return E2BIG;
366 outbuf[bigend ? 1 : 0] = (s & 0x00FF);
367 outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
369 *outbufp += 2;
370 *outbytesleftp -= 2;
371 return 0;
373 else
375 cppchar_t hi, lo;
377 if (*outbytesleftp < 4)
379 *inbufp = save_inbuf;
380 *inbytesleftp = save_inbytesleft;
381 return E2BIG;
384 hi = (s - 0x10000) / 0x400 + 0xD800;
385 lo = (s - 0x10000) % 0x400 + 0xDC00;
387 /* Even if we are little-endian, put the high surrogate first.
388 ??? Matches practice? */
389 outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
390 outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
391 outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
392 outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
394 *outbufp += 4;
395 *outbytesleftp -= 4;
396 return 0;
400 static inline int
401 one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
402 uchar **outbufp, size_t *outbytesleftp)
404 cppchar_t s;
405 const uchar *inbuf = *inbufp;
406 int rval;
408 if (*inbytesleftp < 2)
409 return EINVAL;
410 s = inbuf[bigend ? 0 : 1] << 8;
411 s += inbuf[bigend ? 1 : 0];
413 /* Low surrogate without immediately preceding high surrogate is invalid. */
414 if (s >= 0xDC00 && s <= 0xDFFF)
415 return EILSEQ;
416 /* High surrogate must have a following low surrogate. */
417 else if (s >= 0xD800 && s <= 0xDBFF)
419 cppchar_t hi = s, lo;
420 if (*inbytesleftp < 4)
421 return EINVAL;
423 lo = inbuf[bigend ? 2 : 3] << 8;
424 lo += inbuf[bigend ? 3 : 2];
426 if (lo < 0xDC00 || lo > 0xDFFF)
427 return EILSEQ;
429 s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
432 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
433 if (rval)
434 return rval;
436 /* Success - update the input pointers (one_cppchar_to_utf8 has done
437 the output pointers for us). */
438 if (s <= 0xFFFF)
440 *inbufp += 2;
441 *inbytesleftp -= 2;
443 else
445 *inbufp += 4;
446 *inbytesleftp -= 4;
448 return 0;
451 /* Helper routine for the next few functions. The 'const' on
452 one_conversion means that we promise not to modify what function is
453 pointed to, which lets the inliner see through it. */
455 static inline bool
456 conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
457 uchar **, size_t *),
458 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
460 const uchar *inbuf;
461 uchar *outbuf;
462 size_t inbytesleft, outbytesleft;
463 int rval;
465 inbuf = from;
466 inbytesleft = flen;
467 outbuf = to->text + to->len;
468 outbytesleft = to->asize - to->len;
470 for (;;)
473 rval = one_conversion (cd, &inbuf, &inbytesleft,
474 &outbuf, &outbytesleft);
475 while (inbytesleft && !rval);
477 if (__builtin_expect (inbytesleft == 0, 1))
479 to->len = to->asize - outbytesleft;
480 return true;
482 if (rval != E2BIG)
484 errno = rval;
485 return false;
488 outbytesleft += OUTBUF_BLOCK_SIZE;
489 to->asize += OUTBUF_BLOCK_SIZE;
490 to->text = xrealloc (to->text, to->asize);
491 outbuf = to->text + to->asize - outbytesleft;
496 /* These functions convert entire strings between character sets.
497 They all have the signature
499 bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
501 The input string FROM is converted as specified by the function
502 name plus the iconv descriptor CD (which may be fake), and the
503 result appended to TO. On any error, false is returned, otherwise true. */
505 /* These four use the custom conversion code above. */
506 static bool
507 convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
508 struct _cpp_strbuf *to)
510 return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
513 static bool
514 convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
515 struct _cpp_strbuf *to)
517 return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
520 static bool
521 convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
522 struct _cpp_strbuf *to)
524 return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
527 static bool
528 convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
529 struct _cpp_strbuf *to)
531 return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
534 /* Identity conversion, used when we have no alternative. */
535 static bool
536 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
537 const uchar *from, size_t flen, struct _cpp_strbuf *to)
539 if (to->len + flen > to->asize)
541 to->asize = to->len + flen;
542 to->text = xrealloc (to->text, to->asize);
544 memcpy (to->text + to->len, from, flen);
545 to->len += flen;
546 return true;
549 /* And this one uses the system iconv primitive. It's a little
550 different, since iconv's interface is a little different. */
551 #if HAVE_ICONV
552 static bool
553 convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
554 struct _cpp_strbuf *to)
556 ICONV_CONST char *inbuf;
557 char *outbuf;
558 size_t inbytesleft, outbytesleft;
560 /* Reset conversion descriptor and check that it is valid. */
561 if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
562 return false;
564 inbuf = (ICONV_CONST char *)from;
565 inbytesleft = flen;
566 outbuf = (char *)to->text + to->len;
567 outbytesleft = to->asize - to->len;
569 for (;;)
571 iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
572 if (__builtin_expect (inbytesleft == 0, 1))
574 to->len = to->asize - outbytesleft;
575 return true;
577 if (errno != E2BIG)
578 return false;
580 outbytesleft += OUTBUF_BLOCK_SIZE;
581 to->asize += OUTBUF_BLOCK_SIZE;
582 to->text = xrealloc (to->text, to->asize);
583 outbuf = (char *)to->text + to->asize - outbytesleft;
586 #else
587 #define convert_using_iconv 0 /* prevent undefined symbol error below */
588 #endif
590 /* Arrange for the above custom conversion logic to be used automatically
591 when conversion between a suitable pair of character sets is requested. */
593 #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
594 CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
596 struct conversion
598 const char *pair;
599 convert_f func;
600 iconv_t fake_cd;
602 static const struct conversion conversion_tab[] = {
603 { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
604 { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
605 { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
606 { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
607 { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
608 { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
609 { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
610 { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
613 /* Subroutine of cpp_init_iconv: initialize and return a
614 cset_converter structure for conversion from FROM to TO. If
615 iconv_open() fails, issue an error and return an identity
616 converter. Silently return an identity converter if FROM and TO
617 are identical. */
618 static struct cset_converter
619 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
621 struct cset_converter ret;
622 char *pair;
623 size_t i;
625 if (!strcasecmp (to, from))
627 ret.func = convert_no_conversion;
628 ret.cd = (iconv_t) -1;
629 return ret;
632 pair = alloca(strlen(to) + strlen(from) + 2);
634 strcpy(pair, from);
635 strcat(pair, "/");
636 strcat(pair, to);
637 for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
638 if (!strcasecmp (pair, conversion_tab[i].pair))
640 ret.func = conversion_tab[i].func;
641 ret.cd = conversion_tab[i].fake_cd;
642 return ret;
645 /* No custom converter - try iconv. */
646 if (HAVE_ICONV)
648 ret.func = convert_using_iconv;
649 ret.cd = iconv_open (to, from);
651 if (ret.cd == (iconv_t) -1)
653 if (errno == EINVAL)
654 cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
655 "conversion from %s to %s not supported by iconv",
656 from, to);
657 else
658 cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
660 ret.func = convert_no_conversion;
663 else
665 cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
666 "no iconv implementation, cannot convert from %s to %s",
667 from, to);
668 ret.func = convert_no_conversion;
669 ret.cd = (iconv_t) -1;
671 return ret;
674 /* If charset conversion is requested, initialize iconv(3) descriptors
675 for conversion from the source character set to the execution
676 character sets. If iconv is not present in the C library, and
677 conversion is requested, issue an error. */
679 void
680 cpp_init_iconv (cpp_reader *pfile)
682 const char *ncset = CPP_OPTION (pfile, narrow_charset);
683 const char *wcset = CPP_OPTION (pfile, wide_charset);
684 const char *default_wcset;
686 bool be = CPP_OPTION (pfile, bytes_big_endian);
688 if (CPP_OPTION (pfile, wchar_precision) >= 32)
689 default_wcset = be ? "UTF-32BE" : "UTF-32LE";
690 else if (CPP_OPTION (pfile, wchar_precision) >= 16)
691 default_wcset = be ? "UTF-16BE" : "UTF-16LE";
692 else
693 /* This effectively means that wide strings are not supported,
694 so don't do any conversion at all. */
695 default_wcset = SOURCE_CHARSET;
697 if (!ncset)
698 ncset = SOURCE_CHARSET;
699 if (!wcset)
700 wcset = default_wcset;
702 pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
703 pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
706 /* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
707 void
708 _cpp_destroy_iconv (cpp_reader *pfile)
710 if (HAVE_ICONV)
712 if (pfile->narrow_cset_desc.func == convert_using_iconv)
713 iconv_close (pfile->narrow_cset_desc.cd);
714 if (pfile->wide_cset_desc.func == convert_using_iconv)
715 iconv_close (pfile->wide_cset_desc.cd);
719 /* Utility routine for use by a full compiler. C is a character taken
720 from the *basic* source character set, encoded in the host's
721 execution encoding. Convert it to (the target's) execution
722 encoding, and return that value.
724 Issues an internal error if C's representation in the narrow
725 execution character set fails to be a single-byte value (C99
726 5.2.1p3: "The representation of each member of the source and
727 execution character sets shall fit in a byte.") May also issue an
728 internal error if C fails to be a member of the basic source
729 character set (testing this exactly is too hard, especially when
730 the host character set is EBCDIC). */
731 cppchar_t
732 cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
734 uchar sbuf[1];
735 struct _cpp_strbuf tbuf;
737 /* This test is merely an approximation, but it suffices to catch
738 the most important thing, which is that we don't get handed a
739 character outside the unibyte range of the host character set. */
740 if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
742 cpp_error (pfile, CPP_DL_ICE,
743 "character 0x%lx is not in the basic source character set\n",
744 (unsigned long)c);
745 return 0;
748 /* Being a character in the unibyte range of the host character set,
749 we can safely splat it into a one-byte buffer and trust that that
750 is a well-formed string. */
751 sbuf[0] = c;
753 /* This should never need to reallocate, but just in case... */
754 tbuf.asize = 1;
755 tbuf.text = xmalloc (tbuf.asize);
756 tbuf.len = 0;
758 if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
760 cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
761 return 0;
763 if (tbuf.len != 1)
765 cpp_error (pfile, CPP_DL_ICE,
766 "character 0x%lx is not unibyte in execution character set",
767 (unsigned long)c);
768 return 0;
770 c = tbuf.text[0];
771 free(tbuf.text);
772 return c;
777 /* Utility routine that computes a mask of the form 0000...111... with
778 WIDTH 1-bits. */
779 static inline size_t
780 width_to_mask (size_t width)
782 width = MIN (width, BITS_PER_CPPCHAR_T);
783 if (width >= CHAR_BIT * sizeof (size_t))
784 return ~(size_t) 0;
785 else
786 return ((size_t) 1 << width) - 1;
789 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
790 the start of an identifier, and 0 if C is not valid in an
791 identifier. We assume C has already gone through the checks of
792 _cpp_valid_ucn. The algorithm is a simple binary search on the
793 table defined in cppucnid.h. */
795 static int
796 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
798 int mn, mx, md;
800 mn = -1;
801 mx = ARRAY_SIZE (ucnranges);
802 while (mx - mn > 1)
804 md = (mn + mx) / 2;
805 if (c < ucnranges[md].lo)
806 mx = md;
807 else if (c > ucnranges[md].hi)
808 mn = md;
809 else
810 goto found;
812 return 0;
814 found:
815 /* When -pedantic, we require the character to have been listed by
816 the standard for the current language. Otherwise, we accept the
817 union of the acceptable sets for C++98 and C99. */
818 if (CPP_PEDANTIC (pfile)
819 && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
820 || (CPP_OPTION (pfile, cplusplus)
821 && !(ucnranges[md].flags & CXX))))
822 return 0;
824 /* In C99, UCN digits may not begin identifiers. */
825 if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
826 return 2;
828 return 1;
831 /* [lex.charset]: The character designated by the universal character
832 name \UNNNNNNNN is that character whose character short name in
833 ISO/IEC 10646 is NNNNNNNN; the character designated by the
834 universal character name \uNNNN is that character whose character
835 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
836 for a universal character name is less than 0x20 or in the range
837 0x7F-0x9F (inclusive), or if the universal character name
838 designates a character in the basic source character set, then the
839 program is ill-formed.
841 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
842 buffer end is delimited by a non-hex digit. Returns zero if UCNs
843 are not part of the relevant standard, or if the string beginning
844 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
846 Otherwise the nonzero value of the UCN, whether valid or invalid,
847 is returned. Diagnostics are emitted for invalid values. PSTR
848 is updated to point one beyond the UCN, or to the syntactically
849 invalid character.
851 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
852 an identifier, or 2 otherwise. */
854 cppchar_t
855 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
856 const uchar *limit, int identifier_pos)
858 cppchar_t result, c;
859 unsigned int length;
860 const uchar *str = *pstr;
861 const uchar *base = str - 2;
863 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
864 cpp_error (pfile, CPP_DL_WARNING,
865 "universal character names are only valid in C++ and C99");
866 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
867 cpp_error (pfile, CPP_DL_WARNING,
868 "the meaning of '\\%c' is different in traditional C",
869 (int) str[-1]);
871 if (str[-1] == 'u')
872 length = 4;
873 else if (str[-1] == 'U')
874 length = 8;
875 else
876 abort();
878 result = 0;
881 c = *str;
882 if (!ISXDIGIT (c))
883 break;
884 str++;
885 result = (result << 4) + hex_value (c);
887 while (--length && str < limit);
889 *pstr = str;
890 if (length)
892 /* We'll error when we try it out as the start of an identifier. */
893 cpp_error (pfile, CPP_DL_ERROR,
894 "incomplete universal character name %.*s",
895 (int) (str - base), base);
896 result = 1;
898 /* The standard permits $, @ and ` to be specified as UCNs. We use
899 hex escapes so that this also works with EBCDIC hosts. */
900 else if ((result < 0xa0
901 && (result != 0x24 && result != 0x40 && result != 0x60))
902 || (result & 0x80000000)
903 || (result >= 0xD800 && result <= 0xDFFF))
905 cpp_error (pfile, CPP_DL_ERROR,
906 "%.*s is not a valid universal character",
907 (int) (str - base), base);
908 result = 1;
910 else if (identifier_pos)
912 int validity = ucn_valid_in_identifier (pfile, result);
914 if (validity == 0)
915 cpp_error (pfile, CPP_DL_ERROR,
916 "universal character %.*s is not valid in an identifier",
917 (int) (str - base), base);
918 else if (validity == 2 && identifier_pos == 1)
919 cpp_error (pfile, CPP_DL_ERROR,
920 "universal character %.*s is not valid at the start of an identifier",
921 (int) (str - base), base);
924 if (result == 0)
925 result = 1;
927 return result;
930 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
931 it to the execution character set and write the result into TBUF.
932 An advanced pointer is returned. Issues all relevant diagnostics. */
933 static const uchar *
934 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
935 struct _cpp_strbuf *tbuf, bool wide)
937 cppchar_t ucn;
938 uchar buf[6];
939 uchar *bufp = buf;
940 size_t bytesleft = 6;
941 int rval;
942 struct cset_converter cvt
943 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
945 from++; /* Skip u/U. */
946 ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
948 rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
949 if (rval)
951 errno = rval;
952 cpp_errno (pfile, CPP_DL_ERROR,
953 "converting UCN to source character set");
955 else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
956 cpp_errno (pfile, CPP_DL_ERROR,
957 "converting UCN to execution character set");
959 return from;
962 /* Subroutine of convert_hex and convert_oct. N is the representation
963 in the execution character set of a numeric escape; write it into the
964 string buffer TBUF and update the end-of-string pointer therein. WIDE
965 is true if it's a wide string that's being assembled in TBUF. This
966 function issues no diagnostics and never fails. */
967 static void
968 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
969 struct _cpp_strbuf *tbuf, bool wide)
971 if (wide)
973 /* We have to render this into the target byte order, which may not
974 be our byte order. */
975 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
976 size_t width = CPP_OPTION (pfile, wchar_precision);
977 size_t cwidth = CPP_OPTION (pfile, char_precision);
978 size_t cmask = width_to_mask (cwidth);
979 size_t nbwc = width / cwidth;
980 size_t i;
981 size_t off = tbuf->len;
982 cppchar_t c;
984 if (tbuf->len + nbwc > tbuf->asize)
986 tbuf->asize += OUTBUF_BLOCK_SIZE;
987 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
990 for (i = 0; i < nbwc; i++)
992 c = n & cmask;
993 n >>= cwidth;
994 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
996 tbuf->len += nbwc;
998 else
1000 /* Note: this code does not handle the case where the target
1001 and host have a different number of bits in a byte. */
1002 if (tbuf->len + 1 > tbuf->asize)
1004 tbuf->asize += OUTBUF_BLOCK_SIZE;
1005 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
1007 tbuf->text[tbuf->len++] = n;
1011 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
1012 character set and write it into the string buffer TBUF. Returns an
1013 advanced pointer, and issues diagnostics as necessary.
1014 No character set translation occurs; this routine always produces the
1015 execution-set character with numeric value equal to the given hex
1016 number. You can, e.g. generate surrogate pairs this way. */
1017 static const uchar *
1018 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1019 struct _cpp_strbuf *tbuf, bool wide)
1021 cppchar_t c, n = 0, overflow = 0;
1022 int digits_found = 0;
1023 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1024 : CPP_OPTION (pfile, char_precision));
1025 size_t mask = width_to_mask (width);
1027 if (CPP_WTRADITIONAL (pfile))
1028 cpp_error (pfile, CPP_DL_WARNING,
1029 "the meaning of '\\x' is different in traditional C");
1031 from++; /* Skip 'x'. */
1032 while (from < limit)
1034 c = *from;
1035 if (! hex_p (c))
1036 break;
1037 from++;
1038 overflow |= n ^ (n << 4 >> 4);
1039 n = (n << 4) + hex_value (c);
1040 digits_found = 1;
1043 if (!digits_found)
1045 cpp_error (pfile, CPP_DL_ERROR,
1046 "\\x used with no following hex digits");
1047 return from;
1050 if (overflow | (n != (n & mask)))
1052 cpp_error (pfile, CPP_DL_PEDWARN,
1053 "hex escape sequence out of range");
1054 n &= mask;
1057 emit_numeric_escape (pfile, n, tbuf, wide);
1059 return from;
1062 /* Convert an octal escape, pointed to by FROM, to the execution
1063 character set and write it into the string buffer TBUF. Returns an
1064 advanced pointer, and issues diagnostics as necessary.
1065 No character set translation occurs; this routine always produces the
1066 execution-set character with numeric value equal to the given octal
1067 number. */
1068 static const uchar *
1069 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1070 struct _cpp_strbuf *tbuf, bool wide)
1072 size_t count = 0;
1073 cppchar_t c, n = 0;
1074 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1075 : CPP_OPTION (pfile, char_precision));
1076 size_t mask = width_to_mask (width);
1077 bool overflow = false;
1079 while (from < limit && count++ < 3)
1081 c = *from;
1082 if (c < '0' || c > '7')
1083 break;
1084 from++;
1085 overflow |= n ^ (n << 3 >> 3);
1086 n = (n << 3) + c - '0';
1089 if (n != (n & mask))
1091 cpp_error (pfile, CPP_DL_PEDWARN,
1092 "octal escape sequence out of range");
1093 n &= mask;
1096 emit_numeric_escape (pfile, n, tbuf, wide);
1098 return from;
1101 /* Convert an escape sequence (pointed to by FROM) to its value on
1102 the target, and to the execution character set. Do not scan past
1103 LIMIT. Write the converted value into TBUF. Returns an advanced
1104 pointer. Handles all relevant diagnostics. */
1105 static const uchar *
1106 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1107 struct _cpp_strbuf *tbuf, bool wide)
1109 /* Values of \a \b \e \f \n \r \t \v respectively. */
1110 #if HOST_CHARSET == HOST_CHARSET_ASCII
1111 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
1112 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1113 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
1114 #else
1115 #error "unknown host character set"
1116 #endif
1118 uchar c;
1119 struct cset_converter cvt
1120 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1122 c = *from;
1123 switch (c)
1125 /* UCNs, hex escapes, and octal escapes are processed separately. */
1126 case 'u': case 'U':
1127 return convert_ucn (pfile, from, limit, tbuf, wide);
1129 case 'x':
1130 return convert_hex (pfile, from, limit, tbuf, wide);
1131 break;
1133 case '0': case '1': case '2': case '3':
1134 case '4': case '5': case '6': case '7':
1135 return convert_oct (pfile, from, limit, tbuf, wide);
1137 /* Various letter escapes. Get the appropriate host-charset
1138 value into C. */
1139 case '\\': case '\'': case '"': case '?': break;
1141 case '(': case '{': case '[': case '%':
1142 /* '\(', etc, can be used at the beginning of a line in a long
1143 string split onto multiple lines with \-newline, to prevent
1144 Emacs or other text editors from getting confused. '\%' can
1145 be used to prevent SCCS from mangling printf format strings. */
1146 if (CPP_PEDANTIC (pfile))
1147 goto unknown;
1148 break;
1150 case 'b': c = charconsts[1]; break;
1151 case 'f': c = charconsts[3]; break;
1152 case 'n': c = charconsts[4]; break;
1153 case 'r': c = charconsts[5]; break;
1154 case 't': c = charconsts[6]; break;
1155 case 'v': c = charconsts[7]; break;
1157 case 'a':
1158 if (CPP_WTRADITIONAL (pfile))
1159 cpp_error (pfile, CPP_DL_WARNING,
1160 "the meaning of '\\a' is different in traditional C");
1161 c = charconsts[0];
1162 break;
1164 case 'e': case 'E':
1165 if (CPP_PEDANTIC (pfile))
1166 cpp_error (pfile, CPP_DL_PEDWARN,
1167 "non-ISO-standard escape sequence, '\\%c'", (int) c);
1168 c = charconsts[2];
1169 break;
1171 default:
1172 unknown:
1173 if (ISGRAPH (c))
1174 cpp_error (pfile, CPP_DL_PEDWARN,
1175 "unknown escape sequence '\\%c'", (int) c);
1176 else
1177 cpp_error (pfile, CPP_DL_PEDWARN,
1178 "unknown escape sequence: '\\%03o'", (int) c);
1181 /* Now convert what we have to the execution character set. */
1182 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1183 cpp_errno (pfile, CPP_DL_ERROR,
1184 "converting escape sequence to execution character set");
1186 return from + 1;
1189 /* FROM is an array of cpp_string structures of length COUNT. These
1190 are to be converted from the source to the execution character set,
1191 escape sequences translated, and finally all are to be
1192 concatenated. WIDE indicates whether or not to produce a wide
1193 string. The result is written into TO. Returns true for success,
1194 false for failure. */
1195 bool
1196 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1197 cpp_string *to, bool wide)
1199 struct _cpp_strbuf tbuf;
1200 const uchar *p, *base, *limit;
1201 size_t i;
1202 struct cset_converter cvt
1203 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1205 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1206 tbuf.text = xmalloc (tbuf.asize);
1207 tbuf.len = 0;
1209 for (i = 0; i < count; i++)
1211 p = from[i].text;
1212 if (*p == 'L') p++;
1213 p++; /* Skip leading quote. */
1214 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
1216 for (;;)
1218 base = p;
1219 while (p < limit && *p != '\\')
1220 p++;
1221 if (p > base)
1223 /* We have a run of normal characters; these can be fed
1224 directly to convert_cset. */
1225 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1226 goto fail;
1228 if (p == limit)
1229 break;
1231 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1234 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1235 structure. */
1236 emit_numeric_escape (pfile, 0, &tbuf, wide);
1237 tbuf.text = xrealloc (tbuf.text, tbuf.len);
1238 to->text = tbuf.text;
1239 to->len = tbuf.len;
1240 return true;
1242 fail:
1243 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1244 free (tbuf.text);
1245 return false;
1248 /* Subroutine of do_line and do_linemarker. Convert escape sequences
1249 in a string, but do not perform character set conversion. */
1250 bool
1251 cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1252 size_t count, cpp_string *to, bool wide)
1254 struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1255 bool retval;
1257 pfile->narrow_cset_desc.func = convert_no_conversion;
1258 pfile->narrow_cset_desc.cd = (iconv_t) -1;
1260 retval = cpp_interpret_string (pfile, from, count, to, wide);
1262 pfile->narrow_cset_desc = save_narrow_cset_desc;
1263 return retval;
1267 /* Subroutine of cpp_interpret_charconst which performs the conversion
1268 to a number, for narrow strings. STR is the string structure returned
1269 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
1270 cpp_interpret_charconst. */
1271 static cppchar_t
1272 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1273 unsigned int *pchars_seen, int *unsignedp)
1275 size_t width = CPP_OPTION (pfile, char_precision);
1276 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1277 size_t mask = width_to_mask (width);
1278 size_t i;
1279 cppchar_t result, c;
1280 bool unsigned_p;
1282 /* The value of a multi-character character constant, or a
1283 single-character character constant whose representation in the
1284 execution character set is more than one byte long, is
1285 implementation defined. This implementation defines it to be the
1286 number formed by interpreting the byte sequence in memory as a
1287 big-endian binary number. If overflow occurs, the high bytes are
1288 lost, and a warning is issued.
1290 We don't want to process the NUL terminator handed back by
1291 cpp_interpret_string. */
1292 result = 0;
1293 for (i = 0; i < str.len - 1; i++)
1295 c = str.text[i] & mask;
1296 if (width < BITS_PER_CPPCHAR_T)
1297 result = (result << width) | c;
1298 else
1299 result = c;
1302 if (i > max_chars)
1304 i = max_chars;
1305 cpp_error (pfile, CPP_DL_WARNING,
1306 "character constant too long for its type");
1308 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1309 cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
1311 /* Multichar constants are of type int and therefore signed. */
1312 if (i > 1)
1313 unsigned_p = 0;
1314 else
1315 unsigned_p = CPP_OPTION (pfile, unsigned_char);
1317 /* Truncate the constant to its natural width, and simultaneously
1318 sign- or zero-extend to the full width of cppchar_t.
1319 For single-character constants, the value is WIDTH bits wide.
1320 For multi-character constants, the value is INT_PRECISION bits wide. */
1321 if (i > 1)
1322 width = CPP_OPTION (pfile, int_precision);
1323 if (width < BITS_PER_CPPCHAR_T)
1325 mask = ((cppchar_t) 1 << width) - 1;
1326 if (unsigned_p || !(result & (1 << (width - 1))))
1327 result &= mask;
1328 else
1329 result |= ~mask;
1331 *pchars_seen = i;
1332 *unsignedp = unsigned_p;
1333 return result;
1336 /* Subroutine of cpp_interpret_charconst which performs the conversion
1337 to a number, for wide strings. STR is the string structure returned
1338 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
1339 cpp_interpret_charconst. */
1340 static cppchar_t
1341 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1342 unsigned int *pchars_seen, int *unsignedp)
1344 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1345 size_t width = CPP_OPTION (pfile, wchar_precision);
1346 size_t cwidth = CPP_OPTION (pfile, char_precision);
1347 size_t mask = width_to_mask (width);
1348 size_t cmask = width_to_mask (cwidth);
1349 size_t nbwc = width / cwidth;
1350 size_t off, i;
1351 cppchar_t result = 0, c;
1353 /* This is finicky because the string is in the target's byte order,
1354 which may not be our byte order. Only the last character, ignoring
1355 the NUL terminator, is relevant. */
1356 off = str.len - (nbwc * 2);
1357 result = 0;
1358 for (i = 0; i < nbwc; i++)
1360 c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1361 result = (result << cwidth) | (c & cmask);
1364 /* Wide character constants have type wchar_t, and a single
1365 character exactly fills a wchar_t, so a multi-character wide
1366 character constant is guaranteed to overflow. */
1367 if (off > 0)
1368 cpp_error (pfile, CPP_DL_WARNING,
1369 "character constant too long for its type");
1371 /* Truncate the constant to its natural width, and simultaneously
1372 sign- or zero-extend to the full width of cppchar_t. */
1373 if (width < BITS_PER_CPPCHAR_T)
1375 if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1376 result &= mask;
1377 else
1378 result |= ~mask;
1381 *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1382 *pchars_seen = 1;
1383 return result;
1386 /* Interpret a (possibly wide) character constant in TOKEN.
1387 PCHARS_SEEN points to a variable that is filled in with the number
1388 of characters seen, and UNSIGNEDP to a variable that indicates
1389 whether the result has signed type. */
1390 cppchar_t
1391 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1392 unsigned int *pchars_seen, int *unsignedp)
1394 cpp_string str = { 0, 0 };
1395 bool wide = (token->type == CPP_WCHAR);
1396 cppchar_t result;
1398 /* an empty constant will appear as L'' or '' */
1399 if (token->val.str.len == (size_t) (2 + wide))
1401 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1402 return 0;
1404 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1405 return 0;
1407 if (wide)
1408 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1409 else
1410 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1412 if (str.text != token->val.str.text)
1413 free ((void *)str.text);
1415 return result;
1418 /* Convert an input buffer (containing the complete contents of one
1419 source file) from INPUT_CHARSET to the source character set. INPUT
1420 points to the input buffer, SIZE is its allocated size, and LEN is
1421 the length of the meaningful data within the buffer. The
1422 translated buffer is returned, and *ST_SIZE is set to the length of
1423 the meaningful data within the translated buffer.
1425 INPUT is expected to have been allocated with xmalloc. This function
1426 will either return INPUT, or free it and return a pointer to another
1427 xmalloc-allocated block of memory. */
1428 uchar *
1429 _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1430 uchar *input, size_t size, size_t len, off_t *st_size)
1432 struct cset_converter input_cset;
1433 struct _cpp_strbuf to;
1435 input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1436 if (input_cset.func == convert_no_conversion)
1438 to.text = input;
1439 to.asize = size;
1440 to.len = len;
1442 else
1444 to.asize = MAX (65536, len);
1445 to.text = xmalloc (to.asize);
1446 to.len = 0;
1448 if (!APPLY_CONVERSION (input_cset, input, len, &to))
1449 cpp_error (pfile, CPP_DL_ERROR,
1450 "failure to convert %s to %s",
1451 CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1453 free (input);
1456 /* Clean up the mess. */
1457 if (input_cset.func == convert_using_iconv)
1458 iconv_close (input_cset.cd);
1460 /* Resize buffer if we allocated substantially too much, or if we
1461 haven't enough space for the \n-terminator. */
1462 if (to.len + 4096 < to.asize || to.len >= to.asize)
1463 to.text = xrealloc (to.text, to.len + 1);
1465 /* If the file is using old-school Mac line endings (\r only),
1466 terminate with another \r, not an \n, so that we do not mistake
1467 the \r\n sequence for a single DOS line ending and erroneously
1468 issue the "No newline at end of file" diagnostic. */
1469 if (to.text[to.len - 1] == '\r')
1470 to.text[to.len] = '\r';
1471 else
1472 to.text[to.len] = '\n';
1474 *st_size = to.len;
1475 return to.text;
1478 /* Decide on the default encoding to assume for input files. */
1479 const char *
1480 _cpp_default_encoding (void)
1482 const char *current_encoding = NULL;
1484 /* We disable this because the default codeset is 7-bit ASCII on
1485 most platforms, and this causes conversion failures on every
1486 file in GCC that happens to have one of the upper 128 characters
1487 in it -- most likely, as part of the name of a contributor.
1488 We should definitely recognize in-band markers of file encoding,
1489 like:
1490 - the appropriate Unicode byte-order mark (FE FF) to recognize
1491 UTF16 and UCS4 (in both big-endian and little-endian flavors)
1492 and UTF8
1493 - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1494 distinguish ASCII and EBCDIC.
1495 - now we can parse something like "#pragma GCC encoding <xyz>
1496 on the first line, or even Emacs/VIM's mode line tags (there's
1497 a problem here in that VIM uses the last line, and Emacs has
1498 its more elaborate "local variables" convention).
1499 - investigate whether Java has another common convention, which
1500 would be friendly to support.
1501 (Zack Weinberg and Paolo Bonzini, May 20th 2004) */
1502 #if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1503 setlocale (LC_CTYPE, "");
1504 current_encoding = nl_langinfo (CODESET);
1505 #endif
1506 if (current_encoding == NULL || *current_encoding == '\0')
1507 current_encoding = SOURCE_CHARSET;
1509 return current_encoding;