include/ChangeLog:
[official-gcc.git] / gcc / cppcharset.c
blobf506ba2bc1bcb9347bae4a65bed2c43a8698694d
1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
10 later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
28 static int ucn_valid_in_identifier (cpp_reader *, cppchar_t);
30 /* [lex.charset]: The character designated by the universal character
31 name \UNNNNNNNN is that character whose character short name in
32 ISO/IEC 10646 is NNNNNNNN; the character designated by the
33 universal character name \uNNNN is that character whose character
34 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
35 for a universal character name is less than 0x20 or in the range
36 0x7F-0x9F (inclusive), or if the universal character name
37 designates a character in the basic source character set, then the
38 program is ill-formed.
40 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
41 buffer end is delimited by a non-hex digit. Returns zero if UCNs
42 are not part of the relevant standard, or if the string beginning
43 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
45 Otherwise the nonzero value of the UCN, whether valid or invalid,
46 is returned. Diagnostics are emitted for invalid values. PSTR
47 is updated to point one beyond the UCN, or to the syntactically
48 invalid character.
50 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
51 an identifier, or 2 otherwise.
54 cppchar_t
55 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, int identifier_pos)
57 cppchar_t result, c;
58 unsigned int length;
59 const uchar *str = *pstr;
60 const uchar *base = str - 2;
62 /* Only attempt to interpret a UCS for C++ and C99. */
63 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
64 return 0;
66 /* We don't accept UCNs for an EBCDIC target. */
67 if (CPP_OPTION (pfile, EBCDIC))
68 return 0;
70 if (str[-1] == 'u')
71 length = 4;
72 else if (str[-1] == 'U')
73 length = 8;
74 else
75 abort();
77 result = 0;
80 c = *str;
81 if (!ISXDIGIT (c))
82 break;
83 str++;
84 result = (result << 4) + hex_value (c);
86 while (--length);
88 *pstr = str;
89 if (length)
90 /* We'll error when we try it out as the start of an identifier. */
91 cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
92 (int) (str - base), base);
93 /* The standard permits $, @ and ` to be specified as UCNs. We use
94 hex escapes so that this also works with EBCDIC hosts. */
95 else if ((result < 0xa0
96 && (result != 0x24 && result != 0x40 && result != 0x60))
97 || (result & 0x80000000)
98 || (result >= 0xD800 && result <= 0xDFFF))
100 cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character",
101 (int) (str - base), base);
103 else if (identifier_pos)
105 int validity = ucn_valid_in_identifier (pfile, result);
107 if (validity == 0)
108 cpp_error (pfile, DL_ERROR,
109 "universal character %.*s is not valid in an identifier",
110 (int) (str - base), base);
111 else if (validity == 2 && identifier_pos == 1)
112 cpp_error (pfile, DL_ERROR,
113 "universal character %.*s is not valid at the start of an identifier",
114 (int) (str - base), base);
117 if (result == 0)
118 result = 1;
120 return result;
123 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
124 the start of an identifier, and 0 if C is not valid in an
125 identifier. We assume C has already gone through the checks of
126 _cpp_valid_ucn. */
127 static int
128 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
130 /* None of the valid chars are outside the Basic Multilingual Plane (the
131 low 16 bits). */
132 if (c > 0xffff)
133 return 0;
135 if (CPP_OPTION (pfile, c99) || !CPP_PEDANTIC (pfile))
137 /* Latin. */
138 if (c == 0x0aa || c == 0x00ba || c == 0x207f || c == 0x1e9b)
139 return 1;
141 /* Greek. */
142 if (c == 0x0386)
143 return 1;
145 /* Cyrillic. */
146 if (c == 0x040c)
147 return 1;
149 /* Hebrew. */
150 if ((c >= 0x05b0 && c <= 0x05b9)
151 || (c >= 0x05bb && c <= 0x005bd)
152 || c == 0x05bf
153 || (c >= 0x05c1 && c <= 0x05c2))
154 return 1;
156 /* Arabic. */
157 if ((c >= 0x06d0 && c <= 0x06dc)
158 || c == 0x06e8
159 || (c >= 0x06ea && c <= 0x06ed))
160 return 1;
162 /* Devanagari */
163 if ((c >= 0x0901 && c <= 0x0903)
164 || (c >= 0x093e && c <= 0x094d)
165 || (c >= 0x0950 && c <= 0x0952)
166 || c == 0x0963)
167 return 1;
169 /* Bengali */
170 if ((c >= 0x0981 && c <= 0x0983)
171 || (c >= 0x09be && c <= 0x09c4)
172 || (c >= 0x09c7 && c <= 0x09c8)
173 || (c >= 0x09cb && c <= 0x09cd)
174 || (c >= 0x09e2 && c <= 0x09e3))
175 return 1;
177 /* Gurmukhi */
178 if (c == 0x0a02
179 || (c >= 0x0a3e && c <= 0x0a42)
180 || (c >= 0x0a47 && c <= 0x0a48)
181 || (c >= 0x0a4b && c <= 0x0a4d)
182 || (c == 0x0a74))
183 return 1;
185 /* Gujarati */
186 if ((c >= 0x0a81 && c <= 0x0a83)
187 || (c >= 0x0abd && c <= 0x0ac5)
188 || (c >= 0x0ac7 && c <= 0x0ac9)
189 || (c >= 0x0acb && c <= 0x0acd)
190 || (c == 0x0ad0))
191 return 1;
193 /* Oriya */
194 if ((c >= 0x0b01 && c <= 0x0b03)
195 || (c >= 0x0b3e && c <= 0x0b43)
196 || (c >= 0x0b47 && c <= 0x0b48)
197 || (c >= 0x0b4b && c <= 0x0b4d))
198 return 1;
200 /* Tamil */
201 if ((c >= 0x0b82 && c <= 0x0b83)
202 || (c >= 0x0bbe && c <= 0x0bc2)
203 || (c >= 0x0bc6 && c <= 0x0bc8)
204 || (c >= 0x0bc8 && c <= 0x0bcd))
205 return 1;
207 /* Telugu */
208 if ((c >= 0x0c01 && c <= 0x0c03)
209 || (c >= 0x0c3e && c <= 0x0c44)
210 || (c >= 0x0c46 && c <= 0x0c48)
211 || (c >= 0x0c4a && c <= 0x0c4d))
212 return 1;
214 /* Kannada */
215 if ((c >= 0x0c82 && c <= 0x0c83)
216 || (c >= 0x0cbe && c <= 0x0cc4)
217 || (c >= 0x0cc6 && c <= 0x0cc8)
218 || (c >= 0x0cca && c <= 0x0ccd)
219 || c == 0x0cde)
220 return 1;
222 /* Malayalam */
223 if ((c >= 0x0d02 && c <= 0x0d03)
224 || (c >= 0x0d3e && c <= 0x0d43)
225 || (c >= 0x0d46 && c <= 0x0d48)
226 || (c >= 0x0d4a && c <= 0x0d4d))
227 return 1;
229 /* Thai */
230 if ((c >= 0x0e01 && c <= 0x0e3a)
231 || (c >= 0x0e40 && c <= 0x0e5b))
232 return 1;
234 /* Lao */
235 if ((c >= 0x0ead && c <= 0x0eae)
236 || (c >= 0x0eb0 && c <= 0x0eb9)
237 || (c >= 0x0ebb && c <= 0x0ebd)
238 || (c >= 0x0ec0 && c <= 0x0ec4)
239 || c == 0x0ec6
240 || (c >= 0x0ec8 && c <= 0x0ecd)
241 || (c >= 0x0edc && c <= 0x0ed))
242 return 1;
244 /* Tibetan. */
245 if (c == 0x0f00
246 || (c >= 0x0f18 && c <= 0x0f19)
247 || c == 0x0f35
248 || c == 0x0f37
249 || c == 0x0f39
250 || (c >= 0x0f3e && c <= 0x0f47)
251 || (c >= 0x0f49 && c <= 0x0f69)
252 || (c >= 0x0f71 && c <= 0x0f84)
253 || (c >= 0x0f86 && c <= 0x0f8b)
254 || (c >= 0x0f90 && c <= 0x0f95)
255 || c == 0x0f97
256 || (c >= 0x0f99 && c <= 0x0fad)
257 || (c >= 0x0fb1 && c <= 0x0fb7)
258 || c == 0x0fb9)
259 return 1;
261 /* Katakana */
262 if ((c >= 0x30a1 && c <= 0x30f6)
263 || (c >= 0x30fb && c <= 0x30fc))
264 return 1;
266 /* CJK Unified Ideographs. */
267 if (c >= 0x4e00 && c <= 0x9fa5)
268 return 1;
270 /* Hangul. */
271 if (c >= 0xac00 && c <= 0xd7a3)
272 return 1;
274 /* Digits. */
275 if ((c >= 0x0660 && c <= 0x0669)
276 || (c >= 0x06f0 && c <= 0x06f9)
277 || (c >= 0x0966 && c <= 0x096f)
278 || (c >= 0x09e6 && c <= 0x09ef)
279 || (c >= 0x0a66 && c <= 0x0a6f)
280 || (c >= 0x0ae6 && c <= 0x0aef)
281 || (c >= 0x0b66 && c <= 0x0b6f)
282 || (c >= 0x0be7 && c <= 0x0bef)
283 || (c >= 0x0c66 && c <= 0x0c6f)
284 || (c >= 0x0ce6 && c <= 0x0cef)
285 || (c >= 0x0d66 && c <= 0x0d6f)
286 || (c >= 0x0e50 && c <= 0x0e59)
287 || (c >= 0x0ed0 && c <= 0x0ed9)
288 || (c >= 0x0f20 && c <= 0x0f33))
289 return 2;
291 /* Special characters. */
292 if (c == 0x00b5
293 || c == 0x00b7
294 || (c >= 0x02b0 && c <= 0x02b8)
295 || c == 0x02bb
296 || (c >= 0x02bd && c <= 0x02c1)
297 || (c >= 0x02d0 && c <= 0x02d1)
298 || (c >= 0x02e0 && c <= 0x02e4)
299 || c == 0x037a
300 || c == 0x0559
301 || c == 0x093d
302 || c == 0x0b3d
303 || c == 0x1fbe
304 || (c >= 0x203f && c <= 0x2040)
305 || c == 0x2102
306 || c == 0x2107
307 || (c >= 0x210a && c <= 0x2113)
308 || c == 0x2115
309 || (c >= 0x2118 && c <= 0x211d)
310 || c == 0x2124
311 || c == 0x2126
312 || c == 0x2128
313 || (c >= 0x212a && c <= 0x2131)
314 || (c >= 0x2133 && c <= 0x2138)
315 || (c >= 0x2160 && c <= 0x2182)
316 || (c >= 0x3005 && c <= 0x3007)
317 || (c >= 0x3021 && c <= 0x3029))
318 return 1;
321 if (CPP_OPTION (pfile, cplusplus) || !CPP_PEDANTIC (pfile))
323 /* Greek. */
324 if (c == 0x0384)
325 return 1;
327 /* Cyrillic. */
328 if (c == 0x040d)
329 return 1;
331 /* Hebrew. */
332 if (c >= 0x05f3 && c <= 0x05f4)
333 return 1;
335 /* Lao. */
336 if ((c >= 0x0ead && c <= 0x0eb0)
337 || (c == 0x0eb2)
338 || (c == 0x0eb3)
339 || (c == 0x0ebd)
340 || (c >= 0x0ec0 && c <= 0x0ec4)
341 || (c == 0x0ec6))
342 return 1;
344 /* Hiragana */
345 if (c == 0x3094
346 || (c >= 0x309d && c <= 0x309e))
347 return 1;
349 /* Katakana */
350 if ((c >= 0x30a1 && c <= 0x30fe))
351 return 1;
353 /* Hangul */
354 if ((c >= 0x1100 && c <= 0x1159)
355 || (c >= 0x1161 && c <= 0x11a2)
356 || (c >= 0x11a8 && c <= 0x11f9))
357 return 1;
359 /* CJK Unified Ideographs */
360 if ((c >= 0xf900 && c <= 0xfa2d)
361 || (c >= 0xfb1f && c <= 0xfb36)
362 || (c >= 0xfb38 && c <= 0xfb3c)
363 || (c == 0xfb3e)
364 || (c >= 0xfb40 && c <= 0xfb41)
365 || (c >= 0xfb42 && c <= 0xfb44)
366 || (c >= 0xfb46 && c <= 0xfbb1)
367 || (c >= 0xfbd3 && c <= 0xfd3f)
368 || (c >= 0xfd50 && c <= 0xfd8f)
369 || (c >= 0xfd92 && c <= 0xfdc7)
370 || (c >= 0xfdf0 && c <= 0xfdfb)
371 || (c >= 0xfe70 && c <= 0xfe72)
372 || (c == 0xfe74)
373 || (c >= 0xfe76 && c <= 0xfefc)
374 || (c >= 0xff21 && c <= 0xff3a)
375 || (c >= 0xff41 && c <= 0xff5a)
376 || (c >= 0xff66 && c <= 0xffbe)
377 || (c >= 0xffc2 && c <= 0xffc7)
378 || (c >= 0xffca && c <= 0xffcf)
379 || (c >= 0xffd2 && c <= 0xffd7)
380 || (c >= 0xffda && c <= 0xffdc)
381 || (c >= 0x4e00 && c <= 0x9fa5))
382 return 1;
385 /* Latin */
386 if ((c >= 0x00c0 && c <= 0x00d6)
387 || (c >= 0x00d8 && c <= 0x00f6)
388 || (c >= 0x00f8 && c <= 0x01f5)
389 || (c >= 0x01fa && c <= 0x0217)
390 || (c >= 0x0250 && c <= 0x02a8)
391 || (c >= 0x1e00 && c <= 0x1e9a)
392 || (c >= 0x1ea0 && c <= 0x1ef9))
393 return 1;
395 /* Greek */
396 if ((c >= 0x0388 && c <= 0x038a)
397 || (c == 0x038c)
398 || (c >= 0x038e && c <= 0x03a1)
399 || (c >= 0x03a3 && c <= 0x03ce)
400 || (c >= 0x03d0 && c <= 0x03d6)
401 || (c == 0x03da)
402 || (c == 0x03dc)
403 || (c == 0x03de)
404 || (c == 0x03e0)
405 || (c >= 0x03e2 && c <= 0x03f3)
406 || (c >= 0x1f00 && c <= 0x1f15)
407 || (c >= 0x1f18 && c <= 0x1f1d)
408 || (c >= 0x1f20 && c <= 0x1f45)
409 || (c >= 0x1f48 && c <= 0x1f4d)
410 || (c >= 0x1f50 && c <= 0x1f57)
411 || (c == 0x1f59)
412 || (c == 0x1f5b)
413 || (c == 0x1f5d)
414 || (c >= 0x1f5f && c <= 0x1f7d)
415 || (c >= 0x1f80 && c <= 0x1fb4)
416 || (c >= 0x1fb6 && c <= 0x1fbc)
417 || (c >= 0x1fc2 && c <= 0x1fc4)
418 || (c >= 0x1fc6 && c <= 0x1fcc)
419 || (c >= 0x1fd0 && c <= 0x1fd3)
420 || (c >= 0x1fd6 && c <= 0x1fdb)
421 || (c >= 0x1fe0 && c <= 0x1fec)
422 || (c >= 0x1ff2 && c <= 0x1ff4)
423 || (c >= 0x1ff6 && c <= 0x1ffc))
424 return 1;
426 /* Cyrillic */
427 if ((c >= 0x0401 && c <= 0x040c)
428 || (c >= 0x040f && c <= 0x044f)
429 || (c >= 0x0451 && c <= 0x045c)
430 || (c >= 0x045e && c <= 0x0481)
431 || (c >= 0x0490 && c <= 0x04c4)
432 || (c >= 0x04c7 && c <= 0x04c8)
433 || (c >= 0x04cb && c <= 0x04cc)
434 || (c >= 0x04d0 && c <= 0x04eb)
435 || (c >= 0x04ee && c <= 0x04f5)
436 || (c >= 0x04f8 && c <= 0x04f9))
437 return 1;
439 /* Armenian */
440 if ((c >= 0x0531 && c <= 0x0556)
441 || (c >= 0x0561 && c <= 0x0587))
442 return 1;
444 /* Hebrew */
445 if ((c >= 0x05d0 && c <= 0x05ea)
446 || (c >= 0x05f0 && c <= 0x05f2))
447 return 1;
449 /* Arabic */
450 if ((c >= 0x0621 && c <= 0x063a)
451 || (c >= 0x0640 && c <= 0x0652)
452 || (c >= 0x0670 && c <= 0x06b7)
453 || (c >= 0x06ba && c <= 0x06be)
454 || (c >= 0x06c0 && c <= 0x06ce)
455 || (c >= 0x06e5 && c <= 0x06e7))
456 return 1;
458 /* Devanagari */
459 if ((c >= 0x0905 && c <= 0x0939)
460 || (c >= 0x0958 && c <= 0x0962))
461 return 1;
463 /* Bengali */
464 if ((c >= 0x0985 && c <= 0x098c)
465 || (c >= 0x098f && c <= 0x0990)
466 || (c >= 0x0993 && c <= 0x09a8)
467 || (c >= 0x09aa && c <= 0x09b0)
468 || (c == 0x09b2)
469 || (c >= 0x09b6 && c <= 0x09b9)
470 || (c >= 0x09dc && c <= 0x09dd)
471 || (c >= 0x09df && c <= 0x09e1)
472 || (c >= 0x09f0 && c <= 0x09f1))
473 return 1;
475 /* Gurmukhi */
476 if ((c >= 0x0a05 && c <= 0x0a0a)
477 || (c >= 0x0a0f && c <= 0x0a10)
478 || (c >= 0x0a13 && c <= 0x0a28)
479 || (c >= 0x0a2a && c <= 0x0a30)
480 || (c >= 0x0a32 && c <= 0x0a33)
481 || (c >= 0x0a35 && c <= 0x0a36)
482 || (c >= 0x0a38 && c <= 0x0a39)
483 || (c >= 0x0a59 && c <= 0x0a5c)
484 || (c == 0x0a5e))
485 return 1;
487 /* Gujarati */
488 if ((c >= 0x0a85 && c <= 0x0a8b)
489 || (c == 0x0a8d)
490 || (c >= 0x0a8f && c <= 0x0a91)
491 || (c >= 0x0a93 && c <= 0x0aa8)
492 || (c >= 0x0aaa && c <= 0x0ab0)
493 || (c >= 0x0ab2 && c <= 0x0ab3)
494 || (c >= 0x0ab5 && c <= 0x0ab9)
495 || (c == 0x0ae0))
496 return 1;
498 /* Oriya */
499 if ((c >= 0x0b05 && c <= 0x0b0c)
500 || (c >= 0x0b0f && c <= 0x0b10)
501 || (c >= 0x0b13 && c <= 0x0b28)
502 || (c >= 0x0b2a && c <= 0x0b30)
503 || (c >= 0x0b32 && c <= 0x0b33)
504 || (c >= 0x0b36 && c <= 0x0b39)
505 || (c >= 0x0b5c && c <= 0x0b5d)
506 || (c >= 0x0b5f && c <= 0x0b61))
507 return 1;
509 /* Tamil */
510 if ((c >= 0x0b85 && c <= 0x0b8a)
511 || (c >= 0x0b8e && c <= 0x0b90)
512 || (c >= 0x0b92 && c <= 0x0b95)
513 || (c >= 0x0b99 && c <= 0x0b9a)
514 || (c == 0x0b9c)
515 || (c >= 0x0b9e && c <= 0x0b9f)
516 || (c >= 0x0ba3 && c <= 0x0ba4)
517 || (c >= 0x0ba8 && c <= 0x0baa)
518 || (c >= 0x0bae && c <= 0x0bb5)
519 || (c >= 0x0bb7 && c <= 0x0bb9))
520 return 1;
522 /* Telugu */
523 if ((c >= 0x0c05 && c <= 0x0c0c)
524 || (c >= 0x0c0e && c <= 0x0c10)
525 || (c >= 0x0c12 && c <= 0x0c28)
526 || (c >= 0x0c2a && c <= 0x0c33)
527 || (c >= 0x0c35 && c <= 0x0c39)
528 || (c >= 0x0c60 && c <= 0x0c61))
529 return 1;
531 /* Kannada */
532 if ((c >= 0x0c85 && c <= 0x0c8c)
533 || (c >= 0x0c8e && c <= 0x0c90)
534 || (c >= 0x0c92 && c <= 0x0ca8)
535 || (c >= 0x0caa && c <= 0x0cb3)
536 || (c >= 0x0cb5 && c <= 0x0cb9)
537 || (c >= 0x0ce0 && c <= 0x0ce1))
538 return 1;
540 /* Malayalam */
541 if ((c >= 0x0d05 && c <= 0x0d0c)
542 || (c >= 0x0d0e && c <= 0x0d10)
543 || (c >= 0x0d12 && c <= 0x0d28)
544 || (c >= 0x0d2a && c <= 0x0d39)
545 || (c >= 0x0d60 && c <= 0x0d61))
546 return 1;
548 /* Thai */
549 if ((c >= 0x0e01 && c <= 0x0e30)
550 || (c >= 0x0e32 && c <= 0x0e33)
551 || (c >= 0x0e40 && c <= 0x0e46)
552 || (c >= 0x0e4f && c <= 0x0e5b))
553 return 1;
555 /* Lao */
556 if ((c >= 0x0e81 && c <= 0x0e82)
557 || (c == 0x0e84)
558 || (c == 0x0e87)
559 || (c == 0x0e88)
560 || (c == 0x0e8a)
561 || (c == 0x0e8d)
562 || (c >= 0x0e94 && c <= 0x0e97)
563 || (c >= 0x0e99 && c <= 0x0e9f)
564 || (c >= 0x0ea1 && c <= 0x0ea3)
565 || (c == 0x0ea5)
566 || (c == 0x0ea7)
567 || (c == 0x0eaa)
568 || (c == 0x0eab))
569 return 1;
571 /* Georgian */
572 if ((c >= 0x10a0 && c <= 0x10c5)
573 || (c >= 0x10d0 && c <= 0x10f6))
574 return 1;
576 /* Hiragana */
577 if ((c >= 0x3041 && c <= 0x3093)
578 || (c >= 0x309b && c <= 0x309c))
579 return 1;
581 /* Bopmofo */
582 if ((c >= 0x3105 && c <= 0x312c))
583 return 1;
585 return 0;