PR target/11183
[official-gcc.git] / gcc / cppcharset.c
blobb11e6424ad80c71ae6f30d65a510c6480d1b1c80
1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
10 later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
28 static int ucn_valid_in_identifier PARAMS ((cpp_reader *, cppchar_t));
30 /* [lex.charset]: The character designated by the universal character
31 name \UNNNNNNNN is that character whose character short name in
32 ISO/IEC 10646 is NNNNNNNN; the character designated by the
33 universal character name \uNNNN is that character whose character
34 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
35 for a universal character name is less than 0x20 or in the range
36 0x7F-0x9F (inclusive), or if the universal character name
37 designates a character in the basic source character set, then the
38 program is ill-formed.
40 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
41 buffer end is delimited by a non-hex digit. Returns zero if UCNs
42 are not part of the relevant standard, or if the string beginning
43 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
45 Otherwise the nonzero value of the UCN, whether valid or invalid,
46 is returned. Diagnostics are emitted for invalid values. PSTR
47 is updated to point one beyond the UCN, or to the syntactically
48 invalid character.
50 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
51 an identifier, or 2 otherwise.
54 cppchar_t
55 _cpp_valid_ucn (pfile, pstr, identifier_pos)
56 cpp_reader *pfile;
57 const uchar **pstr;
58 int identifier_pos;
60 cppchar_t result, c;
61 unsigned int length;
62 const uchar *str = *pstr;
63 const uchar *base = str - 2;
65 /* Only attempt to interpret a UCS for C++ and C99. */
66 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
67 return 0;
69 /* We don't accept UCNs for an EBCDIC target. */
70 if (CPP_OPTION (pfile, EBCDIC))
71 return 0;
73 if (str[-1] == 'u')
74 length = 4;
75 else if (str[-1] == 'U')
76 length = 8;
77 else
78 abort();
80 result = 0;
83 c = *str;
84 if (!ISXDIGIT (c))
85 break;
86 str++;
87 result = (result << 4) + hex_value (c);
89 while (--length);
91 *pstr = str;
92 if (length)
93 /* We'll error when we try it out as the start of an identifier. */
94 cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
95 (int) (str - base), base);
96 /* The standard permits $, @ and ` to be specified as UCNs. We use
97 hex escapes so that this also works with EBCDIC hosts. */
98 else if ((result < 0xa0
99 && (result != 0x24 && result != 0x40 && result != 0x60))
100 || (result & 0x80000000)
101 || (result >= 0xD800 && result <= 0xDFFF))
103 cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character",
104 (int) (str - base), base);
106 else if (identifier_pos)
108 int validity = ucn_valid_in_identifier (pfile, result);
110 if (validity == 0)
111 cpp_error (pfile, DL_ERROR,
112 "universal character %.*s is not valid in an identifier",
113 (int) (str - base), base);
114 else if (validity == 2 && identifier_pos == 1)
115 cpp_error (pfile, DL_ERROR,
116 "universal character %.*s is not valid at the start of an identifier",
117 (int) (str - base), base);
120 if (result == 0)
121 result = 1;
123 return result;
126 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
127 the start of an identifier, and 0 if C is not valid in an
128 identifier. We assume C has already gone through the checks of
129 _cpp_valid_ucn. */
130 static int
131 ucn_valid_in_identifier (pfile, c)
132 cpp_reader *pfile;
133 cppchar_t c;
135 /* None of the valid chars are outside the Basic Multilingual Plane (the
136 low 16 bits). */
137 if (c > 0xffff)
138 return 0;
140 if (CPP_OPTION (pfile, c99) || !CPP_PEDANTIC (pfile))
142 /* Latin. */
143 if (c == 0x0aa || c == 0x00ba || c == 0x207f || c == 0x1e9b)
144 return 1;
146 /* Greek. */
147 if (c == 0x0386)
148 return 1;
150 /* Cyrillic. */
151 if (c == 0x040c)
152 return 1;
154 /* Hebrew. */
155 if ((c >= 0x05b0 && c <= 0x05b9)
156 || (c >= 0x05bb && c <= 0x005bd)
157 || c == 0x05bf
158 || (c >= 0x05c1 && c <= 0x05c2))
159 return 1;
161 /* Arabic. */
162 if ((c >= 0x06d0 && c <= 0x06dc)
163 || c == 0x06e8
164 || (c >= 0x06ea && c <= 0x06ed))
165 return 1;
167 /* Devanagari */
168 if ((c >= 0x0901 && c <= 0x0903)
169 || (c >= 0x093e && c <= 0x094d)
170 || (c >= 0x0950 && c <= 0x0952)
171 || c == 0x0963)
172 return 1;
174 /* Bengali */
175 if ((c >= 0x0981 && c <= 0x0983)
176 || (c >= 0x09be && c <= 0x09c4)
177 || (c >= 0x09c7 && c <= 0x09c8)
178 || (c >= 0x09cb && c <= 0x09cd)
179 || (c >= 0x09e2 && c <= 0x09e3))
180 return 1;
182 /* Gurmukhi */
183 if (c == 0x0a02
184 || (c >= 0x0a3e && c <= 0x0a42)
185 || (c >= 0x0a47 && c <= 0x0a48)
186 || (c >= 0x0a4b && c <= 0x0a4d)
187 || (c == 0x0a74))
188 return 1;
190 /* Gujarati */
191 if ((c >= 0x0a81 && c <= 0x0a83)
192 || (c >= 0x0abd && c <= 0x0ac5)
193 || (c >= 0x0ac7 && c <= 0x0ac9)
194 || (c >= 0x0acb && c <= 0x0acd)
195 || (c == 0x0ad0))
196 return 1;
198 /* Oriya */
199 if ((c >= 0x0b01 && c <= 0x0b03)
200 || (c >= 0x0b3e && c <= 0x0b43)
201 || (c >= 0x0b47 && c <= 0x0b48)
202 || (c >= 0x0b4b && c <= 0x0b4d))
203 return 1;
205 /* Tamil */
206 if ((c >= 0x0b82 && c <= 0x0b83)
207 || (c >= 0x0bbe && c <= 0x0bc2)
208 || (c >= 0x0bc6 && c <= 0x0bc8)
209 || (c >= 0x0bc8 && c <= 0x0bcd))
210 return 1;
212 /* Telugu */
213 if ((c >= 0x0c01 && c <= 0x0c03)
214 || (c >= 0x0c3e && c <= 0x0c44)
215 || (c >= 0x0c46 && c <= 0x0c48)
216 || (c >= 0x0c4a && c <= 0x0c4d))
217 return 1;
219 /* Kannada */
220 if ((c >= 0x0c82 && c <= 0x0c83)
221 || (c >= 0x0cbe && c <= 0x0cc4)
222 || (c >= 0x0cc6 && c <= 0x0cc8)
223 || (c >= 0x0cca && c <= 0x0ccd)
224 || c == 0x0cde)
225 return 1;
227 /* Malayalam */
228 if ((c >= 0x0d02 && c <= 0x0d03)
229 || (c >= 0x0d3e && c <= 0x0d43)
230 || (c >= 0x0d46 && c <= 0x0d48)
231 || (c >= 0x0d4a && c <= 0x0d4d))
232 return 1;
234 /* Thai */
235 if ((c >= 0x0e01 && c <= 0x0e3a)
236 || (c >= 0x0e40 && c <= 0x0e5b))
237 return 1;
239 /* Lao */
240 if ((c >= 0x0ead && c <= 0x0eae)
241 || (c >= 0x0eb0 && c <= 0x0eb9)
242 || (c >= 0x0ebb && c <= 0x0ebd)
243 || (c >= 0x0ec0 && c <= 0x0ec4)
244 || c == 0x0ec6
245 || (c >= 0x0ec8 && c <= 0x0ecd)
246 || (c >= 0x0edc && c <= 0x0ed))
247 return 1;
249 /* Tibetan. */
250 if (c == 0x0f00
251 || (c >= 0x0f18 && c <= 0x0f19)
252 || c == 0x0f35
253 || c == 0x0f37
254 || c == 0x0f39
255 || (c >= 0x0f3e && c <= 0x0f47)
256 || (c >= 0x0f49 && c <= 0x0f69)
257 || (c >= 0x0f71 && c <= 0x0f84)
258 || (c >= 0x0f86 && c <= 0x0f8b)
259 || (c >= 0x0f90 && c <= 0x0f95)
260 || c == 0x0f97
261 || (c >= 0x0f99 && c <= 0x0fad)
262 || (c >= 0x0fb1 && c <= 0x0fb7)
263 || c == 0x0fb9)
264 return 1;
266 /* Katakana */
267 if ((c >= 0x30a1 && c <= 0x30f6)
268 || (c >= 0x30fb && c <= 0x30fc))
269 return 1;
271 /* CJK Unified Ideographs. */
272 if (c >= 0x4e00 && c <= 0x9fa5)
273 return 1;
275 /* Hangul. */
276 if (c >= 0xac00 && c <= 0xd7a3)
277 return 1;
279 /* Digits. */
280 if ((c >= 0x0660 && c <= 0x0669)
281 || (c >= 0x06f0 && c <= 0x06f9)
282 || (c >= 0x0966 && c <= 0x096f)
283 || (c >= 0x09e6 && c <= 0x09ef)
284 || (c >= 0x0a66 && c <= 0x0a6f)
285 || (c >= 0x0ae6 && c <= 0x0aef)
286 || (c >= 0x0b66 && c <= 0x0b6f)
287 || (c >= 0x0be7 && c <= 0x0bef)
288 || (c >= 0x0c66 && c <= 0x0c6f)
289 || (c >= 0x0ce6 && c <= 0x0cef)
290 || (c >= 0x0d66 && c <= 0x0d6f)
291 || (c >= 0x0e50 && c <= 0x0e59)
292 || (c >= 0x0ed0 && c <= 0x0ed9)
293 || (c >= 0x0f20 && c <= 0x0f33))
294 return 2;
296 /* Special characters. */
297 if (c == 0x00b5
298 || c == 0x00b7
299 || (c >= 0x02b0 && c <= 0x02b8)
300 || c == 0x02bb
301 || (c >= 0x02bd && c <= 0x02c1)
302 || (c >= 0x02d0 && c <= 0x02d1)
303 || (c >= 0x02e0 && c <= 0x02e4)
304 || c == 0x037a
305 || c == 0x0559
306 || c == 0x093d
307 || c == 0x0b3d
308 || c == 0x1fbe
309 || (c >= 0x203f && c <= 0x2040)
310 || c == 0x2102
311 || c == 0x2107
312 || (c >= 0x210a && c <= 0x2113)
313 || c == 0x2115
314 || (c >= 0x2118 && c <= 0x211d)
315 || c == 0x2124
316 || c == 0x2126
317 || c == 0x2128
318 || (c >= 0x212a && c <= 0x2131)
319 || (c >= 0x2133 && c <= 0x2138)
320 || (c >= 0x2160 && c <= 0x2182)
321 || (c >= 0x3005 && c <= 0x3007)
322 || (c >= 0x3021 && c <= 0x3029))
323 return 1;
326 if (CPP_OPTION (pfile, cplusplus) || !CPP_PEDANTIC (pfile))
328 /* Greek. */
329 if (c == 0x0384)
330 return 1;
332 /* Cyrillic. */
333 if (c == 0x040d)
334 return 1;
336 /* Hebrew. */
337 if (c >= 0x05f3 && c <= 0x05f4)
338 return 1;
340 /* Lao. */
341 if ((c >= 0x0ead && c <= 0x0eb0)
342 || (c == 0x0eb2)
343 || (c == 0x0eb3)
344 || (c == 0x0ebd)
345 || (c >= 0x0ec0 && c <= 0x0ec4)
346 || (c == 0x0ec6))
347 return 1;
349 /* Hiragana */
350 if (c == 0x3094
351 || (c >= 0x309d && c <= 0x309e))
352 return 1;
354 /* Katakana */
355 if ((c >= 0x30a1 && c <= 0x30fe))
356 return 1;
358 /* Hangul */
359 if ((c >= 0x1100 && c <= 0x1159)
360 || (c >= 0x1161 && c <= 0x11a2)
361 || (c >= 0x11a8 && c <= 0x11f9))
362 return 1;
364 /* CJK Unified Ideographs */
365 if ((c >= 0xf900 && c <= 0xfa2d)
366 || (c >= 0xfb1f && c <= 0xfb36)
367 || (c >= 0xfb38 && c <= 0xfb3c)
368 || (c == 0xfb3e)
369 || (c >= 0xfb40 && c <= 0xfb41)
370 || (c >= 0xfb42 && c <= 0xfb44)
371 || (c >= 0xfb46 && c <= 0xfbb1)
372 || (c >= 0xfbd3 && c <= 0xfd3f)
373 || (c >= 0xfd50 && c <= 0xfd8f)
374 || (c >= 0xfd92 && c <= 0xfdc7)
375 || (c >= 0xfdf0 && c <= 0xfdfb)
376 || (c >= 0xfe70 && c <= 0xfe72)
377 || (c == 0xfe74)
378 || (c >= 0xfe76 && c <= 0xfefc)
379 || (c >= 0xff21 && c <= 0xff3a)
380 || (c >= 0xff41 && c <= 0xff5a)
381 || (c >= 0xff66 && c <= 0xffbe)
382 || (c >= 0xffc2 && c <= 0xffc7)
383 || (c >= 0xffca && c <= 0xffcf)
384 || (c >= 0xffd2 && c <= 0xffd7)
385 || (c >= 0xffda && c <= 0xffdc)
386 || (c >= 0x4e00 && c <= 0x9fa5))
387 return 1;
390 /* Latin */
391 if ((c >= 0x00c0 && c <= 0x00d6)
392 || (c >= 0x00d8 && c <= 0x00f6)
393 || (c >= 0x00f8 && c <= 0x01f5)
394 || (c >= 0x01fa && c <= 0x0217)
395 || (c >= 0x0250 && c <= 0x02a8)
396 || (c >= 0x1e00 && c <= 0x1e9a)
397 || (c >= 0x1ea0 && c <= 0x1ef9))
398 return 1;
400 /* Greek */
401 if ((c >= 0x0388 && c <= 0x038a)
402 || (c == 0x038c)
403 || (c >= 0x038e && c <= 0x03a1)
404 || (c >= 0x03a3 && c <= 0x03ce)
405 || (c >= 0x03d0 && c <= 0x03d6)
406 || (c == 0x03da)
407 || (c == 0x03dc)
408 || (c == 0x03de)
409 || (c == 0x03e0)
410 || (c >= 0x03e2 && c <= 0x03f3)
411 || (c >= 0x1f00 && c <= 0x1f15)
412 || (c >= 0x1f18 && c <= 0x1f1d)
413 || (c >= 0x1f20 && c <= 0x1f45)
414 || (c >= 0x1f48 && c <= 0x1f4d)
415 || (c >= 0x1f50 && c <= 0x1f57)
416 || (c == 0x1f59)
417 || (c == 0x1f5b)
418 || (c == 0x1f5d)
419 || (c >= 0x1f5f && c <= 0x1f7d)
420 || (c >= 0x1f80 && c <= 0x1fb4)
421 || (c >= 0x1fb6 && c <= 0x1fbc)
422 || (c >= 0x1fc2 && c <= 0x1fc4)
423 || (c >= 0x1fc6 && c <= 0x1fcc)
424 || (c >= 0x1fd0 && c <= 0x1fd3)
425 || (c >= 0x1fd6 && c <= 0x1fdb)
426 || (c >= 0x1fe0 && c <= 0x1fec)
427 || (c >= 0x1ff2 && c <= 0x1ff4)
428 || (c >= 0x1ff6 && c <= 0x1ffc))
429 return 1;
431 /* Cyrillic */
432 if ((c >= 0x0401 && c <= 0x040c)
433 || (c >= 0x040f && c <= 0x044f)
434 || (c >= 0x0451 && c <= 0x045c)
435 || (c >= 0x045e && c <= 0x0481)
436 || (c >= 0x0490 && c <= 0x04c4)
437 || (c >= 0x04c7 && c <= 0x04c8)
438 || (c >= 0x04cb && c <= 0x04cc)
439 || (c >= 0x04d0 && c <= 0x04eb)
440 || (c >= 0x04ee && c <= 0x04f5)
441 || (c >= 0x04f8 && c <= 0x04f9))
442 return 1;
444 /* Armenian */
445 if ((c >= 0x0531 && c <= 0x0556)
446 || (c >= 0x0561 && c <= 0x0587))
447 return 1;
449 /* Hebrew */
450 if ((c >= 0x05d0 && c <= 0x05ea)
451 || (c >= 0x05f0 && c <= 0x05f2))
452 return 1;
454 /* Arabic */
455 if ((c >= 0x0621 && c <= 0x063a)
456 || (c >= 0x0640 && c <= 0x0652)
457 || (c >= 0x0670 && c <= 0x06b7)
458 || (c >= 0x06ba && c <= 0x06be)
459 || (c >= 0x06c0 && c <= 0x06ce)
460 || (c >= 0x06e5 && c <= 0x06e7))
461 return 1;
463 /* Devanagari */
464 if ((c >= 0x0905 && c <= 0x0939)
465 || (c >= 0x0958 && c <= 0x0962))
466 return 1;
468 /* Bengali */
469 if ((c >= 0x0985 && c <= 0x098c)
470 || (c >= 0x098f && c <= 0x0990)
471 || (c >= 0x0993 && c <= 0x09a8)
472 || (c >= 0x09aa && c <= 0x09b0)
473 || (c == 0x09b2)
474 || (c >= 0x09b6 && c <= 0x09b9)
475 || (c >= 0x09dc && c <= 0x09dd)
476 || (c >= 0x09df && c <= 0x09e1)
477 || (c >= 0x09f0 && c <= 0x09f1))
478 return 1;
480 /* Gurmukhi */
481 if ((c >= 0x0a05 && c <= 0x0a0a)
482 || (c >= 0x0a0f && c <= 0x0a10)
483 || (c >= 0x0a13 && c <= 0x0a28)
484 || (c >= 0x0a2a && c <= 0x0a30)
485 || (c >= 0x0a32 && c <= 0x0a33)
486 || (c >= 0x0a35 && c <= 0x0a36)
487 || (c >= 0x0a38 && c <= 0x0a39)
488 || (c >= 0x0a59 && c <= 0x0a5c)
489 || (c == 0x0a5e))
490 return 1;
492 /* Gujarati */
493 if ((c >= 0x0a85 && c <= 0x0a8b)
494 || (c == 0x0a8d)
495 || (c >= 0x0a8f && c <= 0x0a91)
496 || (c >= 0x0a93 && c <= 0x0aa8)
497 || (c >= 0x0aaa && c <= 0x0ab0)
498 || (c >= 0x0ab2 && c <= 0x0ab3)
499 || (c >= 0x0ab5 && c <= 0x0ab9)
500 || (c == 0x0ae0))
501 return 1;
503 /* Oriya */
504 if ((c >= 0x0b05 && c <= 0x0b0c)
505 || (c >= 0x0b0f && c <= 0x0b10)
506 || (c >= 0x0b13 && c <= 0x0b28)
507 || (c >= 0x0b2a && c <= 0x0b30)
508 || (c >= 0x0b32 && c <= 0x0b33)
509 || (c >= 0x0b36 && c <= 0x0b39)
510 || (c >= 0x0b5c && c <= 0x0b5d)
511 || (c >= 0x0b5f && c <= 0x0b61))
512 return 1;
514 /* Tamil */
515 if ((c >= 0x0b85 && c <= 0x0b8a)
516 || (c >= 0x0b8e && c <= 0x0b90)
517 || (c >= 0x0b92 && c <= 0x0b95)
518 || (c >= 0x0b99 && c <= 0x0b9a)
519 || (c == 0x0b9c)
520 || (c >= 0x0b9e && c <= 0x0b9f)
521 || (c >= 0x0ba3 && c <= 0x0ba4)
522 || (c >= 0x0ba8 && c <= 0x0baa)
523 || (c >= 0x0bae && c <= 0x0bb5)
524 || (c >= 0x0bb7 && c <= 0x0bb9))
525 return 1;
527 /* Telugu */
528 if ((c >= 0x0c05 && c <= 0x0c0c)
529 || (c >= 0x0c0e && c <= 0x0c10)
530 || (c >= 0x0c12 && c <= 0x0c28)
531 || (c >= 0x0c2a && c <= 0x0c33)
532 || (c >= 0x0c35 && c <= 0x0c39)
533 || (c >= 0x0c60 && c <= 0x0c61))
534 return 1;
536 /* Kannada */
537 if ((c >= 0x0c85 && c <= 0x0c8c)
538 || (c >= 0x0c8e && c <= 0x0c90)
539 || (c >= 0x0c92 && c <= 0x0ca8)
540 || (c >= 0x0caa && c <= 0x0cb3)
541 || (c >= 0x0cb5 && c <= 0x0cb9)
542 || (c >= 0x0ce0 && c <= 0x0ce1))
543 return 1;
545 /* Malayalam */
546 if ((c >= 0x0d05 && c <= 0x0d0c)
547 || (c >= 0x0d0e && c <= 0x0d10)
548 || (c >= 0x0d12 && c <= 0x0d28)
549 || (c >= 0x0d2a && c <= 0x0d39)
550 || (c >= 0x0d60 && c <= 0x0d61))
551 return 1;
553 /* Thai */
554 if ((c >= 0x0e01 && c <= 0x0e30)
555 || (c >= 0x0e32 && c <= 0x0e33)
556 || (c >= 0x0e40 && c <= 0x0e46)
557 || (c >= 0x0e4f && c <= 0x0e5b))
558 return 1;
560 /* Lao */
561 if ((c >= 0x0e81 && c <= 0x0e82)
562 || (c == 0x0e84)
563 || (c == 0x0e87)
564 || (c == 0x0e88)
565 || (c == 0x0e8a)
566 || (c == 0x0e8d)
567 || (c >= 0x0e94 && c <= 0x0e97)
568 || (c >= 0x0e99 && c <= 0x0e9f)
569 || (c >= 0x0ea1 && c <= 0x0ea3)
570 || (c == 0x0ea5)
571 || (c == 0x0ea7)
572 || (c == 0x0eaa)
573 || (c == 0x0eab))
574 return 1;
576 /* Georgian */
577 if ((c >= 0x10a0 && c <= 0x10c5)
578 || (c >= 0x10d0 && c <= 0x10f6))
579 return 1;
581 /* Hiragana */
582 if ((c >= 0x3041 && c <= 0x3093)
583 || (c >= 0x309b && c <= 0x309c))
584 return 1;
586 /* Bopmofo */
587 if ((c >= 0x3105 && c <= 0x312c))
588 return 1;
590 return 0;