1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "coretypes.h"
28 static int ucn_valid_in_identifier (cpp_reader
*, cppchar_t
);
30 /* [lex.charset]: The character designated by the universal character
31 name \UNNNNNNNN is that character whose character short name in
32 ISO/IEC 10646 is NNNNNNNN; the character designated by the
33 universal character name \uNNNN is that character whose character
34 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
35 for a universal character name is less than 0x20 or in the range
36 0x7F-0x9F (inclusive), or if the universal character name
37 designates a character in the basic source character set, then the
38 program is ill-formed.
40 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
41 buffer end is delimited by a non-hex digit. Returns zero if UCNs
42 are not part of the relevant standard, or if the string beginning
43 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
45 Otherwise the nonzero value of the UCN, whether valid or invalid,
46 is returned. Diagnostics are emitted for invalid values. PSTR
47 is updated to point one beyond the UCN, or to the syntactically
50 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
51 an identifier, or 2 otherwise.
55 _cpp_valid_ucn (cpp_reader
*pfile
, const uchar
**pstr
, int identifier_pos
)
59 const uchar
*str
= *pstr
;
60 const uchar
*base
= str
- 2;
62 /* Only attempt to interpret a UCS for C++ and C99. */
63 if (!CPP_OPTION (pfile
, cplusplus
) && !CPP_OPTION (pfile
, c99
))
66 /* We don't accept UCNs for an EBCDIC target. */
67 if (CPP_OPTION (pfile
, EBCDIC
))
72 else if (str
[-1] == 'U')
84 result
= (result
<< 4) + hex_value (c
);
90 /* We'll error when we try it out as the start of an identifier. */
91 cpp_error (pfile
, DL_ERROR
, "incomplete universal character name %.*s",
92 (int) (str
- base
), base
);
93 /* The standard permits $, @ and ` to be specified as UCNs. We use
94 hex escapes so that this also works with EBCDIC hosts. */
95 else if ((result
< 0xa0
96 && (result
!= 0x24 && result
!= 0x40 && result
!= 0x60))
97 || (result
& 0x80000000)
98 || (result
>= 0xD800 && result
<= 0xDFFF))
100 cpp_error (pfile
, DL_ERROR
, "%.*s is not a valid universal character",
101 (int) (str
- base
), base
);
103 else if (identifier_pos
)
105 int validity
= ucn_valid_in_identifier (pfile
, result
);
108 cpp_error (pfile
, DL_ERROR
,
109 "universal character %.*s is not valid in an identifier",
110 (int) (str
- base
), base
);
111 else if (validity
== 2 && identifier_pos
== 1)
112 cpp_error (pfile
, DL_ERROR
,
113 "universal character %.*s is not valid at the start of an identifier",
114 (int) (str
- base
), base
);
123 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
124 the start of an identifier, and 0 if C is not valid in an
125 identifier. We assume C has already gone through the checks of
128 ucn_valid_in_identifier (cpp_reader
*pfile
, cppchar_t c
)
130 /* None of the valid chars are outside the Basic Multilingual Plane (the
135 if (CPP_OPTION (pfile
, c99
) || !CPP_PEDANTIC (pfile
))
138 if (c
== 0x0aa || c
== 0x00ba || c
== 0x207f || c
== 0x1e9b)
150 if ((c
>= 0x05b0 && c
<= 0x05b9)
151 || (c
>= 0x05bb && c
<= 0x005bd)
153 || (c
>= 0x05c1 && c
<= 0x05c2))
157 if ((c
>= 0x06d0 && c
<= 0x06dc)
159 || (c
>= 0x06ea && c
<= 0x06ed))
163 if ((c
>= 0x0901 && c
<= 0x0903)
164 || (c
>= 0x093e && c
<= 0x094d)
165 || (c
>= 0x0950 && c
<= 0x0952)
170 if ((c
>= 0x0981 && c
<= 0x0983)
171 || (c
>= 0x09be && c
<= 0x09c4)
172 || (c
>= 0x09c7 && c
<= 0x09c8)
173 || (c
>= 0x09cb && c
<= 0x09cd)
174 || (c
>= 0x09e2 && c
<= 0x09e3))
179 || (c
>= 0x0a3e && c
<= 0x0a42)
180 || (c
>= 0x0a47 && c
<= 0x0a48)
181 || (c
>= 0x0a4b && c
<= 0x0a4d)
186 if ((c
>= 0x0a81 && c
<= 0x0a83)
187 || (c
>= 0x0abd && c
<= 0x0ac5)
188 || (c
>= 0x0ac7 && c
<= 0x0ac9)
189 || (c
>= 0x0acb && c
<= 0x0acd)
194 if ((c
>= 0x0b01 && c
<= 0x0b03)
195 || (c
>= 0x0b3e && c
<= 0x0b43)
196 || (c
>= 0x0b47 && c
<= 0x0b48)
197 || (c
>= 0x0b4b && c
<= 0x0b4d))
201 if ((c
>= 0x0b82 && c
<= 0x0b83)
202 || (c
>= 0x0bbe && c
<= 0x0bc2)
203 || (c
>= 0x0bc6 && c
<= 0x0bc8)
204 || (c
>= 0x0bc8 && c
<= 0x0bcd))
208 if ((c
>= 0x0c01 && c
<= 0x0c03)
209 || (c
>= 0x0c3e && c
<= 0x0c44)
210 || (c
>= 0x0c46 && c
<= 0x0c48)
211 || (c
>= 0x0c4a && c
<= 0x0c4d))
215 if ((c
>= 0x0c82 && c
<= 0x0c83)
216 || (c
>= 0x0cbe && c
<= 0x0cc4)
217 || (c
>= 0x0cc6 && c
<= 0x0cc8)
218 || (c
>= 0x0cca && c
<= 0x0ccd)
223 if ((c
>= 0x0d02 && c
<= 0x0d03)
224 || (c
>= 0x0d3e && c
<= 0x0d43)
225 || (c
>= 0x0d46 && c
<= 0x0d48)
226 || (c
>= 0x0d4a && c
<= 0x0d4d))
230 if ((c
>= 0x0e01 && c
<= 0x0e3a)
231 || (c
>= 0x0e40 && c
<= 0x0e5b))
235 if ((c
>= 0x0ead && c
<= 0x0eae)
236 || (c
>= 0x0eb0 && c
<= 0x0eb9)
237 || (c
>= 0x0ebb && c
<= 0x0ebd)
238 || (c
>= 0x0ec0 && c
<= 0x0ec4)
240 || (c
>= 0x0ec8 && c
<= 0x0ecd)
241 || (c
>= 0x0edc && c
<= 0x0ed))
246 || (c
>= 0x0f18 && c
<= 0x0f19)
250 || (c
>= 0x0f3e && c
<= 0x0f47)
251 || (c
>= 0x0f49 && c
<= 0x0f69)
252 || (c
>= 0x0f71 && c
<= 0x0f84)
253 || (c
>= 0x0f86 && c
<= 0x0f8b)
254 || (c
>= 0x0f90 && c
<= 0x0f95)
256 || (c
>= 0x0f99 && c
<= 0x0fad)
257 || (c
>= 0x0fb1 && c
<= 0x0fb7)
262 if ((c
>= 0x30a1 && c
<= 0x30f6)
263 || (c
>= 0x30fb && c
<= 0x30fc))
266 /* CJK Unified Ideographs. */
267 if (c
>= 0x4e00 && c
<= 0x9fa5)
271 if (c
>= 0xac00 && c
<= 0xd7a3)
275 if ((c
>= 0x0660 && c
<= 0x0669)
276 || (c
>= 0x06f0 && c
<= 0x06f9)
277 || (c
>= 0x0966 && c
<= 0x096f)
278 || (c
>= 0x09e6 && c
<= 0x09ef)
279 || (c
>= 0x0a66 && c
<= 0x0a6f)
280 || (c
>= 0x0ae6 && c
<= 0x0aef)
281 || (c
>= 0x0b66 && c
<= 0x0b6f)
282 || (c
>= 0x0be7 && c
<= 0x0bef)
283 || (c
>= 0x0c66 && c
<= 0x0c6f)
284 || (c
>= 0x0ce6 && c
<= 0x0cef)
285 || (c
>= 0x0d66 && c
<= 0x0d6f)
286 || (c
>= 0x0e50 && c
<= 0x0e59)
287 || (c
>= 0x0ed0 && c
<= 0x0ed9)
288 || (c
>= 0x0f20 && c
<= 0x0f33))
291 /* Special characters. */
294 || (c
>= 0x02b0 && c
<= 0x02b8)
296 || (c
>= 0x02bd && c
<= 0x02c1)
297 || (c
>= 0x02d0 && c
<= 0x02d1)
298 || (c
>= 0x02e0 && c
<= 0x02e4)
304 || (c
>= 0x203f && c
<= 0x2040)
307 || (c
>= 0x210a && c
<= 0x2113)
309 || (c
>= 0x2118 && c
<= 0x211d)
313 || (c
>= 0x212a && c
<= 0x2131)
314 || (c
>= 0x2133 && c
<= 0x2138)
315 || (c
>= 0x2160 && c
<= 0x2182)
316 || (c
>= 0x3005 && c
<= 0x3007)
317 || (c
>= 0x3021 && c
<= 0x3029))
321 if (CPP_OPTION (pfile
, cplusplus
) || !CPP_PEDANTIC (pfile
))
332 if (c
>= 0x05f3 && c
<= 0x05f4)
336 if ((c
>= 0x0ead && c
<= 0x0eb0)
340 || (c
>= 0x0ec0 && c
<= 0x0ec4)
346 || (c
>= 0x309d && c
<= 0x309e))
350 if ((c
>= 0x30a1 && c
<= 0x30fe))
354 if ((c
>= 0x1100 && c
<= 0x1159)
355 || (c
>= 0x1161 && c
<= 0x11a2)
356 || (c
>= 0x11a8 && c
<= 0x11f9))
359 /* CJK Unified Ideographs */
360 if ((c
>= 0xf900 && c
<= 0xfa2d)
361 || (c
>= 0xfb1f && c
<= 0xfb36)
362 || (c
>= 0xfb38 && c
<= 0xfb3c)
364 || (c
>= 0xfb40 && c
<= 0xfb41)
365 || (c
>= 0xfb42 && c
<= 0xfb44)
366 || (c
>= 0xfb46 && c
<= 0xfbb1)
367 || (c
>= 0xfbd3 && c
<= 0xfd3f)
368 || (c
>= 0xfd50 && c
<= 0xfd8f)
369 || (c
>= 0xfd92 && c
<= 0xfdc7)
370 || (c
>= 0xfdf0 && c
<= 0xfdfb)
371 || (c
>= 0xfe70 && c
<= 0xfe72)
373 || (c
>= 0xfe76 && c
<= 0xfefc)
374 || (c
>= 0xff21 && c
<= 0xff3a)
375 || (c
>= 0xff41 && c
<= 0xff5a)
376 || (c
>= 0xff66 && c
<= 0xffbe)
377 || (c
>= 0xffc2 && c
<= 0xffc7)
378 || (c
>= 0xffca && c
<= 0xffcf)
379 || (c
>= 0xffd2 && c
<= 0xffd7)
380 || (c
>= 0xffda && c
<= 0xffdc)
381 || (c
>= 0x4e00 && c
<= 0x9fa5))
386 if ((c
>= 0x00c0 && c
<= 0x00d6)
387 || (c
>= 0x00d8 && c
<= 0x00f6)
388 || (c
>= 0x00f8 && c
<= 0x01f5)
389 || (c
>= 0x01fa && c
<= 0x0217)
390 || (c
>= 0x0250 && c
<= 0x02a8)
391 || (c
>= 0x1e00 && c
<= 0x1e9a)
392 || (c
>= 0x1ea0 && c
<= 0x1ef9))
396 if ((c
>= 0x0388 && c
<= 0x038a)
398 || (c
>= 0x038e && c
<= 0x03a1)
399 || (c
>= 0x03a3 && c
<= 0x03ce)
400 || (c
>= 0x03d0 && c
<= 0x03d6)
405 || (c
>= 0x03e2 && c
<= 0x03f3)
406 || (c
>= 0x1f00 && c
<= 0x1f15)
407 || (c
>= 0x1f18 && c
<= 0x1f1d)
408 || (c
>= 0x1f20 && c
<= 0x1f45)
409 || (c
>= 0x1f48 && c
<= 0x1f4d)
410 || (c
>= 0x1f50 && c
<= 0x1f57)
414 || (c
>= 0x1f5f && c
<= 0x1f7d)
415 || (c
>= 0x1f80 && c
<= 0x1fb4)
416 || (c
>= 0x1fb6 && c
<= 0x1fbc)
417 || (c
>= 0x1fc2 && c
<= 0x1fc4)
418 || (c
>= 0x1fc6 && c
<= 0x1fcc)
419 || (c
>= 0x1fd0 && c
<= 0x1fd3)
420 || (c
>= 0x1fd6 && c
<= 0x1fdb)
421 || (c
>= 0x1fe0 && c
<= 0x1fec)
422 || (c
>= 0x1ff2 && c
<= 0x1ff4)
423 || (c
>= 0x1ff6 && c
<= 0x1ffc))
427 if ((c
>= 0x0401 && c
<= 0x040c)
428 || (c
>= 0x040f && c
<= 0x044f)
429 || (c
>= 0x0451 && c
<= 0x045c)
430 || (c
>= 0x045e && c
<= 0x0481)
431 || (c
>= 0x0490 && c
<= 0x04c4)
432 || (c
>= 0x04c7 && c
<= 0x04c8)
433 || (c
>= 0x04cb && c
<= 0x04cc)
434 || (c
>= 0x04d0 && c
<= 0x04eb)
435 || (c
>= 0x04ee && c
<= 0x04f5)
436 || (c
>= 0x04f8 && c
<= 0x04f9))
440 if ((c
>= 0x0531 && c
<= 0x0556)
441 || (c
>= 0x0561 && c
<= 0x0587))
445 if ((c
>= 0x05d0 && c
<= 0x05ea)
446 || (c
>= 0x05f0 && c
<= 0x05f2))
450 if ((c
>= 0x0621 && c
<= 0x063a)
451 || (c
>= 0x0640 && c
<= 0x0652)
452 || (c
>= 0x0670 && c
<= 0x06b7)
453 || (c
>= 0x06ba && c
<= 0x06be)
454 || (c
>= 0x06c0 && c
<= 0x06ce)
455 || (c
>= 0x06e5 && c
<= 0x06e7))
459 if ((c
>= 0x0905 && c
<= 0x0939)
460 || (c
>= 0x0958 && c
<= 0x0962))
464 if ((c
>= 0x0985 && c
<= 0x098c)
465 || (c
>= 0x098f && c
<= 0x0990)
466 || (c
>= 0x0993 && c
<= 0x09a8)
467 || (c
>= 0x09aa && c
<= 0x09b0)
469 || (c
>= 0x09b6 && c
<= 0x09b9)
470 || (c
>= 0x09dc && c
<= 0x09dd)
471 || (c
>= 0x09df && c
<= 0x09e1)
472 || (c
>= 0x09f0 && c
<= 0x09f1))
476 if ((c
>= 0x0a05 && c
<= 0x0a0a)
477 || (c
>= 0x0a0f && c
<= 0x0a10)
478 || (c
>= 0x0a13 && c
<= 0x0a28)
479 || (c
>= 0x0a2a && c
<= 0x0a30)
480 || (c
>= 0x0a32 && c
<= 0x0a33)
481 || (c
>= 0x0a35 && c
<= 0x0a36)
482 || (c
>= 0x0a38 && c
<= 0x0a39)
483 || (c
>= 0x0a59 && c
<= 0x0a5c)
488 if ((c
>= 0x0a85 && c
<= 0x0a8b)
490 || (c
>= 0x0a8f && c
<= 0x0a91)
491 || (c
>= 0x0a93 && c
<= 0x0aa8)
492 || (c
>= 0x0aaa && c
<= 0x0ab0)
493 || (c
>= 0x0ab2 && c
<= 0x0ab3)
494 || (c
>= 0x0ab5 && c
<= 0x0ab9)
499 if ((c
>= 0x0b05 && c
<= 0x0b0c)
500 || (c
>= 0x0b0f && c
<= 0x0b10)
501 || (c
>= 0x0b13 && c
<= 0x0b28)
502 || (c
>= 0x0b2a && c
<= 0x0b30)
503 || (c
>= 0x0b32 && c
<= 0x0b33)
504 || (c
>= 0x0b36 && c
<= 0x0b39)
505 || (c
>= 0x0b5c && c
<= 0x0b5d)
506 || (c
>= 0x0b5f && c
<= 0x0b61))
510 if ((c
>= 0x0b85 && c
<= 0x0b8a)
511 || (c
>= 0x0b8e && c
<= 0x0b90)
512 || (c
>= 0x0b92 && c
<= 0x0b95)
513 || (c
>= 0x0b99 && c
<= 0x0b9a)
515 || (c
>= 0x0b9e && c
<= 0x0b9f)
516 || (c
>= 0x0ba3 && c
<= 0x0ba4)
517 || (c
>= 0x0ba8 && c
<= 0x0baa)
518 || (c
>= 0x0bae && c
<= 0x0bb5)
519 || (c
>= 0x0bb7 && c
<= 0x0bb9))
523 if ((c
>= 0x0c05 && c
<= 0x0c0c)
524 || (c
>= 0x0c0e && c
<= 0x0c10)
525 || (c
>= 0x0c12 && c
<= 0x0c28)
526 || (c
>= 0x0c2a && c
<= 0x0c33)
527 || (c
>= 0x0c35 && c
<= 0x0c39)
528 || (c
>= 0x0c60 && c
<= 0x0c61))
532 if ((c
>= 0x0c85 && c
<= 0x0c8c)
533 || (c
>= 0x0c8e && c
<= 0x0c90)
534 || (c
>= 0x0c92 && c
<= 0x0ca8)
535 || (c
>= 0x0caa && c
<= 0x0cb3)
536 || (c
>= 0x0cb5 && c
<= 0x0cb9)
537 || (c
>= 0x0ce0 && c
<= 0x0ce1))
541 if ((c
>= 0x0d05 && c
<= 0x0d0c)
542 || (c
>= 0x0d0e && c
<= 0x0d10)
543 || (c
>= 0x0d12 && c
<= 0x0d28)
544 || (c
>= 0x0d2a && c
<= 0x0d39)
545 || (c
>= 0x0d60 && c
<= 0x0d61))
549 if ((c
>= 0x0e01 && c
<= 0x0e30)
550 || (c
>= 0x0e32 && c
<= 0x0e33)
551 || (c
>= 0x0e40 && c
<= 0x0e46)
552 || (c
>= 0x0e4f && c
<= 0x0e5b))
556 if ((c
>= 0x0e81 && c
<= 0x0e82)
562 || (c
>= 0x0e94 && c
<= 0x0e97)
563 || (c
>= 0x0e99 && c
<= 0x0e9f)
564 || (c
>= 0x0ea1 && c
<= 0x0ea3)
572 if ((c
>= 0x10a0 && c
<= 0x10c5)
573 || (c
>= 0x10d0 && c
<= 0x10f6))
577 if ((c
>= 0x3041 && c
<= 0x3093)
578 || (c
>= 0x309b && c
<= 0x309c))
582 if ((c
>= 0x3105 && c
<= 0x312c))