1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "coretypes.h"
28 static int ucn_valid_in_identifier
PARAMS ((cpp_reader
*, cppchar_t
));
30 /* [lex.charset]: The character designated by the universal character
31 name \UNNNNNNNN is that character whose character short name in
32 ISO/IEC 10646 is NNNNNNNN; the character designated by the
33 universal character name \uNNNN is that character whose character
34 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
35 for a universal character name is less than 0x20 or in the range
36 0x7F-0x9F (inclusive), or if the universal character name
37 designates a character in the basic source character set, then the
38 program is ill-formed.
40 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
41 buffer end is delimited by a non-hex digit. Returns zero if UCNs
42 are not part of the relevant standard, or if the string beginning
43 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
45 Otherwise the non-zero value of the UCN, whether valid or invalid,
46 is returned. Diagnostics are emitted for invalid values. PSTR
47 is updated to point one beyond the UCN, or to the syntactically
50 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
51 an identifier, or 2 otherwise.
55 _cpp_valid_ucn (pfile
, pstr
, identifier_pos
)
62 const uchar
*str
= *pstr
;
63 const uchar
*base
= str
- 2;
65 /* Only attempt to interpret a UCS for C++ and C99. */
66 if (!CPP_OPTION (pfile
, cplusplus
) && !CPP_OPTION (pfile
, c99
))
69 /* We don't accept UCNs for an EBCDIC target. */
70 if (CPP_OPTION (pfile
, EBCDIC
))
75 else if (str
[-1] == 'U')
87 result
= (result
<< 4) + hex_value (c
);
93 /* We'll error when we try it out as the start of an identifier. */
94 cpp_error (pfile
, DL_ERROR
, "incomplete universal character name %.*s",
95 (int) (str
- base
), base
);
96 /* The standard permits $, @ and ` to be specified as UCNs. We use
97 hex escapes so that this also works with EBCDIC hosts. */
98 else if ((result
< 0xa0
99 && (result
!= 0x24 && result
!= 0x40 && result
!= 0x60))
100 || (result
& 0x80000000)
101 || (result
>= 0xD800 && result
<= 0xDFFF))
103 cpp_error (pfile
, DL_ERROR
, "%.*s is not a valid universal character",
104 (int) (str
- base
), base
);
106 else if (identifier_pos
)
108 int validity
= ucn_valid_in_identifier (pfile
, result
);
111 cpp_error (pfile
, DL_ERROR
,
112 "universal character %.*s is not valid in an identifier",
113 (int) (str
- base
), base
);
114 else if (validity
== 2 && identifier_pos
== 1)
115 cpp_error (pfile
, DL_ERROR
,
116 "universal character %.*s is not valid at the start of an identifier",
117 (int) (str
- base
), base
);
126 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
127 the start of an identifier, and 0 if C is not valid in an
128 identifier. We assume C has already gone through the checks of
131 ucn_valid_in_identifier (pfile
, c
)
135 /* None of the valid chars are outside the Basic Multilingual Plane (the
140 if (CPP_OPTION (pfile
, c99
) || !CPP_PEDANTIC (pfile
))
143 if (c
== 0x0aa || c
== 0x00ba || c
== 0x207f || c
== 0x1e9b)
155 if ((c
>= 0x05b0 && c
<= 0x05b9)
156 || (c
>= 0x05bb && c
<= 0x005bd)
158 || (c
>= 0x05c1 && c
<= 0x05c2))
162 if ((c
>= 0x06d0 && c
<= 0x06dc)
164 || (c
>= 0x06ea && c
<= 0x06ed))
168 if ((c
>= 0x0901 && c
<= 0x0903)
169 || (c
>= 0x093e && c
<= 0x094d)
170 || (c
>= 0x0950 && c
<= 0x0952)
175 if ((c
>= 0x0981 && c
<= 0x0983)
176 || (c
>= 0x09be && c
<= 0x09c4)
177 || (c
>= 0x09c7 && c
<= 0x09c8)
178 || (c
>= 0x09cb && c
<= 0x09cd)
179 || (c
>= 0x09e2 && c
<= 0x09e3))
184 || (c
>= 0x0a3e && c
<= 0x0a42)
185 || (c
>= 0x0a47 && c
<= 0x0a48)
186 || (c
>= 0x0a4b && c
<= 0x0a4d)
191 if ((c
>= 0x0a81 && c
<= 0x0a83)
192 || (c
>= 0x0abd && c
<= 0x0ac5)
193 || (c
>= 0x0ac7 && c
<= 0x0ac9)
194 || (c
>= 0x0acb && c
<= 0x0acd)
199 if ((c
>= 0x0b01 && c
<= 0x0b03)
200 || (c
>= 0x0b3e && c
<= 0x0b43)
201 || (c
>= 0x0b47 && c
<= 0x0b48)
202 || (c
>= 0x0b4b && c
<= 0x0b4d))
206 if ((c
>= 0x0b82 && c
<= 0x0b83)
207 || (c
>= 0x0bbe && c
<= 0x0bc2)
208 || (c
>= 0x0bc6 && c
<= 0x0bc8)
209 || (c
>= 0x0bc8 && c
<= 0x0bcd))
213 if ((c
>= 0x0c01 && c
<= 0x0c03)
214 || (c
>= 0x0c3e && c
<= 0x0c44)
215 || (c
>= 0x0c46 && c
<= 0x0c48)
216 || (c
>= 0x0c4a && c
<= 0x0c4d))
220 if ((c
>= 0x0c82 && c
<= 0x0c83)
221 || (c
>= 0x0cbe && c
<= 0x0cc4)
222 || (c
>= 0x0cc6 && c
<= 0x0cc8)
223 || (c
>= 0x0cca && c
<= 0x0ccd)
228 if ((c
>= 0x0d02 && c
<= 0x0d03)
229 || (c
>= 0x0d3e && c
<= 0x0d43)
230 || (c
>= 0x0d46 && c
<= 0x0d48)
231 || (c
>= 0x0d4a && c
<= 0x0d4d))
235 if ((c
>= 0x0e01 && c
<= 0x0e3a)
236 || (c
>= 0x0e40 && c
<= 0x0e5b))
240 if ((c
>= 0x0ead && c
<= 0x0eae)
241 || (c
>= 0x0eb0 && c
<= 0x0eb9)
242 || (c
>= 0x0ebb && c
<= 0x0ebd)
243 || (c
>= 0x0ec0 && c
<= 0x0ec4)
245 || (c
>= 0x0ec8 && c
<= 0x0ecd)
246 || (c
>= 0x0edc && c
<= 0x0ed))
251 || (c
>= 0x0f18 && c
<= 0x0f19)
255 || (c
>= 0x0f3e && c
<= 0x0f47)
256 || (c
>= 0x0f49 && c
<= 0x0f69)
257 || (c
>= 0x0f71 && c
<= 0x0f84)
258 || (c
>= 0x0f86 && c
<= 0x0f8b)
259 || (c
>= 0x0f90 && c
<= 0x0f95)
261 || (c
>= 0x0f99 && c
<= 0x0fad)
262 || (c
>= 0x0fb1 && c
<= 0x0fb7)
267 if ((c
>= 0x30a1 && c
<= 0x30f6)
268 || (c
>= 0x30fb && c
<= 0x30fc))
271 /* CJK Unified Ideographs. */
272 if (c
>= 0x4e00 && c
<= 0x9fa5)
276 if (c
>= 0xac00 && c
<= 0xd7a3)
280 if ((c
>= 0x0660 && c
<= 0x0669)
281 || (c
>= 0x06f0 && c
<= 0x06f9)
282 || (c
>= 0x0966 && c
<= 0x096f)
283 || (c
>= 0x09e6 && c
<= 0x09ef)
284 || (c
>= 0x0a66 && c
<= 0x0a6f)
285 || (c
>= 0x0ae6 && c
<= 0x0aef)
286 || (c
>= 0x0b66 && c
<= 0x0b6f)
287 || (c
>= 0x0be7 && c
<= 0x0bef)
288 || (c
>= 0x0c66 && c
<= 0x0c6f)
289 || (c
>= 0x0ce6 && c
<= 0x0cef)
290 || (c
>= 0x0d66 && c
<= 0x0d6f)
291 || (c
>= 0x0e50 && c
<= 0x0e59)
292 || (c
>= 0x0ed0 && c
<= 0x0ed9)
293 || (c
>= 0x0f20 && c
<= 0x0f33))
296 /* Special characters. */
299 || (c
>= 0x02b0 && c
<= 0x02b8)
301 || (c
>= 0x02bd && c
<= 0x02c1)
302 || (c
>= 0x02d0 && c
<= 0x02d1)
303 || (c
>= 0x02e0 && c
<= 0x02e4)
309 || (c
>= 0x203f && c
<= 0x2040)
312 || (c
>= 0x210a && c
<= 0x2113)
314 || (c
>= 0x2118 && c
<= 0x211d)
318 || (c
>= 0x212a && c
<= 0x2131)
319 || (c
>= 0x2133 && c
<= 0x2138)
320 || (c
>= 0x2160 && c
<= 0x2182)
321 || (c
>= 0x3005 && c
<= 0x3007)
322 || (c
>= 0x3021 && c
<= 0x3029))
326 if (CPP_OPTION (pfile
, cplusplus
) || !CPP_PEDANTIC (pfile
))
337 if (c
>= 0x05f3 && c
<= 0x05f4)
341 if ((c
>= 0x0ead && c
<= 0x0eb0)
345 || (c
>= 0x0ec0 && c
<= 0x0ec4)
351 || (c
>= 0x309d && c
<= 0x309e))
355 if ((c
>= 0x30a1 && c
<= 0x30fe))
359 if ((c
>= 0x1100 && c
<= 0x1159)
360 || (c
>= 0x1161 && c
<= 0x11a2)
361 || (c
>= 0x11a8 && c
<= 0x11f9))
364 /* CJK Unified Ideographs */
365 if ((c
>= 0xf900 && c
<= 0xfa2d)
366 || (c
>= 0xfb1f && c
<= 0xfb36)
367 || (c
>= 0xfb38 && c
<= 0xfb3c)
369 || (c
>= 0xfb40 && c
<= 0xfb41)
370 || (c
>= 0xfb42 && c
<= 0xfb44)
371 || (c
>= 0xfb46 && c
<= 0xfbb1)
372 || (c
>= 0xfbd3 && c
<= 0xfd3f)
373 || (c
>= 0xfd50 && c
<= 0xfd8f)
374 || (c
>= 0xfd92 && c
<= 0xfdc7)
375 || (c
>= 0xfdf0 && c
<= 0xfdfb)
376 || (c
>= 0xfe70 && c
<= 0xfe72)
378 || (c
>= 0xfe76 && c
<= 0xfefc)
379 || (c
>= 0xff21 && c
<= 0xff3a)
380 || (c
>= 0xff41 && c
<= 0xff5a)
381 || (c
>= 0xff66 && c
<= 0xffbe)
382 || (c
>= 0xffc2 && c
<= 0xffc7)
383 || (c
>= 0xffca && c
<= 0xffcf)
384 || (c
>= 0xffd2 && c
<= 0xffd7)
385 || (c
>= 0xffda && c
<= 0xffdc)
386 || (c
>= 0x4e00 && c
<= 0x9fa5))
391 if ((c
>= 0x00c0 && c
<= 0x00d6)
392 || (c
>= 0x00d8 && c
<= 0x00f6)
393 || (c
>= 0x00f8 && c
<= 0x01f5)
394 || (c
>= 0x01fa && c
<= 0x0217)
395 || (c
>= 0x0250 && c
<= 0x02a8)
396 || (c
>= 0x1e00 && c
<= 0x1e9a)
397 || (c
>= 0x1ea0 && c
<= 0x1ef9))
401 if ((c
>= 0x0388 && c
<= 0x038a)
403 || (c
>= 0x038e && c
<= 0x03a1)
404 || (c
>= 0x03a3 && c
<= 0x03ce)
405 || (c
>= 0x03d0 && c
<= 0x03d6)
410 || (c
>= 0x03e2 && c
<= 0x03f3)
411 || (c
>= 0x1f00 && c
<= 0x1f15)
412 || (c
>= 0x1f18 && c
<= 0x1f1d)
413 || (c
>= 0x1f20 && c
<= 0x1f45)
414 || (c
>= 0x1f48 && c
<= 0x1f4d)
415 || (c
>= 0x1f50 && c
<= 0x1f57)
419 || (c
>= 0x1f5f && c
<= 0x1f7d)
420 || (c
>= 0x1f80 && c
<= 0x1fb4)
421 || (c
>= 0x1fb6 && c
<= 0x1fbc)
422 || (c
>= 0x1fc2 && c
<= 0x1fc4)
423 || (c
>= 0x1fc6 && c
<= 0x1fcc)
424 || (c
>= 0x1fd0 && c
<= 0x1fd3)
425 || (c
>= 0x1fd6 && c
<= 0x1fdb)
426 || (c
>= 0x1fe0 && c
<= 0x1fec)
427 || (c
>= 0x1ff2 && c
<= 0x1ff4)
428 || (c
>= 0x1ff6 && c
<= 0x1ffc))
432 if ((c
>= 0x0401 && c
<= 0x040c)
433 || (c
>= 0x040f && c
<= 0x044f)
434 || (c
>= 0x0451 && c
<= 0x045c)
435 || (c
>= 0x045e && c
<= 0x0481)
436 || (c
>= 0x0490 && c
<= 0x04c4)
437 || (c
>= 0x04c7 && c
<= 0x04c8)
438 || (c
>= 0x04cb && c
<= 0x04cc)
439 || (c
>= 0x04d0 && c
<= 0x04eb)
440 || (c
>= 0x04ee && c
<= 0x04f5)
441 || (c
>= 0x04f8 && c
<= 0x04f9))
445 if ((c
>= 0x0531 && c
<= 0x0556)
446 || (c
>= 0x0561 && c
<= 0x0587))
450 if ((c
>= 0x05d0 && c
<= 0x05ea)
451 || (c
>= 0x05f0 && c
<= 0x05f2))
455 if ((c
>= 0x0621 && c
<= 0x063a)
456 || (c
>= 0x0640 && c
<= 0x0652)
457 || (c
>= 0x0670 && c
<= 0x06b7)
458 || (c
>= 0x06ba && c
<= 0x06be)
459 || (c
>= 0x06c0 && c
<= 0x06ce)
460 || (c
>= 0x06e5 && c
<= 0x06e7))
464 if ((c
>= 0x0905 && c
<= 0x0939)
465 || (c
>= 0x0958 && c
<= 0x0962))
469 if ((c
>= 0x0985 && c
<= 0x098c)
470 || (c
>= 0x098f && c
<= 0x0990)
471 || (c
>= 0x0993 && c
<= 0x09a8)
472 || (c
>= 0x09aa && c
<= 0x09b0)
474 || (c
>= 0x09b6 && c
<= 0x09b9)
475 || (c
>= 0x09dc && c
<= 0x09dd)
476 || (c
>= 0x09df && c
<= 0x09e1)
477 || (c
>= 0x09f0 && c
<= 0x09f1))
481 if ((c
>= 0x0a05 && c
<= 0x0a0a)
482 || (c
>= 0x0a0f && c
<= 0x0a10)
483 || (c
>= 0x0a13 && c
<= 0x0a28)
484 || (c
>= 0x0a2a && c
<= 0x0a30)
485 || (c
>= 0x0a32 && c
<= 0x0a33)
486 || (c
>= 0x0a35 && c
<= 0x0a36)
487 || (c
>= 0x0a38 && c
<= 0x0a39)
488 || (c
>= 0x0a59 && c
<= 0x0a5c)
493 if ((c
>= 0x0a85 && c
<= 0x0a8b)
495 || (c
>= 0x0a8f && c
<= 0x0a91)
496 || (c
>= 0x0a93 && c
<= 0x0aa8)
497 || (c
>= 0x0aaa && c
<= 0x0ab0)
498 || (c
>= 0x0ab2 && c
<= 0x0ab3)
499 || (c
>= 0x0ab5 && c
<= 0x0ab9)
504 if ((c
>= 0x0b05 && c
<= 0x0b0c)
505 || (c
>= 0x0b0f && c
<= 0x0b10)
506 || (c
>= 0x0b13 && c
<= 0x0b28)
507 || (c
>= 0x0b2a && c
<= 0x0b30)
508 || (c
>= 0x0b32 && c
<= 0x0b33)
509 || (c
>= 0x0b36 && c
<= 0x0b39)
510 || (c
>= 0x0b5c && c
<= 0x0b5d)
511 || (c
>= 0x0b5f && c
<= 0x0b61))
515 if ((c
>= 0x0b85 && c
<= 0x0b8a)
516 || (c
>= 0x0b8e && c
<= 0x0b90)
517 || (c
>= 0x0b92 && c
<= 0x0b95)
518 || (c
>= 0x0b99 && c
<= 0x0b9a)
520 || (c
>= 0x0b9e && c
<= 0x0b9f)
521 || (c
>= 0x0ba3 && c
<= 0x0ba4)
522 || (c
>= 0x0ba8 && c
<= 0x0baa)
523 || (c
>= 0x0bae && c
<= 0x0bb5)
524 || (c
>= 0x0bb7 && c
<= 0x0bb9))
528 if ((c
>= 0x0c05 && c
<= 0x0c0c)
529 || (c
>= 0x0c0e && c
<= 0x0c10)
530 || (c
>= 0x0c12 && c
<= 0x0c28)
531 || (c
>= 0x0c2a && c
<= 0x0c33)
532 || (c
>= 0x0c35 && c
<= 0x0c39)
533 || (c
>= 0x0c60 && c
<= 0x0c61))
537 if ((c
>= 0x0c85 && c
<= 0x0c8c)
538 || (c
>= 0x0c8e && c
<= 0x0c90)
539 || (c
>= 0x0c92 && c
<= 0x0ca8)
540 || (c
>= 0x0caa && c
<= 0x0cb3)
541 || (c
>= 0x0cb5 && c
<= 0x0cb9)
542 || (c
>= 0x0ce0 && c
<= 0x0ce1))
546 if ((c
>= 0x0d05 && c
<= 0x0d0c)
547 || (c
>= 0x0d0e && c
<= 0x0d10)
548 || (c
>= 0x0d12 && c
<= 0x0d28)
549 || (c
>= 0x0d2a && c
<= 0x0d39)
550 || (c
>= 0x0d60 && c
<= 0x0d61))
554 if ((c
>= 0x0e01 && c
<= 0x0e30)
555 || (c
>= 0x0e32 && c
<= 0x0e33)
556 || (c
>= 0x0e40 && c
<= 0x0e46)
557 || (c
>= 0x0e4f && c
<= 0x0e5b))
561 if ((c
>= 0x0e81 && c
<= 0x0e82)
567 || (c
>= 0x0e94 && c
<= 0x0e97)
568 || (c
>= 0x0e99 && c
<= 0x0e9f)
569 || (c
>= 0x0ea1 && c
<= 0x0ea3)
577 if ((c
>= 0x10a0 && c
<= 0x10c5)
578 || (c
>= 0x10d0 && c
<= 0x10f6))
582 if ((c
>= 0x3041 && c
<= 0x3093)
583 || (c
>= 0x309b && c
<= 0x309c))
587 if ((c
>= 0x3105 && c
<= 0x312c))