2017-10-17 Paolo Carlini <paolo.carlini@oracle.com>
[official-gcc.git] / libcpp / makeucnid.c
blobfd24c00c5d1ce4191207124453d780f365bf885a
1 /* Make ucnid.h from various sources.
2 Copyright (C) 2005-2017 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify it
5 under the terms of the GNU General Public License as published by the
6 Free Software Foundation; either version 3, or (at your option) any
7 later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; see the file COPYING3. If not see
16 <http://www.gnu.org/licenses/>. */
18 /* Run this program as
19 ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
20 > ucnid.h
23 #include <stdio.h>
24 #include <string.h>
25 #include <ctype.h>
26 #include <stdbool.h>
27 #include <stdlib.h>
29 enum {
30 C99 = 1,
31 CXX = 2,
32 N99 = 4,
33 C11 = 8,
34 N11 = 16,
35 all_languages = C99 | CXX | C11,
36 not_NFC = 32,
37 not_NFKC = 64,
38 maybe_not_NFC = 128
41 #define NUM_CODE_POINTS 0x110000
42 #define MAX_CODE_POINT 0x10ffff
44 static unsigned flags[NUM_CODE_POINTS];
45 static unsigned int all_decomp[NUM_CODE_POINTS][2];
46 static unsigned int decomp[NUM_CODE_POINTS][2];
47 static unsigned char combining_value[NUM_CODE_POINTS];
49 /* Die! */
51 static void
52 fail (const char *s)
54 fprintf (stderr, "%s\n", s);
55 exit (1);
58 /* Read ucnid.tab and set the flags for language versions in header[]. */
60 static void
61 read_ucnid (const char *fname)
63 FILE *f = fopen (fname, "r");
64 unsigned fl = 0;
66 if (!f)
67 fail ("opening ucnid.tab");
68 for (;;)
70 char line[256];
72 if (!fgets (line, sizeof (line), f))
73 break;
74 if (strcmp (line, "[C99]\n") == 0)
75 fl = C99;
76 else if (strcmp (line, "[C99DIG]\n") == 0)
77 fl = C99|N99;
78 else if (strcmp (line, "[CXX]\n") == 0)
79 fl = CXX;
80 else if (strcmp (line, "[C11]\n") == 0)
81 fl = C11;
82 else if (strcmp (line, "[C11NOSTART]\n") == 0)
83 fl = C11|N11;
84 else if (isxdigit (line[0]))
86 char *l = line;
87 while (*l)
89 unsigned long start, end;
90 char *endptr;
91 start = strtoul (l, &endptr, 16);
92 if (endptr == l || (*endptr != '-' && ! isspace (*endptr)))
93 fail ("parsing ucnid.tab [1]");
94 l = endptr;
95 if (*l != '-')
96 end = start;
97 else
99 end = strtoul (l + 1, &endptr, 16);
100 if (end < start)
101 fail ("parsing ucnid.tab, end before start");
102 l = endptr;
103 if (! isspace (*l))
104 fail ("parsing ucnid.tab, junk after range");
106 while (isspace (*l))
107 l++;
108 if (end > MAX_CODE_POINT)
109 fail ("parsing ucnid.tab, end too large");
110 while (start <= end)
111 flags[start++] |= fl;
115 if (ferror (f))
116 fail ("reading ucnid.tab");
117 fclose (f);
120 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
121 decompositions of characters for which both the character
122 decomposed and all the code points in the decomposition are valid
123 for some supported language version, and the 'all_decomp' table to
124 be the decompositions of all characters without those
125 constraints. */
127 static void
128 read_table (char *fname)
130 FILE * f = fopen (fname, "r");
132 if (!f)
133 fail ("opening UnicodeData.txt");
134 for (;;)
136 char line[256];
137 unsigned long codepoint, this_decomp[4];
138 char *l;
139 int i, j;
140 int decomp_useful;
142 if (!fgets (line, sizeof (line), f))
143 break;
144 codepoint = strtoul (line, &l, 16);
145 if (l == line || *l != ';')
146 fail ("parsing UnicodeData.txt, reading code point");
147 if (codepoint > MAX_CODE_POINT)
148 fail ("parsing UnicodeData.txt, code point too large");
150 do {
151 l++;
152 } while (*l != ';');
153 /* Category value. */
154 do {
155 l++;
156 } while (*l != ';');
157 /* Canonical combining class; in NFC/NFKC, they must be increasing
158 (or zero). */
159 if (! isdigit (*++l))
160 fail ("parsing UnicodeData.txt, combining class not number");
161 combining_value[codepoint] = strtoul (l, &l, 10);
162 if (*l++ != ';')
163 fail ("parsing UnicodeData.txt, junk after combining class");
165 /* Skip over bidi value. */
166 do {
167 l++;
168 } while (*l != ';');
170 /* Decomposition mapping. */
171 decomp_useful = flags[codepoint];
172 if (*++l == '<') /* Compatibility mapping. */
173 continue;
174 for (i = 0; i < 4; i++)
176 if (*l == ';')
177 break;
178 if (!isxdigit (*l))
179 fail ("parsing UnicodeData.txt, decomposition format");
180 this_decomp[i] = strtoul (l, &l, 16);
181 decomp_useful &= flags[this_decomp[i]];
182 while (isspace (*l))
183 l++;
185 if (i > 2) /* Decomposition too long. */
186 fail ("parsing UnicodeData.txt, decomposition too long");
187 for (j = 0; j < i; j++)
188 all_decomp[codepoint][j] = this_decomp[j];
189 if ((flags[codepoint] & all_languages) && decomp_useful)
190 while (--i >= 0)
191 decomp[codepoint][i] = this_decomp[i];
193 if (ferror (f))
194 fail ("reading UnicodeData.txt");
195 fclose (f);
198 /* Read DerivedNormalizationProps.txt and set the flags that say whether
199 a character is in NFC, NFKC, or is context-dependent. */
201 static void
202 read_derived (const char *fname)
204 FILE * f = fopen (fname, "r");
206 if (!f)
207 fail ("opening DerivedNormalizationProps.txt");
208 for (;;)
210 char line[256];
211 unsigned long start, end;
212 char *l;
213 bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p;
215 if (!fgets (line, sizeof (line), f))
216 break;
217 not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL);
218 not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL);
219 maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL);
220 if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p)
221 continue;
223 start = strtoul (line, &l, 16);
224 if (l == line)
225 fail ("parsing DerivedNormalizationProps.txt, reading start");
226 if (start > MAX_CODE_POINT)
227 fail ("parsing DerivedNormalizationProps.txt, code point too large");
228 if (*l == '.' && l[1] == '.')
229 end = strtoul (l + 2, &l, 16);
230 else
231 end = start;
233 while (start <= end)
234 flags[start++] |= ((not_NFC_p ? not_NFC : 0)
235 | (not_NFKC_p ? not_NFKC : 0)
236 | (maybe_not_NFC_p ? maybe_not_NFC : 0)
239 if (ferror (f))
240 fail ("reading DerivedNormalizationProps.txt");
241 fclose (f);
244 /* Write out the table.
245 The table consists of two words per entry. The first word is the flags
246 for the unicode code points up to and including the second word. */
248 static void
249 write_table (void)
251 unsigned i;
252 unsigned last_flag = flags[0];
253 bool really_safe = decomp[0][0] == 0;
254 unsigned char last_combine = combining_value[0];
256 printf ("static const struct ucnrange ucnranges[] = {\n");
258 for (i = 1; i <= NUM_CODE_POINTS; i++)
259 if (i == NUM_CODE_POINTS
260 || (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
261 || really_safe != (decomp[i][0] == 0)
262 || combining_value[i] != last_combine)
264 printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
265 last_flag & C99 ? "C99" : " 0",
266 last_flag & N99 ? "N99" : " 0",
267 last_flag & CXX ? "CXX" : " 0",
268 last_flag & C11 ? "C11" : " 0",
269 last_flag & N11 ? "N11" : " 0",
270 really_safe ? "CID" : " 0",
271 last_flag & not_NFC ? " 0" : "NFC",
272 last_flag & not_NFKC ? " 0" : "NKC",
273 last_flag & maybe_not_NFC ? "CTX" : " 0",
274 combining_value[i - 1],
275 i - 1);
276 last_flag = flags[i];
277 last_combine = combining_value[0];
278 really_safe = decomp[i][0] == 0;
281 printf ("};\n");
284 /* Return whether a given character is valid in an identifier for some
285 supported language, either as itself or as a UCN. */
287 static bool
288 char_id_valid (unsigned int c)
290 return ((flags[c] & all_languages)
291 || (c == 0x24)
292 || (c >= 0x30 && c <= 0x39)
293 || (c >= 0x41 && c <= 0x5a)
294 || (c >= 0x61 && c <= 0x7a));
297 /* Write out the switch statement over characters for which it is
298 context-dependent whether they are in NFC. */
300 static void
301 write_context_switch (void)
303 unsigned i;
304 printf ("static bool\n"
305 "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
306 "{\n"
307 " switch (c)\n"
308 " {\n");
309 for (i = 0; i < NUM_CODE_POINTS; i++)
311 bool found_case = false;
312 unsigned j;
313 if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
314 continue;
315 if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
316 continue; /* Hangul handled algorithmically. */
317 printf (" case %#06x:\n"
318 " switch (p)\n"
319 "\t{\n", i);
320 /* If an NFC starter character decomposes with this character I
321 as the second character and an NFC starter character S as the
322 first character, that latter character as a previous
323 character means this character is not NFC. Furthermore, any
324 NFC starter character K made by a series of compositions of S
325 with combining characters whose combining class is greater
326 than that of I also means this character is not NFC. */
327 for (j = 0; j < NUM_CODE_POINTS; j++)
329 unsigned s, k;
330 if (all_decomp[j][1] != i)
331 continue;
332 s = all_decomp[j][0];
333 if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
334 continue;
335 if (char_id_valid (s))
337 found_case = true;
338 printf ("\tcase %#06x:\n", s);
340 for (k = 0; k < NUM_CODE_POINTS; k++)
342 unsigned t = k;
343 if (k == s || !char_id_valid (k))
344 continue;
345 while (all_decomp[t][1] != 0
346 && combining_value[all_decomp[t][1]] > combining_value[i])
348 if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
349 break;
350 t = all_decomp[t][0];
352 if (t == s)
354 found_case = true;
355 printf ("\tcase %#06x:\n", k);
359 if (found_case)
360 printf ("\t return false;\n");
361 else
362 printf ("\t/* Non-NFC cases not applicable to C/C++. */\n");
363 printf ("\tdefault:\n"
364 "\t return true;\n"
365 "\t}\n\n");
367 printf (" default:\n"
368 " cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
369 " return true;\n"
370 " }\n"
371 "}\n");
374 /* Print out the huge copyright notice. */
376 static void
377 write_copyright (void)
379 static const char copyright[] = "\
380 /* Unicode characters and various properties.\n\
381 Copyright (C) 2003-2017 Free Software Foundation, Inc.\n\
383 This program is free software; you can redistribute it and/or modify it\n\
384 under the terms of the GNU General Public License as published by the\n\
385 Free Software Foundation; either version 3, or (at your option) any\n\
386 later version.\n\
388 This program is distributed in the hope that it will be useful,\n\
389 but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
390 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
391 GNU General Public License for more details.\n\
393 You should have received a copy of the GNU General Public License\n\
394 along with this program; see the file COPYING3. If not see\n\
395 <http://www.gnu.org/licenses/>.\n\
398 Copyright (C) 1991-2005 Unicode, Inc. All rights reserved.\n\
399 Distributed under the Terms of Use in\n\
400 http://www.unicode.org/copyright.html.\n\
402 Permission is hereby granted, free of charge, to any person\n\
403 obtaining a copy of the Unicode data files and any associated\n\
404 documentation (the \"Data Files\") or Unicode software and any\n\
405 associated documentation (the \"Software\") to deal in the Data Files\n\
406 or Software without restriction, including without limitation the\n\
407 rights to use, copy, modify, merge, publish, distribute, and/or\n\
408 sell copies of the Data Files or Software, and to permit persons to\n\
409 whom the Data Files or Software are furnished to do so, provided\n\
410 that (a) the above copyright notice(s) and this permission notice\n\
411 appear with all copies of the Data Files or Software, (b) both the\n\
412 above copyright notice(s) and this permission notice appear in\n\
413 associated documentation, and (c) there is clear notice in each\n\
414 modified Data File or in the Software as well as in the\n\
415 documentation associated with the Data File(s) or Software that the\n\
416 data or software has been modified.\n\
418 THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
419 OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
420 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
421 NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
422 COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
423 ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
424 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
425 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
426 ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
427 OF THE DATA FILES OR SOFTWARE.\n\
429 Except as contained in this notice, the name of a copyright holder\n\
430 shall not be used in advertising or otherwise to promote the sale,\n\
431 use or other dealings in these Data Files or Software without prior\n\
432 written authorization of the copyright holder. */\n";
434 puts (copyright);
437 /* Main program. */
440 main(int argc, char ** argv)
442 if (argc != 4)
443 fail ("too few arguments to makeucn");
444 read_ucnid (argv[1]);
445 read_table (argv[2]);
446 read_derived (argv[3]);
448 write_copyright ();
449 write_table ();
450 write_context_switch ();
451 return 0;