1 /* Make ucnid.h from various sources.
2 Copyright (C) 2005-2021 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify it
5 under the terms of the GNU General Public License as published by the
6 Free Software Foundation; either version 3, or (at your option) any
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; see the file COPYING3. If not see
16 <http://www.gnu.org/licenses/>. */
18 /* Run this program as
19 ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
35 all_languages
= C99
| CXX
| C11
,
41 #define NUM_CODE_POINTS 0x110000
42 #define MAX_CODE_POINT 0x10ffff
44 static unsigned flags
[NUM_CODE_POINTS
];
45 static unsigned int all_decomp
[NUM_CODE_POINTS
][2];
46 static unsigned int decomp
[NUM_CODE_POINTS
][2];
47 static unsigned char combining_value
[NUM_CODE_POINTS
];
54 fprintf (stderr
, "%s\n", s
);
58 /* Read ucnid.tab and set the flags for language versions in header[]. */
61 read_ucnid (const char *fname
)
63 FILE *f
= fopen (fname
, "r");
67 fail ("opening ucnid.tab");
72 if (!fgets (line
, sizeof (line
), f
))
74 if (strcmp (line
, "[C99]\n") == 0)
76 else if (strcmp (line
, "[C99DIG]\n") == 0)
78 else if (strcmp (line
, "[CXX]\n") == 0)
80 else if (strcmp (line
, "[C11]\n") == 0)
82 else if (strcmp (line
, "[C11NOSTART]\n") == 0)
84 else if (isxdigit (line
[0]))
89 unsigned long start
, end
;
91 start
= strtoul (l
, &endptr
, 16);
92 if (endptr
== l
|| (*endptr
!= '-' && ! isspace (*endptr
)))
93 fail ("parsing ucnid.tab [1]");
99 end
= strtoul (l
+ 1, &endptr
, 16);
101 fail ("parsing ucnid.tab, end before start");
104 fail ("parsing ucnid.tab, junk after range");
108 if (end
> MAX_CODE_POINT
)
109 fail ("parsing ucnid.tab, end too large");
111 flags
[start
++] |= fl
;
116 fail ("reading ucnid.tab");
120 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
121 decompositions of characters for which both the character
122 decomposed and all the code points in the decomposition are valid
123 for some supported language version, and the 'all_decomp' table to
124 be the decompositions of all characters without those
128 read_table (char *fname
)
130 FILE * f
= fopen (fname
, "r");
133 fail ("opening UnicodeData.txt");
137 unsigned long codepoint
, this_decomp
[4];
142 if (!fgets (line
, sizeof (line
), f
))
144 codepoint
= strtoul (line
, &l
, 16);
145 if (l
== line
|| *l
!= ';')
146 fail ("parsing UnicodeData.txt, reading code point");
147 if (codepoint
> MAX_CODE_POINT
)
148 fail ("parsing UnicodeData.txt, code point too large");
153 /* Category value. */
157 /* Canonical combining class; in NFC/NFKC, they must be increasing
159 if (! isdigit (*++l
))
160 fail ("parsing UnicodeData.txt, combining class not number");
161 combining_value
[codepoint
] = strtoul (l
, &l
, 10);
163 fail ("parsing UnicodeData.txt, junk after combining class");
165 /* Skip over bidi value. */
170 /* Decomposition mapping. */
171 decomp_useful
= flags
[codepoint
];
172 if (*++l
== '<') /* Compatibility mapping. */
174 for (i
= 0; i
< 4; i
++)
179 fail ("parsing UnicodeData.txt, decomposition format");
180 this_decomp
[i
] = strtoul (l
, &l
, 16);
181 decomp_useful
&= flags
[this_decomp
[i
]];
185 if (i
> 2) /* Decomposition too long. */
186 fail ("parsing UnicodeData.txt, decomposition too long");
187 for (j
= 0; j
< i
; j
++)
188 all_decomp
[codepoint
][j
] = this_decomp
[j
];
189 if ((flags
[codepoint
] & all_languages
) && decomp_useful
)
191 decomp
[codepoint
][i
] = this_decomp
[i
];
194 fail ("reading UnicodeData.txt");
198 /* Read DerivedNormalizationProps.txt and set the flags that say whether
199 a character is in NFC, NFKC, or is context-dependent. */
202 read_derived (const char *fname
)
204 FILE * f
= fopen (fname
, "r");
207 fail ("opening DerivedNormalizationProps.txt");
211 unsigned long start
, end
;
213 bool not_NFC_p
, not_NFKC_p
, maybe_not_NFC_p
;
215 if (!fgets (line
, sizeof (line
), f
))
217 not_NFC_p
= (strstr (line
, "; NFC_QC; N") != NULL
);
218 not_NFKC_p
= (strstr (line
, "; NFKC_QC; N") != NULL
);
219 maybe_not_NFC_p
= (strstr (line
, "; NFC_QC; M") != NULL
);
220 if (! not_NFC_p
&& ! not_NFKC_p
&& ! maybe_not_NFC_p
)
223 start
= strtoul (line
, &l
, 16);
225 fail ("parsing DerivedNormalizationProps.txt, reading start");
226 if (start
> MAX_CODE_POINT
)
227 fail ("parsing DerivedNormalizationProps.txt, code point too large");
228 if (*l
== '.' && l
[1] == '.')
229 end
= strtoul (l
+ 2, &l
, 16);
234 flags
[start
++] |= ((not_NFC_p
? not_NFC
: 0)
235 | (not_NFKC_p
? not_NFKC
: 0)
236 | (maybe_not_NFC_p
? maybe_not_NFC
: 0)
240 fail ("reading DerivedNormalizationProps.txt");
244 /* Write out the table.
245 The table consists of two words per entry. The first word is the flags
246 for the unicode code points up to and including the second word. */
252 unsigned last_flag
= flags
[0];
253 bool really_safe
= decomp
[0][0] == 0;
254 unsigned char last_combine
= combining_value
[0];
256 printf ("static const struct ucnrange ucnranges[] = {\n");
258 for (i
= 1; i
<= NUM_CODE_POINTS
; i
++)
259 if (i
== NUM_CODE_POINTS
260 || (flags
[i
] != last_flag
&& ((flags
[i
] | last_flag
) & all_languages
))
261 || really_safe
!= (decomp
[i
][0] == 0)
262 || combining_value
[i
] != last_combine
)
264 printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
265 last_flag
& C99
? "C99" : " 0",
266 last_flag
& N99
? "N99" : " 0",
267 last_flag
& CXX
? "CXX" : " 0",
268 last_flag
& C11
? "C11" : " 0",
269 last_flag
& N11
? "N11" : " 0",
270 really_safe
? "CID" : " 0",
271 last_flag
& not_NFC
? " 0" : "NFC",
272 last_flag
& not_NFKC
? " 0" : "NKC",
273 last_flag
& maybe_not_NFC
? "CTX" : " 0",
274 combining_value
[i
- 1],
276 last_flag
= flags
[i
];
277 last_combine
= combining_value
[0];
278 really_safe
= decomp
[i
][0] == 0;
284 /* Return whether a given character is valid in an identifier for some
285 supported language, either as itself or as a UCN. */
288 char_id_valid (unsigned int c
)
290 return ((flags
[c
] & all_languages
)
292 || (c
>= 0x30 && c
<= 0x39)
293 || (c
>= 0x41 && c
<= 0x5a)
294 || (c
>= 0x61 && c
<= 0x7a));
297 /* Write out the switch statement over characters for which it is
298 context-dependent whether they are in NFC. */
301 write_context_switch (void)
304 printf ("static bool\n"
305 "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
309 for (i
= 0; i
< NUM_CODE_POINTS
; i
++)
311 bool found_case
= false;
313 if (!(flags
[i
] & all_languages
) || !(flags
[i
] & maybe_not_NFC
))
315 if ((i
>= 0x1161 && i
<= 0x1175) || (i
>= 0x11A8 && i
<= 0x11C2))
316 continue; /* Hangul handled algorithmically. */
317 printf (" case %#06x:\n"
320 /* If an NFC starter character decomposes with this character I
321 as the second character and an NFC starter character S as the
322 first character, that latter character as a previous
323 character means this character is not NFC. Furthermore, any
324 NFC starter character K made by a series of compositions of S
325 with combining characters whose combining class is greater
326 than that of I also means this character is not NFC. */
327 for (j
= 0; j
< NUM_CODE_POINTS
; j
++)
330 if (all_decomp
[j
][1] != i
)
332 s
= all_decomp
[j
][0];
333 if (combining_value
[s
] != 0 || (flags
[s
] & not_NFC
) != 0)
335 if (char_id_valid (s
))
338 printf ("\tcase %#06x:\n", s
);
340 for (k
= 0; k
< NUM_CODE_POINTS
; k
++)
343 if (k
== s
|| !char_id_valid (k
))
345 while (all_decomp
[t
][1] != 0
346 && combining_value
[all_decomp
[t
][1]] > combining_value
[i
])
348 if (combining_value
[t
] != 0 || (flags
[t
] & not_NFC
) != 0)
350 t
= all_decomp
[t
][0];
355 printf ("\tcase %#06x:\n", k
);
360 printf ("\t return false;\n");
362 printf ("\t/* Non-NFC cases not applicable to C/C++. */\n");
363 printf ("\tdefault:\n"
367 printf (" default:\n"
368 " cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
374 /* Print out the huge copyright notice. */
377 write_copyright (void)
379 static const char copyright
[] = "\
380 /* Unicode characters and various properties.\n\
381 Copyright (C) 2003-2021 Free Software Foundation, Inc.\n\
383 This program is free software; you can redistribute it and/or modify it\n\
384 under the terms of the GNU General Public License as published by the\n\
385 Free Software Foundation; either version 3, or (at your option) any\n\
388 This program is distributed in the hope that it will be useful,\n\
389 but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
390 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
391 GNU General Public License for more details.\n\
393 You should have received a copy of the GNU General Public License\n\
394 along with this program; see the file COPYING3. If not see\n\
395 <http://www.gnu.org/licenses/>.\n\
398 Copyright (C) 1991-2005 Unicode, Inc. All rights reserved.\n\
399 Distributed under the Terms of Use in\n\
400 http://www.unicode.org/copyright.html.\n\
402 Permission is hereby granted, free of charge, to any person\n\
403 obtaining a copy of the Unicode data files and any associated\n\
404 documentation (the \"Data Files\") or Unicode software and any\n\
405 associated documentation (the \"Software\") to deal in the Data Files\n\
406 or Software without restriction, including without limitation the\n\
407 rights to use, copy, modify, merge, publish, distribute, and/or\n\
408 sell copies of the Data Files or Software, and to permit persons to\n\
409 whom the Data Files or Software are furnished to do so, provided\n\
410 that (a) the above copyright notice(s) and this permission notice\n\
411 appear with all copies of the Data Files or Software, (b) both the\n\
412 above copyright notice(s) and this permission notice appear in\n\
413 associated documentation, and (c) there is clear notice in each\n\
414 modified Data File or in the Software as well as in the\n\
415 documentation associated with the Data File(s) or Software that the\n\
416 data or software has been modified.\n\
418 THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
419 OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
420 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
421 NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
422 COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
423 ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
424 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
425 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
426 ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
427 OF THE DATA FILES OR SOFTWARE.\n\
429 Except as contained in this notice, the name of a copyright holder\n\
430 shall not be used in advertising or otherwise to promote the sale,\n\
431 use or other dealings in these Data Files or Software without prior\n\
432 written authorization of the copyright holder. */\n";
440 main(int argc
, char ** argv
)
443 fail ("too few arguments to makeucn");
444 read_ucnid (argv
[1]);
445 read_table (argv
[2]);
446 read_derived (argv
[3]);
450 write_context_switch ();