Improve a comment.
[gnulib.git] / lib / gen-uni-tables.c
blobb4f16da5607ab0ad0067068109b58635963b1f9b
1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2024 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <https://www.gnu.org/licenses/>. */
20 /* Usage example:
21 $ gen-uni-tables /usr/local/share/www.unicode.org/Public/15.1.0/ucd/UnicodeData.txt \
22 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/PropList.txt \
23 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/DerivedCoreProperties.txt \
24 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/emoji/emoji-data.txt \
25 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/ArabicShaping.txt \
26 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/Scripts.txt \
27 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/Blocks.txt \
28 /usr/local/share/www.unicode.org/Public/3.0-Update1/PropList-3.0.1.txt \
29 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/BidiMirroring.txt \
30 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt \
31 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/LineBreak.txt \
32 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/auxiliary/WordBreakProperty.txt \
33 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \
34 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/CompositionExclusions.txt \
35 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/SpecialCasing.txt \
36 /usr/local/share/www.unicode.org/Public/15.1.0/ucd/CaseFolding.txt \
37 15.1.0
40 #include <assert.h>
41 #if __STDC_VERSION__ < 202311L
42 # include <stdbool.h>
43 #endif
44 #include <stdint.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <time.h>
50 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
52 /* ========================================================================= */
54 /* Reading UnicodeData.txt. */
55 /* See UCD.html. */
57 /* This structure represents one line in the UnicodeData.txt file. */
58 struct unicode_attribute
60 const char *name; /* Character name */
61 const char *category; /* General category */
62 const char *combining; /* Canonical combining class */
63 const char *bidi; /* Bidirectional category */
64 const char *decomposition; /* Character decomposition mapping */
65 const char *decdigit; /* Decimal digit value */
66 const char *digit; /* Digit value */
67 const char *numeric; /* Numeric value */
68 bool mirrored; /* mirrored */
69 const char *oldname; /* Old Unicode 1.0 name */
70 const char *comment; /* Comment */
71 unsigned int upper; /* Uppercase mapping */
72 unsigned int lower; /* Lowercase mapping */
73 unsigned int title; /* Titlecase mapping */
76 /* Missing fields are represented with "" for strings, and NONE for
77 characters. */
78 #define NONE (~(unsigned int)0)
80 /* The entire contents of the UnicodeData.txt file. */
81 struct unicode_attribute unicode_attributes [0x110000];
83 /* Stores in unicode_attributes[i] the values from the given fields. */
84 static void
85 fill_attribute (unsigned int i,
86 const char *field1, const char *field2,
87 const char *field3, const char *field4,
88 const char *field5, const char *field6,
89 const char *field7, const char *field8,
90 const char *field9, const char *field10,
91 const char *field11, const char *field12,
92 const char *field13, const char *field14)
94 struct unicode_attribute * uni;
96 if (i >= 0x110000)
98 fprintf (stderr, "index too large\n");
99 exit (1);
101 if (strcmp (field2, "Cs") == 0)
102 /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */
103 return;
104 uni = &unicode_attributes[i];
105 /* Copy the strings. */
106 uni->name = strdup (field1);
107 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
108 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
109 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
110 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
111 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
112 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
113 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
114 uni->mirrored = (field9[0] == 'Y');
115 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
116 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
117 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
118 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
119 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
122 /* Maximum length of a field in the UnicodeData.txt file. */
123 #define FIELDLEN 160
125 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
126 Reads up to (but excluding) DELIM.
127 Returns 1 when a field was successfully read, otherwise 0. */
128 static int
129 getfield (FILE *stream, char *buffer, int delim)
131 int count = 0;
132 int c;
134 for (; (c = getc (stream)), (c != EOF && c != delim); )
136 /* The original unicode.org UnicodeData.txt file happens to have
137 CR/LF line terminators. Silently convert to LF. */
138 if (c == '\r')
139 continue;
141 /* Put c into the buffer. */
142 if (++count >= FIELDLEN - 1)
144 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
145 exit (1);
147 *buffer++ = c;
150 if (c == EOF)
151 return 0;
153 *buffer = '\0';
154 return 1;
157 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
158 file. */
159 static void
160 fill_attributes (const char *unicodedata_filename)
162 unsigned int i, j;
163 FILE *stream;
164 char field0[FIELDLEN];
165 char field1[FIELDLEN];
166 char field2[FIELDLEN];
167 char field3[FIELDLEN];
168 char field4[FIELDLEN];
169 char field5[FIELDLEN];
170 char field6[FIELDLEN];
171 char field7[FIELDLEN];
172 char field8[FIELDLEN];
173 char field9[FIELDLEN];
174 char field10[FIELDLEN];
175 char field11[FIELDLEN];
176 char field12[FIELDLEN];
177 char field13[FIELDLEN];
178 char field14[FIELDLEN];
179 int lineno = 0;
181 for (i = 0; i < 0x110000; i++)
182 unicode_attributes[i].name = NULL;
184 stream = fopen (unicodedata_filename, "r");
185 if (stream == NULL)
187 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
188 exit (1);
191 for (;;)
193 int n;
195 lineno++;
196 n = getfield (stream, field0, ';');
197 n += getfield (stream, field1, ';');
198 n += getfield (stream, field2, ';');
199 n += getfield (stream, field3, ';');
200 n += getfield (stream, field4, ';');
201 n += getfield (stream, field5, ';');
202 n += getfield (stream, field6, ';');
203 n += getfield (stream, field7, ';');
204 n += getfield (stream, field8, ';');
205 n += getfield (stream, field9, ';');
206 n += getfield (stream, field10, ';');
207 n += getfield (stream, field11, ';');
208 n += getfield (stream, field12, ';');
209 n += getfield (stream, field13, ';');
210 n += getfield (stream, field14, '\n');
211 if (n == 0)
212 break;
213 if (n != 15)
215 fprintf (stderr, "short line in '%s':%d\n",
216 unicodedata_filename, lineno);
217 exit (1);
219 i = strtoul (field0, NULL, 16);
220 if (field1[0] == '<'
221 && strlen (field1) >= 9
222 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
224 /* Deal with a range. */
225 lineno++;
226 n = getfield (stream, field0, ';');
227 n += getfield (stream, field1, ';');
228 n += getfield (stream, field2, ';');
229 n += getfield (stream, field3, ';');
230 n += getfield (stream, field4, ';');
231 n += getfield (stream, field5, ';');
232 n += getfield (stream, field6, ';');
233 n += getfield (stream, field7, ';');
234 n += getfield (stream, field8, ';');
235 n += getfield (stream, field9, ';');
236 n += getfield (stream, field10, ';');
237 n += getfield (stream, field11, ';');
238 n += getfield (stream, field12, ';');
239 n += getfield (stream, field13, ';');
240 n += getfield (stream, field14, '\n');
241 if (n != 15)
243 fprintf (stderr, "missing end range in '%s':%d\n",
244 unicodedata_filename, lineno);
245 exit (1);
247 if (!(field1[0] == '<'
248 && strlen (field1) >= 8
249 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
251 fprintf (stderr, "missing end range in '%s':%d\n",
252 unicodedata_filename, lineno);
253 exit (1);
255 field1[strlen (field1) - 7] = '\0';
256 j = strtoul (field0, NULL, 16);
257 for (; i <= j; i++)
258 fill_attribute (i, field1+1, field2, field3, field4, field5,
259 field6, field7, field8, field9, field10,
260 field11, field12, field13, field14);
262 else
264 /* Single character line */
265 fill_attribute (i, field1, field2, field3, field4, field5,
266 field6, field7, field8, field9, field10,
267 field11, field12, field13, field14);
271 if (ferror (stream) || fclose (stream))
273 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
274 exit (1);
278 /* ========================================================================= */
280 /* Output the license notice for a library file.
281 This closes an open C syntax comment. */
282 static void
283 output_library_license (FILE *stream, bool lgplv2plus)
285 if (lgplv2plus)
287 /* These Gnulib modules are under the LGPLv2+ license. */
288 fprintf (stream, " This file is free software: you can redistribute it and/or modify\n");
289 fprintf (stream, " it under the terms of the GNU Lesser General Public License as\n");
290 fprintf (stream, " published by the Free Software Foundation; either version 2.1 of the\n");
291 fprintf (stream, " License, or (at your option) any later version.\n");
292 fprintf (stream, "\n");
293 fprintf (stream, " This file is distributed in the hope that it will be useful,\n");
294 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
295 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
296 fprintf (stream, " GNU Lesser General Public License for more details.\n");
297 fprintf (stream, "\n");
298 fprintf (stream, " You should have received a copy of the GNU Lesser General Public License\n");
299 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
301 else
303 /* These Gnulib modules are under the 'LGPLv3+ or GPLv2+' license. */
304 fprintf (stream, " This file is free software.\n");
305 fprintf (stream, " It is dual-licensed under \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
306 fprintf (stream, " You can redistribute it and/or modify it under either\n");
307 fprintf (stream, " - the terms of the GNU Lesser General Public License as published\n");
308 fprintf (stream, " by the Free Software Foundation, either version 3, or (at your\n");
309 fprintf (stream, " option) any later version, or\n");
310 fprintf (stream, " - the terms of the GNU General Public License as published by the\n");
311 fprintf (stream, " Free Software Foundation; either version 2, or (at your option)\n");
312 fprintf (stream, " any later version, or\n");
313 fprintf (stream, " - the same dual license \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
314 fprintf (stream, "\n");
315 fprintf (stream, " This file is distributed in the hope that it will be useful,\n");
316 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
317 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
318 fprintf (stream, " Lesser General Public License and the GNU General Public License\n");
319 fprintf (stream, " for more details.\n");
320 fprintf (stream, "\n");
321 fprintf (stream, " You should have received a copy of the GNU Lesser General Public\n");
322 fprintf (stream, " License and of the GNU General Public License along with this\n");
323 fprintf (stream, " program. If not, see <https://www.gnu.org/licenses/>. */\n");
327 /* Output the license notice for a tests file.
328 This closes an open C syntax comment. */
329 static void
330 output_tests_license (FILE *stream)
332 /* Gnulib tests modules are under the GPLv3+ license. */
333 fprintf (stream, " This file is free software: you can redistribute it and/or modify\n");
334 fprintf (stream, " it under the terms of the GNU General Public License as published\n");
335 fprintf (stream, " by the Free Software Foundation, either version 3 of the License,\n");
336 fprintf (stream, " or (at your option) any later version.\n");
337 fprintf (stream, "\n");
338 fprintf (stream, " This file is distributed in the hope that it will be useful,\n");
339 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
340 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
341 fprintf (stream, " GNU General Public License for more details.\n");
342 fprintf (stream, "\n");
343 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
344 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
347 /* ========================================================================= */
349 /* General category. */
350 /* See Unicode 3.0 book, section 4.5,
351 UCD.html. */
353 static bool
354 is_category_L (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'L');
360 static bool
361 is_category_LC (unsigned int ch)
363 /* See PropertyValueAliases.txt. */
364 return (unicode_attributes[ch].name != NULL
365 && unicode_attributes[ch].category[0] == 'L'
366 && (unicode_attributes[ch].category[1] == 'u'
367 || unicode_attributes[ch].category[1] == 'l'
368 || unicode_attributes[ch].category[1] == 't'));
371 static bool
372 is_category_Lu (unsigned int ch)
374 return (unicode_attributes[ch].name != NULL
375 && unicode_attributes[ch].category[0] == 'L'
376 && unicode_attributes[ch].category[1] == 'u');
379 static bool
380 is_category_Ll (unsigned int ch)
382 return (unicode_attributes[ch].name != NULL
383 && unicode_attributes[ch].category[0] == 'L'
384 && unicode_attributes[ch].category[1] == 'l');
387 static bool
388 is_category_Lt (unsigned int ch)
390 return (unicode_attributes[ch].name != NULL
391 && unicode_attributes[ch].category[0] == 'L'
392 && unicode_attributes[ch].category[1] == 't');
395 static bool
396 is_category_Lm (unsigned int ch)
398 return (unicode_attributes[ch].name != NULL
399 && unicode_attributes[ch].category[0] == 'L'
400 && unicode_attributes[ch].category[1] == 'm');
403 static bool
404 is_category_Lo (unsigned int ch)
406 return (unicode_attributes[ch].name != NULL
407 && unicode_attributes[ch].category[0] == 'L'
408 && unicode_attributes[ch].category[1] == 'o');
411 static bool
412 is_category_M (unsigned int ch)
414 return (unicode_attributes[ch].name != NULL
415 && unicode_attributes[ch].category[0] == 'M');
418 static bool
419 is_category_Mn (unsigned int ch)
421 return (unicode_attributes[ch].name != NULL
422 && unicode_attributes[ch].category[0] == 'M'
423 && unicode_attributes[ch].category[1] == 'n');
426 static bool
427 is_category_Mc (unsigned int ch)
429 return (unicode_attributes[ch].name != NULL
430 && unicode_attributes[ch].category[0] == 'M'
431 && unicode_attributes[ch].category[1] == 'c');
434 static bool
435 is_category_Me (unsigned int ch)
437 return (unicode_attributes[ch].name != NULL
438 && unicode_attributes[ch].category[0] == 'M'
439 && unicode_attributes[ch].category[1] == 'e');
442 static bool
443 is_category_N (unsigned int ch)
445 return (unicode_attributes[ch].name != NULL
446 && unicode_attributes[ch].category[0] == 'N');
449 static bool
450 is_category_Nd (unsigned int ch)
452 return (unicode_attributes[ch].name != NULL
453 && unicode_attributes[ch].category[0] == 'N'
454 && unicode_attributes[ch].category[1] == 'd');
457 static bool
458 is_category_Nl (unsigned int ch)
460 return (unicode_attributes[ch].name != NULL
461 && unicode_attributes[ch].category[0] == 'N'
462 && unicode_attributes[ch].category[1] == 'l');
465 static bool
466 is_category_No (unsigned int ch)
468 return (unicode_attributes[ch].name != NULL
469 && unicode_attributes[ch].category[0] == 'N'
470 && unicode_attributes[ch].category[1] == 'o');
473 static bool
474 is_category_P (unsigned int ch)
476 return (unicode_attributes[ch].name != NULL
477 && unicode_attributes[ch].category[0] == 'P');
480 static bool
481 is_category_Pc (unsigned int ch)
483 return (unicode_attributes[ch].name != NULL
484 && unicode_attributes[ch].category[0] == 'P'
485 && unicode_attributes[ch].category[1] == 'c');
488 static bool
489 is_category_Pd (unsigned int ch)
491 return (unicode_attributes[ch].name != NULL
492 && unicode_attributes[ch].category[0] == 'P'
493 && unicode_attributes[ch].category[1] == 'd');
496 static bool
497 is_category_Ps (unsigned int ch)
499 return (unicode_attributes[ch].name != NULL
500 && unicode_attributes[ch].category[0] == 'P'
501 && unicode_attributes[ch].category[1] == 's');
504 static bool
505 is_category_Pe (unsigned int ch)
507 return (unicode_attributes[ch].name != NULL
508 && unicode_attributes[ch].category[0] == 'P'
509 && unicode_attributes[ch].category[1] == 'e');
512 static bool
513 is_category_Pi (unsigned int ch)
515 return (unicode_attributes[ch].name != NULL
516 && unicode_attributes[ch].category[0] == 'P'
517 && unicode_attributes[ch].category[1] == 'i');
520 static bool
521 is_category_Pf (unsigned int ch)
523 return (unicode_attributes[ch].name != NULL
524 && unicode_attributes[ch].category[0] == 'P'
525 && unicode_attributes[ch].category[1] == 'f');
528 static bool
529 is_category_Po (unsigned int ch)
531 return (unicode_attributes[ch].name != NULL
532 && unicode_attributes[ch].category[0] == 'P'
533 && unicode_attributes[ch].category[1] == 'o');
536 static bool
537 is_category_S (unsigned int ch)
539 return (unicode_attributes[ch].name != NULL
540 && unicode_attributes[ch].category[0] == 'S');
543 static bool
544 is_category_Sm (unsigned int ch)
546 return (unicode_attributes[ch].name != NULL
547 && unicode_attributes[ch].category[0] == 'S'
548 && unicode_attributes[ch].category[1] == 'm');
551 static bool
552 is_category_Sc (unsigned int ch)
554 return (unicode_attributes[ch].name != NULL
555 && unicode_attributes[ch].category[0] == 'S'
556 && unicode_attributes[ch].category[1] == 'c');
559 static bool
560 is_category_Sk (unsigned int ch)
562 return (unicode_attributes[ch].name != NULL
563 && unicode_attributes[ch].category[0] == 'S'
564 && unicode_attributes[ch].category[1] == 'k');
567 static bool
568 is_category_So (unsigned int ch)
570 return (unicode_attributes[ch].name != NULL
571 && unicode_attributes[ch].category[0] == 'S'
572 && unicode_attributes[ch].category[1] == 'o');
575 static bool
576 is_category_Z (unsigned int ch)
578 return (unicode_attributes[ch].name != NULL
579 && unicode_attributes[ch].category[0] == 'Z');
582 static bool
583 is_category_Zs (unsigned int ch)
585 return (unicode_attributes[ch].name != NULL
586 && unicode_attributes[ch].category[0] == 'Z'
587 && unicode_attributes[ch].category[1] == 's');
590 static bool
591 is_category_Zl (unsigned int ch)
593 return (unicode_attributes[ch].name != NULL
594 && unicode_attributes[ch].category[0] == 'Z'
595 && unicode_attributes[ch].category[1] == 'l');
598 static bool
599 is_category_Zp (unsigned int ch)
601 return (unicode_attributes[ch].name != NULL
602 && unicode_attributes[ch].category[0] == 'Z'
603 && unicode_attributes[ch].category[1] == 'p');
606 static bool
607 is_category_C (unsigned int ch)
609 return (unicode_attributes[ch].name == NULL
610 || unicode_attributes[ch].category[0] == 'C');
613 static bool
614 is_category_Cc (unsigned int ch)
616 return (unicode_attributes[ch].name != NULL
617 && unicode_attributes[ch].category[0] == 'C'
618 && unicode_attributes[ch].category[1] == 'c');
621 static bool
622 is_category_Cf (unsigned int ch)
624 return (unicode_attributes[ch].name != NULL
625 && unicode_attributes[ch].category[0] == 'C'
626 && unicode_attributes[ch].category[1] == 'f');
629 static bool
630 is_category_Cs (unsigned int ch)
632 return (ch >= 0xd800 && ch < 0xe000);
635 static bool
636 is_category_Co (unsigned int ch)
638 return (unicode_attributes[ch].name != NULL
639 && unicode_attributes[ch].category[0] == 'C'
640 && unicode_attributes[ch].category[1] == 'o');
643 static bool
644 is_category_Cn (unsigned int ch)
646 return (unicode_attributes[ch].name == NULL
647 && !(ch >= 0xd800 && ch < 0xe000));
650 /* Output a boolean property in a human readable format. */
651 static void
652 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
654 FILE *stream;
655 unsigned int ch;
657 stream = fopen (filename, "w");
658 if (stream == NULL)
660 fprintf (stderr, "cannot open '%s' for writing\n", filename);
661 exit (1);
664 #if 0 /* This yields huge text output. */
665 for (ch = 0; ch < 0x110000; ch++)
666 if (predicate (ch))
668 fprintf (stream, "0x%04X\n", ch);
670 #else
671 for (ch = 0; ch < 0x110000; ch++)
672 if (predicate (ch))
674 unsigned int first = ch;
675 unsigned int last;
677 while (ch + 1 < 0x110000 && predicate (ch + 1))
678 ch++;
679 last = ch;
680 if (first < last)
681 fprintf (stream, "0x%04X..0x%04X\n", first, last);
682 else
683 fprintf (stream, "0x%04X\n", ch);
685 #endif
687 if (ferror (stream) || fclose (stream))
689 fprintf (stderr, "error writing to '%s'\n", filename);
690 exit (1);
694 /* Output the unit test for a boolean property. */
695 static void
696 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
698 FILE *stream;
699 bool need_comma;
700 unsigned int ch;
702 stream = fopen (filename, "w");
703 if (stream == NULL)
705 fprintf (stderr, "cannot open '%s' for writing\n", filename);
706 exit (1);
709 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
710 fprintf (stream, "/* Test the Unicode character type functions.\n");
711 fprintf (stream, " Copyright (C) 2007-2024 Free Software Foundation, Inc.\n");
712 fprintf (stream, "\n");
713 output_tests_license (stream);
714 fprintf (stream, "\n");
715 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
716 fprintf (stream, "\n");
718 need_comma = false;
719 for (ch = 0; ch < 0x110000; ch++)
720 if (predicate (ch))
722 unsigned int first = ch;
723 unsigned int last;
725 while (ch + 1 < 0x110000 && predicate (ch + 1))
726 ch++;
727 last = ch;
728 if (need_comma)
729 fprintf (stream, ",\n");
730 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
731 need_comma = true;
733 if (need_comma)
734 fprintf (stream, "\n");
736 fprintf (stream, "\n");
737 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
738 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
740 if (ferror (stream) || fclose (stream))
742 fprintf (stderr, "error writing to '%s'\n", filename);
743 exit (1);
747 /* Construction of sparse 3-level tables. */
748 #define TABLE predicate_table
749 #define xmalloc malloc
750 #define xrealloc realloc
751 #include "3levelbit.h"
753 /* Output a boolean property in a three-level bitmap. */
754 static void
755 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
757 FILE *stream;
758 unsigned int ch, i;
759 struct predicate_table t;
760 unsigned int level1_offset, level2_offset, level3_offset;
762 stream = fopen (filename, "w");
763 if (stream == NULL)
765 fprintf (stderr, "cannot open '%s' for writing\n", filename);
766 exit (1);
769 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
770 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
771 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
772 version);
773 fprintf (stream, "\n");
775 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
776 fprintf (stream, "\n");
777 output_library_license (stream,
778 strcmp (filename, "unictype/categ_M.h") == 0
779 || strncmp (filename, "unictype/ctype_", 15) == 0
780 || strcmp (filename, "uniwidth/width2.h") == 0);
781 fprintf (stream, "\n");
783 t.p = 4; /* or: 5 */
784 t.q = 7; /* or: 6 */
785 predicate_table_init (&t);
787 for (ch = 0; ch < 0x110000; ch++)
788 if (predicate (ch))
789 predicate_table_add (&t, ch);
791 predicate_table_finalize (&t);
793 /* Offsets in t.result, in memory of this process. */
794 level1_offset =
795 5 * sizeof (uint32_t);
796 level2_offset =
797 5 * sizeof (uint32_t)
798 + t.level1_size * sizeof (uint32_t);
799 level3_offset =
800 5 * sizeof (uint32_t)
801 + t.level1_size * sizeof (uint32_t)
802 + (t.level2_size << t.q) * sizeof (uint32_t);
804 for (i = 0; i < 5; i++)
805 if (i != 1)
806 fprintf (stream, "#define header_%d %d\n", i,
807 ((uint32_t *) t.result)[i]);
809 fprintf (stream, "static const\n");
810 fprintf (stream, "struct\n");
811 fprintf (stream, " {\n");
812 fprintf (stream, " int header[1];\n");
813 fprintf (stream, " int level1[%zu];\n", t.level1_size);
814 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
815 fprintf (stream, " unsigned int level3[%zu << %d];\n", t.level3_size, t.p);
816 fprintf (stream, " }\n");
817 fprintf (stream, "%s =\n", name);
818 fprintf (stream, "{\n");
819 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
820 fprintf (stream, " {");
821 if (t.level1_size > 1)
822 fprintf (stream, "\n ");
823 for (i = 0; i < t.level1_size; i++)
825 uint32_t offset;
826 if (i > 0 && (i % 1) == 0)
827 fprintf (stream, "\n ");
828 offset = ((uint32_t *) (t.result + level1_offset))[i];
829 if (offset == 0)
830 fprintf (stream, " %5d", -1);
831 else
832 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
833 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
834 if (i+1 < t.level1_size)
835 fprintf (stream, ",");
837 if (t.level1_size > 1)
838 fprintf (stream, "\n ");
839 fprintf (stream, " },\n");
840 fprintf (stream, " {");
841 if (t.level2_size << t.q > 1)
842 fprintf (stream, "\n ");
843 for (i = 0; i < t.level2_size << t.q; i++)
845 uint32_t offset;
846 if (i > 0 && (i % 1) == 0)
847 fprintf (stream, "\n ");
848 offset = ((uint32_t *) (t.result + level2_offset))[i];
849 if (offset == 0)
850 fprintf (stream, " %5d", -1);
851 else
852 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
853 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
854 if (i+1 < t.level2_size << t.q)
855 fprintf (stream, ",");
857 if (t.level2_size << t.q > 1)
858 fprintf (stream, "\n ");
859 fprintf (stream, " },\n");
860 fprintf (stream, " {");
861 if (t.level3_size << t.p > 4)
862 fprintf (stream, "\n ");
863 for (i = 0; i < t.level3_size << t.p; i++)
865 if (i > 0 && (i % 4) == 0)
866 fprintf (stream, "\n ");
867 fprintf (stream, " 0x%08XU",
868 ((uint32_t *) (t.result + level3_offset))[i]);
869 if (i+1 < t.level3_size << t.p)
870 fprintf (stream, ",");
872 if (t.level3_size << t.p > 4)
873 fprintf (stream, "\n ");
874 fprintf (stream, " }\n");
875 fprintf (stream, "};\n");
877 if (ferror (stream) || fclose (stream))
879 fprintf (stderr, "error writing to '%s'\n", filename);
880 exit (1);
884 /* Output all categories. */
885 static void
886 output_categories (const char *version)
888 #define CATEGORY(C) \
889 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
890 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
891 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
892 CATEGORY (L)
893 CATEGORY (LC)
894 CATEGORY (Lu)
895 CATEGORY (Ll)
896 CATEGORY (Lt)
897 CATEGORY (Lm)
898 CATEGORY (Lo)
899 CATEGORY (M)
900 CATEGORY (Mn)
901 CATEGORY (Mc)
902 CATEGORY (Me)
903 CATEGORY (N)
904 CATEGORY (Nd)
905 CATEGORY (Nl)
906 CATEGORY (No)
907 CATEGORY (P)
908 CATEGORY (Pc)
909 CATEGORY (Pd)
910 CATEGORY (Ps)
911 CATEGORY (Pe)
912 CATEGORY (Pi)
913 CATEGORY (Pf)
914 CATEGORY (Po)
915 CATEGORY (S)
916 CATEGORY (Sm)
917 CATEGORY (Sc)
918 CATEGORY (Sk)
919 CATEGORY (So)
920 CATEGORY (Z)
921 CATEGORY (Zs)
922 CATEGORY (Zl)
923 CATEGORY (Zp)
924 CATEGORY (C)
925 CATEGORY (Cc)
926 CATEGORY (Cf)
927 CATEGORY (Cs)
928 CATEGORY (Co)
929 CATEGORY (Cn)
930 #undef CATEGORY
933 enum
935 UC_CATEGORY_MASK_L = 0x0000001f,
936 UC_CATEGORY_MASK_LC = 0x00000007,
937 UC_CATEGORY_MASK_Lu = 0x00000001,
938 UC_CATEGORY_MASK_Ll = 0x00000002,
939 UC_CATEGORY_MASK_Lt = 0x00000004,
940 UC_CATEGORY_MASK_Lm = 0x00000008,
941 UC_CATEGORY_MASK_Lo = 0x00000010,
942 UC_CATEGORY_MASK_M = 0x000000e0,
943 UC_CATEGORY_MASK_Mn = 0x00000020,
944 UC_CATEGORY_MASK_Mc = 0x00000040,
945 UC_CATEGORY_MASK_Me = 0x00000080,
946 UC_CATEGORY_MASK_N = 0x00000700,
947 UC_CATEGORY_MASK_Nd = 0x00000100,
948 UC_CATEGORY_MASK_Nl = 0x00000200,
949 UC_CATEGORY_MASK_No = 0x00000400,
950 UC_CATEGORY_MASK_P = 0x0003f800,
951 UC_CATEGORY_MASK_Pc = 0x00000800,
952 UC_CATEGORY_MASK_Pd = 0x00001000,
953 UC_CATEGORY_MASK_Ps = 0x00002000,
954 UC_CATEGORY_MASK_Pe = 0x00004000,
955 UC_CATEGORY_MASK_Pi = 0x00008000,
956 UC_CATEGORY_MASK_Pf = 0x00010000,
957 UC_CATEGORY_MASK_Po = 0x00020000,
958 UC_CATEGORY_MASK_S = 0x003c0000,
959 UC_CATEGORY_MASK_Sm = 0x00040000,
960 UC_CATEGORY_MASK_Sc = 0x00080000,
961 UC_CATEGORY_MASK_Sk = 0x00100000,
962 UC_CATEGORY_MASK_So = 0x00200000,
963 UC_CATEGORY_MASK_Z = 0x01c00000,
964 UC_CATEGORY_MASK_Zs = 0x00400000,
965 UC_CATEGORY_MASK_Zl = 0x00800000,
966 UC_CATEGORY_MASK_Zp = 0x01000000,
967 UC_CATEGORY_MASK_C = 0x3e000000,
968 UC_CATEGORY_MASK_Cc = 0x02000000,
969 UC_CATEGORY_MASK_Cf = 0x04000000,
970 UC_CATEGORY_MASK_Cs = 0x08000000,
971 UC_CATEGORY_MASK_Co = 0x10000000,
972 UC_CATEGORY_MASK_Cn = 0x20000000
975 static int
976 general_category_byname (const char *category_name)
978 if (category_name[0] != '\0'
979 && (category_name[1] == '\0' || category_name[2] == '\0'))
980 switch (category_name[0])
982 case 'L':
983 switch (category_name[1])
985 case '\0': return UC_CATEGORY_MASK_L;
986 case 'C': return UC_CATEGORY_MASK_LC;
987 case 'u': return UC_CATEGORY_MASK_Lu;
988 case 'l': return UC_CATEGORY_MASK_Ll;
989 case 't': return UC_CATEGORY_MASK_Lt;
990 case 'm': return UC_CATEGORY_MASK_Lm;
991 case 'o': return UC_CATEGORY_MASK_Lo;
993 break;
994 case 'M':
995 switch (category_name[1])
997 case '\0': return UC_CATEGORY_MASK_M;
998 case 'n': return UC_CATEGORY_MASK_Mn;
999 case 'c': return UC_CATEGORY_MASK_Mc;
1000 case 'e': return UC_CATEGORY_MASK_Me;
1002 break;
1003 case 'N':
1004 switch (category_name[1])
1006 case '\0': return UC_CATEGORY_MASK_N;
1007 case 'd': return UC_CATEGORY_MASK_Nd;
1008 case 'l': return UC_CATEGORY_MASK_Nl;
1009 case 'o': return UC_CATEGORY_MASK_No;
1011 break;
1012 case 'P':
1013 switch (category_name[1])
1015 case '\0': return UC_CATEGORY_MASK_P;
1016 case 'c': return UC_CATEGORY_MASK_Pc;
1017 case 'd': return UC_CATEGORY_MASK_Pd;
1018 case 's': return UC_CATEGORY_MASK_Ps;
1019 case 'e': return UC_CATEGORY_MASK_Pe;
1020 case 'i': return UC_CATEGORY_MASK_Pi;
1021 case 'f': return UC_CATEGORY_MASK_Pf;
1022 case 'o': return UC_CATEGORY_MASK_Po;
1024 break;
1025 case 'S':
1026 switch (category_name[1])
1028 case '\0': return UC_CATEGORY_MASK_S;
1029 case 'm': return UC_CATEGORY_MASK_Sm;
1030 case 'c': return UC_CATEGORY_MASK_Sc;
1031 case 'k': return UC_CATEGORY_MASK_Sk;
1032 case 'o': return UC_CATEGORY_MASK_So;
1034 break;
1035 case 'Z':
1036 switch (category_name[1])
1038 case '\0': return UC_CATEGORY_MASK_Z;
1039 case 's': return UC_CATEGORY_MASK_Zs;
1040 case 'l': return UC_CATEGORY_MASK_Zl;
1041 case 'p': return UC_CATEGORY_MASK_Zp;
1043 break;
1044 case 'C':
1045 switch (category_name[1])
1047 case '\0': return UC_CATEGORY_MASK_C;
1048 case 'c': return UC_CATEGORY_MASK_Cc;
1049 case 'f': return UC_CATEGORY_MASK_Cf;
1050 case 's': return UC_CATEGORY_MASK_Cs;
1051 case 'o': return UC_CATEGORY_MASK_Co;
1052 case 'n': return UC_CATEGORY_MASK_Cn;
1054 break;
1056 /* Invalid category name. */
1057 abort ();
1060 /* Construction of sparse 3-level tables. */
1061 #define TABLE category_table
1062 #define ELEMENT uint8_t
1063 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
1064 #define xmalloc malloc
1065 #define xrealloc realloc
1066 #include "3level.h"
1068 /* Output the per-character category table. */
1069 static void
1070 output_category (const char *filename, const char *version)
1072 FILE *stream;
1073 unsigned int ch, i;
1074 struct category_table t;
1075 unsigned int level1_offset, level2_offset, level3_offset;
1076 uint16_t *level3_packed;
1078 stream = fopen (filename, "w");
1079 if (stream == NULL)
1081 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1082 exit (1);
1085 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1086 fprintf (stream, "/* Categories of Unicode characters. */\n");
1087 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1088 version);
1089 fprintf (stream, "\n");
1091 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1092 fprintf (stream, "\n");
1093 output_library_license (stream, true);
1094 fprintf (stream, "\n");
1096 t.p = 7;
1097 t.q = 9;
1098 category_table_init (&t);
1100 for (ch = 0; ch < 0x110000; ch++)
1102 int value;
1103 unsigned int log2_value;
1105 if (is_category_Cs (ch))
1106 value = UC_CATEGORY_MASK_Cs;
1107 else if (unicode_attributes[ch].name != NULL)
1108 value = general_category_byname (unicode_attributes[ch].category);
1109 else
1110 continue;
1112 /* Now value should contain exactly one bit. */
1113 assert (value != 0 && (value & (value - 1)) == 0);
1115 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1117 assert (log2_value <= 0x1f);
1119 category_table_add (&t, ch, log2_value);
1122 category_table_finalize (&t);
1124 /* Offsets in t.result, in memory of this process. */
1125 level1_offset =
1126 5 * sizeof (uint32_t);
1127 level2_offset =
1128 5 * sizeof (uint32_t)
1129 + t.level1_size * sizeof (uint32_t);
1130 level3_offset =
1131 5 * sizeof (uint32_t)
1132 + t.level1_size * sizeof (uint32_t)
1133 + (t.level2_size << t.q) * sizeof (uint32_t);
1135 for (i = 0; i < 5; i++)
1136 fprintf (stream, "#define category_header_%d %d\n", i,
1137 ((uint32_t *) t.result)[i]);
1138 fprintf (stream, "static const\n");
1139 fprintf (stream, "struct\n");
1140 fprintf (stream, " {\n");
1141 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1142 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1143 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1144 (1 << t.p) * 5 / 16);
1145 fprintf (stream, " }\n");
1146 fprintf (stream, "u_category =\n");
1147 fprintf (stream, "{\n");
1148 fprintf (stream, " {");
1149 if (t.level1_size > 8)
1150 fprintf (stream, "\n ");
1151 for (i = 0; i < t.level1_size; i++)
1153 uint32_t offset;
1154 if (i > 0 && (i % 8) == 0)
1155 fprintf (stream, "\n ");
1156 offset = ((uint32_t *) (t.result + level1_offset))[i];
1157 if (offset == 0)
1158 fprintf (stream, " %5d", -1);
1159 else
1160 fprintf (stream, " %5zu",
1161 (offset - level2_offset) / sizeof (uint32_t));
1162 if (i+1 < t.level1_size)
1163 fprintf (stream, ",");
1165 if (t.level1_size > 8)
1166 fprintf (stream, "\n ");
1167 fprintf (stream, " },\n");
1168 fprintf (stream, " {");
1169 if (t.level2_size << t.q > 8)
1170 fprintf (stream, "\n ");
1171 for (i = 0; i < t.level2_size << t.q; i++)
1173 uint32_t offset;
1174 if (i > 0 && (i % 8) == 0)
1175 fprintf (stream, "\n ");
1176 offset = ((uint32_t *) (t.result + level2_offset))[i];
1177 if (offset == 0)
1178 fprintf (stream, " %5d", -1);
1179 else
1180 fprintf (stream, " %5zu",
1181 (offset - level3_offset) / sizeof (uint8_t));
1182 if (i+1 < t.level2_size << t.q)
1183 fprintf (stream, ",");
1185 if (t.level2_size << t.q > 8)
1186 fprintf (stream, "\n ");
1187 fprintf (stream, " },\n");
1188 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1189 not 32-bit units, in order to make the lookup function easier. */
1190 level3_packed =
1191 (uint16_t *)
1192 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1193 for (i = 0; i < t.level3_size << t.p; i++)
1195 unsigned int j = (i * 5) / 16;
1196 unsigned int k = (i * 5) % 16;
1197 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1198 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1199 level3_packed[j] = value & 0xffff;
1200 level3_packed[j+1] = value >> 16;
1202 fprintf (stream, " {");
1203 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1204 fprintf (stream, "\n ");
1205 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1207 if (i > 0 && (i % 8) == 0)
1208 fprintf (stream, "\n ");
1209 fprintf (stream, " 0x%04x", level3_packed[i]);
1210 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1211 fprintf (stream, ",");
1213 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1214 fprintf (stream, "\n ");
1215 fprintf (stream, " }\n");
1216 free (level3_packed);
1217 fprintf (stream, "};\n");
1219 if (ferror (stream) || fclose (stream))
1221 fprintf (stderr, "error writing to '%s'\n", filename);
1222 exit (1);
1226 /* ========================================================================= */
1228 /* Canonical combining class. */
1229 /* See Unicode 3.0 book, section 4.2,
1230 UCD.html. */
1232 /* Construction of sparse 3-level tables. */
1233 #define TABLE combclass_table
1234 #define ELEMENT uint8_t
1235 #define DEFAULT 0
1236 #define xmalloc malloc
1237 #define xrealloc realloc
1238 #include "3level.h"
1240 /* Output the per-character combining class table. */
1241 static void
1242 output_combclass (const char *filename, const char *version)
1244 FILE *stream;
1245 unsigned int ch, i;
1246 struct combclass_table t;
1247 unsigned int level1_offset, level2_offset, level3_offset;
1249 stream = fopen (filename, "w");
1250 if (stream == NULL)
1252 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1253 exit (1);
1256 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1257 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1258 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1259 version);
1260 fprintf (stream, "\n");
1262 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1263 fprintf (stream, "\n");
1264 output_library_license (stream, true);
1265 fprintf (stream, "\n");
1267 t.p = 7;
1268 t.q = 9;
1269 combclass_table_init (&t);
1271 for (ch = 0; ch < 0x110000; ch++)
1272 if (unicode_attributes[ch].name != NULL)
1274 int value = atoi (unicode_attributes[ch].combining);
1275 assert (value >= 0 && value <= 255);
1276 combclass_table_add (&t, ch, value);
1279 combclass_table_finalize (&t);
1281 /* Offsets in t.result, in memory of this process. */
1282 level1_offset =
1283 5 * sizeof (uint32_t);
1284 level2_offset =
1285 5 * sizeof (uint32_t)
1286 + t.level1_size * sizeof (uint32_t);
1287 level3_offset =
1288 5 * sizeof (uint32_t)
1289 + t.level1_size * sizeof (uint32_t)
1290 + (t.level2_size << t.q) * sizeof (uint32_t);
1292 for (i = 0; i < 5; i++)
1293 fprintf (stream, "#define combclass_header_%d %d\n", i,
1294 ((uint32_t *) t.result)[i]);
1295 fprintf (stream, "static const\n");
1296 fprintf (stream, "struct\n");
1297 fprintf (stream, " {\n");
1298 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1299 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1300 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1301 fprintf (stream, " }\n");
1302 fprintf (stream, "u_combclass =\n");
1303 fprintf (stream, "{\n");
1304 fprintf (stream, " {");
1305 if (t.level1_size > 8)
1306 fprintf (stream, "\n ");
1307 for (i = 0; i < t.level1_size; i++)
1309 uint32_t offset;
1310 if (i > 0 && (i % 8) == 0)
1311 fprintf (stream, "\n ");
1312 offset = ((uint32_t *) (t.result + level1_offset))[i];
1313 if (offset == 0)
1314 fprintf (stream, " %5d", -1);
1315 else
1316 fprintf (stream, " %5zu",
1317 (offset - level2_offset) / sizeof (uint32_t));
1318 if (i+1 < t.level1_size)
1319 fprintf (stream, ",");
1321 if (t.level1_size > 8)
1322 fprintf (stream, "\n ");
1323 fprintf (stream, " },\n");
1324 fprintf (stream, " {");
1325 if (t.level2_size << t.q > 8)
1326 fprintf (stream, "\n ");
1327 for (i = 0; i < t.level2_size << t.q; i++)
1329 uint32_t offset;
1330 if (i > 0 && (i % 8) == 0)
1331 fprintf (stream, "\n ");
1332 offset = ((uint32_t *) (t.result + level2_offset))[i];
1333 if (offset == 0)
1334 fprintf (stream, " %5d", -1);
1335 else
1336 fprintf (stream, " %5zu",
1337 (offset - level3_offset) / sizeof (uint8_t));
1338 if (i+1 < t.level2_size << t.q)
1339 fprintf (stream, ",");
1341 if (t.level2_size << t.q > 8)
1342 fprintf (stream, "\n ");
1343 fprintf (stream, " },\n");
1344 fprintf (stream, " {");
1345 if (t.level3_size << t.p > 8)
1346 fprintf (stream, "\n ");
1347 for (i = 0; i < t.level3_size << t.p; i++)
1349 if (i > 0 && (i % 8) == 0)
1350 fprintf (stream, "\n ");
1351 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1352 if (i+1 < t.level3_size << t.p)
1353 fprintf (stream, ",");
1355 if (t.level3_size << t.p > 8)
1356 fprintf (stream, "\n ");
1357 fprintf (stream, " }\n");
1358 fprintf (stream, "};\n");
1360 if (ferror (stream) || fclose (stream))
1362 fprintf (stderr, "error writing to '%s'\n", filename);
1363 exit (1);
1367 /* ========================================================================= */
1369 /* Bidirectional category. */
1370 /* See Unicode 3.0 book, section 4.3,
1371 UCD.html. */
1373 enum
1375 UC_BIDI_L, /* Left-to-Right */
1376 UC_BIDI_LRE, /* Left-to-Right Embedding */
1377 UC_BIDI_LRO, /* Left-to-Right Override */
1378 UC_BIDI_R, /* Right-to-Left */
1379 UC_BIDI_AL, /* Right-to-Left Arabic */
1380 UC_BIDI_RLE, /* Right-to-Left Embedding */
1381 UC_BIDI_RLO, /* Right-to-Left Override */
1382 UC_BIDI_PDF, /* Pop Directional Format */
1383 UC_BIDI_EN, /* European Number */
1384 UC_BIDI_ES, /* European Number Separator */
1385 UC_BIDI_ET, /* European Number Terminator */
1386 UC_BIDI_AN, /* Arabic Number */
1387 UC_BIDI_CS, /* Common Number Separator */
1388 UC_BIDI_NSM, /* Non-Spacing Mark */
1389 UC_BIDI_BN, /* Boundary Neutral */
1390 UC_BIDI_B, /* Paragraph Separator */
1391 UC_BIDI_S, /* Segment Separator */
1392 UC_BIDI_WS, /* Whitespace */
1393 UC_BIDI_ON, /* Other Neutral */
1394 UC_BIDI_LRI, /* Left-to-Right Isolate */
1395 UC_BIDI_RLI, /* Right-to-Left Isolate */
1396 UC_BIDI_FSI, /* First Strong Isolate */
1397 UC_BIDI_PDI /* Pop Directional Isolate */
1400 static int
1401 bidi_category_byname (const char *category_name)
1403 switch (category_name[0])
1405 case 'A':
1406 switch (category_name[1])
1408 case 'L':
1409 if (category_name[2] == '\0')
1410 return UC_BIDI_AL;
1411 break;
1412 case 'N':
1413 if (category_name[2] == '\0')
1414 return UC_BIDI_AN;
1415 break;
1417 break;
1418 case 'B':
1419 switch (category_name[1])
1421 case '\0':
1422 return UC_BIDI_B;
1423 case 'N':
1424 if (category_name[2] == '\0')
1425 return UC_BIDI_BN;
1426 break;
1428 break;
1429 case 'C':
1430 switch (category_name[1])
1432 case 'S':
1433 if (category_name[2] == '\0')
1434 return UC_BIDI_CS;
1435 break;
1437 break;
1438 case 'E':
1439 switch (category_name[1])
1441 case 'N':
1442 if (category_name[2] == '\0')
1443 return UC_BIDI_EN;
1444 break;
1445 case 'S':
1446 if (category_name[2] == '\0')
1447 return UC_BIDI_ES;
1448 break;
1449 case 'T':
1450 if (category_name[2] == '\0')
1451 return UC_BIDI_ET;
1452 break;
1454 break;
1455 case 'F':
1456 switch (category_name[1])
1458 case 'S':
1459 switch (category_name[2])
1461 case 'I':
1462 if (category_name[3] == '\0')
1463 return UC_BIDI_FSI;
1464 break;
1467 break;
1468 case 'L':
1469 switch (category_name[1])
1471 case '\0':
1472 return UC_BIDI_L;
1473 case 'R':
1474 switch (category_name[2])
1476 case 'E':
1477 if (category_name[3] == '\0')
1478 return UC_BIDI_LRE;
1479 break;
1480 case 'O':
1481 if (category_name[3] == '\0')
1482 return UC_BIDI_LRO;
1483 break;
1484 case 'I':
1485 if (category_name[3] == '\0')
1486 return UC_BIDI_LRI;
1487 break;
1489 break;
1491 break;
1492 case 'N':
1493 switch (category_name[1])
1495 case 'S':
1496 switch (category_name[2])
1498 case 'M':
1499 if (category_name[3] == '\0')
1500 return UC_BIDI_NSM;
1501 break;
1503 break;
1505 break;
1506 case 'O':
1507 switch (category_name[1])
1509 case 'N':
1510 if (category_name[2] == '\0')
1511 return UC_BIDI_ON;
1512 break;
1514 break;
1515 case 'P':
1516 switch (category_name[1])
1518 case 'D':
1519 switch (category_name[2])
1521 case 'F':
1522 if (category_name[3] == '\0')
1523 return UC_BIDI_PDF;
1524 break;
1525 case 'I':
1526 if (category_name[3] == '\0')
1527 return UC_BIDI_PDI;
1528 break;
1530 break;
1532 break;
1533 case 'R':
1534 switch (category_name[1])
1536 case '\0':
1537 return UC_BIDI_R;
1538 case 'L':
1539 switch (category_name[2])
1541 case 'E':
1542 if (category_name[3] == '\0')
1543 return UC_BIDI_RLE;
1544 break;
1545 case 'O':
1546 if (category_name[3] == '\0')
1547 return UC_BIDI_RLO;
1548 break;
1549 case 'I':
1550 if (category_name[3] == '\0')
1551 return UC_BIDI_RLI;
1552 break;
1554 break;
1556 break;
1557 case 'S':
1558 if (category_name[1] == '\0')
1559 return UC_BIDI_S;
1560 break;
1561 case 'W':
1562 switch (category_name[1])
1564 case 'S':
1565 if (category_name[2] == '\0')
1566 return UC_BIDI_WS;
1567 break;
1569 break;
1571 /* Invalid bidi category name. */
1572 abort ();
1575 static int
1576 get_bidi_category (unsigned int ch)
1578 if (unicode_attributes[ch].name != NULL)
1579 return bidi_category_byname (unicode_attributes[ch].bidi);
1580 else
1582 /* The bidi category of unassigned characters depends on the range.
1583 See UTR #9 and DerivedBidiClass.txt. */
1584 if ((ch >= 0x0590 && ch <= 0x05FF)
1585 || (ch >= 0x07FB && ch <= 0x08FF)
1586 || (ch >= 0xFB37 && ch <= 0xFB45)
1587 || (ch >= 0x10800 && ch <= 0x10FFF))
1588 return UC_BIDI_R;
1589 else if ((ch >= 0x0600 && ch <= 0x07BF)
1590 || (ch >= 0x2064 && ch <= 0x2069)
1591 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1592 || (ch >= 0xFDFE && ch <= 0xFEFE))
1593 return UC_BIDI_AL;
1594 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1595 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1596 || (ch & 0xFFFF) == 0xFFFE
1597 || (ch & 0xFFFF) == 0xFFFF
1598 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1599 return UC_BIDI_BN;
1600 else
1601 return UC_BIDI_L;
1605 /* Construction of sparse 3-level tables. */
1606 #define TABLE bidi_category_table
1607 #define ELEMENT uint8_t
1608 #define DEFAULT UC_BIDI_L
1609 #define xmalloc malloc
1610 #define xrealloc realloc
1611 #include "3level.h"
1613 /* Output the per-character bidi category table. */
1614 static void
1615 output_bidi_category (const char *filename, const char *version)
1617 FILE *stream;
1618 unsigned int ch, i;
1619 struct bidi_category_table t;
1620 unsigned int level1_offset, level2_offset, level3_offset;
1621 uint16_t *level3_packed;
1623 stream = fopen (filename, "w");
1624 if (stream == NULL)
1626 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1627 exit (1);
1630 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1631 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1632 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1633 version);
1634 fprintf (stream, "\n");
1636 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1637 fprintf (stream, "\n");
1638 output_library_license (stream, true);
1639 fprintf (stream, "\n");
1641 t.p = 7;
1642 t.q = 9;
1643 bidi_category_table_init (&t);
1645 for (ch = 0; ch < 0x110000; ch++)
1647 int value = get_bidi_category (ch);
1649 assert (value <= 0x1f);
1651 bidi_category_table_add (&t, ch, value);
1654 bidi_category_table_finalize (&t);
1656 /* Offsets in t.result, in memory of this process. */
1657 level1_offset =
1658 5 * sizeof (uint32_t);
1659 level2_offset =
1660 5 * sizeof (uint32_t)
1661 + t.level1_size * sizeof (uint32_t);
1662 level3_offset =
1663 5 * sizeof (uint32_t)
1664 + t.level1_size * sizeof (uint32_t)
1665 + (t.level2_size << t.q) * sizeof (uint32_t);
1667 for (i = 0; i < 5; i++)
1668 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1669 ((uint32_t *) t.result)[i]);
1670 fprintf (stream, "static const\n");
1671 fprintf (stream, "struct\n");
1672 fprintf (stream, " {\n");
1673 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1674 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1675 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1676 (1 << t.p) * 5 / 16);
1677 fprintf (stream, " }\n");
1678 fprintf (stream, "u_bidi_category =\n");
1679 fprintf (stream, "{\n");
1680 fprintf (stream, " {");
1681 if (t.level1_size > 8)
1682 fprintf (stream, "\n ");
1683 for (i = 0; i < t.level1_size; i++)
1685 uint32_t offset;
1686 if (i > 0 && (i % 8) == 0)
1687 fprintf (stream, "\n ");
1688 offset = ((uint32_t *) (t.result + level1_offset))[i];
1689 if (offset == 0)
1690 fprintf (stream, " %5d", -1);
1691 else
1692 fprintf (stream, " %5zu",
1693 (offset - level2_offset) / sizeof (uint32_t));
1694 if (i+1 < t.level1_size)
1695 fprintf (stream, ",");
1697 if (t.level1_size > 8)
1698 fprintf (stream, "\n ");
1699 fprintf (stream, " },\n");
1700 fprintf (stream, " {");
1701 if (t.level2_size << t.q > 8)
1702 fprintf (stream, "\n ");
1703 for (i = 0; i < t.level2_size << t.q; i++)
1705 uint32_t offset;
1706 if (i > 0 && (i % 8) == 0)
1707 fprintf (stream, "\n ");
1708 offset = ((uint32_t *) (t.result + level2_offset))[i];
1709 if (offset == 0)
1710 fprintf (stream, " %5d", -1);
1711 else
1712 fprintf (stream, " %5zu",
1713 (offset - level3_offset) / sizeof (uint8_t));
1714 if (i+1 < t.level2_size << t.q)
1715 fprintf (stream, ",");
1717 if (t.level2_size << t.q > 8)
1718 fprintf (stream, "\n ");
1719 fprintf (stream, " },\n");
1720 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1721 not 32-bit units, in order to make the lookup function easier. */
1722 level3_packed =
1723 (uint16_t *)
1724 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1725 for (i = 0; i < t.level3_size << t.p; i++)
1727 unsigned int j = (i * 5) / 16;
1728 unsigned int k = (i * 5) % 16;
1729 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1730 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1731 level3_packed[j] = value & 0xffff;
1732 level3_packed[j+1] = value >> 16;
1734 fprintf (stream, " {");
1735 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1736 fprintf (stream, "\n ");
1737 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1739 if (i > 0 && (i % 8) == 0)
1740 fprintf (stream, "\n ");
1741 fprintf (stream, " 0x%04x", level3_packed[i]);
1742 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1743 fprintf (stream, ",");
1745 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1746 fprintf (stream, "\n ");
1747 fprintf (stream, " }\n");
1748 free (level3_packed);
1749 fprintf (stream, "};\n");
1751 if (ferror (stream) || fclose (stream))
1753 fprintf (stderr, "error writing to '%s'\n", filename);
1754 exit (1);
1758 /* ========================================================================= */
1760 /* Decimal digit value. */
1761 /* See Unicode 3.0 book, section 4.6. */
1763 static int
1764 get_decdigit_value (unsigned int ch)
1766 if (unicode_attributes[ch].name != NULL
1767 && unicode_attributes[ch].decdigit[0] != '\0')
1768 return atoi (unicode_attributes[ch].decdigit);
1769 return -1;
1772 /* Construction of sparse 3-level tables. */
1773 #define TABLE decdigit_table
1774 #define ELEMENT uint8_t
1775 #define DEFAULT 0
1776 #define xmalloc malloc
1777 #define xrealloc realloc
1778 #include "3level.h"
1780 /* Output the unit test for the per-character decimal digit value table. */
1781 static void
1782 output_decimal_digit_test (const char *filename, const char *version)
1784 FILE *stream;
1785 bool need_comma;
1786 unsigned int ch;
1788 stream = fopen (filename, "w");
1789 if (stream == NULL)
1791 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1792 exit (1);
1795 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1796 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1797 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1798 version);
1799 fprintf (stream, "\n");
1801 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1802 fprintf (stream, "\n");
1803 output_tests_license (stream);
1804 fprintf (stream, "\n");
1806 need_comma = false;
1807 for (ch = 0; ch < 0x110000; ch++)
1809 int value = get_decdigit_value (ch);
1811 assert (value >= -1 && value < 10);
1813 if (value >= 0)
1815 if (need_comma)
1816 fprintf (stream, ",\n");
1817 fprintf (stream, " { 0x%04X, %d }", ch, value);
1818 need_comma = true;
1821 if (need_comma)
1822 fprintf (stream, "\n");
1824 if (ferror (stream) || fclose (stream))
1826 fprintf (stderr, "error writing to '%s'\n", filename);
1827 exit (1);
1831 /* Output the per-character decimal digit value table. */
1832 static void
1833 output_decimal_digit (const char *filename, const char *version)
1835 FILE *stream;
1836 unsigned int ch, i;
1837 struct decdigit_table t;
1838 unsigned int level1_offset, level2_offset, level3_offset;
1840 stream = fopen (filename, "w");
1841 if (stream == NULL)
1843 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1844 exit (1);
1847 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1848 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1849 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1850 version);
1851 fprintf (stream, "\n");
1853 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1854 fprintf (stream, "\n");
1855 output_library_license (stream, false);
1856 fprintf (stream, "\n");
1858 t.p = 7;
1859 t.q = 9;
1860 decdigit_table_init (&t);
1862 for (ch = 0; ch < 0x110000; ch++)
1864 int value = 1 + get_decdigit_value (ch);
1866 assert (value >= 0 && value <= 10);
1868 decdigit_table_add (&t, ch, value);
1871 decdigit_table_finalize (&t);
1873 /* Offsets in t.result, in memory of this process. */
1874 level1_offset =
1875 5 * sizeof (uint32_t);
1876 level2_offset =
1877 5 * sizeof (uint32_t)
1878 + t.level1_size * sizeof (uint32_t);
1879 level3_offset =
1880 5 * sizeof (uint32_t)
1881 + t.level1_size * sizeof (uint32_t)
1882 + (t.level2_size << t.q) * sizeof (uint32_t);
1884 for (i = 0; i < 5; i++)
1885 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1886 ((uint32_t *) t.result)[i]);
1887 fprintf (stream, "static const\n");
1888 fprintf (stream, "struct\n");
1889 fprintf (stream, " {\n");
1890 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1891 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1892 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1893 t.p - 1);
1894 fprintf (stream, " }\n");
1895 fprintf (stream, "u_decdigit =\n");
1896 fprintf (stream, "{\n");
1897 fprintf (stream, " {");
1898 if (t.level1_size > 8)
1899 fprintf (stream, "\n ");
1900 for (i = 0; i < t.level1_size; i++)
1902 uint32_t offset;
1903 if (i > 0 && (i % 8) == 0)
1904 fprintf (stream, "\n ");
1905 offset = ((uint32_t *) (t.result + level1_offset))[i];
1906 if (offset == 0)
1907 fprintf (stream, " %5d", -1);
1908 else
1909 fprintf (stream, " %5zu",
1910 (offset - level2_offset) / sizeof (uint32_t));
1911 if (i+1 < t.level1_size)
1912 fprintf (stream, ",");
1914 if (t.level1_size > 8)
1915 fprintf (stream, "\n ");
1916 fprintf (stream, " },\n");
1917 fprintf (stream, " {");
1918 if (t.level2_size << t.q > 8)
1919 fprintf (stream, "\n ");
1920 for (i = 0; i < t.level2_size << t.q; i++)
1922 uint32_t offset;
1923 if (i > 0 && (i % 8) == 0)
1924 fprintf (stream, "\n ");
1925 offset = ((uint32_t *) (t.result + level2_offset))[i];
1926 if (offset == 0)
1927 fprintf (stream, " %5d", -1);
1928 else
1929 fprintf (stream, " %5zu",
1930 (offset - level3_offset) / sizeof (uint8_t));
1931 if (i+1 < t.level2_size << t.q)
1932 fprintf (stream, ",");
1934 if (t.level2_size << t.q > 8)
1935 fprintf (stream, "\n ");
1936 fprintf (stream, " },\n");
1937 /* Pack the level3 array. Each entry needs 4 bits only. */
1938 fprintf (stream, " {");
1939 if (t.level3_size << (t.p - 1) > 8)
1940 fprintf (stream, "\n ");
1941 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1943 if (i > 0 && (i % 8) == 0)
1944 fprintf (stream, "\n ");
1945 fprintf (stream, " 0x%02x",
1946 ((uint8_t *) (t.result + level3_offset))[2*i]
1947 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1948 if (i+1 < t.level3_size << (t.p - 1))
1949 fprintf (stream, ",");
1951 if (t.level3_size << (t.p - 1) > 8)
1952 fprintf (stream, "\n ");
1953 fprintf (stream, " }\n");
1954 fprintf (stream, "};\n");
1956 if (ferror (stream) || fclose (stream))
1958 fprintf (stderr, "error writing to '%s'\n", filename);
1959 exit (1);
1963 /* ========================================================================= */
1965 /* Digit value. */
1966 /* See Unicode 3.0 book, section 4.6. */
1968 static int
1969 get_digit_value (unsigned int ch)
1971 if (unicode_attributes[ch].name != NULL
1972 && unicode_attributes[ch].digit[0] != '\0')
1973 return atoi (unicode_attributes[ch].digit);
1974 return -1;
1977 /* Output the unit test for the per-character digit value table. */
1978 static void
1979 output_digit_test (const char *filename, const char *version)
1981 FILE *stream;
1982 bool need_comma;
1983 unsigned int ch;
1985 stream = fopen (filename, "w");
1986 if (stream == NULL)
1988 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1989 exit (1);
1992 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1993 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1994 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1995 version);
1996 fprintf (stream, "\n");
1998 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1999 fprintf (stream, "\n");
2000 output_tests_license (stream);
2001 fprintf (stream, "\n");
2003 need_comma = false;
2004 for (ch = 0; ch < 0x110000; ch++)
2006 int value = get_digit_value (ch);
2008 assert (value >= -1 && value < 10);
2010 if (value >= 0)
2012 if (need_comma)
2013 fprintf (stream, ",\n");
2014 fprintf (stream, " { 0x%04X, %d }", ch, value);
2015 need_comma = true;
2018 if (need_comma)
2019 fprintf (stream, "\n");
2021 if (ferror (stream) || fclose (stream))
2023 fprintf (stderr, "error writing to '%s'\n", filename);
2024 exit (1);
2028 /* Output the per-character digit value table. */
2029 static void
2030 output_digit (const char *filename, const char *version)
2032 FILE *stream;
2033 unsigned int ch, i;
2034 struct decdigit_table t;
2035 unsigned int level1_offset, level2_offset, level3_offset;
2037 stream = fopen (filename, "w");
2038 if (stream == NULL)
2040 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2041 exit (1);
2044 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2045 fprintf (stream, "/* Digit values of Unicode characters. */\n");
2046 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2047 version);
2048 fprintf (stream, "\n");
2050 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2051 fprintf (stream, "\n");
2052 output_library_license (stream, false);
2053 fprintf (stream, "\n");
2055 t.p = 7;
2056 t.q = 9;
2057 decdigit_table_init (&t);
2059 for (ch = 0; ch < 0x110000; ch++)
2061 int value = 1 + get_digit_value (ch);
2063 assert (value >= 0 && value <= 10);
2065 decdigit_table_add (&t, ch, value);
2068 decdigit_table_finalize (&t);
2070 /* Offsets in t.result, in memory of this process. */
2071 level1_offset =
2072 5 * sizeof (uint32_t);
2073 level2_offset =
2074 5 * sizeof (uint32_t)
2075 + t.level1_size * sizeof (uint32_t);
2076 level3_offset =
2077 5 * sizeof (uint32_t)
2078 + t.level1_size * sizeof (uint32_t)
2079 + (t.level2_size << t.q) * sizeof (uint32_t);
2081 for (i = 0; i < 5; i++)
2082 fprintf (stream, "#define digit_header_%d %d\n", i,
2083 ((uint32_t *) t.result)[i]);
2084 fprintf (stream, "static const\n");
2085 fprintf (stream, "struct\n");
2086 fprintf (stream, " {\n");
2087 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2088 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2089 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
2090 t.p - 1);
2091 fprintf (stream, " }\n");
2092 fprintf (stream, "u_digit =\n");
2093 fprintf (stream, "{\n");
2094 fprintf (stream, " {");
2095 if (t.level1_size > 8)
2096 fprintf (stream, "\n ");
2097 for (i = 0; i < t.level1_size; i++)
2099 uint32_t offset;
2100 if (i > 0 && (i % 8) == 0)
2101 fprintf (stream, "\n ");
2102 offset = ((uint32_t *) (t.result + level1_offset))[i];
2103 if (offset == 0)
2104 fprintf (stream, " %5d", -1);
2105 else
2106 fprintf (stream, " %5zu",
2107 (offset - level2_offset) / sizeof (uint32_t));
2108 if (i+1 < t.level1_size)
2109 fprintf (stream, ",");
2111 if (t.level1_size > 8)
2112 fprintf (stream, "\n ");
2113 fprintf (stream, " },\n");
2114 fprintf (stream, " {");
2115 if (t.level2_size << t.q > 8)
2116 fprintf (stream, "\n ");
2117 for (i = 0; i < t.level2_size << t.q; i++)
2119 uint32_t offset;
2120 if (i > 0 && (i % 8) == 0)
2121 fprintf (stream, "\n ");
2122 offset = ((uint32_t *) (t.result + level2_offset))[i];
2123 if (offset == 0)
2124 fprintf (stream, " %5d", -1);
2125 else
2126 fprintf (stream, " %5zu",
2127 (offset - level3_offset) / sizeof (uint8_t));
2128 if (i+1 < t.level2_size << t.q)
2129 fprintf (stream, ",");
2131 if (t.level2_size << t.q > 8)
2132 fprintf (stream, "\n ");
2133 fprintf (stream, " },\n");
2134 /* Pack the level3 array. Each entry needs 4 bits only. */
2135 fprintf (stream, " {");
2136 if (t.level3_size << (t.p - 1) > 8)
2137 fprintf (stream, "\n ");
2138 for (i = 0; i < t.level3_size << (t.p - 1); i++)
2140 if (i > 0 && (i % 8) == 0)
2141 fprintf (stream, "\n ");
2142 fprintf (stream, " 0x%02x",
2143 ((uint8_t *) (t.result + level3_offset))[2*i]
2144 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
2145 if (i+1 < t.level3_size << (t.p - 1))
2146 fprintf (stream, ",");
2148 if (t.level3_size << (t.p - 1) > 8)
2149 fprintf (stream, "\n ");
2150 fprintf (stream, " }\n");
2151 fprintf (stream, "};\n");
2153 if (ferror (stream) || fclose (stream))
2155 fprintf (stderr, "error writing to '%s'\n", filename);
2156 exit (1);
2160 /* ========================================================================= */
2162 /* Numeric value. */
2163 /* See Unicode 3.0 book, section 4.6. */
2165 typedef struct { int numerator; int denominator; } uc_fraction_t;
2167 static uc_fraction_t
2168 get_numeric_value (unsigned int ch)
2170 uc_fraction_t value;
2172 if (unicode_attributes[ch].name != NULL
2173 && unicode_attributes[ch].numeric[0] != '\0')
2175 const char *str = unicode_attributes[ch].numeric;
2176 /* str is of the form "integer" or "integer/posinteger". */
2177 value.numerator = atoi (str);
2178 if (strchr (str, '/') != NULL)
2179 value.denominator = atoi (strchr (str, '/') + 1);
2180 else
2181 value.denominator = 1;
2183 else
2185 value.numerator = 0;
2186 value.denominator = 0;
2188 return value;
2191 /* Output the unit test for the per-character numeric value table. */
2192 static void
2193 output_numeric_test (const char *filename, const char *version)
2195 FILE *stream;
2196 bool need_comma;
2197 unsigned int ch;
2199 stream = fopen (filename, "w");
2200 if (stream == NULL)
2202 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2203 exit (1);
2206 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2207 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2208 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2209 version);
2210 fprintf (stream, "\n");
2212 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2213 fprintf (stream, "\n");
2214 output_tests_license (stream);
2215 fprintf (stream, "\n");
2217 need_comma = false;
2218 for (ch = 0; ch < 0x110000; ch++)
2220 uc_fraction_t value = get_numeric_value (ch);
2222 if (value.numerator != 0 || value.denominator != 0)
2224 if (need_comma)
2225 fprintf (stream, ",\n");
2226 fprintf (stream, " { 0x%04X, %d, %d }",
2227 ch, value.numerator, value.denominator);
2228 need_comma = true;
2231 if (need_comma)
2232 fprintf (stream, "\n");
2234 if (ferror (stream) || fclose (stream))
2236 fprintf (stderr, "error writing to '%s'\n", filename);
2237 exit (1);
2241 /* Construction of sparse 3-level tables. */
2242 #define TABLE numeric_table
2243 #define ELEMENT uint8_t
2244 #define DEFAULT 0
2245 #define xmalloc malloc
2246 #define xrealloc realloc
2247 #include "3level.h"
2249 /* Output the per-character numeric value table. */
2250 static void
2251 output_numeric (const char *filename, const char *version)
2253 FILE *stream;
2254 uc_fraction_t fractions[160];
2255 unsigned int nfractions;
2256 unsigned int ch, i, j;
2257 struct numeric_table t;
2258 unsigned int level1_offset, level2_offset, level3_offset;
2259 uint16_t *level3_packed;
2261 stream = fopen (filename, "w");
2262 if (stream == NULL)
2264 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2265 exit (1);
2268 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2269 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2270 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2271 version);
2272 fprintf (stream, "\n");
2274 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2275 fprintf (stream, "\n");
2276 output_library_license (stream, false);
2277 fprintf (stream, "\n");
2279 /* Create table of occurring fractions. */
2280 nfractions = 0;
2281 for (ch = 0; ch < 0x110000; ch++)
2283 uc_fraction_t value = get_numeric_value (ch);
2285 for (i = 0; i < nfractions; i++)
2286 if (value.numerator == fractions[i].numerator
2287 && value.denominator == fractions[i].denominator)
2288 break;
2289 if (i == nfractions)
2291 assert (nfractions != SIZEOF (fractions));
2292 for (i = 0; i < nfractions; i++)
2293 if (value.denominator < fractions[i].denominator
2294 || (value.denominator == fractions[i].denominator
2295 && value.numerator < fractions[i].numerator))
2296 break;
2297 for (j = nfractions; j > i; j--)
2298 fractions[j] = fractions[j - 1];
2299 fractions[i] = value;
2300 nfractions++;
2304 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2305 nfractions);
2306 fprintf (stream, "{\n");
2307 for (i = 0; i < nfractions; i++)
2309 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2310 fractions[i].denominator);
2311 if (i+1 < nfractions)
2312 fprintf (stream, ",");
2313 fprintf (stream, "\n");
2315 fprintf (stream, "};\n");
2317 t.p = 7;
2318 t.q = 9;
2319 numeric_table_init (&t);
2321 for (ch = 0; ch < 0x110000; ch++)
2323 uc_fraction_t value = get_numeric_value (ch);
2325 for (i = 0; i < nfractions; i++)
2326 if (value.numerator == fractions[i].numerator
2327 && value.denominator == fractions[i].denominator)
2328 break;
2329 assert (i != nfractions);
2331 numeric_table_add (&t, ch, i);
2334 numeric_table_finalize (&t);
2336 /* Offsets in t.result, in memory of this process. */
2337 level1_offset =
2338 5 * sizeof (uint32_t);
2339 level2_offset =
2340 5 * sizeof (uint32_t)
2341 + t.level1_size * sizeof (uint32_t);
2342 level3_offset =
2343 5 * sizeof (uint32_t)
2344 + t.level1_size * sizeof (uint32_t)
2345 + (t.level2_size << t.q) * sizeof (uint32_t);
2347 for (i = 0; i < 5; i++)
2348 fprintf (stream, "#define numeric_header_%d %d\n", i,
2349 ((uint32_t *) t.result)[i]);
2350 fprintf (stream, "static const\n");
2351 fprintf (stream, "struct\n");
2352 fprintf (stream, " {\n");
2353 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2354 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2355 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2356 (1 << t.p) * 8 / 16);
2357 fprintf (stream, " }\n");
2358 fprintf (stream, "u_numeric =\n");
2359 fprintf (stream, "{\n");
2360 fprintf (stream, " {");
2361 if (t.level1_size > 8)
2362 fprintf (stream, "\n ");
2363 for (i = 0; i < t.level1_size; i++)
2365 uint32_t offset;
2366 if (i > 0 && (i % 8) == 0)
2367 fprintf (stream, "\n ");
2368 offset = ((uint32_t *) (t.result + level1_offset))[i];
2369 if (offset == 0)
2370 fprintf (stream, " %5d", -1);
2371 else
2372 fprintf (stream, " %5zu",
2373 (offset - level2_offset) / sizeof (uint32_t));
2374 if (i+1 < t.level1_size)
2375 fprintf (stream, ",");
2377 if (t.level1_size > 8)
2378 fprintf (stream, "\n ");
2379 fprintf (stream, " },\n");
2380 fprintf (stream, " {");
2381 if (t.level2_size << t.q > 8)
2382 fprintf (stream, "\n ");
2383 for (i = 0; i < t.level2_size << t.q; i++)
2385 uint32_t offset;
2386 if (i > 0 && (i % 8) == 0)
2387 fprintf (stream, "\n ");
2388 offset = ((uint32_t *) (t.result + level2_offset))[i];
2389 if (offset == 0)
2390 fprintf (stream, " %5d", -1);
2391 else
2392 fprintf (stream, " %5zu",
2393 (offset - level3_offset) / sizeof (uint8_t));
2394 if (i+1 < t.level2_size << t.q)
2395 fprintf (stream, ",");
2397 if (t.level2_size << t.q > 8)
2398 fprintf (stream, "\n ");
2399 fprintf (stream, " },\n");
2400 /* Pack the level3 array. Each entry needs 8 bits only. Use 16-bit units,
2401 not 32-bit units, in order to make the lookup function easier. */
2402 level3_packed =
2403 (uint16_t *)
2404 calloc ((t.level3_size << t.p) * 8 / 16 + 1, sizeof (uint16_t));
2405 for (i = 0; i < t.level3_size << t.p; i++)
2407 unsigned int j = (i * 8) / 16;
2408 unsigned int k = (i * 8) % 16;
2409 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2410 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2411 level3_packed[j] = value & 0xffff;
2412 level3_packed[j+1] = value >> 16;
2414 fprintf (stream, " {");
2415 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2416 fprintf (stream, "\n ");
2417 for (i = 0; i < (t.level3_size << t.p) * 8 / 16 + 1; i++)
2419 if (i > 0 && (i % 8) == 0)
2420 fprintf (stream, "\n ");
2421 fprintf (stream, " 0x%04x", level3_packed[i]);
2422 if (i+1 < (t.level3_size << t.p) * 8 / 16 + 1)
2423 fprintf (stream, ",");
2425 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2426 fprintf (stream, "\n ");
2427 fprintf (stream, " }\n");
2428 free (level3_packed);
2429 fprintf (stream, "};\n");
2431 if (ferror (stream) || fclose (stream))
2433 fprintf (stderr, "error writing to '%s'\n", filename);
2434 exit (1);
2438 /* ========================================================================= */
2440 /* Mirrored. */
2441 /* See Unicode 3.0 book, section 4.7,
2442 UAX #9. */
2444 /* A pair of mirrored characters. */
2445 struct mirror_pair { unsigned int uc[2]; };
2447 /* List of mirrored character pairs, from the BidiMirroring.txt file.
2448 This is a subset of the characters having the BidiMirrored property. */
2449 static struct mirror_pair mirror_pairs[1000];
2450 static unsigned int mirror_pairs_count;
2452 /* Stores in mirror_pairs[] the mirrored character pairs from the
2453 BidiMirroring.txt file. */
2454 static void
2455 fill_mirror (const char *bidimirroring_filename)
2457 FILE *stream;
2458 char field0[FIELDLEN];
2459 char field1[FIELDLEN];
2460 char field2[FIELDLEN];
2461 int lineno = 0;
2463 stream = fopen (bidimirroring_filename, "r");
2464 if (stream == NULL)
2466 fprintf (stderr, "error during fopen of '%s'\n", bidimirroring_filename);
2467 exit (1);
2470 mirror_pairs_count = 0;
2471 for (;;)
2473 int n;
2474 int c;
2475 unsigned int uc1;
2476 unsigned int uc2;
2477 unsigned int i;
2479 lineno++;
2480 c = getc (stream);
2481 if (c == EOF)
2482 break;
2483 if (c == '\n')
2484 continue;
2485 if (c == '#')
2487 do c = getc (stream); while (c != EOF && c != '\n');
2488 continue;
2490 ungetc (c, stream);
2491 n = getfield (stream, field0, ';');
2492 do c = getc (stream); while (c == ' ');
2493 ungetc (c, stream);
2494 n += getfield (stream, field1, '#');
2495 n += getfield (stream, field2, '\n');
2496 if (n == 0)
2497 break;
2498 if (n != 3)
2500 fprintf (stderr, "short line in '%s':%d\n",
2501 bidimirroring_filename, lineno);
2502 exit (1);
2504 /* Remove trailing spaces from field1. */
2505 while (strlen (field1) > 0 && field1[strlen (field1) - 1] == ' ')
2506 field1[strlen (field1) - 1] = '\0';
2507 /* The line should contain two characters. */
2508 uc1 = strtoul (field0, NULL, 16);
2509 uc2 = strtoul (field1, NULL, 16);
2510 if (uc1 == 0 || uc2 == 0 || uc1 == uc2)
2512 fprintf (stderr, "parse error at '%s':%d\n",
2513 bidimirroring_filename, lineno);
2514 exit (1);
2516 /* Verify that uc1 and uc2 are in range. */
2517 if (!(uc1 < 0x110000))
2519 fprintf (stderr, "%s mentions 0x%04X, which is out-of-range.\n",
2520 bidimirroring_filename, uc1);
2521 exit (1);
2523 if (!(uc2 < 0x110000))
2525 fprintf (stderr, "%s mentions 0x%04X, which is out-of-range.\n",
2526 bidimirroring_filename, uc2);
2527 exit (1);
2529 /* Have we seen uc1 or uc2 already? */
2530 for (i = 0; i < mirror_pairs_count; i++)
2532 if (uc1 == mirror_pairs[i].uc[0])
2534 fprintf (stderr, "%s: mapping conflict for 0x%04X\n",
2535 bidimirroring_filename, uc1);
2536 exit (1);
2538 if (uc2 == mirror_pairs[i].uc[1])
2540 fprintf (stderr, "%s: mapping conflict for 0x%04X\n",
2541 bidimirroring_filename, uc2);
2542 exit (1);
2545 for (i = 0; i < mirror_pairs_count; i++)
2546 if (uc1 == mirror_pairs[i].uc[1] || uc2 == mirror_pairs[i].uc[0])
2547 break;
2548 if (i < mirror_pairs_count)
2550 if (uc1 != mirror_pairs[i].uc[1])
2552 /* uc1 != mirror_pairs[i].uc[1], uc2 == mirror_pairs[i].uc[0] */
2553 fprintf (stderr, "%s: mapping conflict for 0x%04X\n",
2554 bidimirroring_filename, uc2);
2555 exit (1);
2557 if (uc2 != mirror_pairs[i].uc[0])
2559 /* uc1 == mirror_pairs[i].uc[1], uc2 != mirror_pairs[i].uc[0] */
2560 fprintf (stderr, "%s: mapping conflict for 0x%04X\n",
2561 bidimirroring_filename, uc1);
2562 exit (1);
2564 /* uc1 == mirror_pairs[i].uc[1], uc2 == mirror_pairs[i].uc[0].
2565 (uc1, uc2) is the reverse pair of a pair that we already had
2566 encountered: (uc2, uc1). */
2568 else
2570 /* A new pair. */
2571 if (mirror_pairs_count == SIZEOF (mirror_pairs))
2573 fprintf (stderr, "%s contains more pairs than expected, "
2574 "increase mirror_pairs' size.\n",
2575 bidimirroring_filename);
2576 exit (1);
2578 mirror_pairs[mirror_pairs_count].uc[0] = uc1;
2579 mirror_pairs[mirror_pairs_count].uc[1] = uc2;
2580 mirror_pairs_count++;
2582 /* Verify that uc1 and uc2 have the BidiMirrored property. */
2583 if (!(unicode_attributes[uc1].name != NULL
2584 && unicode_attributes[uc1].mirrored))
2586 fprintf (stderr, "%s mentions 0x%04X, which is not BidiMirrored\n",
2587 bidimirroring_filename, uc1);
2588 exit (1);
2590 if (!(unicode_attributes[uc2].name != NULL
2591 && unicode_attributes[uc2].mirrored))
2593 fprintf (stderr, "%s mentions 0x%04X, which is not BidiMirrored\n",
2594 bidimirroring_filename, uc2);
2595 exit (1);
2599 if (ferror (stream) || fclose (stream))
2601 fprintf (stderr, "error reading from '%s'\n", bidimirroring_filename);
2602 exit (1);
2606 static int
2607 get_mirror_value (unsigned int ch)
2609 bool mirrored;
2610 unsigned int mirror_char;
2611 unsigned int i;
2613 mirrored = (unicode_attributes[ch].name != NULL
2614 && unicode_attributes[ch].mirrored);
2615 mirror_char = 0xfffd;
2616 for (i = 0; i < mirror_pairs_count; i++)
2617 if (ch == mirror_pairs[i].uc[0])
2619 mirror_char = mirror_pairs[i].uc[1];
2620 break;
2622 else if (ch == mirror_pairs[i].uc[1])
2624 mirror_char = mirror_pairs[i].uc[0];
2625 break;
2627 if (mirrored)
2628 return (int) mirror_char - (int) ch;
2629 else
2631 assert (mirror_char == 0xfffd);
2632 return 0;
2636 /* Construction of sparse 3-level tables. */
2637 #define TABLE mirror_table
2638 #define ELEMENT int32_t
2639 #define DEFAULT 0
2640 #define xmalloc malloc
2641 #define xrealloc realloc
2642 #include "3level.h"
2644 /* Output the per-character mirror table. */
2645 static void
2646 output_mirror (const char *filename, const char *version)
2648 FILE *stream;
2649 unsigned int ch, i;
2650 struct mirror_table t;
2651 unsigned int level1_offset, level2_offset, level3_offset;
2653 stream = fopen (filename, "w");
2654 if (stream == NULL)
2656 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2657 exit (1);
2660 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2661 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2662 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2663 version);
2664 fprintf (stream, "\n");
2666 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2667 fprintf (stream, "\n");
2668 output_library_license (stream, false);
2669 fprintf (stream, "\n");
2671 t.p = 7;
2672 t.q = 9;
2673 mirror_table_init (&t);
2675 for (ch = 0; ch < 0x110000; ch++)
2677 int value = get_mirror_value (ch);
2679 mirror_table_add (&t, ch, value);
2682 mirror_table_finalize (&t);
2684 /* Offsets in t.result, in memory of this process. */
2685 level1_offset =
2686 5 * sizeof (uint32_t);
2687 level2_offset =
2688 5 * sizeof (uint32_t)
2689 + t.level1_size * sizeof (uint32_t);
2690 level3_offset =
2691 5 * sizeof (uint32_t)
2692 + t.level1_size * sizeof (uint32_t)
2693 + (t.level2_size << t.q) * sizeof (uint32_t);
2695 for (i = 0; i < 5; i++)
2696 fprintf (stream, "#define mirror_header_%d %d\n", i,
2697 ((uint32_t *) t.result)[i]);
2698 fprintf (stream, "static const\n");
2699 fprintf (stream, "struct\n");
2700 fprintf (stream, " {\n");
2701 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2702 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2703 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2704 fprintf (stream, " }\n");
2705 fprintf (stream, "u_mirror =\n");
2706 fprintf (stream, "{\n");
2707 fprintf (stream, " {");
2708 if (t.level1_size > 8)
2709 fprintf (stream, "\n ");
2710 for (i = 0; i < t.level1_size; i++)
2712 uint32_t offset;
2713 if (i > 0 && (i % 8) == 0)
2714 fprintf (stream, "\n ");
2715 offset = ((uint32_t *) (t.result + level1_offset))[i];
2716 if (offset == 0)
2717 fprintf (stream, " %5d", -1);
2718 else
2719 fprintf (stream, " %5zu",
2720 (offset - level2_offset) / sizeof (uint32_t));
2721 if (i+1 < t.level1_size)
2722 fprintf (stream, ",");
2724 if (t.level1_size > 8)
2725 fprintf (stream, "\n ");
2726 fprintf (stream, " },\n");
2727 fprintf (stream, " {");
2728 if (t.level2_size << t.q > 8)
2729 fprintf (stream, "\n ");
2730 for (i = 0; i < t.level2_size << t.q; i++)
2732 uint32_t offset;
2733 if (i > 0 && (i % 8) == 0)
2734 fprintf (stream, "\n ");
2735 offset = ((uint32_t *) (t.result + level2_offset))[i];
2736 if (offset == 0)
2737 fprintf (stream, " %5d", -1);
2738 else
2739 fprintf (stream, " %5zu",
2740 (offset - level3_offset) / sizeof (int32_t));
2741 if (i+1 < t.level2_size << t.q)
2742 fprintf (stream, ",");
2744 if (t.level2_size << t.q > 8)
2745 fprintf (stream, "\n ");
2746 fprintf (stream, " },\n");
2747 fprintf (stream, " {");
2748 if (t.level3_size << t.p > 8)
2749 fprintf (stream, "\n ");
2750 for (i = 0; i < t.level3_size << t.p; i++)
2752 if (i > 0 && (i % 8) == 0)
2753 fprintf (stream, "\n ");
2754 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2755 if (i+1 < t.level3_size << t.p)
2756 fprintf (stream, ",");
2758 if (t.level3_size << t.p > 8)
2759 fprintf (stream, "\n ");
2760 fprintf (stream, " }\n");
2761 fprintf (stream, "};\n");
2763 if (ferror (stream) || fclose (stream))
2765 fprintf (stderr, "error writing to '%s'\n", filename);
2766 exit (1);
2770 /* ========================================================================= */
2772 /* Particular values of the word break property. */
2774 static bool
2775 is_WBP_MIDNUMLET (unsigned int ch)
2777 return (ch == 0x002E || ch == 0x2018 || ch == 0x2019
2778 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2781 static bool
2782 is_WBP_MIDLETTER (unsigned int ch)
2784 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2785 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A
2786 || ch == 0x055F);
2789 /* ========================================================================= */
2791 /* Properties. */
2793 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2794 enum
2796 /* PropList.txt */
2797 PROP_WHITE_SPACE,
2798 PROP_BIDI_CONTROL,
2799 PROP_JOIN_CONTROL,
2800 PROP_PREPENDED_CONCATENATION_MARK,
2801 PROP_DASH,
2802 PROP_HYPHEN,
2803 PROP_QUOTATION_MARK,
2804 PROP_TERMINAL_PUNCTUATION,
2805 PROP_OTHER_MATH,
2806 PROP_HEX_DIGIT,
2807 PROP_ASCII_HEX_DIGIT,
2808 PROP_OTHER_ALPHABETIC,
2809 PROP_IDEOGRAPHIC,
2810 PROP_DIACRITIC,
2811 PROP_EXTENDER,
2812 PROP_OTHER_LOWERCASE,
2813 PROP_OTHER_UPPERCASE,
2814 PROP_NONCHARACTER_CODE_POINT,
2815 PROP_OTHER_GRAPHEME_EXTEND,
2816 PROP_IDS_BINARY_OPERATOR,
2817 PROP_IDS_TRINARY_OPERATOR,
2818 PROP_IDS_UNARY_OPERATOR,
2819 PROP_RADICAL,
2820 PROP_UNIFIED_IDEOGRAPH,
2821 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2822 PROP_DEPRECATED,
2823 PROP_SOFT_DOTTED,
2824 PROP_LOGICAL_ORDER_EXCEPTION,
2825 PROP_OTHER_ID_START,
2826 PROP_OTHER_ID_CONTINUE,
2827 PROP_ID_COMPAT_MATH_CONTINUE,
2828 PROP_ID_COMPAT_MATH_START,
2829 PROP_SENTENCE_TERMINAL,
2830 PROP_VARIATION_SELECTOR,
2831 PROP_PATTERN_WHITE_SPACE,
2832 PROP_PATTERN_SYNTAX,
2833 PROP_REGIONAL_INDICATOR,
2834 /* DerivedCoreProperties.txt */
2835 PROP_MATH,
2836 PROP_ALPHABETIC,
2837 PROP_LOWERCASE,
2838 PROP_UPPERCASE,
2839 PROP_CASED,
2840 PROP_CASE_IGNORABLE,
2841 PROP_CHANGES_WHEN_LOWERCASED,
2842 PROP_CHANGES_WHEN_UPPERCASED,
2843 PROP_CHANGES_WHEN_TITLECASED,
2844 PROP_CHANGES_WHEN_CASEFOLDED,
2845 PROP_CHANGES_WHEN_CASEMAPPED,
2846 PROP_ID_START,
2847 PROP_ID_CONTINUE,
2848 PROP_XID_START,
2849 PROP_XID_CONTINUE,
2850 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2851 PROP_GRAPHEME_EXTEND,
2852 PROP_GRAPHEME_BASE,
2853 PROP_GRAPHEME_LINK,
2854 /* emoji-data.txt */
2855 PROP_EMOJI,
2856 PROP_EMOJI_PRESENTATION,
2857 PROP_EMOJI_MODIFIER,
2858 PROP_EMOJI_MODIFIER_BASE,
2859 PROP_EMOJI_COMPONENT,
2860 PROP_EXTENDED_PICTOGRAPHIC
2862 unsigned long long unicode_properties[0x110000];
2864 enum
2866 UC_INDIC_CONJUNCT_BREAK_NONE = 0, /* None */
2867 UC_INDIC_CONJUNCT_BREAK_CONSONANT, /* Consonant */
2868 UC_INDIC_CONJUNCT_BREAK_LINKER, /* Linker */
2869 UC_INDIC_CONJUNCT_BREAK_EXTEND /* Extend */
2871 static uint8_t unicode_indic_conjunct_break[0x110000];
2873 static void
2874 clear_properties (void)
2876 unsigned int i;
2878 for (i = 0; i < 0x110000; i++)
2879 unicode_properties[i] = 0;
2882 /* Stores in unicode_properties[] the properties from the
2883 PropList.txt or DerivedCoreProperties.txt file. */
2884 static void
2885 fill_properties (const char *proplist_filename)
2887 unsigned int i;
2888 FILE *stream;
2890 stream = fopen (proplist_filename, "r");
2891 if (stream == NULL)
2893 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2894 exit (1);
2897 for (;;)
2899 char buf[200+1];
2900 unsigned int i1, i2;
2901 char padding[200+1];
2902 char propname[200+1];
2903 char rest_of_line[200+1];
2904 unsigned int propcode;
2906 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2907 break;
2909 if (buf[0] == '\0' || buf[0] == '#')
2910 continue;
2912 if (sscanf (buf, "%X..%X%[ ;]%[^ #]%200s", &i1, &i2, padding, propname, rest_of_line) != 5)
2914 if (sscanf (buf, "%X%[ ;]%[^ #]%200s", &i1, padding, propname, rest_of_line) != 4)
2916 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2917 exit (1);
2919 i2 = i1;
2921 #define PROP(name,code) \
2922 if (strcmp (propname, name) == 0) propcode = code; else
2923 /* PropList.txt */
2924 PROP ("White_Space", PROP_WHITE_SPACE)
2925 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2926 PROP ("Join_Control", PROP_JOIN_CONTROL)
2927 PROP ("Prepended_Concatenation_Mark", PROP_PREPENDED_CONCATENATION_MARK)
2928 PROP ("Dash", PROP_DASH)
2929 PROP ("Hyphen", PROP_HYPHEN)
2930 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2931 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2932 PROP ("Other_Math", PROP_OTHER_MATH)
2933 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2934 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2935 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2936 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2937 PROP ("Diacritic", PROP_DIACRITIC)
2938 PROP ("Extender", PROP_EXTENDER)
2939 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2940 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2941 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2942 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2943 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2944 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2945 PROP ("IDS_Unary_Operator", PROP_IDS_UNARY_OPERATOR)
2946 PROP ("Radical", PROP_RADICAL)
2947 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2948 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2949 PROP ("Deprecated", PROP_DEPRECATED)
2950 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2951 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2952 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2953 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2954 PROP ("ID_Compat_Math_Continue", PROP_ID_COMPAT_MATH_CONTINUE)
2955 PROP ("ID_Compat_Math_Start", PROP_ID_COMPAT_MATH_START)
2956 PROP ("Sentence_Terminal", PROP_SENTENCE_TERMINAL)
2957 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2958 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2959 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2960 PROP ("Regional_Indicator", PROP_REGIONAL_INDICATOR)
2961 /* DerivedCoreProperties.txt */
2962 PROP ("Math", PROP_MATH)
2963 PROP ("Alphabetic", PROP_ALPHABETIC)
2964 PROP ("Lowercase", PROP_LOWERCASE)
2965 PROP ("Uppercase", PROP_UPPERCASE)
2966 PROP ("Cased", PROP_CASED)
2967 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2968 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2969 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2970 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2971 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2972 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2973 PROP ("ID_Start", PROP_ID_START)
2974 PROP ("ID_Continue", PROP_ID_CONTINUE)
2975 PROP ("XID_Start", PROP_XID_START)
2976 PROP ("XID_Continue", PROP_XID_CONTINUE)
2977 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2978 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2979 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2980 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2981 /* emoji-data.txt */
2982 PROP ("Emoji", PROP_EMOJI)
2983 PROP ("Emoji_Presentation", PROP_EMOJI_PRESENTATION)
2984 PROP ("Emoji_Modifier", PROP_EMOJI_MODIFIER)
2985 PROP ("Emoji_Modifier_Base", PROP_EMOJI_MODIFIER_BASE)
2986 PROP ("Emoji_Component", PROP_EMOJI_COMPONENT)
2987 PROP ("Extended_Pictographic", PROP_EXTENDED_PICTOGRAPHIC)
2988 #undef PROP
2989 /* An enum-valued property from DerivedCoreProperties.txt */
2990 if (strcmp (propname, "InCB;") == 0)
2992 char valuename[200+1];
2993 unsigned int valuecode;
2995 if (sscanf (rest_of_line, "%[^ #]", valuename) != 1)
2997 fprintf (stderr, "parse error 2 in '%s'\n", proplist_filename);
2998 exit (1);
3001 if (strcmp (valuename, "None") == 0)
3002 valuecode = UC_INDIC_CONJUNCT_BREAK_NONE;
3003 else if (strcmp (valuename, "Consonant") == 0)
3004 valuecode = UC_INDIC_CONJUNCT_BREAK_CONSONANT;
3005 else if (strcmp (valuename, "Linker") == 0)
3006 valuecode = UC_INDIC_CONJUNCT_BREAK_LINKER;
3007 else if (strcmp (valuename, "Extend") == 0)
3008 valuecode = UC_INDIC_CONJUNCT_BREAK_EXTEND;
3009 else
3011 fprintf (stderr, "unknown InCB value named '%s' in '%s'\n",
3012 valuename, proplist_filename);
3013 exit (1);
3016 assert (i1 <= i2 && i2 < 0x110000);
3017 for (i = i1; i <= i2; i++)
3018 unicode_indic_conjunct_break[i] = valuecode;
3020 goto done_line;
3022 else
3024 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
3025 proplist_filename);
3026 exit (1);
3029 assert (i1 <= i2 && i2 < 0x110000);
3030 for (i = i1; i <= i2; i++)
3031 unicode_properties[i] |= 1ULL << propcode;
3033 done_line: ;
3036 if (ferror (stream) || fclose (stream))
3038 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
3039 exit (1);
3043 /* Stores in array the given property from the Unicode 3.0 PropList.txt
3044 file. */
3045 static void
3046 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
3048 unsigned int i;
3049 FILE *stream;
3050 char buf[100+1];
3052 for (i = 0; i < 0x110000; i++)
3053 array[i] = 0;
3055 stream = fopen (proplist_filename, "r");
3056 if (stream == NULL)
3058 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
3059 exit (1);
3062 /* Search for the "Property dump for: ..." line. */
3065 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
3067 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
3068 exit (1);
3071 while (strstr (buf, property_name) == NULL);
3073 for (;;)
3075 unsigned int i1, i2;
3077 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
3078 break;
3079 if (buf[0] == '*')
3080 break;
3081 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
3083 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
3085 fprintf (stderr, "parse error in property in '%s'\n",
3086 proplist_filename);
3087 exit (1);
3090 else if (strlen (buf) >= 4)
3092 if (sscanf (buf, "%4X", &i1) < 1)
3094 fprintf (stderr, "parse error in property in '%s'\n",
3095 proplist_filename);
3096 exit (1);
3098 i2 = i1;
3100 else
3102 fprintf (stderr, "parse error in property in '%s'\n",
3103 proplist_filename);
3104 exit (1);
3106 assert (i1 <= i2 && i2 < 0x110000);
3107 for (i = i1; i <= i2; i++)
3108 array[i] = 1;
3111 if (ferror (stream) || fclose (stream))
3113 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
3114 exit (1);
3118 /* Properties from Unicode 3.0 PropList.txt file. */
3120 /* The paired punctuation property from the PropList.txt file. */
3121 char unicode_pairedpunctuation[0x110000];
3123 /* The left of pair property from the PropList.txt file. */
3124 char unicode_leftofpair[0x110000];
3126 static void
3127 fill_properties30 (const char *proplist30_filename)
3129 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
3130 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
3133 /* ------------------------------------------------------------------------- */
3135 /* See PropList.txt, UCD.html. */
3136 static bool
3137 is_property_white_space (unsigned int ch)
3139 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
3142 /* See Unicode 3.0 book, section 4.10,
3143 PropList.txt, UCD.html,
3144 DerivedCoreProperties.txt, UCD.html. */
3145 static bool
3146 is_property_alphabetic (unsigned int ch)
3148 bool result1 =
3149 is_category_L (ch)
3150 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
3151 /* For some reason, the following are listed as having property
3152 Alphabetic but not as having property Other_Alphabetic. */
3153 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
3154 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
3155 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
3156 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
3157 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
3158 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
3159 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
3160 || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
3161 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
3162 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
3163 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
3164 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
3165 || (ch >= 0x12400 && ch <= 0x1246E); /* CUNEIFORM NUMERIC SIGNS */
3166 bool result2 =
3167 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
3169 assert (result1 == result2);
3170 return result1;
3173 /* See PropList.txt, UCD.html. */
3174 static bool
3175 is_property_other_alphabetic (unsigned int ch)
3177 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
3180 /* See PropList.txt, UCD.html. */
3181 static bool
3182 is_property_not_a_character (unsigned int ch)
3184 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
3187 /* See PropList.txt, UCD.html,
3188 DerivedCoreProperties.txt, UCD.html. */
3189 static bool
3190 is_property_default_ignorable_code_point (unsigned int ch)
3192 bool result1 =
3193 (is_category_Cf (ch)
3194 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
3195 && !(ch >= 0x13430 && ch <= 0x1343F) /* Egyptian Hieroglyph */
3196 && ((unicode_properties[ch] & (1ULL << PROP_PREPENDED_CONCATENATION_MARK)) == 0))
3197 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
3198 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
3199 bool result2 =
3200 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
3202 assert (result1 == result2);
3203 return result1;
3206 /* See PropList.txt, UCD.html. */
3207 static bool
3208 is_property_other_default_ignorable_code_point (unsigned int ch)
3210 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
3213 /* See PropList.txt, UCD.html. */
3214 static bool
3215 is_property_deprecated (unsigned int ch)
3217 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
3220 /* See PropList.txt, UCD.html. */
3221 static bool
3222 is_property_logical_order_exception (unsigned int ch)
3224 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
3227 /* See PropList.txt, UCD.html. */
3228 static bool
3229 is_property_variation_selector (unsigned int ch)
3231 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
3234 /* See PropList-3.0.1.txt. */
3235 static bool
3236 is_property_private_use (unsigned int ch)
3238 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
3239 return (ch >= 0xE000 && ch <= 0xF8FF)
3240 || (ch >= 0xF0000 && ch <= 0xFFFFD)
3241 || (ch >= 0x100000 && ch <= 0x10FFFD);
3244 /* See PropList-3.0.1.txt. */
3245 static bool
3246 is_property_unassigned_code_value (unsigned int ch)
3248 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
3251 /* See PropList.txt, UCD.html,
3252 DerivedCoreProperties.txt, UCD.html. */
3253 static bool
3254 is_property_uppercase (unsigned int ch)
3256 bool result1 =
3257 is_category_Lu (ch)
3258 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
3259 bool result2 =
3260 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
3262 assert (result1 == result2);
3263 return result1;
3266 /* See PropList.txt, UCD.html. */
3267 static bool
3268 is_property_other_uppercase (unsigned int ch)
3270 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
3273 /* See PropList.txt, UCD.html,
3274 DerivedCoreProperties.txt, UCD.html. */
3275 static bool
3276 is_property_lowercase (unsigned int ch)
3278 bool result1 =
3279 is_category_Ll (ch)
3280 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
3281 bool result2 =
3282 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
3284 assert (result1 == result2);
3285 return result1;
3288 /* See PropList.txt, UCD.html. */
3289 static bool
3290 is_property_other_lowercase (unsigned int ch)
3292 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
3295 /* See PropList-3.0.1.txt. */
3296 static bool
3297 is_property_titlecase (unsigned int ch)
3299 return is_category_Lt (ch);
3302 /* See DerivedCoreProperties.txt. */
3303 static bool
3304 is_property_cased (unsigned int ch)
3306 bool result1 = (is_property_lowercase (ch)
3307 || is_property_uppercase (ch)
3308 || is_category_Lt (ch));
3309 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
3311 assert (result1 == result2);
3312 return result1;
3315 /* See DerivedCoreProperties.txt. */
3316 static bool
3317 is_property_case_ignorable (unsigned int ch)
3319 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
3320 || ch == 0x0027
3321 || is_category_Mn (ch)
3322 || is_category_Me (ch)
3323 || is_category_Cf (ch)
3324 || is_category_Lm (ch)
3325 || is_category_Sk (ch));
3326 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
3328 assert (result1 == result2);
3329 return result1;
3332 /* See DerivedCoreProperties.txt. */
3333 static bool
3334 is_property_changes_when_lowercased (unsigned int ch)
3336 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
3337 bool result2 = (unicode_attributes[ch].name != NULL
3338 && unicode_attributes[ch].lower != NONE
3339 && unicode_attributes[ch].lower != ch);
3341 assert (result1 == result2);
3342 return result1;
3345 /* See DerivedCoreProperties.txt. */
3346 static bool
3347 is_property_changes_when_uppercased (unsigned int ch)
3349 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3352 /* See DerivedCoreProperties.txt. */
3353 static bool
3354 is_property_changes_when_titlecased (unsigned int ch)
3356 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3359 /* See DerivedCoreProperties.txt. */
3360 static bool
3361 is_property_changes_when_casefolded (unsigned int ch)
3363 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3366 /* See DerivedCoreProperties.txt. */
3367 static bool
3368 is_property_changes_when_casemapped (unsigned int ch)
3370 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3373 /* See PropList.txt, UCD.html. */
3374 static bool
3375 is_property_soft_dotted (unsigned int ch)
3377 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3380 /* See DerivedCoreProperties.txt, UCD.html. */
3381 static bool
3382 is_property_id_start (unsigned int ch)
3384 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3387 /* See PropList.txt, UCD.html. */
3388 static bool
3389 is_property_other_id_start (unsigned int ch)
3391 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3394 /* See DerivedCoreProperties.txt, UCD.html. */
3395 static bool
3396 is_property_id_continue (unsigned int ch)
3398 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3401 /* See PropList.txt, UCD.html. */
3402 static bool
3403 is_property_other_id_continue (unsigned int ch)
3405 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3408 /* See DerivedCoreProperties.txt, UCD.html. */
3409 static bool
3410 is_property_xid_start (unsigned int ch)
3412 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3415 /* See DerivedCoreProperties.txt, UCD.html. */
3416 static bool
3417 is_property_xid_continue (unsigned int ch)
3419 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3422 /* See PropList.txt, UCD.html. */
3423 static bool
3424 is_property_id_compat_math_start (unsigned int ch)
3426 return ((unicode_properties[ch] & (1ULL << PROP_ID_COMPAT_MATH_START)) != 0);
3429 /* See PropList.txt, UCD.html. */
3430 static bool
3431 is_property_id_compat_math_continue (unsigned int ch)
3433 return ((unicode_properties[ch] & (1ULL << PROP_ID_COMPAT_MATH_CONTINUE)) != 0);
3436 /* See PropList.txt, UCD.html. */
3437 static bool
3438 is_property_pattern_white_space (unsigned int ch)
3440 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3443 /* See PropList.txt, UCD.html. */
3444 static bool
3445 is_property_pattern_syntax (unsigned int ch)
3447 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3450 /* See PropList.txt, UCD.html. */
3451 static bool
3452 is_property_join_control (unsigned int ch)
3454 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3457 /* See DerivedCoreProperties.txt, UCD.html. */
3458 static bool
3459 is_property_grapheme_base (unsigned int ch)
3461 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3464 /* See DerivedCoreProperties.txt, UCD.html. */
3465 static bool
3466 is_property_grapheme_extend (unsigned int ch)
3468 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3471 /* See PropList.txt, UCD.html. */
3472 static bool
3473 is_property_other_grapheme_extend (unsigned int ch)
3475 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3478 /* See DerivedCoreProperties.txt, UCD.html. */
3479 static bool
3480 is_property_grapheme_link (unsigned int ch)
3482 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3485 /* See PropList.txt, UCD.html. */
3486 static bool
3487 is_property_bidi_control (unsigned int ch)
3489 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3492 /* See PropList-3.0.1.txt. */
3493 static bool
3494 is_property_bidi_left_to_right (unsigned int ch)
3496 return (get_bidi_category (ch) == UC_BIDI_L);
3499 /* See PropList-3.0.1.txt. */
3500 static bool
3501 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3503 return (get_bidi_category (ch) == UC_BIDI_R);
3506 /* See PropList-3.0.1.txt. */
3507 static bool
3508 is_property_bidi_arabic_right_to_left (unsigned int ch)
3510 return (get_bidi_category (ch) == UC_BIDI_AL);
3513 /* See PropList-3.0.1.txt. */
3514 static bool
3515 is_property_bidi_european_digit (unsigned int ch)
3517 return (get_bidi_category (ch) == UC_BIDI_EN);
3520 /* See PropList-3.0.1.txt. */
3521 static bool
3522 is_property_bidi_eur_num_separator (unsigned int ch)
3524 return (get_bidi_category (ch) == UC_BIDI_ES);
3527 /* See PropList-3.0.1.txt. */
3528 static bool
3529 is_property_bidi_eur_num_terminator (unsigned int ch)
3531 return (get_bidi_category (ch) == UC_BIDI_ET);
3534 /* See PropList-3.0.1.txt. */
3535 static bool
3536 is_property_bidi_arabic_digit (unsigned int ch)
3538 return (get_bidi_category (ch) == UC_BIDI_AN);
3541 /* See PropList-3.0.1.txt. */
3542 static bool
3543 is_property_bidi_common_separator (unsigned int ch)
3545 return (get_bidi_category (ch) == UC_BIDI_CS);
3548 /* See PropList-3.0.1.txt. */
3549 static bool
3550 is_property_bidi_block_separator (unsigned int ch)
3552 return (get_bidi_category (ch) == UC_BIDI_B);
3555 /* See PropList-3.0.1.txt. */
3556 static bool
3557 is_property_bidi_segment_separator (unsigned int ch)
3559 return (get_bidi_category (ch) == UC_BIDI_S);
3562 /* See PropList-3.0.1.txt. */
3563 static bool
3564 is_property_bidi_whitespace (unsigned int ch)
3566 return (get_bidi_category (ch) == UC_BIDI_WS);
3569 /* See PropList-3.0.1.txt. */
3570 static bool
3571 is_property_bidi_non_spacing_mark (unsigned int ch)
3573 return (get_bidi_category (ch) == UC_BIDI_NSM);
3576 /* See PropList-3.0.1.txt. */
3577 static bool
3578 is_property_bidi_boundary_neutral (unsigned int ch)
3580 return (get_bidi_category (ch) == UC_BIDI_BN);
3583 /* See PropList-3.0.1.txt. */
3584 static bool
3585 is_property_bidi_pdf (unsigned int ch)
3587 return (get_bidi_category (ch) == UC_BIDI_PDF);
3590 /* See PropList-3.0.1.txt. */
3591 static bool
3592 is_property_bidi_embedding_or_override (unsigned int ch)
3594 int category = get_bidi_category (ch);
3595 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3596 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3599 /* See PropList-3.0.1.txt. */
3600 static bool
3601 is_property_bidi_other_neutral (unsigned int ch)
3603 return (get_bidi_category (ch) == UC_BIDI_ON);
3606 /* See PropList.txt, UCD.html. */
3607 static bool
3608 is_property_hex_digit (unsigned int ch)
3610 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3613 /* See PropList.txt, UCD.html. */
3614 static bool
3615 is_property_ascii_hex_digit (unsigned int ch)
3617 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3620 /* See Unicode 3.0 book, section 4.10,
3621 PropList.txt, UCD.html. */
3622 static bool
3623 is_property_ideographic (unsigned int ch)
3625 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3628 /* See PropList.txt, UCD.html. */
3629 static bool
3630 is_property_unified_ideograph (unsigned int ch)
3632 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3635 /* See PropList.txt, UCD.html. */
3636 static bool
3637 is_property_radical (unsigned int ch)
3639 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3642 /* See PropList.txt, UCD.html. */
3643 static bool
3644 is_property_ids_unary_operator (unsigned int ch)
3646 return ((unicode_properties[ch] & (1ULL << PROP_IDS_UNARY_OPERATOR)) != 0);
3649 /* See PropList.txt, UCD.html. */
3650 static bool
3651 is_property_ids_binary_operator (unsigned int ch)
3653 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3656 /* See PropList.txt, UCD.html. */
3657 static bool
3658 is_property_ids_trinary_operator (unsigned int ch)
3660 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3663 /* See PropList-3.0.1.txt. */
3664 static bool
3665 is_property_zero_width (unsigned int ch)
3667 return is_category_Cf (ch)
3668 || (unicode_attributes[ch].name != NULL
3669 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3672 /* See PropList-3.0.1.txt. */
3673 static bool
3674 is_property_space (unsigned int ch)
3676 return is_category_Zs (ch);
3679 /* See PropList-3.0.1.txt. */
3680 static bool
3681 is_property_non_break (unsigned int ch)
3683 /* This is exactly the set of characters having line breaking
3684 property GL. */
3685 return (ch == 0x00A0 /* NO-BREAK SPACE */
3686 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3687 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3688 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3689 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3690 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3691 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3692 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3693 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3694 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3695 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3696 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3697 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3698 || ch == 0x2007 /* FIGURE SPACE */
3699 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3700 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3703 /* See PropList-3.0.1.txt. */
3704 static bool
3705 is_property_iso_control (unsigned int ch)
3707 bool result1 =
3708 (unicode_attributes[ch].name != NULL
3709 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3710 bool result2 =
3711 is_category_Cc (ch);
3713 assert (result1 == result2);
3714 return result1;
3717 /* See PropList-3.0.1.txt. */
3718 static bool
3719 is_property_format_control (unsigned int ch)
3721 return (is_category_Cf (ch)
3722 && get_bidi_category (ch) == UC_BIDI_BN
3723 && !is_property_join_control (ch)
3724 && ch != 0xFEFF);
3727 /* See PropList.txt, UCD.html. */
3728 static bool
3729 is_property_prepended_concatenation_mark (unsigned int ch)
3731 return ((unicode_properties[ch] & (1ULL << PROP_PREPENDED_CONCATENATION_MARK)) != 0);
3734 /* See PropList.txt, UCD.html. */
3735 static bool
3736 is_property_dash (unsigned int ch)
3738 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3741 /* See PropList.txt, UCD.html. */
3742 static bool
3743 is_property_hyphen (unsigned int ch)
3745 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3748 /* See PropList-3.0.1.txt. */
3749 static bool
3750 is_property_punctuation (unsigned int ch)
3752 return is_category_P (ch);
3755 /* See PropList-3.0.1.txt. */
3756 static bool
3757 is_property_line_separator (unsigned int ch)
3759 return is_category_Zl (ch);
3762 /* See PropList-3.0.1.txt. */
3763 static bool
3764 is_property_paragraph_separator (unsigned int ch)
3766 return is_category_Zp (ch);
3769 /* See PropList.txt, UCD.html. */
3770 static bool
3771 is_property_quotation_mark (unsigned int ch)
3773 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3776 /* See PropList.txt, UCD.html. */
3777 static bool
3778 is_property_sentence_terminal (unsigned int ch)
3780 return ((unicode_properties[ch] & (1ULL << PROP_SENTENCE_TERMINAL)) != 0);
3783 /* See PropList.txt, UCD.html. */
3784 static bool
3785 is_property_terminal_punctuation (unsigned int ch)
3787 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3790 /* See PropList-3.0.1.txt. */
3791 static bool
3792 is_property_currency_symbol (unsigned int ch)
3794 return is_category_Sc (ch);
3797 /* See Unicode 3.0 book, section 4.9,
3798 PropList.txt, UCD.html,
3799 DerivedCoreProperties.txt, UCD.html. */
3800 static bool
3801 is_property_math (unsigned int ch)
3803 bool result1 =
3804 is_category_Sm (ch)
3805 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3806 bool result2 =
3807 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3809 assert (result1 == result2);
3810 return result1;
3813 /* See PropList.txt, UCD.html. */
3814 static bool
3815 is_property_other_math (unsigned int ch)
3817 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3820 /* See PropList-3.0.1.txt. */
3821 static bool
3822 is_property_paired_punctuation (unsigned int ch)
3824 return unicode_pairedpunctuation[ch];
3827 /* See PropList-3.0.1.txt. */
3828 static bool
3829 is_property_left_of_pair (unsigned int ch)
3831 return unicode_leftofpair[ch];
3834 /* See PropList-3.0.1.txt. */
3835 static bool
3836 is_property_combining (unsigned int ch)
3838 return (unicode_attributes[ch].name != NULL
3839 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3840 || is_category_Mc (ch)
3841 || is_category_Me (ch)
3842 || is_category_Mn (ch)));
3845 #if 0 /* same as is_property_bidi_non_spacing_mark */
3846 /* See PropList-3.0.1.txt. */
3847 static bool
3848 is_property_non_spacing (unsigned int ch)
3850 return (unicode_attributes[ch].name != NULL
3851 && get_bidi_category (ch) == UC_BIDI_NSM);
3853 #endif
3855 /* See PropList-3.0.1.txt. */
3856 static bool
3857 is_property_composite (unsigned int ch)
3859 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3860 logical in some sense. */
3861 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3862 return true;
3863 if (unicode_attributes[ch].name != NULL
3864 && unicode_attributes[ch].decomposition != NULL)
3866 /* Test whether the decomposition contains more than one character,
3867 and the first is not a space. */
3868 const char *decomp = unicode_attributes[ch].decomposition;
3869 if (decomp[0] == '<')
3871 decomp = strchr (decomp, '>') + 1;
3872 if (decomp[0] == ' ')
3873 decomp++;
3875 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3877 return false;
3880 /* See PropList-3.0.1.txt. */
3881 static bool
3882 is_property_decimal_digit (unsigned int ch)
3884 return is_category_Nd (ch);
3887 /* See PropList-3.0.1.txt. */
3888 static bool
3889 is_property_numeric (unsigned int ch)
3891 return ((get_numeric_value (ch)).denominator > 0)
3892 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3893 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3896 /* See PropList.txt, UCD.html. */
3897 static bool
3898 is_property_diacritic (unsigned int ch)
3900 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3903 /* See PropList.txt, UCD.html. */
3904 static bool
3905 is_property_extender (unsigned int ch)
3907 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3910 /* See PropList-3.0.1.txt. */
3911 static bool
3912 is_property_ignorable_control (unsigned int ch)
3914 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3915 || is_category_Cf (ch))
3916 && ch != 0x0000;
3919 /* See PropList.txt, UCD.html. */
3920 static bool
3921 is_property_regional_indicator (unsigned int ch)
3923 return ((unicode_properties[ch] & (1ULL << PROP_REGIONAL_INDICATOR)) != 0);
3926 /* See emoji-data.txt, UTS #51. */
3927 static bool
3928 is_property_emoji (unsigned int ch)
3930 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI)) != 0);
3933 /* See emoji-data.txt, UTS #51. */
3934 static bool
3935 is_property_emoji_presentation (unsigned int ch)
3937 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI_PRESENTATION)) != 0);
3940 /* See emoji-data.txt, UTS #51. */
3941 static bool
3942 is_property_emoji_modifier (unsigned int ch)
3944 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI_MODIFIER)) != 0);
3947 /* See emoji-data.txt, UTS #51. */
3948 static bool
3949 is_property_emoji_modifier_base (unsigned int ch)
3951 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI_MODIFIER_BASE)) != 0);
3954 /* See emoji-data.txt, UTS #51. */
3955 static bool
3956 is_property_emoji_component (unsigned int ch)
3958 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI_COMPONENT)) != 0);
3961 /* See emoji-data.txt, UTS #51. */
3962 static bool
3963 is_property_extended_pictographic (unsigned int ch)
3965 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDED_PICTOGRAPHIC)) != 0);
3968 /* ------------------------------------------------------------------------- */
3970 /* Output all properties. */
3971 static void
3972 output_properties (const char *version)
3974 #define PROPERTY(P) \
3975 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3976 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3977 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3978 PROPERTY(white_space)
3979 PROPERTY(alphabetic)
3980 PROPERTY(other_alphabetic)
3981 PROPERTY(not_a_character)
3982 PROPERTY(default_ignorable_code_point)
3983 PROPERTY(other_default_ignorable_code_point)
3984 PROPERTY(deprecated)
3985 PROPERTY(logical_order_exception)
3986 PROPERTY(variation_selector)
3987 PROPERTY(private_use)
3988 PROPERTY(unassigned_code_value)
3989 PROPERTY(uppercase)
3990 PROPERTY(other_uppercase)
3991 PROPERTY(lowercase)
3992 PROPERTY(other_lowercase)
3993 PROPERTY(titlecase)
3994 PROPERTY(cased)
3995 PROPERTY(case_ignorable)
3996 PROPERTY(changes_when_lowercased)
3997 PROPERTY(changes_when_uppercased)
3998 PROPERTY(changes_when_titlecased)
3999 PROPERTY(changes_when_casefolded)
4000 PROPERTY(changes_when_casemapped)
4001 PROPERTY(soft_dotted)
4002 PROPERTY(id_start)
4003 PROPERTY(other_id_start)
4004 PROPERTY(id_continue)
4005 PROPERTY(other_id_continue)
4006 PROPERTY(xid_start)
4007 PROPERTY(xid_continue)
4008 PROPERTY(id_compat_math_start)
4009 PROPERTY(id_compat_math_continue)
4010 PROPERTY(pattern_white_space)
4011 PROPERTY(pattern_syntax)
4012 PROPERTY(join_control)
4013 PROPERTY(grapheme_base)
4014 PROPERTY(grapheme_extend)
4015 PROPERTY(other_grapheme_extend)
4016 PROPERTY(grapheme_link)
4017 PROPERTY(bidi_control)
4018 PROPERTY(bidi_left_to_right)
4019 PROPERTY(bidi_hebrew_right_to_left)
4020 PROPERTY(bidi_arabic_right_to_left)
4021 PROPERTY(bidi_european_digit)
4022 PROPERTY(bidi_eur_num_separator)
4023 PROPERTY(bidi_eur_num_terminator)
4024 PROPERTY(bidi_arabic_digit)
4025 PROPERTY(bidi_common_separator)
4026 PROPERTY(bidi_block_separator)
4027 PROPERTY(bidi_segment_separator)
4028 PROPERTY(bidi_whitespace)
4029 PROPERTY(bidi_non_spacing_mark)
4030 PROPERTY(bidi_boundary_neutral)
4031 PROPERTY(bidi_pdf)
4032 PROPERTY(bidi_embedding_or_override)
4033 PROPERTY(bidi_other_neutral)
4034 PROPERTY(hex_digit)
4035 PROPERTY(ascii_hex_digit)
4036 PROPERTY(ideographic)
4037 PROPERTY(unified_ideograph)
4038 PROPERTY(radical)
4039 PROPERTY(ids_unary_operator)
4040 PROPERTY(ids_binary_operator)
4041 PROPERTY(ids_trinary_operator)
4042 PROPERTY(zero_width)
4043 PROPERTY(space)
4044 PROPERTY(non_break)
4045 PROPERTY(iso_control)
4046 PROPERTY(format_control)
4047 PROPERTY(prepended_concatenation_mark)
4048 PROPERTY(dash)
4049 PROPERTY(hyphen)
4050 PROPERTY(punctuation)
4051 PROPERTY(line_separator)
4052 PROPERTY(paragraph_separator)
4053 PROPERTY(quotation_mark)
4054 PROPERTY(sentence_terminal)
4055 PROPERTY(terminal_punctuation)
4056 PROPERTY(currency_symbol)
4057 PROPERTY(math)
4058 PROPERTY(other_math)
4059 PROPERTY(paired_punctuation)
4060 PROPERTY(left_of_pair)
4061 PROPERTY(combining)
4062 PROPERTY(composite)
4063 PROPERTY(decimal_digit)
4064 PROPERTY(numeric)
4065 PROPERTY(diacritic)
4066 PROPERTY(extender)
4067 PROPERTY(ignorable_control)
4068 PROPERTY(regional_indicator)
4069 PROPERTY(emoji)
4070 PROPERTY(emoji_presentation)
4071 PROPERTY(emoji_modifier)
4072 PROPERTY(emoji_modifier_base)
4073 PROPERTY(emoji_component)
4074 PROPERTY(extended_pictographic)
4075 #undef PROPERTY
4078 /* ------------------------------------------------------------------------- */
4080 /* Convert an Indic_Conjunct_Break value to a C identifier. */
4081 static const char *
4082 indic_conjunct_break_as_c_identifier (int indic_conjunct_break)
4084 #define TRY(value) if (indic_conjunct_break == value) return #value;
4085 TRY(UC_INDIC_CONJUNCT_BREAK_NONE)
4086 TRY(UC_INDIC_CONJUNCT_BREAK_CONSONANT)
4087 TRY(UC_INDIC_CONJUNCT_BREAK_LINKER)
4088 TRY(UC_INDIC_CONJUNCT_BREAK_EXTEND)
4089 #undef TRY
4090 abort ();
4093 static void
4094 output_indic_conjunct_break_test (const char *filename, const char *version)
4096 FILE *stream;
4097 bool need_comma;
4098 unsigned int ch;
4100 stream = fopen (filename, "w");
4101 if (stream == NULL)
4103 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4104 exit (1);
4107 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4108 fprintf (stream, "/* Indic_Conjunct_Break attribute of Unicode characters. */\n");
4109 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4110 version);
4111 fprintf (stream, "\n");
4113 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4114 fprintf (stream, "\n");
4115 output_tests_license (stream);
4116 fprintf (stream, "\n");
4118 need_comma = false;
4119 for (ch = 0; ch < 0x110000; ch++)
4121 int value = unicode_indic_conjunct_break[ch];
4123 if (value != UC_INDIC_CONJUNCT_BREAK_NONE)
4125 if (need_comma)
4126 fprintf (stream, ",\n");
4127 fprintf (stream, " { 0x%04X, %s }", ch, indic_conjunct_break_as_c_identifier (value));
4128 need_comma = true;
4131 if (need_comma)
4132 fprintf (stream, "\n");
4134 if (ferror (stream) || fclose (stream))
4136 fprintf (stderr, "error writing to '%s'\n", filename);
4137 exit (1);
4141 /* Construction of sparse 3-level tables. */
4142 #define TABLE indic_conjunct_break_table
4143 #define ELEMENT uint8_t
4144 #define DEFAULT UC_INDIC_CONJUNCT_BREAK_NONE
4145 #define xmalloc malloc
4146 #define xrealloc realloc
4147 #include "3level.h"
4149 static void
4150 output_indic_conjunct_break (const char *filename, const char *version)
4152 FILE *stream;
4153 unsigned int ch, i;
4154 struct indic_conjunct_break_table t;
4155 unsigned int level1_offset, level2_offset, level3_offset;
4157 stream = fopen (filename, "w");
4158 if (stream == NULL)
4160 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4161 exit (1);
4164 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4165 fprintf (stream, "/* Indic_Conjunct_Break attribute of Unicode characters. */\n");
4166 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4167 version);
4168 fprintf (stream, "\n");
4170 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4171 fprintf (stream, "\n");
4172 output_library_license (stream, false);
4173 fprintf (stream, "\n");
4175 t.p = 6; /* or 5 */
4176 t.q = 4; /* or 5 */
4177 indic_conjunct_break_table_init (&t);
4179 for (ch = 0; ch < 0x110000; ch++)
4181 uint8_t value = unicode_indic_conjunct_break[ch];
4183 assert (value <= 0x03);
4185 if (value != UC_INDIC_CONJUNCT_BREAK_NONE)
4186 indic_conjunct_break_table_add (&t, ch, value);
4189 indic_conjunct_break_table_finalize (&t);
4191 /* Offsets in t.result, in memory of this process. */
4192 level1_offset =
4193 5 * sizeof (uint32_t);
4194 level2_offset =
4195 5 * sizeof (uint32_t)
4196 + t.level1_size * sizeof (uint32_t);
4197 level3_offset =
4198 5 * sizeof (uint32_t)
4199 + t.level1_size * sizeof (uint32_t)
4200 + (t.level2_size << t.q) * sizeof (uint32_t);
4202 for (i = 0; i < 5; i++)
4203 fprintf (stream, "#define indic_conjunct_break_header_%d %d\n", i,
4204 ((uint32_t *) t.result)[i]);
4205 fprintf (stream, "static const\n");
4206 fprintf (stream, "struct\n");
4207 fprintf (stream, " {\n");
4208 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4209 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4210 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4211 (1 << t.p) * 2 / 16);
4212 fprintf (stream, " }\n");
4213 fprintf (stream, "u_indic_conjunct_break =\n");
4214 fprintf (stream, "{\n");
4215 fprintf (stream, " {");
4216 if (t.level1_size > 8)
4217 fprintf (stream, "\n ");
4218 for (i = 0; i < t.level1_size; i++)
4220 uint32_t offset;
4221 if (i > 0 && (i % 8) == 0)
4222 fprintf (stream, "\n ");
4223 offset = ((uint32_t *) (t.result + level1_offset))[i];
4224 if (offset == 0)
4225 fprintf (stream, " %5d", -1);
4226 else
4227 fprintf (stream, " %5zu",
4228 (offset - level2_offset) / sizeof (uint32_t));
4229 if (i+1 < t.level1_size)
4230 fprintf (stream, ",");
4232 if (t.level1_size > 8)
4233 fprintf (stream, "\n ");
4234 fprintf (stream, " },\n");
4235 fprintf (stream, " {");
4236 if (t.level2_size << t.q > 8)
4237 fprintf (stream, "\n ");
4238 for (i = 0; i < t.level2_size << t.q; i++)
4240 uint32_t offset;
4241 if (i > 0 && (i % 8) == 0)
4242 fprintf (stream, "\n ");
4243 offset = ((uint32_t *) (t.result + level2_offset))[i];
4244 if (offset == 0)
4245 fprintf (stream, " %5d", -1);
4246 else
4247 fprintf (stream, " %5zu",
4248 (offset - level3_offset) / sizeof (uint8_t));
4249 if (i+1 < t.level2_size << t.q)
4250 fprintf (stream, ",");
4252 if (t.level2_size << t.q > 8)
4253 fprintf (stream, "\n ");
4254 fprintf (stream, " },\n");
4255 /* Pack the level3 array. Each entry needs 2 bits only. */
4256 fprintf (stream, " {");
4257 if ((t.level3_size << t.p) * 2 / 16 > 8)
4258 fprintf (stream, "\n ");
4259 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4261 if (i > 0 && (i % 8) == 0)
4262 fprintf (stream, "\n ");
4263 fprintf (stream, " 0x%04x",
4264 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4265 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4266 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4267 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4268 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4269 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4270 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4271 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4272 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4273 fprintf (stream, ",");
4275 if ((t.level3_size << t.p) * 2 / 16 > 8)
4276 fprintf (stream, "\n ");
4277 fprintf (stream, " }\n");
4278 fprintf (stream, "};\n");
4280 if (ferror (stream) || fclose (stream))
4282 fprintf (stderr, "error writing to '%s'\n", filename);
4283 exit (1);
4287 /* ========================================================================= */
4289 /* Arabic Shaping. */
4291 enum
4293 UC_JOINING_TYPE_U, /* Non_Joining */
4294 UC_JOINING_TYPE_T, /* Transparent */
4295 UC_JOINING_TYPE_C, /* Join_Causing */
4296 UC_JOINING_TYPE_L, /* Left_Joining */
4297 UC_JOINING_TYPE_R, /* Right_Joining */
4298 UC_JOINING_TYPE_D /* Dual_Joining */
4301 static uint8_t unicode_joining_type[0x110000];
4303 enum
4305 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
4306 UC_JOINING_GROUP_AIN, /* Ain */
4307 UC_JOINING_GROUP_ALAPH, /* Alaph */
4308 UC_JOINING_GROUP_ALEF, /* Alef */
4309 UC_JOINING_GROUP_BEH, /* Beh */
4310 UC_JOINING_GROUP_BETH, /* Beth */
4311 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
4312 UC_JOINING_GROUP_DAL, /* Dal */
4313 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
4314 UC_JOINING_GROUP_E, /* E */
4315 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
4316 UC_JOINING_GROUP_FE, /* Fe */
4317 UC_JOINING_GROUP_FEH, /* Feh */
4318 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
4319 UC_JOINING_GROUP_GAF, /* Gaf */
4320 UC_JOINING_GROUP_GAMAL, /* Gamal */
4321 UC_JOINING_GROUP_HAH, /* Hah */
4322 UC_JOINING_GROUP_HE, /* He */
4323 UC_JOINING_GROUP_HEH, /* Heh */
4324 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
4325 UC_JOINING_GROUP_HETH, /* Heth */
4326 UC_JOINING_GROUP_KAF, /* Kaf */
4327 UC_JOINING_GROUP_KAPH, /* Kaph */
4328 UC_JOINING_GROUP_KHAPH, /* Khaph */
4329 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
4330 UC_JOINING_GROUP_LAM, /* Lam */
4331 UC_JOINING_GROUP_LAMADH, /* Lamadh */
4332 UC_JOINING_GROUP_MEEM, /* Meem */
4333 UC_JOINING_GROUP_MIM, /* Mim */
4334 UC_JOINING_GROUP_NOON, /* Noon */
4335 UC_JOINING_GROUP_NUN, /* Nun */
4336 UC_JOINING_GROUP_NYA, /* Nya */
4337 UC_JOINING_GROUP_PE, /* Pe */
4338 UC_JOINING_GROUP_QAF, /* Qaf */
4339 UC_JOINING_GROUP_QAPH, /* Qaph */
4340 UC_JOINING_GROUP_REH, /* Reh */
4341 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
4342 UC_JOINING_GROUP_SAD, /* Sad */
4343 UC_JOINING_GROUP_SADHE, /* Sadhe */
4344 UC_JOINING_GROUP_SEEN, /* Seen */
4345 UC_JOINING_GROUP_SEMKATH, /* Semkath */
4346 UC_JOINING_GROUP_SHIN, /* Shin */
4347 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
4348 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
4349 UC_JOINING_GROUP_TAH, /* Tah */
4350 UC_JOINING_GROUP_TAW, /* Taw */
4351 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
4352 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
4353 UC_JOINING_GROUP_TETH, /* Teth */
4354 UC_JOINING_GROUP_WAW, /* Waw */
4355 UC_JOINING_GROUP_YEH, /* Yeh */
4356 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
4357 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
4358 UC_JOINING_GROUP_YUDH, /* Yudh */
4359 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
4360 UC_JOINING_GROUP_ZAIN, /* Zain */
4361 UC_JOINING_GROUP_ZHAIN, /* Zhain */
4362 UC_JOINING_GROUP_ROHINGYA_YEH, /* Rohingya_Yeh */
4363 UC_JOINING_GROUP_STRAIGHT_WAW, /* Straight_Waw */
4364 UC_JOINING_GROUP_MANICHAEAN_ALEPH, /* Manichaean_Aleph */
4365 UC_JOINING_GROUP_MANICHAEAN_BETH, /* Manichaean_Beth */
4366 UC_JOINING_GROUP_MANICHAEAN_GIMEL, /* Manichaean_Gimel */
4367 UC_JOINING_GROUP_MANICHAEAN_DALETH, /* Manichaean_Daleth */
4368 UC_JOINING_GROUP_MANICHAEAN_WAW, /* Manichaean_Waw */
4369 UC_JOINING_GROUP_MANICHAEAN_ZAYIN, /* Manichaean_Zayin */
4370 UC_JOINING_GROUP_MANICHAEAN_HETH, /* Manichaean_Heth */
4371 UC_JOINING_GROUP_MANICHAEAN_TETH, /* Manichaean_Teth */
4372 UC_JOINING_GROUP_MANICHAEAN_YODH, /* Manichaean_Yodh */
4373 UC_JOINING_GROUP_MANICHAEAN_KAPH, /* Manichaean_Kaph */
4374 UC_JOINING_GROUP_MANICHAEAN_LAMEDH, /* Manichaean_Lamedh */
4375 UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, /* Manichaean_Dhamedh */
4376 UC_JOINING_GROUP_MANICHAEAN_THAMEDH, /* Manichaean_Thamedh */
4377 UC_JOINING_GROUP_MANICHAEAN_MEM, /* Manichaean_Mem */
4378 UC_JOINING_GROUP_MANICHAEAN_NUN, /* Manichaean_Nun */
4379 UC_JOINING_GROUP_MANICHAEAN_SAMEKH, /* Manichaean_Aleph */
4380 UC_JOINING_GROUP_MANICHAEAN_AYIN, /* Manichaean_Ayin */
4381 UC_JOINING_GROUP_MANICHAEAN_PE, /* Manichaean_Pe */
4382 UC_JOINING_GROUP_MANICHAEAN_SADHE, /* Manichaean_Sadhe */
4383 UC_JOINING_GROUP_MANICHAEAN_QOPH, /* Manichaean_Qoph */
4384 UC_JOINING_GROUP_MANICHAEAN_RESH, /* Manichaean_Resh */
4385 UC_JOINING_GROUP_MANICHAEAN_TAW, /* Manichaean_Taw */
4386 UC_JOINING_GROUP_MANICHAEAN_ONE, /* Manichaean_One */
4387 UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
4388 UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
4389 UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
4390 UC_JOINING_GROUP_MANICHAEAN_HUNDRED, /* Manichaean_Hundred */
4391 UC_JOINING_GROUP_AFRICAN_FEH, /* African_Feh */
4392 UC_JOINING_GROUP_AFRICAN_QAF, /* African_Qaf */
4393 UC_JOINING_GROUP_AFRICAN_NOON, /* African_Noon */
4394 UC_JOINING_GROUP_MALAYALAM_NGA, /* Malayalam_Nga */
4395 UC_JOINING_GROUP_MALAYALAM_JA, /* Malayalam_Ja */
4396 UC_JOINING_GROUP_MALAYALAM_NYA, /* Malayalam_Nya */
4397 UC_JOINING_GROUP_MALAYALAM_TTA, /* Malayalam_Tta */
4398 UC_JOINING_GROUP_MALAYALAM_NNA, /* Malayalam_Nna */
4399 UC_JOINING_GROUP_MALAYALAM_NNNA, /* Malayalam_Nnna */
4400 UC_JOINING_GROUP_MALAYALAM_BHA, /* Malayalam_Bha */
4401 UC_JOINING_GROUP_MALAYALAM_RA, /* Malayalam_Ra */
4402 UC_JOINING_GROUP_MALAYALAM_LLA, /* Malayalam_Lla */
4403 UC_JOINING_GROUP_MALAYALAM_LLLA, /* Malayalam_Llla */
4404 UC_JOINING_GROUP_MALAYALAM_SSA, /* Malayalam_Ssa */
4405 UC_JOINING_GROUP_HANIFI_ROHINGYA_PA, /* Hanifi_Rohingya_Pa */
4406 UC_JOINING_GROUP_HANIFI_ROHINGYA_KINNA_YA, /* Hanifi_Rohingya_Kinna_Ya */
4407 UC_JOINING_GROUP_THIN_YEH, /* Thin_Yeh */
4408 UC_JOINING_GROUP_VERTICAL_TAIL /* Vertical_Tail */
4411 static uint8_t unicode_joining_group[0x110000];
4413 static void
4414 fill_arabicshaping (const char *arabicshaping_filename)
4416 FILE *stream;
4417 unsigned int i;
4418 int lineno;
4420 stream = fopen (arabicshaping_filename, "r");
4421 if (stream == NULL)
4423 fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
4424 exit (1);
4427 for (i = 0; i < 0x110000; i++)
4429 unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
4430 unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
4433 lineno = 0;
4434 for (;;)
4436 char buf[200+1];
4437 char separator1[200+1];
4438 char schematic_name[200+1];
4439 char separator2[200+1];
4440 char joining_type_name[200+1];
4441 char separator3[200+1];
4442 char joining_group_name[200+1];
4443 int joining_type;
4444 int joining_group;
4446 lineno++;
4447 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4448 break;
4450 if (buf[0] == '\0' || buf[0] == '#')
4451 continue;
4453 if (sscanf (buf, "%X%[; ]%[^;]%[; ]%[^;]%[; ]%100[^\n]",
4454 &i, separator1, schematic_name, separator2, joining_type_name,
4455 separator3, joining_group_name) != 7)
4457 fprintf (stderr, "parse error in '%s':%d\n",
4458 arabicshaping_filename, lineno);
4459 exit (1);
4461 assert (i < 0x110000);
4463 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
4464 if (false) {}
4465 TRY(UC_JOINING_TYPE_U)
4466 TRY(UC_JOINING_TYPE_T)
4467 TRY(UC_JOINING_TYPE_C)
4468 TRY(UC_JOINING_TYPE_L)
4469 TRY(UC_JOINING_TYPE_R)
4470 TRY(UC_JOINING_TYPE_D)
4471 #undef TRY
4472 else
4474 fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
4475 joining_type_name, arabicshaping_filename, lineno);
4476 exit (1);
4479 /* Remove trailing spaces. */
4480 while (joining_group_name[0] != '\0'
4481 && joining_group_name[strlen (joining_group_name) - 1] == ' ')
4482 joining_group_name[strlen (joining_group_name) - 1] = '\0';
4484 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
4485 if (false) {}
4486 TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
4487 TRY(UC_JOINING_GROUP_AIN, "AIN")
4488 TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
4489 TRY(UC_JOINING_GROUP_ALEF, "ALEF")
4490 TRY(UC_JOINING_GROUP_BEH, "BEH")
4491 TRY(UC_JOINING_GROUP_BETH, "BETH")
4492 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
4493 TRY(UC_JOINING_GROUP_DAL, "DAL")
4494 TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
4495 TRY(UC_JOINING_GROUP_E, "E")
4496 TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
4497 TRY(UC_JOINING_GROUP_FE, "FE")
4498 TRY(UC_JOINING_GROUP_FEH, "FEH")
4499 TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
4500 TRY(UC_JOINING_GROUP_GAF, "GAF")
4501 TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
4502 TRY(UC_JOINING_GROUP_HAH, "HAH")
4503 TRY(UC_JOINING_GROUP_HE, "HE")
4504 TRY(UC_JOINING_GROUP_HEH, "HEH")
4505 TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
4506 TRY(UC_JOINING_GROUP_HETH, "HETH")
4507 TRY(UC_JOINING_GROUP_KAF, "KAF")
4508 TRY(UC_JOINING_GROUP_KAPH, "KAPH")
4509 TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
4510 TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
4511 TRY(UC_JOINING_GROUP_LAM, "LAM")
4512 TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
4513 TRY(UC_JOINING_GROUP_MEEM, "MEEM")
4514 TRY(UC_JOINING_GROUP_MIM, "MIM")
4515 TRY(UC_JOINING_GROUP_NOON, "NOON")
4516 TRY(UC_JOINING_GROUP_NUN, "NUN")
4517 TRY(UC_JOINING_GROUP_NYA, "NYA")
4518 TRY(UC_JOINING_GROUP_PE, "PE")
4519 TRY(UC_JOINING_GROUP_QAF, "QAF")
4520 TRY(UC_JOINING_GROUP_QAPH, "QAPH")
4521 TRY(UC_JOINING_GROUP_REH, "REH")
4522 TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
4523 TRY(UC_JOINING_GROUP_SAD, "SAD")
4524 TRY(UC_JOINING_GROUP_SADHE, "SADHE")
4525 TRY(UC_JOINING_GROUP_SEEN, "SEEN")
4526 TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
4527 TRY(UC_JOINING_GROUP_SHIN, "SHIN")
4528 TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
4529 TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
4530 TRY(UC_JOINING_GROUP_TAH, "TAH")
4531 TRY(UC_JOINING_GROUP_TAW, "TAW")
4532 TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
4533 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
4534 TRY(UC_JOINING_GROUP_TETH, "TETH")
4535 TRY(UC_JOINING_GROUP_WAW, "WAW")
4536 TRY(UC_JOINING_GROUP_YEH, "YEH")
4537 TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
4538 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
4539 TRY(UC_JOINING_GROUP_YUDH, "YUDH")
4540 TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
4541 TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
4542 TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
4543 TRY(UC_JOINING_GROUP_ROHINGYA_YEH, "ROHINGYA YEH")
4544 TRY(UC_JOINING_GROUP_STRAIGHT_WAW, "STRAIGHT WAW")
4545 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH, "MANICHAEAN ALEPH")
4546 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH, "MANICHAEAN BETH")
4547 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL, "MANICHAEAN GIMEL")
4548 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH, "MANICHAEAN DALETH")
4549 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW, "MANICHAEAN WAW")
4550 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN, "MANICHAEAN ZAYIN")
4551 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH, "MANICHAEAN HETH")
4552 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH, "MANICHAEAN TETH")
4553 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH, "MANICHAEAN YODH")
4554 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH, "MANICHAEAN KAPH")
4555 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH, "MANICHAEAN LAMEDH")
4556 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, "MANICHAEAN DHAMEDH")
4557 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH, "MANICHAEAN THAMEDH")
4558 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM, "MANICHAEAN MEM")
4559 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN, "MANICHAEAN NUN")
4560 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH, "MANICHAEAN SAMEKH")
4561 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN, "MANICHAEAN AYIN")
4562 TRY(UC_JOINING_GROUP_MANICHAEAN_PE, "MANICHAEAN PE")
4563 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE, "MANICHAEAN SADHE")
4564 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH, "MANICHAEAN QOPH")
4565 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH, "MANICHAEAN RESH")
4566 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW, "MANICHAEAN TAW")
4567 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE, "MANICHAEAN ONE")
4568 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE, "MANICHAEAN FIVE")
4569 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN, "MANICHAEAN TEN")
4570 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY, "MANICHAEAN TWENTY")
4571 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED, "MANICHAEAN HUNDRED")
4572 TRY(UC_JOINING_GROUP_AFRICAN_FEH, "AFRICAN FEH")
4573 TRY(UC_JOINING_GROUP_AFRICAN_QAF, "AFRICAN QAF")
4574 TRY(UC_JOINING_GROUP_AFRICAN_NOON, "AFRICAN NOON")
4575 TRY(UC_JOINING_GROUP_MALAYALAM_NGA, "MALAYALAM NGA")
4576 TRY(UC_JOINING_GROUP_MALAYALAM_JA, "MALAYALAM JA")
4577 TRY(UC_JOINING_GROUP_MALAYALAM_NYA, "MALAYALAM NYA")
4578 TRY(UC_JOINING_GROUP_MALAYALAM_TTA, "MALAYALAM TTA")
4579 TRY(UC_JOINING_GROUP_MALAYALAM_NNA, "MALAYALAM NNA")
4580 TRY(UC_JOINING_GROUP_MALAYALAM_NNNA, "MALAYALAM NNNA")
4581 TRY(UC_JOINING_GROUP_MALAYALAM_BHA, "MALAYALAM BHA")
4582 TRY(UC_JOINING_GROUP_MALAYALAM_RA, "MALAYALAM RA")
4583 TRY(UC_JOINING_GROUP_MALAYALAM_LLA, "MALAYALAM LLA")
4584 TRY(UC_JOINING_GROUP_MALAYALAM_LLLA, "MALAYALAM LLLA")
4585 TRY(UC_JOINING_GROUP_MALAYALAM_SSA, "MALAYALAM SSA")
4586 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_PA, "HANIFI ROHINGYA PA")
4587 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_KINNA_YA, "HANIFI ROHINGYA KINNA YA")
4588 TRY(UC_JOINING_GROUP_THIN_YEH, "THIN YEH")
4589 TRY(UC_JOINING_GROUP_VERTICAL_TAIL, "VERTICAL TAIL")
4590 #undef TRY
4591 else
4593 fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
4594 joining_group_name, arabicshaping_filename, lineno);
4595 exit (1);
4598 unicode_joining_type[i] = joining_type;
4599 unicode_joining_group[i] = joining_group;
4602 if (ferror (stream) || fclose (stream))
4604 fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
4605 exit (1);
4609 /* Convert a Joining_Type value to a C identifier. */
4610 static const char *
4611 joining_type_as_c_identifier (int joining_type)
4613 #define TRY(value) if (joining_type == value) return #value;
4614 TRY(UC_JOINING_TYPE_U)
4615 TRY(UC_JOINING_TYPE_T)
4616 TRY(UC_JOINING_TYPE_C)
4617 TRY(UC_JOINING_TYPE_L)
4618 TRY(UC_JOINING_TYPE_R)
4619 TRY(UC_JOINING_TYPE_D)
4620 #undef TRY
4621 abort ();
4624 static void
4625 output_joining_type_test (const char *filename, const char *version)
4627 FILE *stream;
4628 bool need_comma;
4629 unsigned int ch;
4631 stream = fopen (filename, "w");
4632 if (stream == NULL)
4634 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4635 exit (1);
4638 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4639 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4640 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4641 version);
4642 fprintf (stream, "\n");
4644 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4645 fprintf (stream, "\n");
4646 output_tests_license (stream);
4647 fprintf (stream, "\n");
4649 need_comma = false;
4650 for (ch = 0; ch < 0x110000; ch++)
4652 int value = unicode_joining_type[ch];
4654 if (value != (uint8_t)~(uint8_t)0)
4656 if (need_comma)
4657 fprintf (stream, ",\n");
4658 fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
4659 need_comma = true;
4662 if (need_comma)
4663 fprintf (stream, "\n");
4665 if (ferror (stream) || fclose (stream))
4667 fprintf (stderr, "error writing to '%s'\n", filename);
4668 exit (1);
4672 /* Construction of sparse 3-level tables. */
4673 #define TABLE joining_type_table
4674 #define ELEMENT uint8_t
4675 #define DEFAULT (uint8_t)~(uint8_t)0
4676 #define xmalloc malloc
4677 #define xrealloc realloc
4678 #include "3level.h"
4680 static void
4681 output_joining_type (const char *filename, const char *version)
4683 FILE *stream;
4684 unsigned int ch, i;
4685 struct joining_type_table t;
4686 unsigned int level1_offset, level2_offset, level3_offset;
4687 uint8_t *level3_packed;
4689 stream = fopen (filename, "w");
4690 if (stream == NULL)
4692 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4693 exit (1);
4696 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4697 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4698 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4699 version);
4700 fprintf (stream, "\n");
4702 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4703 fprintf (stream, "\n");
4704 output_library_license (stream, true);
4705 fprintf (stream, "\n");
4707 t.p = 7;
4708 t.q = 9;
4709 joining_type_table_init (&t);
4711 for (ch = 0; ch < 0x110000; ch++)
4713 uint8_t value = unicode_joining_type[ch];
4715 assert (value == (uint8_t)~(uint8_t)0 || value <= 0x0f);
4717 joining_type_table_add (&t, ch, value);
4720 joining_type_table_finalize (&t);
4722 /* Offsets in t.result, in memory of this process. */
4723 level1_offset =
4724 5 * sizeof (uint32_t);
4725 level2_offset =
4726 5 * sizeof (uint32_t)
4727 + t.level1_size * sizeof (uint32_t);
4728 level3_offset =
4729 5 * sizeof (uint32_t)
4730 + t.level1_size * sizeof (uint32_t)
4731 + (t.level2_size << t.q) * sizeof (uint32_t);
4733 for (i = 0; i < 5; i++)
4734 fprintf (stream, "#define joining_type_header_%d %d\n", i,
4735 ((uint32_t *) t.result)[i]);
4736 fprintf (stream, "static const\n");
4737 fprintf (stream, "struct\n");
4738 fprintf (stream, " {\n");
4739 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4740 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4741 fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
4742 (1 << t.p) * 4 / 8);
4743 fprintf (stream, " }\n");
4744 fprintf (stream, "u_joining_type =\n");
4745 fprintf (stream, "{\n");
4746 fprintf (stream, " {");
4747 if (t.level1_size > 8)
4748 fprintf (stream, "\n ");
4749 for (i = 0; i < t.level1_size; i++)
4751 uint32_t offset;
4752 if (i > 0 && (i % 8) == 0)
4753 fprintf (stream, "\n ");
4754 offset = ((uint32_t *) (t.result + level1_offset))[i];
4755 if (offset == 0)
4756 fprintf (stream, " %5d", -1);
4757 else
4758 fprintf (stream, " %5zu",
4759 (offset - level2_offset) / sizeof (uint32_t));
4760 if (i+1 < t.level1_size)
4761 fprintf (stream, ",");
4763 if (t.level1_size > 8)
4764 fprintf (stream, "\n ");
4765 fprintf (stream, " },\n");
4766 fprintf (stream, " {");
4767 if (t.level2_size << t.q > 8)
4768 fprintf (stream, "\n ");
4769 for (i = 0; i < t.level2_size << t.q; i++)
4771 uint32_t offset;
4772 if (i > 0 && (i % 8) == 0)
4773 fprintf (stream, "\n ");
4774 offset = ((uint32_t *) (t.result + level2_offset))[i];
4775 if (offset == 0)
4776 fprintf (stream, " %5d", -1);
4777 else
4778 fprintf (stream, " %5zu",
4779 (offset - level3_offset) / sizeof (uint8_t));
4780 if (i+1 < t.level2_size << t.q)
4781 fprintf (stream, ",");
4783 if (t.level2_size << t.q > 8)
4784 fprintf (stream, "\n ");
4785 fprintf (stream, " },\n");
4786 /* Pack the level3 array. Each entry needs 4 bits only. */
4787 level3_packed =
4788 (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
4789 for (i = 0; i < t.level3_size << t.p; i++)
4791 unsigned int j = (i * 4) / 8;
4792 unsigned int k = (i * 4) % 8;
4793 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
4794 level3_packed[j] |= (value << k);
4796 fprintf (stream, " {");
4797 if ((t.level3_size << t.p) * 4 / 8 > 8)
4798 fprintf (stream, "\n ");
4799 for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
4801 if (i > 0 && (i % 8) == 0)
4802 fprintf (stream, "\n ");
4803 fprintf (stream, " 0x%02x", level3_packed[i]);
4804 if (i+1 < (t.level3_size << t.p) * 4 / 8)
4805 fprintf (stream, ",");
4807 if ((t.level3_size << t.p) * 4 / 8 > 8)
4808 fprintf (stream, "\n ");
4809 fprintf (stream, " }\n");
4810 free (level3_packed);
4811 fprintf (stream, "};\n");
4813 if (ferror (stream) || fclose (stream))
4815 fprintf (stderr, "error writing to '%s'\n", filename);
4816 exit (1);
4820 /* Convert a Joining_Group value to a C identifier. */
4821 static const char *
4822 joining_group_as_c_identifier (int joining_group)
4824 #define TRY(value) if (joining_group == value) return #value;
4825 TRY(UC_JOINING_GROUP_NONE)
4826 TRY(UC_JOINING_GROUP_AIN)
4827 TRY(UC_JOINING_GROUP_ALAPH)
4828 TRY(UC_JOINING_GROUP_ALEF)
4829 TRY(UC_JOINING_GROUP_BEH)
4830 TRY(UC_JOINING_GROUP_BETH)
4831 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
4832 TRY(UC_JOINING_GROUP_DAL)
4833 TRY(UC_JOINING_GROUP_DALATH_RISH)
4834 TRY(UC_JOINING_GROUP_E)
4835 TRY(UC_JOINING_GROUP_FARSI_YEH)
4836 TRY(UC_JOINING_GROUP_FE)
4837 TRY(UC_JOINING_GROUP_FEH)
4838 TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
4839 TRY(UC_JOINING_GROUP_GAF)
4840 TRY(UC_JOINING_GROUP_GAMAL)
4841 TRY(UC_JOINING_GROUP_HAH)
4842 TRY(UC_JOINING_GROUP_HE)
4843 TRY(UC_JOINING_GROUP_HEH)
4844 TRY(UC_JOINING_GROUP_HEH_GOAL)
4845 TRY(UC_JOINING_GROUP_HETH)
4846 TRY(UC_JOINING_GROUP_KAF)
4847 TRY(UC_JOINING_GROUP_KAPH)
4848 TRY(UC_JOINING_GROUP_KHAPH)
4849 TRY(UC_JOINING_GROUP_KNOTTED_HEH)
4850 TRY(UC_JOINING_GROUP_LAM)
4851 TRY(UC_JOINING_GROUP_LAMADH)
4852 TRY(UC_JOINING_GROUP_MEEM)
4853 TRY(UC_JOINING_GROUP_MIM)
4854 TRY(UC_JOINING_GROUP_NOON)
4855 TRY(UC_JOINING_GROUP_NUN)
4856 TRY(UC_JOINING_GROUP_NYA)
4857 TRY(UC_JOINING_GROUP_PE)
4858 TRY(UC_JOINING_GROUP_QAF)
4859 TRY(UC_JOINING_GROUP_QAPH)
4860 TRY(UC_JOINING_GROUP_REH)
4861 TRY(UC_JOINING_GROUP_REVERSED_PE)
4862 TRY(UC_JOINING_GROUP_SAD)
4863 TRY(UC_JOINING_GROUP_SADHE)
4864 TRY(UC_JOINING_GROUP_SEEN)
4865 TRY(UC_JOINING_GROUP_SEMKATH)
4866 TRY(UC_JOINING_GROUP_SHIN)
4867 TRY(UC_JOINING_GROUP_SWASH_KAF)
4868 TRY(UC_JOINING_GROUP_SYRIAC_WAW)
4869 TRY(UC_JOINING_GROUP_TAH)
4870 TRY(UC_JOINING_GROUP_TAW)
4871 TRY(UC_JOINING_GROUP_TEH_MARBUTA)
4872 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
4873 TRY(UC_JOINING_GROUP_TETH)
4874 TRY(UC_JOINING_GROUP_WAW)
4875 TRY(UC_JOINING_GROUP_YEH)
4876 TRY(UC_JOINING_GROUP_YEH_BARREE)
4877 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
4878 TRY(UC_JOINING_GROUP_YUDH)
4879 TRY(UC_JOINING_GROUP_YUDH_HE)
4880 TRY(UC_JOINING_GROUP_ZAIN)
4881 TRY(UC_JOINING_GROUP_ZHAIN)
4882 TRY(UC_JOINING_GROUP_ROHINGYA_YEH)
4883 TRY(UC_JOINING_GROUP_STRAIGHT_WAW)
4884 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH)
4885 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH)
4886 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL)
4887 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH)
4888 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW)
4889 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN)
4890 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH)
4891 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH)
4892 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH)
4893 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH)
4894 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH)
4895 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH)
4896 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH)
4897 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM)
4898 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN)
4899 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH)
4900 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN)
4901 TRY(UC_JOINING_GROUP_MANICHAEAN_PE)
4902 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE)
4903 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH)
4904 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH)
4905 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW)
4906 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE)
4907 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE)
4908 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN)
4909 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY)
4910 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED)
4911 TRY(UC_JOINING_GROUP_AFRICAN_FEH)
4912 TRY(UC_JOINING_GROUP_AFRICAN_QAF)
4913 TRY(UC_JOINING_GROUP_AFRICAN_NOON)
4914 TRY(UC_JOINING_GROUP_MALAYALAM_NGA)
4915 TRY(UC_JOINING_GROUP_MALAYALAM_JA)
4916 TRY(UC_JOINING_GROUP_MALAYALAM_NYA)
4917 TRY(UC_JOINING_GROUP_MALAYALAM_TTA)
4918 TRY(UC_JOINING_GROUP_MALAYALAM_NNA)
4919 TRY(UC_JOINING_GROUP_MALAYALAM_NNNA)
4920 TRY(UC_JOINING_GROUP_MALAYALAM_BHA)
4921 TRY(UC_JOINING_GROUP_MALAYALAM_RA)
4922 TRY(UC_JOINING_GROUP_MALAYALAM_LLA)
4923 TRY(UC_JOINING_GROUP_MALAYALAM_LLLA)
4924 TRY(UC_JOINING_GROUP_MALAYALAM_SSA)
4925 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_PA)
4926 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_KINNA_YA)
4927 TRY(UC_JOINING_GROUP_THIN_YEH)
4928 TRY(UC_JOINING_GROUP_VERTICAL_TAIL)
4929 #undef TRY
4930 abort ();
4933 static void
4934 output_joining_group_test (const char *filename, const char *version)
4936 FILE *stream;
4937 bool need_comma;
4938 unsigned int ch;
4940 stream = fopen (filename, "w");
4941 if (stream == NULL)
4943 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4944 exit (1);
4947 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4948 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4949 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4950 version);
4951 fprintf (stream, "\n");
4953 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4954 fprintf (stream, "\n");
4955 output_tests_license (stream);
4956 fprintf (stream, "\n");
4958 need_comma = false;
4959 for (ch = 0; ch < 0x110000; ch++)
4961 int value = unicode_joining_group[ch];
4963 if (value != UC_JOINING_GROUP_NONE)
4965 if (need_comma)
4966 fprintf (stream, ",\n");
4967 fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
4968 need_comma = true;
4971 if (need_comma)
4972 fprintf (stream, "\n");
4974 if (ferror (stream) || fclose (stream))
4976 fprintf (stderr, "error writing to '%s'\n", filename);
4977 exit (1);
4981 /* Construction of sparse 3-level tables. */
4982 #define TABLE joining_group_table
4983 #define ELEMENT uint8_t
4984 #define DEFAULT UC_JOINING_GROUP_NONE
4985 #define xmalloc malloc
4986 #define xrealloc realloc
4987 #include "3level.h"
4989 static void
4990 output_joining_group (const char *filename, const char *version)
4992 FILE *stream;
4993 unsigned int ch, i;
4994 struct joining_group_table t;
4995 unsigned int level1_offset, level2_offset, level3_offset;
4996 uint16_t *level3_packed;
4998 stream = fopen (filename, "w");
4999 if (stream == NULL)
5001 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5002 exit (1);
5005 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5006 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
5007 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5008 version);
5009 fprintf (stream, "\n");
5011 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5012 fprintf (stream, "\n");
5013 output_library_license (stream, false);
5014 fprintf (stream, "\n");
5016 t.p = 7;
5017 t.q = 9;
5018 joining_group_table_init (&t);
5020 for (ch = 0; ch < 0x110000; ch++)
5022 uint8_t value = unicode_joining_group[ch];
5024 assert (value <= 0x7f);
5026 joining_group_table_add (&t, ch, value);
5029 joining_group_table_finalize (&t);
5031 /* Offsets in t.result, in memory of this process. */
5032 level1_offset =
5033 5 * sizeof (uint32_t);
5034 level2_offset =
5035 5 * sizeof (uint32_t)
5036 + t.level1_size * sizeof (uint32_t);
5037 level3_offset =
5038 5 * sizeof (uint32_t)
5039 + t.level1_size * sizeof (uint32_t)
5040 + (t.level2_size << t.q) * sizeof (uint32_t);
5042 for (i = 0; i < 5; i++)
5043 fprintf (stream, "#define joining_group_header_%d %d\n", i,
5044 ((uint32_t *) t.result)[i]);
5045 fprintf (stream, "static const\n");
5046 fprintf (stream, "struct\n");
5047 fprintf (stream, " {\n");
5048 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5049 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
5050 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
5051 (1 << t.p) * 7 / 16);
5052 fprintf (stream, " }\n");
5053 fprintf (stream, "u_joining_group =\n");
5054 fprintf (stream, "{\n");
5055 fprintf (stream, " {");
5056 if (t.level1_size > 8)
5057 fprintf (stream, "\n ");
5058 for (i = 0; i < t.level1_size; i++)
5060 uint32_t offset;
5061 if (i > 0 && (i % 8) == 0)
5062 fprintf (stream, "\n ");
5063 offset = ((uint32_t *) (t.result + level1_offset))[i];
5064 if (offset == 0)
5065 fprintf (stream, " %5d", -1);
5066 else
5067 fprintf (stream, " %5zu",
5068 (offset - level2_offset) / sizeof (uint32_t));
5069 if (i+1 < t.level1_size)
5070 fprintf (stream, ",");
5072 if (t.level1_size > 8)
5073 fprintf (stream, "\n ");
5074 fprintf (stream, " },\n");
5075 fprintf (stream, " {");
5076 if (t.level2_size << t.q > 8)
5077 fprintf (stream, "\n ");
5078 for (i = 0; i < t.level2_size << t.q; i++)
5080 uint32_t offset;
5081 if (i > 0 && (i % 8) == 0)
5082 fprintf (stream, "\n ");
5083 offset = ((uint32_t *) (t.result + level2_offset))[i];
5084 if (offset == 0)
5085 fprintf (stream, " %5d", -1);
5086 else
5087 fprintf (stream, " %5zu",
5088 (offset - level3_offset) / sizeof (uint8_t));
5089 if (i+1 < t.level2_size << t.q)
5090 fprintf (stream, ",");
5092 if (t.level2_size << t.q > 8)
5093 fprintf (stream, "\n ");
5094 fprintf (stream, " },\n");
5095 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
5096 not 32-bit units, in order to make the lookup function easier. */
5097 level3_packed =
5098 (uint16_t *)
5099 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
5100 for (i = 0; i < t.level3_size << t.p; i++)
5102 unsigned int j = (i * 7) / 16;
5103 unsigned int k = (i * 7) % 16;
5104 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
5105 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
5106 level3_packed[j] = value & 0xffff;
5107 level3_packed[j+1] = value >> 16;
5109 fprintf (stream, " {");
5110 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
5111 fprintf (stream, "\n ");
5112 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
5114 if (i > 0 && (i % 8) == 0)
5115 fprintf (stream, "\n ");
5116 fprintf (stream, " 0x%04x", level3_packed[i]);
5117 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
5118 fprintf (stream, ",");
5120 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
5121 fprintf (stream, "\n ");
5122 fprintf (stream, " }\n");
5123 free (level3_packed);
5124 fprintf (stream, "};\n");
5126 if (ferror (stream) || fclose (stream))
5128 fprintf (stderr, "error writing to '%s'\n", filename);
5129 exit (1);
5133 /* ========================================================================= */
5135 /* Scripts. */
5137 static const char *scripts[256];
5138 static unsigned int numscripts;
5140 static uint8_t unicode_scripts[0x110000];
5142 static void
5143 fill_scripts (const char *scripts_filename)
5145 FILE *stream;
5146 unsigned int i;
5148 stream = fopen (scripts_filename, "r");
5149 if (stream == NULL)
5151 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
5152 exit (1);
5155 numscripts = 0;
5157 for (i = 0; i < 0x110000; i++)
5158 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
5160 for (;;)
5162 char buf[200+1];
5163 unsigned int i1, i2;
5164 char padding[200+1];
5165 char scriptname[200+1];
5166 int script;
5168 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
5169 break;
5171 if (buf[0] == '\0' || buf[0] == '#')
5172 continue;
5174 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
5176 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
5178 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
5179 exit (1);
5181 i2 = i1;
5183 assert (i2 >= i1);
5184 assert (i2 < 0x110000);
5186 for (script = numscripts - 1; script >= 0; script--)
5187 if (strcmp (scripts[script], scriptname) == 0)
5188 break;
5189 if (script < 0)
5191 scripts[numscripts] = strdup (scriptname);
5192 script = numscripts;
5193 numscripts++;
5194 assert (numscripts != 256);
5197 for (i = i1; i <= i2; i++)
5199 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
5200 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
5201 unicode_scripts[i] = script;
5205 if (ferror (stream) || fclose (stream))
5207 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
5208 exit (1);
5212 /* Construction of sparse 3-level tables. */
5213 #define TABLE script_table
5214 #define ELEMENT uint8_t
5215 #define DEFAULT (uint8_t)~(uint8_t)0
5216 #define xmalloc malloc
5217 #define xrealloc realloc
5218 #include "3level.h"
5220 static void
5221 output_scripts (const char *version)
5223 const char *filename = "unictype/scripts.h";
5224 FILE *stream;
5225 unsigned int ch, s, i;
5226 struct script_table t;
5227 unsigned int level1_offset, level2_offset, level3_offset;
5229 typedef struct
5231 const char *lowercase_name;
5233 scriptinfo_t;
5234 scriptinfo_t scriptinfo[256];
5236 stream = fopen (filename, "w");
5237 if (stream == NULL)
5239 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5240 exit (1);
5243 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5244 fprintf (stream, "/* Unicode scripts. */\n");
5245 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5246 version);
5247 fprintf (stream, "\n");
5249 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5250 fprintf (stream, "\n");
5251 output_library_license (stream, true);
5252 fprintf (stream, "\n");
5254 for (s = 0; s < numscripts; s++)
5256 char *lcp = strdup (scripts[s]);
5257 char *cp;
5259 for (cp = lcp; *cp != '\0'; cp++)
5260 if (*cp >= 'A' && *cp <= 'Z')
5261 *cp += 'a' - 'A';
5263 scriptinfo[s].lowercase_name = lcp;
5266 for (s = 0; s < numscripts; s++)
5268 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
5269 scriptinfo[s].lowercase_name);
5270 fprintf (stream, "{\n");
5271 i = 0;
5272 for (ch = 0; ch < 0x110000; ch++)
5273 if (unicode_scripts[ch] == s)
5275 unsigned int start;
5276 unsigned int end;
5278 start = ch;
5279 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
5280 ch++;
5281 end = ch;
5283 if (i > 0)
5284 fprintf (stream, ",\n");
5285 if (start == end)
5286 fprintf (stream, " { 0x%04X, 1, 1 }", start);
5287 else
5288 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
5289 start, end);
5290 i++;
5292 fprintf (stream, "\n");
5293 fprintf (stream, "};\n");
5296 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
5297 fprintf (stream, "{\n");
5298 for (s = 0; s < numscripts; s++)
5300 fprintf (stream, " {\n");
5301 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
5302 scriptinfo[s].lowercase_name);
5303 fprintf (stream, " script_%s_intervals,\n",
5304 scriptinfo[s].lowercase_name);
5305 fprintf (stream, " \"%s\"\n", scripts[s]);
5306 fprintf (stream, " }");
5307 if (s+1 < numscripts)
5308 fprintf (stream, ",");
5309 fprintf (stream, "\n");
5311 fprintf (stream, "};\n");
5313 t.p = 7;
5314 t.q = 9;
5315 script_table_init (&t);
5317 for (ch = 0; ch < 0x110000; ch++)
5319 unsigned int s = unicode_scripts[ch];
5320 if (s != (uint8_t)~(uint8_t)0)
5321 script_table_add (&t, ch, s);
5324 script_table_finalize (&t);
5326 /* Offsets in t.result, in memory of this process. */
5327 level1_offset =
5328 5 * sizeof (uint32_t);
5329 level2_offset =
5330 5 * sizeof (uint32_t)
5331 + t.level1_size * sizeof (uint32_t);
5332 level3_offset =
5333 5 * sizeof (uint32_t)
5334 + t.level1_size * sizeof (uint32_t)
5335 + (t.level2_size << t.q) * sizeof (uint32_t);
5337 for (i = 0; i < 5; i++)
5338 fprintf (stream, "#define script_header_%d %d\n", i,
5339 ((uint32_t *) t.result)[i]);
5340 fprintf (stream, "static const\n");
5341 fprintf (stream, "struct\n");
5342 fprintf (stream, " {\n");
5343 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5344 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
5345 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
5346 fprintf (stream, " }\n");
5347 fprintf (stream, "u_script =\n");
5348 fprintf (stream, "{\n");
5349 fprintf (stream, " {");
5350 if (t.level1_size > 8)
5351 fprintf (stream, "\n ");
5352 for (i = 0; i < t.level1_size; i++)
5354 uint32_t offset;
5355 if (i > 0 && (i % 8) == 0)
5356 fprintf (stream, "\n ");
5357 offset = ((uint32_t *) (t.result + level1_offset))[i];
5358 if (offset == 0)
5359 fprintf (stream, " %5d", -1);
5360 else
5361 fprintf (stream, " %5zu",
5362 (offset - level2_offset) / sizeof (uint32_t));
5363 if (i+1 < t.level1_size)
5364 fprintf (stream, ",");
5366 if (t.level1_size > 8)
5367 fprintf (stream, "\n ");
5368 fprintf (stream, " },\n");
5369 fprintf (stream, " {");
5370 if (t.level2_size << t.q > 8)
5371 fprintf (stream, "\n ");
5372 for (i = 0; i < t.level2_size << t.q; i++)
5374 uint32_t offset;
5375 if (i > 0 && (i % 8) == 0)
5376 fprintf (stream, "\n ");
5377 offset = ((uint32_t *) (t.result + level2_offset))[i];
5378 if (offset == 0)
5379 fprintf (stream, " %5d", -1);
5380 else
5381 fprintf (stream, " %5zu",
5382 (offset - level3_offset) / sizeof (uint8_t));
5383 if (i+1 < t.level2_size << t.q)
5384 fprintf (stream, ",");
5386 if (t.level2_size << t.q > 8)
5387 fprintf (stream, "\n ");
5388 fprintf (stream, " },\n");
5389 fprintf (stream, " {");
5390 if (t.level3_size << t.p > 8)
5391 fprintf (stream, "\n ");
5392 for (i = 0; i < t.level3_size << t.p; i++)
5394 if (i > 0 && (i % 8) == 0)
5395 fprintf (stream, "\n ");
5396 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
5397 if (i+1 < t.level3_size << t.p)
5398 fprintf (stream, ",");
5400 if (t.level3_size << t.p > 8)
5401 fprintf (stream, "\n ");
5402 fprintf (stream, " }\n");
5403 fprintf (stream, "};\n");
5405 if (ferror (stream) || fclose (stream))
5407 fprintf (stderr, "error writing to '%s'\n", filename);
5408 exit (1);
5412 static void
5413 output_scripts_byname (const char *version)
5415 const char *filename = "unictype/scripts_byname.gperf";
5416 FILE *stream;
5417 unsigned int s;
5419 stream = fopen (filename, "w");
5420 if (stream == NULL)
5422 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5423 exit (1);
5426 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5427 fprintf (stream, "/* Unicode scripts. */\n");
5428 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5429 version);
5430 fprintf (stream, "\n");
5432 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5433 fprintf (stream, "\n");
5434 output_library_license (stream, true);
5435 fprintf (stream, "\n");
5437 fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
5438 fprintf (stream, "%%struct-type\n");
5439 fprintf (stream, "%%language=ANSI-C\n");
5440 fprintf (stream, "%%define hash-function-name scripts_hash\n");
5441 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
5442 fprintf (stream, "%%readonly-tables\n");
5443 fprintf (stream, "%%global-table\n");
5444 fprintf (stream, "%%define word-array-name script_names\n");
5445 fprintf (stream, "%%pic\n");
5446 fprintf (stream, "%%define string-pool-name script_stringpool\n");
5447 fprintf (stream, "%%%%\n");
5448 for (s = 0; s < numscripts; s++)
5449 fprintf (stream, "%s, %u\n", scripts[s], s);
5451 if (ferror (stream) || fclose (stream))
5453 fprintf (stderr, "error writing to '%s'\n", filename);
5454 exit (1);
5458 /* ========================================================================= */
5460 /* Blocks. */
5462 typedef struct { unsigned int start; unsigned int end; const char *name; }
5463 block_t;
5464 static block_t blocks[384];
5465 static unsigned int numblocks;
5467 static void
5468 fill_blocks (const char *blocks_filename)
5470 FILE *stream;
5472 stream = fopen (blocks_filename, "r");
5473 if (stream == NULL)
5475 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
5476 exit (1);
5479 for (;;)
5481 char buf[200+1];
5482 unsigned int i1, i2;
5483 char padding[200+1];
5484 char blockname[200+1];
5486 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
5487 break;
5489 if (buf[0] == '\0' || buf[0] == '#')
5490 continue;
5492 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
5494 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
5495 exit (1);
5497 blocks[numblocks].start = i1;
5498 blocks[numblocks].end = i2;
5499 blocks[numblocks].name = strdup (blockname);
5500 /* It must be sorted. */
5501 assert (numblocks == 0 || blocks[numblocks-1].end < blocks[numblocks].start);
5502 numblocks++;
5503 assert (numblocks != SIZEOF (blocks));
5506 if (ferror (stream) || fclose (stream))
5508 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
5509 exit (1);
5513 /* Return the smallest block index among the blocks for characters >= ch. */
5514 static unsigned int
5515 block_first_index (unsigned int ch)
5517 /* Binary search. */
5518 unsigned int lo = 0;
5519 unsigned int hi = numblocks;
5520 /* Invariants:
5521 All blocks[i], i < lo, have blocks[i].end < ch,
5522 all blocks[i], i >= hi, have blocks[i].end >= ch. */
5523 while (lo < hi)
5525 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
5526 if (blocks[mid].end < ch)
5527 lo = mid + 1;
5528 else
5529 hi = mid;
5531 return hi;
5534 /* Return the largest block index among the blocks for characters <= ch,
5535 plus 1. */
5536 static unsigned int
5537 block_last_index (unsigned int ch)
5539 /* Binary search. */
5540 unsigned int lo = 0;
5541 unsigned int hi = numblocks;
5542 /* Invariants:
5543 All blocks[i], i < lo, have blocks[i].start <= ch,
5544 all blocks[i], i >= hi, have blocks[i].start > ch. */
5545 while (lo < hi)
5547 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
5548 if (blocks[mid].start <= ch)
5549 lo = mid + 1;
5550 else
5551 hi = mid;
5553 return hi;
5556 static void
5557 output_blocks (const char *version)
5559 const char *filename = "unictype/blocks.h";
5560 const unsigned int shift = 8; /* bits to shift away for array access */
5561 const unsigned int threshold = 0x28000; /* cut-off table here to save space */
5562 FILE *stream;
5563 unsigned int i;
5564 unsigned int i1;
5566 stream = fopen (filename, "w");
5567 if (stream == NULL)
5569 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5570 exit (1);
5573 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5574 fprintf (stream, "/* Unicode blocks. */\n");
5575 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5576 version);
5577 fprintf (stream, "\n");
5579 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5580 fprintf (stream, "\n");
5581 output_library_license (stream, false);
5582 fprintf (stream, "\n");
5584 fprintf (stream, "static const uc_block_t blocks[] =\n");
5585 fprintf (stream, "{\n");
5586 for (i = 0; i < numblocks; i++)
5588 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
5589 blocks[i].end, blocks[i].name);
5590 if (i+1 < numblocks)
5591 fprintf (stream, ",");
5592 fprintf (stream, "\n");
5594 fprintf (stream, "};\n");
5595 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
5596 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
5597 fprintf (stream, "static const uint16_t blocks_level1[%d * 2] =\n",
5598 threshold >> shift);
5599 fprintf (stream, "{\n");
5600 for (i1 = 0; i1 < (threshold >> shift); i1++)
5602 unsigned int first_index = block_first_index (i1 << shift);
5603 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
5604 fprintf (stream, " %3d, %3d", first_index, last_index);
5605 if (i1+1 < (threshold >> shift))
5606 fprintf (stream, ",");
5607 fprintf (stream, "\n");
5609 fprintf (stream, "};\n");
5610 fprintf (stream, "#define blocks_upper_first_index %d\n",
5611 block_first_index (threshold));
5612 fprintf (stream, "#define blocks_upper_last_index %d\n",
5613 block_last_index (0x10FFFF));
5615 if (ferror (stream) || fclose (stream))
5617 fprintf (stderr, "error writing to '%s'\n", filename);
5618 exit (1);
5622 /* ========================================================================= */
5624 /* C and Java syntax. */
5626 enum
5628 UC_IDENTIFIER_START, /* valid as first or subsequent character */
5629 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
5630 UC_IDENTIFIER_INVALID, /* not valid */
5631 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
5634 /* ISO C 99 section 6.4.(3). */
5635 static bool
5636 is_c_whitespace (unsigned int ch)
5638 return (ch == ' ' /* space */
5639 || ch == '\t' /* horizontal tab */
5640 || ch == '\n' || ch == '\r' /* new-line */
5641 || ch == '\v' /* vertical tab */
5642 || ch == '\f'); /* form-feed */
5645 /* ISO C 99 section 6.4.2.1 and appendix D. */
5646 static int
5647 c_ident_category (unsigned int ch)
5649 /* Section 6.4.2.1. */
5650 if (ch >= '0' && ch <= '9')
5651 return UC_IDENTIFIER_VALID;
5652 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
5653 return UC_IDENTIFIER_START;
5654 /* Appendix D. */
5655 if (0
5656 /* Latin */
5657 || (ch == 0x00AA)
5658 || (ch == 0x00BA)
5659 || (ch >= 0x00C0 && ch <= 0x00D6)
5660 || (ch >= 0x00D8 && ch <= 0x00F6)
5661 || (ch >= 0x00F8 && ch <= 0x01F5)
5662 || (ch >= 0x01FA && ch <= 0x0217)
5663 || (ch >= 0x0250 && ch <= 0x02A8)
5664 || (ch >= 0x1E00 && ch <= 0x1E9B)
5665 || (ch >= 0x1EA0 && ch <= 0x1EF9)
5666 || (ch == 0x207F)
5667 /* Greek */
5668 || (ch == 0x0386)
5669 || (ch >= 0x0388 && ch <= 0x038A)
5670 || (ch == 0x038C)
5671 || (ch >= 0x038E && ch <= 0x03A1)
5672 || (ch >= 0x03A3 && ch <= 0x03CE)
5673 || (ch >= 0x03D0 && ch <= 0x03D6)
5674 || (ch == 0x03DA)
5675 || (ch == 0x03DC)
5676 || (ch == 0x03DE)
5677 || (ch == 0x03E0)
5678 || (ch >= 0x03E2 && ch <= 0x03F3)
5679 || (ch >= 0x1F00 && ch <= 0x1F15)
5680 || (ch >= 0x1F18 && ch <= 0x1F1D)
5681 || (ch >= 0x1F20 && ch <= 0x1F45)
5682 || (ch >= 0x1F48 && ch <= 0x1F4D)
5683 || (ch >= 0x1F50 && ch <= 0x1F57)
5684 || (ch == 0x1F59)
5685 || (ch == 0x1F5B)
5686 || (ch == 0x1F5D)
5687 || (ch >= 0x1F5F && ch <= 0x1F7D)
5688 || (ch >= 0x1F80 && ch <= 0x1FB4)
5689 || (ch >= 0x1FB6 && ch <= 0x1FBC)
5690 || (ch >= 0x1FC2 && ch <= 0x1FC4)
5691 || (ch >= 0x1FC6 && ch <= 0x1FCC)
5692 || (ch >= 0x1FD0 && ch <= 0x1FD3)
5693 || (ch >= 0x1FD6 && ch <= 0x1FDB)
5694 || (ch >= 0x1FE0 && ch <= 0x1FEC)
5695 || (ch >= 0x1FF2 && ch <= 0x1FF4)
5696 || (ch >= 0x1FF6 && ch <= 0x1FFC)
5697 /* Cyrillic */
5698 || (ch >= 0x0401 && ch <= 0x040C)
5699 || (ch >= 0x040E && ch <= 0x044F)
5700 || (ch >= 0x0451 && ch <= 0x045C)
5701 || (ch >= 0x045E && ch <= 0x0481)
5702 || (ch >= 0x0490 && ch <= 0x04C4)
5703 || (ch >= 0x04C7 && ch <= 0x04C8)
5704 || (ch >= 0x04CB && ch <= 0x04CC)
5705 || (ch >= 0x04D0 && ch <= 0x04EB)
5706 || (ch >= 0x04EE && ch <= 0x04F5)
5707 || (ch >= 0x04F8 && ch <= 0x04F9)
5708 /* Armenian */
5709 || (ch >= 0x0531 && ch <= 0x0556)
5710 || (ch >= 0x0561 && ch <= 0x0587)
5711 /* Hebrew */
5712 || (ch >= 0x05B0 && ch <= 0x05B9)
5713 || (ch >= 0x05BB && ch <= 0x05BD)
5714 || (ch == 0x05BF)
5715 || (ch >= 0x05C1 && ch <= 0x05C2)
5716 || (ch >= 0x05D0 && ch <= 0x05EA)
5717 || (ch >= 0x05F0 && ch <= 0x05F2)
5718 /* Arabic */
5719 || (ch >= 0x0621 && ch <= 0x063A)
5720 || (ch >= 0x0640 && ch <= 0x0652)
5721 || (ch >= 0x0670 && ch <= 0x06B7)
5722 || (ch >= 0x06BA && ch <= 0x06BE)
5723 || (ch >= 0x06C0 && ch <= 0x06CE)
5724 || (ch >= 0x06D0 && ch <= 0x06DC)
5725 || (ch >= 0x06E5 && ch <= 0x06E8)
5726 || (ch >= 0x06EA && ch <= 0x06ED)
5727 /* Devanagari */
5728 || (ch >= 0x0901 && ch <= 0x0903)
5729 || (ch >= 0x0905 && ch <= 0x0939)
5730 || (ch >= 0x093E && ch <= 0x094D)
5731 || (ch >= 0x0950 && ch <= 0x0952)
5732 || (ch >= 0x0958 && ch <= 0x0963)
5733 /* Bengali */
5734 || (ch >= 0x0981 && ch <= 0x0983)
5735 || (ch >= 0x0985 && ch <= 0x098C)
5736 || (ch >= 0x098F && ch <= 0x0990)
5737 || (ch >= 0x0993 && ch <= 0x09A8)
5738 || (ch >= 0x09AA && ch <= 0x09B0)
5739 || (ch == 0x09B2)
5740 || (ch >= 0x09B6 && ch <= 0x09B9)
5741 || (ch >= 0x09BE && ch <= 0x09C4)
5742 || (ch >= 0x09C7 && ch <= 0x09C8)
5743 || (ch >= 0x09CB && ch <= 0x09CD)
5744 || (ch >= 0x09DC && ch <= 0x09DD)
5745 || (ch >= 0x09DF && ch <= 0x09E3)
5746 || (ch >= 0x09F0 && ch <= 0x09F1)
5747 /* Gurmukhi */
5748 || (ch == 0x0A02)
5749 || (ch >= 0x0A05 && ch <= 0x0A0A)
5750 || (ch >= 0x0A0F && ch <= 0x0A10)
5751 || (ch >= 0x0A13 && ch <= 0x0A28)
5752 || (ch >= 0x0A2A && ch <= 0x0A30)
5753 || (ch >= 0x0A32 && ch <= 0x0A33)
5754 || (ch >= 0x0A35 && ch <= 0x0A36)
5755 || (ch >= 0x0A38 && ch <= 0x0A39)
5756 || (ch >= 0x0A3E && ch <= 0x0A42)
5757 || (ch >= 0x0A47 && ch <= 0x0A48)
5758 || (ch >= 0x0A4B && ch <= 0x0A4D)
5759 || (ch >= 0x0A59 && ch <= 0x0A5C)
5760 || (ch == 0x0A5E)
5761 || (ch == 0x0A74)
5762 /* Gujarati */
5763 || (ch >= 0x0A81 && ch <= 0x0A83)
5764 || (ch >= 0x0A85 && ch <= 0x0A8B)
5765 || (ch == 0x0A8D)
5766 || (ch >= 0x0A8F && ch <= 0x0A91)
5767 || (ch >= 0x0A93 && ch <= 0x0AA8)
5768 || (ch >= 0x0AAA && ch <= 0x0AB0)
5769 || (ch >= 0x0AB2 && ch <= 0x0AB3)
5770 || (ch >= 0x0AB5 && ch <= 0x0AB9)
5771 || (ch >= 0x0ABD && ch <= 0x0AC5)
5772 || (ch >= 0x0AC7 && ch <= 0x0AC9)
5773 || (ch >= 0x0ACB && ch <= 0x0ACD)
5774 || (ch == 0x0AD0)
5775 || (ch == 0x0AE0)
5776 /* Oriya */
5777 || (ch >= 0x0B01 && ch <= 0x0B03)
5778 || (ch >= 0x0B05 && ch <= 0x0B0C)
5779 || (ch >= 0x0B0F && ch <= 0x0B10)
5780 || (ch >= 0x0B13 && ch <= 0x0B28)
5781 || (ch >= 0x0B2A && ch <= 0x0B30)
5782 || (ch >= 0x0B32 && ch <= 0x0B33)
5783 || (ch >= 0x0B36 && ch <= 0x0B39)
5784 || (ch >= 0x0B3E && ch <= 0x0B43)
5785 || (ch >= 0x0B47 && ch <= 0x0B48)
5786 || (ch >= 0x0B4B && ch <= 0x0B4D)
5787 || (ch >= 0x0B5C && ch <= 0x0B5D)
5788 || (ch >= 0x0B5F && ch <= 0x0B61)
5789 /* Tamil */
5790 || (ch >= 0x0B82 && ch <= 0x0B83)
5791 || (ch >= 0x0B85 && ch <= 0x0B8A)
5792 || (ch >= 0x0B8E && ch <= 0x0B90)
5793 || (ch >= 0x0B92 && ch <= 0x0B95)
5794 || (ch >= 0x0B99 && ch <= 0x0B9A)
5795 || (ch == 0x0B9C)
5796 || (ch >= 0x0B9E && ch <= 0x0B9F)
5797 || (ch >= 0x0BA3 && ch <= 0x0BA4)
5798 || (ch >= 0x0BA8 && ch <= 0x0BAA)
5799 || (ch >= 0x0BAE && ch <= 0x0BB5)
5800 || (ch >= 0x0BB7 && ch <= 0x0BB9)
5801 || (ch >= 0x0BBE && ch <= 0x0BC2)
5802 || (ch >= 0x0BC6 && ch <= 0x0BC8)
5803 || (ch >= 0x0BCA && ch <= 0x0BCD)
5804 /* Telugu */
5805 || (ch >= 0x0C01 && ch <= 0x0C03)
5806 || (ch >= 0x0C05 && ch <= 0x0C0C)
5807 || (ch >= 0x0C0E && ch <= 0x0C10)
5808 || (ch >= 0x0C12 && ch <= 0x0C28)
5809 || (ch >= 0x0C2A && ch <= 0x0C33)
5810 || (ch >= 0x0C35 && ch <= 0x0C39)
5811 || (ch >= 0x0C3E && ch <= 0x0C44)
5812 || (ch >= 0x0C46 && ch <= 0x0C48)
5813 || (ch >= 0x0C4A && ch <= 0x0C4D)
5814 || (ch >= 0x0C60 && ch <= 0x0C61)
5815 /* Kannada */
5816 || (ch >= 0x0C82 && ch <= 0x0C83)
5817 || (ch >= 0x0C85 && ch <= 0x0C8C)
5818 || (ch >= 0x0C8E && ch <= 0x0C90)
5819 || (ch >= 0x0C92 && ch <= 0x0CA8)
5820 || (ch >= 0x0CAA && ch <= 0x0CB3)
5821 || (ch >= 0x0CB5 && ch <= 0x0CB9)
5822 || (ch >= 0x0CBE && ch <= 0x0CC4)
5823 || (ch >= 0x0CC6 && ch <= 0x0CC8)
5824 || (ch >= 0x0CCA && ch <= 0x0CCD)
5825 || (ch == 0x0CDE)
5826 || (ch >= 0x0CE0 && ch <= 0x0CE1)
5827 /* Malayalam */
5828 || (ch >= 0x0D02 && ch <= 0x0D03)
5829 || (ch >= 0x0D05 && ch <= 0x0D0C)
5830 || (ch >= 0x0D0E && ch <= 0x0D10)
5831 || (ch >= 0x0D12 && ch <= 0x0D28)
5832 || (ch >= 0x0D2A && ch <= 0x0D39)
5833 || (ch >= 0x0D3E && ch <= 0x0D43)
5834 || (ch >= 0x0D46 && ch <= 0x0D48)
5835 || (ch >= 0x0D4A && ch <= 0x0D4D)
5836 || (ch >= 0x0D60 && ch <= 0x0D61)
5837 /* Thai */
5838 || (ch >= 0x0E01 && ch <= 0x0E3A)
5839 || (ch >= 0x0E40 && ch <= 0x0E5B)
5840 /* Lao */
5841 || (ch >= 0x0E81 && ch <= 0x0E82)
5842 || (ch == 0x0E84)
5843 || (ch >= 0x0E87 && ch <= 0x0E88)
5844 || (ch == 0x0E8A)
5845 || (ch == 0x0E8D)
5846 || (ch >= 0x0E94 && ch <= 0x0E97)
5847 || (ch >= 0x0E99 && ch <= 0x0E9F)
5848 || (ch >= 0x0EA1 && ch <= 0x0EA3)
5849 || (ch == 0x0EA5)
5850 || (ch == 0x0EA7)
5851 || (ch >= 0x0EAA && ch <= 0x0EAB)
5852 || (ch >= 0x0EAD && ch <= 0x0EAE)
5853 || (ch >= 0x0EB0 && ch <= 0x0EB9)
5854 || (ch >= 0x0EBB && ch <= 0x0EBD)
5855 || (ch >= 0x0EC0 && ch <= 0x0EC4)
5856 || (ch == 0x0EC6)
5857 || (ch >= 0x0EC8 && ch <= 0x0ECD)
5858 || (ch >= 0x0EDC && ch <= 0x0EDD)
5859 /* Tibetan */
5860 || (ch == 0x0F00)
5861 || (ch >= 0x0F18 && ch <= 0x0F19)
5862 || (ch == 0x0F35)
5863 || (ch == 0x0F37)
5864 || (ch == 0x0F39)
5865 || (ch >= 0x0F3E && ch <= 0x0F47)
5866 || (ch >= 0x0F49 && ch <= 0x0F69)
5867 || (ch >= 0x0F71 && ch <= 0x0F84)
5868 || (ch >= 0x0F86 && ch <= 0x0F8B)
5869 || (ch >= 0x0F90 && ch <= 0x0F95)
5870 || (ch == 0x0F97)
5871 || (ch >= 0x0F99 && ch <= 0x0FAD)
5872 || (ch >= 0x0FB1 && ch <= 0x0FB7)
5873 || (ch == 0x0FB9)
5874 /* Georgian */
5875 || (ch >= 0x10A0 && ch <= 0x10C5)
5876 || (ch >= 0x10D0 && ch <= 0x10F6)
5877 /* Hiragana */
5878 || (ch >= 0x3041 && ch <= 0x3093)
5879 || (ch >= 0x309B && ch <= 0x309C)
5880 /* Katakana */
5881 || (ch >= 0x30A1 && ch <= 0x30F6)
5882 || (ch >= 0x30FB && ch <= 0x30FC)
5883 /* Bopomofo */
5884 || (ch >= 0x3105 && ch <= 0x312C)
5885 /* CJK Unified Ideographs */
5886 || (ch >= 0x4E00 && ch <= 0x9FA5)
5887 /* Hangul */
5888 || (ch >= 0xAC00 && ch <= 0xD7A3)
5889 /* Digits */
5890 || (ch >= 0x0660 && ch <= 0x0669)
5891 || (ch >= 0x06F0 && ch <= 0x06F9)
5892 || (ch >= 0x0966 && ch <= 0x096F)
5893 || (ch >= 0x09E6 && ch <= 0x09EF)
5894 || (ch >= 0x0A66 && ch <= 0x0A6F)
5895 || (ch >= 0x0AE6 && ch <= 0x0AEF)
5896 || (ch >= 0x0B66 && ch <= 0x0B6F)
5897 || (ch >= 0x0BE7 && ch <= 0x0BEF)
5898 || (ch >= 0x0C66 && ch <= 0x0C6F)
5899 || (ch >= 0x0CE6 && ch <= 0x0CEF)
5900 || (ch >= 0x0D66 && ch <= 0x0D6F)
5901 || (ch >= 0x0E50 && ch <= 0x0E59)
5902 || (ch >= 0x0ED0 && ch <= 0x0ED9)
5903 || (ch >= 0x0F20 && ch <= 0x0F33)
5904 /* Special characters */
5905 || (ch == 0x00B5)
5906 || (ch == 0x00B7)
5907 || (ch >= 0x02B0 && ch <= 0x02B8)
5908 || (ch == 0x02BB)
5909 || (ch >= 0x02BD && ch <= 0x02C1)
5910 || (ch >= 0x02D0 && ch <= 0x02D1)
5911 || (ch >= 0x02E0 && ch <= 0x02E4)
5912 || (ch == 0x037A)
5913 || (ch == 0x0559)
5914 || (ch == 0x093D)
5915 || (ch == 0x0B3D)
5916 || (ch == 0x1FBE)
5917 || (ch >= 0x203F && ch <= 0x2040)
5918 || (ch == 0x2102)
5919 || (ch == 0x2107)
5920 || (ch >= 0x210A && ch <= 0x2113)
5921 || (ch == 0x2115)
5922 || (ch >= 0x2118 && ch <= 0x211D)
5923 || (ch == 0x2124)
5924 || (ch == 0x2126)
5925 || (ch == 0x2128)
5926 || (ch >= 0x212A && ch <= 0x2131)
5927 || (ch >= 0x2133 && ch <= 0x2138)
5928 || (ch >= 0x2160 && ch <= 0x2182)
5929 || (ch >= 0x3005 && ch <= 0x3007)
5930 || (ch >= 0x3021 && ch <= 0x3029)
5932 return UC_IDENTIFIER_START;
5933 return UC_IDENTIFIER_INVALID;
5936 /* The Java Language Specification, 3rd edition, §3.6.
5937 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.6 */
5938 static bool
5939 is_java_whitespace (unsigned int ch)
5941 return (ch == ' ' || ch == '\t' || ch == '\f'
5942 || ch == '\n' || ch == '\r');
5945 /* The Java Language Specification, 3rd edition, §3.8.
5946 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.8
5947 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
5948 static int
5949 java_ident_category (unsigned int ch)
5951 /* FIXME: Check this against Sun's JDK implementation. */
5952 if (is_category_L (ch) /* = Character.isLetter(ch) */
5953 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
5954 || is_category_Sc (ch) /* currency symbol */
5955 || is_category_Pc (ch) /* connector punctuation */
5957 return UC_IDENTIFIER_START;
5958 if (is_category_Nd (ch) /* digit */
5959 || is_category_Mc (ch) /* combining mark */
5960 || is_category_Mn (ch) /* non-spacing mark */
5962 return UC_IDENTIFIER_VALID;
5963 if ((ch >= 0x0000 && ch <= 0x0008)
5964 || (ch >= 0x000E && ch <= 0x001B)
5965 || (ch >= 0x007F && ch <= 0x009F)
5966 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
5968 return UC_IDENTIFIER_IGNORABLE;
5969 return UC_IDENTIFIER_INVALID;
5972 /* Construction of sparse 3-level tables. */
5973 #define TABLE identsyntax_table
5974 #define ELEMENT uint8_t
5975 #define DEFAULT UC_IDENTIFIER_INVALID
5976 #define xmalloc malloc
5977 #define xrealloc realloc
5978 #include "3level.h"
5980 /* Output an identifier syntax categorization in a three-level bitmap. */
5981 static void
5982 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
5984 FILE *stream;
5985 unsigned int ch, i;
5986 struct identsyntax_table t;
5987 unsigned int level1_offset, level2_offset, level3_offset;
5989 stream = fopen (filename, "w");
5990 if (stream == NULL)
5992 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5993 exit (1);
5996 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5997 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
5998 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5999 version);
6000 fprintf (stream, "\n");
6002 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
6003 fprintf (stream, "\n");
6004 output_library_license (stream, false);
6005 fprintf (stream, "\n");
6007 t.p = 7; /* or 8 */
6008 t.q = 5; /* or 4 */
6009 identsyntax_table_init (&t);
6011 for (ch = 0; ch < 0x110000; ch++)
6013 int syntaxcode = predicate (ch);
6015 assert (syntaxcode <= 0x03);
6017 if (syntaxcode != UC_IDENTIFIER_INVALID)
6018 identsyntax_table_add (&t, ch, syntaxcode);
6021 identsyntax_table_finalize (&t);
6023 /* Offsets in t.result, in memory of this process. */
6024 level1_offset =
6025 5 * sizeof (uint32_t);
6026 level2_offset =
6027 5 * sizeof (uint32_t)
6028 + t.level1_size * sizeof (uint32_t);
6029 level3_offset =
6030 5 * sizeof (uint32_t)
6031 + t.level1_size * sizeof (uint32_t)
6032 + (t.level2_size << t.q) * sizeof (uint32_t);
6034 for (i = 0; i < 5; i++)
6035 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
6036 ((uint32_t *) t.result)[i]);
6037 fprintf (stream, "static const\n");
6038 fprintf (stream, "struct\n");
6039 fprintf (stream, " {\n");
6040 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6041 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
6042 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
6043 (1 << t.p) * 2 / 16);
6044 fprintf (stream, " }\n");
6045 fprintf (stream, "%s =\n", name);
6046 fprintf (stream, "{\n");
6047 fprintf (stream, " {");
6048 if (t.level1_size > 8)
6049 fprintf (stream, "\n ");
6050 for (i = 0; i < t.level1_size; i++)
6052 uint32_t offset;
6053 if (i > 0 && (i % 8) == 0)
6054 fprintf (stream, "\n ");
6055 offset = ((uint32_t *) (t.result + level1_offset))[i];
6056 if (offset == 0)
6057 fprintf (stream, " %5d", -1);
6058 else
6059 fprintf (stream, " %5zu",
6060 (offset - level2_offset) / sizeof (uint32_t));
6061 if (i+1 < t.level1_size)
6062 fprintf (stream, ",");
6064 if (t.level1_size > 8)
6065 fprintf (stream, "\n ");
6066 fprintf (stream, " },\n");
6067 fprintf (stream, " {");
6068 if (t.level2_size << t.q > 8)
6069 fprintf (stream, "\n ");
6070 for (i = 0; i < t.level2_size << t.q; i++)
6072 uint32_t offset;
6073 if (i > 0 && (i % 8) == 0)
6074 fprintf (stream, "\n ");
6075 offset = ((uint32_t *) (t.result + level2_offset))[i];
6076 if (offset == 0)
6077 fprintf (stream, " %5d", -1);
6078 else
6079 fprintf (stream, " %5zu",
6080 (offset - level3_offset) / sizeof (uint8_t));
6081 if (i+1 < t.level2_size << t.q)
6082 fprintf (stream, ",");
6084 if (t.level2_size << t.q > 8)
6085 fprintf (stream, "\n ");
6086 fprintf (stream, " },\n");
6087 /* Pack the level3 array. Each entry needs 2 bits only. */
6088 fprintf (stream, " {");
6089 if ((t.level3_size << t.p) * 2 / 16 > 8)
6090 fprintf (stream, "\n ");
6091 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
6093 if (i > 0 && (i % 8) == 0)
6094 fprintf (stream, "\n ");
6095 fprintf (stream, " 0x%04x",
6096 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
6097 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
6098 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
6099 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
6100 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
6101 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
6102 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
6103 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
6104 if (i+1 < (t.level3_size << t.p) * 2 / 16)
6105 fprintf (stream, ",");
6107 if ((t.level3_size << t.p) * 2 / 16 > 8)
6108 fprintf (stream, "\n ");
6109 fprintf (stream, " }\n");
6110 fprintf (stream, "};\n");
6112 if (ferror (stream) || fclose (stream))
6114 fprintf (stderr, "error writing to '%s'\n", filename);
6115 exit (1);
6119 static void
6120 output_ident_properties (const char *version)
6122 #define PROPERTY(P) \
6123 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
6124 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
6125 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
6126 PROPERTY(c_whitespace)
6127 PROPERTY(java_whitespace)
6128 #undef PROPERTY
6130 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
6131 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
6134 /* ========================================================================= */
6136 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
6137 glibc/localedata/locales/i18n file, generated by
6138 glibc/localedata/gen-unicode-ctype.c. */
6140 /* Character mappings. */
6142 static unsigned int
6143 to_upper (unsigned int ch)
6145 if (unicode_attributes[ch].name != NULL
6146 && unicode_attributes[ch].upper != NONE)
6147 return unicode_attributes[ch].upper;
6148 else
6149 return ch;
6152 static unsigned int
6153 to_lower (unsigned int ch)
6155 if (unicode_attributes[ch].name != NULL
6156 && unicode_attributes[ch].lower != NONE)
6157 return unicode_attributes[ch].lower;
6158 else
6159 return ch;
6162 static unsigned int
6163 to_title (unsigned int ch)
6165 if (unicode_attributes[ch].name != NULL
6166 && unicode_attributes[ch].title != NONE)
6167 return unicode_attributes[ch].title;
6168 else
6169 return ch;
6172 /* Character class properties. */
6174 static bool
6175 is_upper (unsigned int ch)
6177 return (to_lower (ch) != ch);
6180 static bool
6181 is_lower (unsigned int ch)
6183 return (to_upper (ch) != ch)
6184 /* <U00DF> is lowercase, but without simple to_upper mapping. */
6185 || (ch == 0x00DF);
6188 static bool
6189 is_alpha (unsigned int ch)
6191 return (unicode_attributes[ch].name != NULL
6192 && ((unicode_attributes[ch].category[0] == 'L'
6193 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
6194 <U0E2F>, <U0E46> should belong to is_punct. */
6195 && (ch != 0x0E2F) && (ch != 0x0E46))
6196 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
6197 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
6198 || (ch == 0x0E31)
6199 || (ch >= 0x0E34 && ch <= 0x0E3A)
6200 || (ch >= 0x0E47 && ch <= 0x0E4E)
6201 /* Avoid warning for <U0345>. */
6202 || (ch == 0x0345)
6203 /* Avoid warnings for <U2160>..<U217F>. */
6204 || (unicode_attributes[ch].category[0] == 'N'
6205 && unicode_attributes[ch].category[1] == 'l')
6206 /* Avoid warnings for <U24B6>..<U24E9>. */
6207 || (unicode_attributes[ch].category[0] == 'S'
6208 && unicode_attributes[ch].category[1] == 'o'
6209 && strstr (unicode_attributes[ch].name, " LETTER ")
6210 != NULL)
6211 /* Consider all the non-ASCII digits as alphabetic.
6212 ISO C 99 forbids us to have them in category "digit",
6213 but we want iswalnum to return true on them. */
6214 || (unicode_attributes[ch].category[0] == 'N'
6215 && unicode_attributes[ch].category[1] == 'd'
6216 && !(ch >= 0x0030 && ch <= 0x0039))));
6219 static bool
6220 is_digit (unsigned int ch)
6222 #if 0
6223 return (unicode_attributes[ch].name != NULL
6224 && unicode_attributes[ch].category[0] == 'N'
6225 && unicode_attributes[ch].category[1] == 'd');
6226 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
6227 a zero. Must add <0> in front of them by hand. */
6228 #else
6229 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
6230 takes it away:
6231 7.25.2.1.5:
6232 The iswdigit function tests for any wide character that corresponds
6233 to a decimal-digit character (as defined in 5.2.1).
6234 5.2.1:
6235 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
6237 return (ch >= 0x0030 && ch <= 0x0039);
6238 #endif
6241 static bool
6242 is_alnum (unsigned int ch)
6244 return is_alpha (ch) || is_digit (ch);
6247 static bool
6248 is_blank (unsigned int ch)
6250 return (ch == 0x0009 /* '\t' */
6251 /* Category Zs without mention of "<noBreak>" */
6252 || (unicode_attributes[ch].name != NULL
6253 && unicode_attributes[ch].category[0] == 'Z'
6254 && unicode_attributes[ch].category[1] == 's'
6255 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
6258 static bool
6259 is_space (unsigned int ch)
6261 /* Don't make U+00A0 a space. Non-breaking space means that all programs
6262 should treat it like a punctuation character, not like a space. */
6263 return (ch == 0x0020 /* ' ' */
6264 || ch == 0x000C /* '\f' */
6265 || ch == 0x000A /* '\n' */
6266 || ch == 0x000D /* '\r' */
6267 || ch == 0x0009 /* '\t' */
6268 || ch == 0x000B /* '\v' */
6269 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
6270 || (unicode_attributes[ch].name != NULL
6271 && unicode_attributes[ch].category[0] == 'Z'
6272 && (unicode_attributes[ch].category[1] == 'l'
6273 || unicode_attributes[ch].category[1] == 'p'
6274 || (unicode_attributes[ch].category[1] == 's'
6275 && !strstr (unicode_attributes[ch].decomposition,
6276 "<noBreak>")))));
6279 static bool
6280 is_cntrl (unsigned int ch)
6282 return (unicode_attributes[ch].name != NULL
6283 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
6284 /* Categories Zl and Zp */
6285 || (unicode_attributes[ch].category[0] == 'Z'
6286 && (unicode_attributes[ch].category[1] == 'l'
6287 || unicode_attributes[ch].category[1] == 'p'))));
6290 static bool
6291 is_xdigit (unsigned int ch)
6293 #if 0
6294 return is_digit (ch)
6295 || (ch >= 0x0041 && ch <= 0x0046)
6296 || (ch >= 0x0061 && ch <= 0x0066);
6297 #else
6298 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
6299 takes it away:
6300 7.25.2.1.12:
6301 The iswxdigit function tests for any wide character that corresponds
6302 to a hexadecimal-digit character (as defined in 6.4.4.1).
6303 6.4.4.1:
6304 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
6306 return (ch >= 0x0030 && ch <= 0x0039)
6307 || (ch >= 0x0041 && ch <= 0x0046)
6308 || (ch >= 0x0061 && ch <= 0x0066);
6309 #endif
6312 static bool
6313 is_graph (unsigned int ch)
6315 return (unicode_attributes[ch].name != NULL
6316 && strcmp (unicode_attributes[ch].name, "<control>")
6317 && !is_space (ch));
6320 static bool
6321 is_print (unsigned int ch)
6323 return (unicode_attributes[ch].name != NULL
6324 && strcmp (unicode_attributes[ch].name, "<control>")
6325 /* Categories Zl and Zp */
6326 && !(unicode_attributes[ch].name != NULL
6327 && unicode_attributes[ch].category[0] == 'Z'
6328 && (unicode_attributes[ch].category[1] == 'l'
6329 || unicode_attributes[ch].category[1] == 'p')));
6332 static bool
6333 is_punct (unsigned int ch)
6335 #if 0
6336 return (unicode_attributes[ch].name != NULL
6337 && unicode_attributes[ch].category[0] == 'P');
6338 #else
6339 /* The traditional POSIX definition of punctuation is every graphic,
6340 non-alphanumeric character. */
6341 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
6342 #endif
6345 /* Output all properties. */
6346 static void
6347 output_old_ctype (const char *version)
6349 #define PROPERTY(P) \
6350 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
6351 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
6352 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
6353 PROPERTY(alnum)
6354 PROPERTY(alpha)
6355 PROPERTY(cntrl)
6356 PROPERTY(digit)
6357 PROPERTY(graph)
6358 PROPERTY(lower)
6359 PROPERTY(print)
6360 PROPERTY(punct)
6361 PROPERTY(space)
6362 PROPERTY(upper)
6363 PROPERTY(xdigit)
6364 PROPERTY(blank)
6365 #undef PROPERTY
6368 #if 0
6370 static bool
6371 is_combining (unsigned int ch)
6373 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
6374 file. In 3.0.1 it was identical to the union of the general categories
6375 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
6376 PropList.txt file, so we take the latter definition. */
6377 return (unicode_attributes[ch].name != NULL
6378 && unicode_attributes[ch].category[0] == 'M'
6379 && (unicode_attributes[ch].category[1] == 'n'
6380 || unicode_attributes[ch].category[1] == 'c'
6381 || unicode_attributes[ch].category[1] == 'e'));
6384 static bool
6385 is_combining_level3 (unsigned int ch)
6387 return is_combining (ch)
6388 && !(unicode_attributes[ch].combining[0] != '\0'
6389 && unicode_attributes[ch].combining[0] != '0'
6390 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
6393 /* Return the UCS symbol string for a Unicode character. */
6394 static const char *
6395 ucs_symbol (unsigned int i)
6397 static char buf[11+1];
6399 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
6400 return buf;
6403 /* Return the UCS symbol range string for a Unicode characters interval. */
6404 static const char *
6405 ucs_symbol_range (unsigned int low, unsigned int high)
6407 static char buf[24+1];
6409 strcpy (buf, ucs_symbol (low));
6410 strcat (buf, "..");
6411 strcat (buf, ucs_symbol (high));
6412 return buf;
6415 /* Output a character class (= property) table. */
6417 static void
6418 output_charclass (FILE *stream, const char *classname,
6419 bool (*func) (unsigned int))
6421 char table[0x110000];
6422 unsigned int i;
6423 bool need_semicolon;
6424 const int max_column = 75;
6425 int column;
6427 for (i = 0; i < 0x110000; i++)
6428 table[i] = (int) func (i);
6430 fprintf (stream, "%s ", classname);
6431 need_semicolon = false;
6432 column = 1000;
6433 for (i = 0; i < 0x110000; )
6435 if (!table[i])
6436 i++;
6437 else
6439 unsigned int low, high;
6440 char buf[25];
6442 low = i;
6444 i++;
6445 while (i < 0x110000 && table[i]);
6446 high = i - 1;
6448 if (low == high)
6449 strcpy (buf, ucs_symbol (low));
6450 else
6451 strcpy (buf, ucs_symbol_range (low, high));
6453 if (need_semicolon)
6455 fprintf (stream, ";");
6456 column++;
6459 if (column + strlen (buf) > max_column)
6461 fprintf (stream, "/\n ");
6462 column = 3;
6465 fprintf (stream, "%s", buf);
6466 column += strlen (buf);
6467 need_semicolon = true;
6470 fprintf (stream, "\n");
6473 /* Output a character mapping table. */
6475 static void
6476 output_charmap (FILE *stream, const char *mapname,
6477 unsigned int (*func) (unsigned int))
6479 char table[0x110000];
6480 unsigned int i;
6481 bool need_semicolon;
6482 const int max_column = 75;
6483 int column;
6485 for (i = 0; i < 0x110000; i++)
6486 table[i] = (func (i) != i);
6488 fprintf (stream, "%s ", mapname);
6489 need_semicolon = false;
6490 column = 1000;
6491 for (i = 0; i < 0x110000; i++)
6492 if (table[i])
6494 char buf[25+1];
6496 strcpy (buf, "(");
6497 strcat (buf, ucs_symbol (i));
6498 strcat (buf, ",");
6499 strcat (buf, ucs_symbol (func (i)));
6500 strcat (buf, ")");
6502 if (need_semicolon)
6504 fprintf (stream, ";");
6505 column++;
6508 if (column + strlen (buf) > max_column)
6510 fprintf (stream, "/\n ");
6511 column = 3;
6514 fprintf (stream, "%s", buf);
6515 column += strlen (buf);
6516 need_semicolon = true;
6518 fprintf (stream, "\n");
6521 /* Output the width table. */
6523 static void
6524 output_widthmap (FILE *stream)
6528 /* Output the tables to the given file. */
6530 static void
6531 output_tables (const char *filename, const char *version)
6533 FILE *stream;
6534 unsigned int ch;
6536 stream = fopen (filename, "w");
6537 if (stream == NULL)
6539 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6540 exit (1);
6543 fprintf (stream, "escape_char /\n");
6544 fprintf (stream, "comment_char %%\n");
6545 fprintf (stream, "\n");
6546 fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
6547 version);
6548 fprintf (stream, "\n");
6550 fprintf (stream, "LC_IDENTIFICATION\n");
6551 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
6552 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
6553 fprintf (stream, "address \"\"\n");
6554 fprintf (stream, "contact \"\"\n");
6555 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
6556 fprintf (stream, "tel \"\"\n");
6557 fprintf (stream, "fax \"\"\n");
6558 fprintf (stream, "language \"\"\n");
6559 fprintf (stream, "territory \"Earth\"\n");
6560 fprintf (stream, "revision \"%s\"\n", version);
6562 time_t now;
6563 char date[11];
6564 now = time (NULL);
6565 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
6566 fprintf (stream, "date \"%s\"\n", date);
6568 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
6569 fprintf (stream, "END LC_IDENTIFICATION\n");
6570 fprintf (stream, "\n");
6572 /* Verification. */
6573 for (ch = 0; ch < 0x110000; ch++)
6575 /* toupper restriction: "Only characters specified for the keywords
6576 lower and upper shall be specified. */
6577 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
6578 fprintf (stderr,
6579 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
6580 ucs_symbol (ch), ch, to_upper (ch));
6582 /* tolower restriction: "Only characters specified for the keywords
6583 lower and upper shall be specified. */
6584 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
6585 fprintf (stderr,
6586 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
6587 ucs_symbol (ch), ch, to_lower (ch));
6589 /* alpha restriction: "Characters classified as either upper or lower
6590 shall automatically belong to this class. */
6591 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
6592 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
6594 /* alpha restriction: "No character specified for the keywords cntrl,
6595 digit, punct or space shall be specified." */
6596 if (is_alpha (ch) && is_cntrl (ch))
6597 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
6598 if (is_alpha (ch) && is_digit (ch))
6599 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
6600 if (is_alpha (ch) && is_punct (ch))
6601 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
6602 if (is_alpha (ch) && is_space (ch))
6603 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
6605 /* space restriction: "No character specified for the keywords upper,
6606 lower, alpha, digit, graph or xdigit shall be specified."
6607 upper, lower, alpha already checked above. */
6608 if (is_space (ch) && is_digit (ch))
6609 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
6610 if (is_space (ch) && is_graph (ch))
6611 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
6612 if (is_space (ch) && is_xdigit (ch))
6613 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
6615 /* cntrl restriction: "No character specified for the keywords upper,
6616 lower, alpha, digit, punct, graph, print or xdigit shall be
6617 specified." upper, lower, alpha already checked above. */
6618 if (is_cntrl (ch) && is_digit (ch))
6619 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
6620 if (is_cntrl (ch) && is_punct (ch))
6621 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
6622 if (is_cntrl (ch) && is_graph (ch))
6623 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
6624 if (is_cntrl (ch) && is_print (ch))
6625 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
6626 if (is_cntrl (ch) && is_xdigit (ch))
6627 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
6629 /* punct restriction: "No character specified for the keywords upper,
6630 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
6631 be specified." upper, lower, alpha, cntrl already checked above. */
6632 if (is_punct (ch) && is_digit (ch))
6633 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
6634 if (is_punct (ch) && is_xdigit (ch))
6635 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
6636 if (is_punct (ch) && (ch == 0x0020))
6637 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
6639 /* graph restriction: "No character specified for the keyword cntrl
6640 shall be specified." Already checked above. */
6642 /* print restriction: "No character specified for the keyword cntrl
6643 shall be specified." Already checked above. */
6645 /* graph - print relation: differ only in the <space> character.
6646 How is this possible if there are more than one space character?!
6647 I think susv2/xbd/locale.html should speak of "space characters",
6648 not "space character". */
6649 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
6650 fprintf (stderr,
6651 "%s is print but not graph|<space>\n", ucs_symbol (ch));
6652 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
6653 fprintf (stderr,
6654 "%s is graph|<space> but not print\n", ucs_symbol (ch));
6657 fprintf (stream, "LC_CTYPE\n");
6658 output_charclass (stream, "upper", is_upper);
6659 output_charclass (stream, "lower", is_lower);
6660 output_charclass (stream, "alpha", is_alpha);
6661 output_charclass (stream, "digit", is_digit);
6662 output_charclass (stream, "outdigit", is_outdigit);
6663 output_charclass (stream, "blank", is_blank);
6664 output_charclass (stream, "space", is_space);
6665 output_charclass (stream, "cntrl", is_cntrl);
6666 output_charclass (stream, "punct", is_punct);
6667 output_charclass (stream, "xdigit", is_xdigit);
6668 output_charclass (stream, "graph", is_graph);
6669 output_charclass (stream, "print", is_print);
6670 output_charclass (stream, "class \"combining\";", is_combining);
6671 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
6672 output_charmap (stream, "toupper", to_upper);
6673 output_charmap (stream, "tolower", to_lower);
6674 output_charmap (stream, "map \"totitle\";", to_title);
6675 output_widthmap (stream);
6676 fprintf (stream, "END LC_CTYPE\n");
6678 if (ferror (stream) || fclose (stream))
6680 fprintf (stderr, "error writing to '%s'\n", filename);
6681 exit (1);
6685 #endif
6687 /* ========================================================================= */
6689 /* The width property from the EastAsianWidth.txt file.
6690 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
6691 const char * unicode_width[0x110000];
6693 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
6694 file. */
6695 static void
6696 fill_width (const char *width_filename)
6698 unsigned int i, j;
6699 FILE *stream;
6700 char field0[FIELDLEN];
6701 char field1[FIELDLEN];
6702 char field2[FIELDLEN];
6703 int lineno = 0;
6705 for (i = 0; i < 0x110000; i++)
6706 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
6708 stream = fopen (width_filename, "r");
6709 if (stream == NULL)
6711 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
6712 exit (1);
6715 for (;;)
6717 int n;
6718 int c;
6720 lineno++;
6721 c = getc (stream);
6722 if (c == EOF)
6723 break;
6724 if (c == '\n')
6725 continue;
6726 if (c == '#')
6728 do c = getc (stream); while (c != EOF && c != '\n');
6729 continue;
6731 ungetc (c, stream);
6732 n = getfield (stream, field0, ';');
6733 do c = getc (stream); while (c == ' ');
6734 ungetc (c, stream);
6735 n += getfield (stream, field1, '#');
6736 n += getfield (stream, field2, '\n');
6737 if (n == 0)
6738 break;
6739 if (n != 3)
6741 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
6742 exit (1);
6744 /* Remove trailing spaces from field0. */
6745 while (strlen (field0) > 0 && field0[strlen (field0) - 1] == ' ')
6746 field0[strlen (field0) - 1] = '\0';
6747 /* Remove trailing spaces from field1. */
6748 while (strlen (field1) > 0 && field1[strlen (field1) - 1] == ' ')
6749 field1[strlen (field1) - 1] = '\0';
6750 i = strtoul (field0, NULL, 16);
6751 if (strstr (field0, "..") != NULL)
6753 /* Deal with a range. */
6754 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6755 for (; i <= j; i++)
6756 unicode_width[i] = strdup (field1);
6758 else
6760 /* Single character line. */
6761 unicode_width[i] = strdup (field1);
6765 if (ferror (stream) || fclose (stream))
6767 fprintf (stderr, "error reading from '%s'\n", width_filename);
6768 exit (1);
6772 /* ========================================================================= */
6774 /* Non-spacing attribute and width. */
6776 /* The non-spacing attribute table consists of:
6777 * Non-spacing characters; generated from PropList.txt or
6778 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
6779 * Format control characters, except for characters with property
6780 Prepended_Concatenation_Mark; generated from
6781 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" and from
6782 "grep Prepended_Concatenation_Mark PropList.txt".
6783 Rationale for the Prepended_Concatenation_Mark exception:
6784 The Unicode standard says "Unlike most other format characters,
6785 however, they should be rendered with a visible glyph".
6786 * Zero width characters; generated from
6787 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
6788 * Hangul Jamo characters that have conjoining behaviour:
6789 - jungseong = syllable-middle vowels
6790 - jongseong = syllable-final consonants
6791 Rationale:
6792 1) These characters act like combining characters. They have no
6793 equivalent in legacy character sets. Therefore the EastAsianWidth.txt
6794 file does not really matter for them; UAX #11 East Asian Width
6795 <https://www.unicode.org/reports/tr11/> makes it clear that it focus
6796 is on compatibility with traditional Japanese layout.
6797 By contrast, the same glyphs without conjoining behaviour are available
6798 in the U+3130..U+318F block, and these characters are mapped to legacy
6799 character sets, and traditional Japanese layout matters for them.
6800 2) glibc does the same thing, see
6801 <https://sourceware.org/bugzilla/show_bug.cgi?id=21750>
6802 <https://sourceware.org/bugzilla/show_bug.cgi?id=26120>
6805 static bool
6806 is_nonspacing (unsigned int ch)
6808 return (unicode_attributes[ch].name != NULL
6809 && (get_bidi_category (ch) == UC_BIDI_NSM
6810 || is_category_Cc (ch)
6811 || (is_category_Cf (ch)
6812 && !is_property_prepended_concatenation_mark (ch))
6813 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0
6814 || (ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6) /* jungseong */
6815 || (ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB) /* jongseong */
6816 ) );
6819 static void
6820 output_nonspacing_property (const char *filename, const char *version)
6822 FILE *stream;
6823 int ind[0x110000 / 0x200];
6824 unsigned int i;
6825 unsigned int i_max;
6826 int next_ind;
6828 stream = fopen (filename, "w");
6829 if (stream == NULL)
6831 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6832 exit (1);
6835 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6836 fprintf (stream, "/* Table of non-spacing or control characters. */\n");
6837 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
6838 version);
6839 fprintf (stream, "\n");
6841 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
6842 fprintf (stream, "\n");
6843 output_library_license (stream, true);
6844 fprintf (stream, "\n");
6846 next_ind = 0;
6847 for (i = 0; i < 0x110000 / 0x200; i++)
6849 bool nontrivial = false;
6850 unsigned int ch;
6852 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
6853 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
6854 if (is_nonspacing (ch))
6856 nontrivial = true;
6857 break;
6859 if (nontrivial)
6860 ind[i] = next_ind++;
6861 else
6862 ind[i] = -1;
6865 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
6866 next_ind);
6867 i_max = 0;
6868 for (i = 0; i < 0x110000 / 0x200; i++)
6870 bool nontrivial = (ind[i] >= 0);
6872 if (nontrivial)
6874 unsigned int j;
6876 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
6877 for (j = 0; j < 8; j++)
6879 unsigned int k;
6881 fprintf (stream, " ");
6882 for (k = 0; k < 8; k++)
6884 unsigned int l;
6885 unsigned char bits = 0;
6887 for (l = 0; l < 8; l++)
6889 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
6891 if (is_nonspacing (ch))
6892 bits |= 1 << l;
6894 fprintf (stream, " 0x%02x%c", bits,
6895 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
6897 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6898 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
6900 i_max = i;
6903 fprintf (stream, "};\n");
6905 i_max = ((i_max + 8 - 1) / 8) * 8;
6906 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
6907 i_max);
6909 unsigned int j;
6911 for (j = 0; j < i_max / 8; j++)
6913 unsigned int k;
6915 fprintf (stream, " ");
6916 for (k = 0; k < 8; k++)
6918 i = j * 8 + k;
6919 fprintf (stream, " %2d%c", ind[i],
6920 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
6922 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6923 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
6926 fprintf (stream, "};\n");
6928 if (ferror (stream) || fclose (stream))
6930 fprintf (stderr, "error writing to '%s'\n", filename);
6931 exit (1);
6935 /* Determines whether a character has width 2, regardless of context.
6936 Generated from "grep '^[^;]\+;[WF]' EastAsianWidth.txt"
6937 and "grep '^[^;]\+;[^WF]' EastAsianWidth.txt"
6939 static bool
6940 is_width2 (unsigned int ch)
6942 return ((ch >= 0x1100 && ch <= 0x115F) /* Hangul Jamo */
6943 || (ch >= 0x231A && ch <= 0x231B) /* Watch, Hourglass */
6944 || (ch >= 0x2329 && ch <= 0x232A) /* Angle Brackets */
6945 || (ch >= 0x23E9 && ch <= 0x23EC) /* Black double triangles */
6946 || ch == 0x23F0 /* Alarm clock */
6947 || ch == 0x23F3 /* Hourglass */
6948 || (ch >= 0x25FD && ch <= 0x25FE) /* Medium small squares */
6949 /* Miscellaneous symbols, dingbats */
6950 || (ch >= 0x2614 && ch <= 0x2615)
6951 || (ch >= 0x2648 && ch <= 0x2653)
6952 || ch == 0x267F
6953 || ch == 0x2693
6954 || ch == 0x26A1
6955 || (ch >= 0x26AA && ch <= 0x26AB)
6956 || (ch >= 0x26BD && ch <= 0x26BE)
6957 || (ch >= 0x26C4 && ch <= 0x26C5)
6958 || ch == 0x26CE
6959 || ch == 0x26D4
6960 || ch == 0x26EA
6961 || (ch >= 0x26F2 && ch <= 0x26F3)
6962 || ch == 0x26F5
6963 || ch == 0x26FA
6964 || ch == 0x26FD
6965 || ch == 0x2705
6966 || (ch >= 0x270A && ch <= 0x270B)
6967 || ch == 0x2728
6968 || ch == 0x274C
6969 || ch == 0x274E
6970 || (ch >= 0x2753 && ch <= 0x2755)
6971 || ch == 0x2757
6972 || (ch >= 0x2795 && ch <= 0x2797)
6973 || ch == 0x27B0
6974 || ch == 0x27BF
6975 || (ch >= 0x2B1B && ch <= 0x2B1C) /* Large squares */
6976 || ch == 0x2B50
6977 || ch == 0x2B55
6978 || (ch >= 0x2E80 && ch <= 0xA4CF /* CJK ... Yi */
6979 && !(ch == 0x303F)
6980 && !(ch >= 0x3248 && ch <= 0x324F)
6981 && !(ch >= 0x4DC0 && ch <= 0x4DFF))
6982 || (ch >= 0xA960 && ch <= 0xA97C) /* Hangul Jamo Extended-A */
6983 || (ch >= 0xAC00 && ch <= 0xD7A3) /* Hangul Syllables */
6984 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
6985 || (ch >= 0xFE10 && ch <= 0xFE1F) /* Presentation Forms for Vertical */
6986 || (ch >= 0xFE30 && ch <= 0xFE6F) /* CJK Compatibility Forms */
6987 || (ch >= 0xFF00 && ch <= 0xFF60) /* Fullwidth Forms */
6988 || (ch >= 0xFFE0 && ch <= 0xFFE6) /* Fullwidth Signs */
6989 || (ch >= 0x16FE0 && ch <= 0x16FE3) /* Tangut mark, Nushu mark */
6990 || (ch >= 0x16FF0 && ch <= 0x16FF1) /* Vietnamese alternate reading marks */
6991 || (ch >= 0x17000 && ch <= 0x187F7) /* Tangut */
6992 || (ch >= 0x18800 && ch <= 0x18CD5) /* Tangut components */
6993 || (ch >= 0x18D00 && ch <= 0x18D08) /* Tangul Ideograph Supplement */
6994 || ((ch >= 0x1AFF0 && ch <= 0x1AFFE) /* Katakana letter Minnan */
6995 && ch != 0x1AFF4 && ch != 0x1AFFC)
6996 || (ch >= 0x1B000 && ch <= 0x1B122) /* Kana supplement, Kana Extended-A */
6997 || (ch >= 0x1B150 && ch <= 0x1B152) /* Small Hiragana */
6998 || (ch >= 0x1B164 && ch <= 0x1B167) /* Small Katakana */
6999 || (ch >= 0x1B170 && ch <= 0x1B2FB) /* Nushu */
7000 || ch == 0x1F004
7001 || ch == 0x1F0CF
7002 || ch == 0x1F18E
7003 || (ch >= 0x1F191 && ch <= 0x1F19A)
7004 /* Miscellaneous symbols and pictographs */
7005 || (ch >= 0x1F200 && ch <= 0x1F320)
7006 || (ch >= 0x1F32D && ch <= 0x1F335)
7007 || (ch >= 0x1F337 && ch <= 0x1F37C)
7008 || (ch >= 0x1F37E && ch <= 0x1F393)
7009 || (ch >= 0x1F3A0 && ch <= 0x1F3CA)
7010 || (ch >= 0x1F3CF && ch <= 0x1F3D3)
7011 || (ch >= 0x1F3E0 && ch <= 0x1F3F0)
7012 || ch == 0x1F3F4
7013 || (ch >= 0x1F3F8 && ch <= 0x1F43E)
7014 || ch == 0x1F440
7015 || (ch >= 0x1F442 && ch <= 0x1F4FC)
7016 || (ch >= 0x1F4FF && ch <= 0x1F53D)
7017 || (ch >= 0x1F54B && ch <= 0x1F54E)
7018 || (ch >= 0x1F550 && ch <= 0x1F567)
7019 || ch == 0x1F57A
7020 || (ch >= 0x1F595 && ch <= 0x1F596)
7021 || ch == 0x1F5A4
7022 || (ch >= 0x1F5FB && ch <= 0x1F64F)
7023 || (ch >= 0x1F680 && ch <= 0x1F6C5)
7024 || ch == 0x1F6CC
7025 || (ch >= 0x1F6D0 && ch <= 0x1F6D2)
7026 || (ch >= 0x1F6D5 && ch <= 0x1F6D7)
7027 || (ch >= 0x1F6DD && ch <= 0x1F6DF)
7028 || (ch >= 0x1F6EB && ch <= 0x1F6EC)
7029 || (ch >= 0x1F6F4 && ch <= 0x1F6FC)
7030 || (ch >= 0x1F7E0 && ch <= 0x1F7EB)
7031 || ch == 0x1F7F0
7032 || ((ch >= 0x1F90C && ch <= 0x1F9FF)
7033 && ch != 0x1F93B && ch != 0x1F946)
7034 || (ch >= 0x1FA70 && ch <= 0x1FA74)
7035 || (ch >= 0x1FA78 && ch <= 0x1FA7C)
7036 || (ch >= 0x1FA80 && ch <= 0x1FA86)
7037 || (ch >= 0x1FA90 && ch <= 0x1FAAC)
7038 || (ch >= 0x1FAB0 && ch <= 0x1FABA)
7039 || (ch >= 0x1FAC0 && ch <= 0x1FAC5)
7040 || (ch >= 0x1FAD0 && ch <= 0x1FAD9)
7041 || (ch >= 0x1FAE0 && ch <= 0x1FAE7)
7042 || (ch >= 0x1FAF0 && ch <= 0x1FAF6)
7043 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
7044 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */
7048 static void
7049 output_width2_property (const char *filename, const char *version)
7051 output_predicate (filename, is_width2, "u_width2", "Width 2 property", version);
7054 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
7055 static char
7056 symbolic_width (unsigned int ch)
7058 /* Test for unassigned character. */
7059 if (is_property_unassigned_code_value (ch))
7061 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
7062 if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
7063 return 'A';
7064 if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
7065 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
7066 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
7067 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
7068 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
7069 return '2';
7070 return 0;
7072 else
7074 /* Test for non-spacing or control character. */
7075 if (is_category_Cc (ch) && ch < 0x00A0)
7076 return 0;
7077 if (is_nonspacing (ch))
7078 return '0';
7079 /* Test for double-width character. */
7080 if (unicode_width[ch] != NULL
7081 && (strcmp (unicode_width[ch], "W") == 0
7082 || strcmp (unicode_width[ch], "F") == 0))
7083 return '2';
7084 /* Test for half-width character. */
7085 if (unicode_width[ch] != NULL
7086 && strcmp (unicode_width[ch], "H") == 0)
7087 return '1';
7089 /* In ancient CJK encodings, Cyrillic and most other characters are
7090 double-width as well. */
7091 if (ch >= 0x00A1 && ch < 0x10000)
7092 return 'A';
7093 return '1';
7096 static void
7097 output_width_property_test (const char *filename)
7099 FILE *stream;
7100 unsigned int interval_start, interval_end, ch;
7101 char interval_value;
7103 stream = fopen (filename, "w");
7104 if (stream == NULL)
7106 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7107 exit (1);
7110 interval_value = 0;
7111 interval_start = interval_end = 0; /* avoid GCC warning */
7112 for (ch = 0; ch < 0x110000; ch++)
7114 char value = symbolic_width (ch);
7115 if (value != 0) /* skip Cc control characters and unassigned characters */
7117 if (value == interval_value)
7118 /* Extend the interval. */
7119 interval_end = ch;
7120 else
7122 /* Terminate the interval. */
7123 if (interval_value != 0)
7125 if (interval_end == interval_start)
7126 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
7127 else
7128 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
7130 /* Start a new interval. */
7131 interval_start = interval_end = ch;
7132 interval_value = value;
7136 /* Terminate the last interval. */
7137 if (interval_value != 0)
7139 if (interval_end == interval_start)
7140 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
7141 else
7142 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
7145 if (ferror (stream) || fclose (stream))
7147 fprintf (stderr, "error writing to '%s'\n", filename);
7148 exit (1);
7152 /* ========================================================================= */
7154 /* Line breaking classification.
7155 Updated for Unicode TR #14 revision 51. */
7157 enum
7159 /* Values >= 40 are resolved at run time. */
7160 /* Values >= 100 are shorthands for several values. */
7161 LBP_BK = 40, /* mandatory break */
7162 LBP_CR = 41, /* carriage return */
7163 LBP_LF = 42, /* line feed */
7164 LBP_CM = 43, /* attached characters and combining marks */
7165 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
7166 /*LBP_SG, surrogates - not used here because they are not characters */
7167 LBP_WJ = 0, /* word joiner */
7168 LBP_ZW = 44, /* zero width space */
7169 LBP_GL = 1, /* non-breaking (glue) */
7170 LBP_SP = 45, /* space */
7171 LBP_B2 = 2, /* break opportunity before and after */
7172 LBP_BA = 3, /* break opportunity after */
7173 LBP_BB = 4, /* break opportunity before */
7174 LBP_HY = 5, /* hyphen */
7175 LBP_CB = 46, /* contingent break opportunity */
7176 LBP_CL = 6, /* closing punctuation */
7177 LBP_CP1 = 7, /* closing parenthesis, non-EastAsian character */
7178 LBP_CP2 = 8, /* closing parenthesis, EastAsian character */
7179 LBP_EX = 9, /* exclamation/interrogation */
7180 LBP_IN = 10, /* inseparable */
7181 LBP_NS = 11, /* non starter */
7182 LBP_OP1 = 12, /* opening punctuation, non-EastAsian character */
7183 LBP_OP2 = 13, /* opening punctuation, EastAsian character */
7184 LBP_QU1 = 14, /* ambiguous quotation, neither initial nor final punctuation */
7185 LBP_QU2 = 15, /* ambiguous quotation, initial punctuation */
7186 LBP_QU3 = 16, /* ambiguous quotation, final punctuation */
7187 LBP_IS = 17, /* infix separator (numeric) */
7188 LBP_NU = 18, /* numeric */
7189 LBP_PO = 19, /* postfix (numeric) */
7190 LBP_PR = 20, /* prefix (numeric) */
7191 LBP_SY = 21, /* symbols allowing breaks */
7192 LBP_AI = 47, /* ambiguous (alphabetic or ideograph) */
7193 LBP_AL = 22, /* ordinary alphabetic and symbol characters */
7194 /*LBP_CJ, conditional Japanese starter, resolved to NS */
7195 LBP_H2 = 23, /* Hangul LV syllable */
7196 LBP_H3 = 24, /* Hangul LVT syllable */
7197 LBP_HL = 30, /* Hebrew letter */
7198 LBP_ID1 = 25, /* ideographic */
7199 LBP_ID2 = 26, /* ideographic and potential future emoji */
7200 LBP_JL = 27, /* Hangul L Jamo */
7201 LBP_JV = 28, /* Hangul V Jamo */
7202 LBP_JT = 29, /* Hangul T Jamo */
7203 LBP_AP = 31, /* Brahmic scripts: pre-base repha */
7204 LBP_AK = 32, /* Brahmic scripts: consonants */
7205 LBP_AS = 33, /* Brahmic scripts: independent vowels */
7206 LBP_VI = 34, /* Brahmic scripts: conjoining viramas */
7207 LBP_VF = 35, /* Brahmic scripts: viramas for final consonants */
7208 LBP_RI = 36, /* regional indicator */
7209 LBP_SA = 48, /* complex context (South East Asian) */
7210 LBP_ZWJ = 37, /* zero width joiner */
7211 LBP_EB = 38, /* emoji base */
7212 LBP_EM = 39, /* emoji modifier */
7213 LBP_XX = 49, /* unknown */
7214 /* Artificial values that exist only in this file, not in the tables. */
7215 LBP_CP = 100, /* LBP_CP1 or LBP_CP2 */
7216 LBP_OP = 101, /* LBP_OP1 or LBP_OP2 */
7217 LBP_QU = 102, /* LBP_QU1 or LBP_QU2 or LBP_QU3 */
7218 LBP_ID = 103 /* LBP_ID1 or LBP_ID2 */
7221 /* Returns the line breaking classification for ch, as a bit mask. */
7222 static int64_t
7223 get_lbp (unsigned int ch)
7225 int64_t attr = 0;
7227 /* U+20BC..U+20CF are reserved for prefixes. */
7228 if (unicode_attributes[ch].name == NULL && (ch >= 0x20BC && ch <= 0x20CF))
7229 return (int64_t) 1 << LBP_PR;
7231 if (unicode_attributes[ch].name != NULL)
7233 /* mandatory break */
7234 if (ch == 0x000A)
7235 attr |= (int64_t) 1 << LBP_LF;
7236 if (ch == 0x000D)
7237 attr |= (int64_t) 1 << LBP_CR;
7238 if (ch == 0x0085 /* newline */
7239 || ch == 0x000B /* LINE TABULATION */
7240 || ch == 0x000C /* FORM FEED */
7241 || ch == 0x2028 /* LINE SEPARATOR */
7242 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
7243 attr |= (int64_t) 1 << LBP_BK;
7245 if (ch == 0x2060 /* WORD JOINER */
7246 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
7247 attr |= (int64_t) 1 << LBP_WJ;
7249 /* zero width space */
7250 if (ch == 0x200B /* ZERO WIDTH SPACE */)
7251 attr |= (int64_t) 1 << LBP_ZW;
7253 /* zero width joiner */
7254 if (ch == 0x200D /* ZERO WIDTH JOINER */)
7255 attr |= (int64_t) 1 << LBP_ZWJ;
7257 /* emoji base */
7258 if (((unicode_properties[ch] >> PROP_EMOJI_MODIFIER_BASE) & 1) != 0) /* EMOJI MODIFIER BASE */
7259 attr |= (int64_t) 1 << LBP_EB;
7261 if (((unicode_properties[ch] >> PROP_EMOJI_MODIFIER) & 1) != 0) /* EMOJI MODIFIER */
7262 attr |= (int64_t) 1 << LBP_EM;
7264 /* non-breaking (glue) */
7265 if (ch == 0x00A0 /* NO-BREAK SPACE */
7266 || ch == 0x202F /* NARROW NO-BREAK SPACE */
7267 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
7268 || ch == 0x1107F /* BRAHMI NUMBER JOINER */
7269 || (ch >= 0x13430 && ch <= 0x13436) /* EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE */
7270 || (ch >= 0x13439 && ch <= 0x1343B) /* EGYPTIAN HIEROGLYPH INSERT AT MIDDLE..EGYPTIAN HIEROGLYPH INSERT AT BOTTOM */
7271 || ch == 0x16FE4 /* KHITAN SMALL SCRIPT FILLER */
7272 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
7273 || ch == 0x2007 /* FIGURE SPACE */
7274 || ch == 0x2011 /* NON-BREAKING HYPHEN */
7275 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
7276 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
7277 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
7278 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
7279 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7280 || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
7281 || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */
7282 || ch == 0x1DCD /* COMBINING DOUBLE CIRCUMFLEX ABOVE */
7283 || ch == 0x1DFC /* COMBINING DOUBLE INVERTED BREVE BELOW */)
7284 attr |= (int64_t) 1 << LBP_GL;
7286 /* space */
7287 if (ch == 0x0020 /* SPACE */)
7288 attr |= (int64_t) 1 << LBP_SP;
7290 /* break opportunity before and after */
7291 if (ch == 0x2014 /* EM DASH */
7292 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7293 || ch == 0x2E3A /* TWO-EM DASH */
7294 || ch == 0x2E3B /* THREE-EM DASH */)
7295 attr |= (int64_t) 1 << LBP_B2;
7297 /* break opportunity after */
7298 if (/* Breaking Spaces */
7299 ch == 0x1680 /* OGHAM SPACE MARK */
7300 || ch == 0x2000 /* EN QUAD */
7301 || ch == 0x2001 /* EM QUAD */
7302 || ch == 0x2002 /* EN SPACE */
7303 || ch == 0x2003 /* EM SPACE */
7304 || ch == 0x2004 /* THREE-PER-EM SPACE */
7305 || ch == 0x2005 /* FOUR-PER-EM SPACE */
7306 || ch == 0x2006 /* SIX-PER-EM SPACE */
7307 || ch == 0x2008 /* PUNCTUATION SPACE */
7308 || ch == 0x2009 /* THIN SPACE */
7309 || ch == 0x200A /* HAIR SPACE */
7310 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
7311 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
7312 /* Tabs */
7313 || ch == 0x0009 /* tab */
7314 /* Conditional Hyphens */
7315 || ch == 0x00AD /* SOFT HYPHEN */
7316 /* Breaking Hyphens */
7317 || ch == 0x058A /* ARMENIAN HYPHEN */
7318 || ch == 0x2010 /* HYPHEN */
7319 || ch == 0x2012 /* FIGURE DASH */
7320 || ch == 0x2013 /* EN DASH */
7321 /* Visible Word Dividers */
7322 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
7323 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
7324 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
7325 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
7326 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
7327 || ch == 0x2027 /* HYPHENATION POINT */
7328 || ch == 0x007C /* VERTICAL LINE */
7329 /* Historic Word Separators */
7330 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
7331 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
7332 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
7333 || ch == 0x2056 /* THREE DOT PUNCTUATION */
7334 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
7335 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
7336 || ch == 0x205A /* TWO DOT PUNCTUATION */
7337 || ch == 0x205B /* FOUR DOT MARK */
7338 || ch == 0x205D /* TRICOLON */
7339 || ch == 0x205E /* VERTICAL FOUR DOTS */
7340 || ch == 0x2E19 /* PALM BRANCH */
7341 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
7342 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
7343 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
7344 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
7345 || ch == 0x2E30 /* RING POINT */
7346 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
7347 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
7348 || ch == 0x10102 /* AEGEAN CHECK MARK */
7349 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
7350 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
7351 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
7352 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
7353 /* Dandas */
7354 || ch == 0x0964 /* DEVANAGARI DANDA */
7355 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
7356 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
7357 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
7358 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
7359 || ch == 0x104B /* MYANMAR SIGN SECTION */
7360 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
7361 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
7362 || ch == 0x17D4 /* KHMER SIGN KHAN */
7363 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
7364 || ch == 0x1B5E /* BALINESE CARIK SIKI */
7365 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
7366 || ch == 0xA8CE /* SAURASHTRA DANDA */
7367 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
7368 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
7369 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
7370 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
7371 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
7372 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
7373 /* Tibetan */
7374 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
7375 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
7376 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
7377 || ch == 0x0FBE /* TIBETAN KU RU KHA */
7378 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
7379 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
7380 /* Other Terminating Punctuation */
7381 || ch == 0x1804 /* MONGOLIAN COLON */
7382 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
7383 || ch == 0x1B5A /* BALINESE PANTI */
7384 || ch == 0x1B5B /* BALINESE PAMADA */
7385 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
7386 || ch == 0x1B60 /* BALINESE PAMENENG */
7387 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
7388 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
7389 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
7390 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
7391 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
7392 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
7393 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
7394 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
7395 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
7396 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
7397 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
7398 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
7399 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
7400 || ch == 0xA60D /* VAI COMMA */
7401 || ch == 0xA60F /* VAI QUESTION MARK */
7402 || ch == 0xA92E /* KAYAH LI SIGN CWI */
7403 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
7404 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
7405 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
7406 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
7407 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
7408 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
7409 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
7410 || (ch >= 0x11EF7 && ch <= 0x11EF8) /* MAKASAR PASSIMBANG..MAKASAR END OF SECTION */
7411 /* Letters attached to orthographic syllables */
7412 || ch == 0xA9CF /* JAVANESE PANGRANGKEP */
7413 || (ch >= 0xAA40 && ch <= 0xAA42) /* CHAM LETTER FINAL K..CHAM LETTER FINAL NG */
7414 || (ch >= 0xAA44 && ch <= 0xAA4B) /* CHAM LETTER FINAL CH..CHAM LETTER FINAL SS */
7415 || ch == 0x1133D /* GRANTHA SIGN AVAGRAHA */
7416 || ch == 0x1135D /* GRANTHA SIGN PLUTA */
7417 || ch == 0x11EF2 /* MAKASAR ANGKA */
7418 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7419 || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
7420 || ch == 0x1B7D /* BALINESE PANTI LANTANG */
7421 || ch == 0x1B7E /* BALINESE PAMADA LANTANG */
7422 || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
7423 || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
7424 || ch == 0x2E33 /* RAISED DOT */
7425 || ch == 0x2E34 /* RAISED COMMA */
7426 || ch == 0x2E3C /* STENOGRAPHIC FULL STOP */
7427 || ch == 0x2E3D /* VERTICAL SIX DOTS */
7428 || ch == 0x2E3E /* WIGGLY VERTICAL LINE */
7429 || ch == 0x2E40 /* DOUBLE HYPHEN */
7430 || ch == 0x2E41 /* REVERSED COMMA */
7431 || ch == 0x2E43 /* DASH WITH LEFT UPTURN */
7432 || ch == 0x2E44 /* DOUBLE SUSPENSION MARK */
7433 || ch == 0x2E45 /* INVERTED LOW KAVYKA */
7434 || ch == 0x2E46 /* INVERTED LOW KAVYKA WITH KAVYKA ABOVE */
7435 || ch == 0x2E47 /* LOW KAVYKA */
7436 || ch == 0x2E48 /* LOW KAVYKA WITH DOT */
7437 || ch == 0x2E49 /* DOUBLE STACKED COMMA */
7438 || ch == 0x2E4A /* DOTTED SOLIDUS */
7439 || ch == 0x2E4C /* MEDIEVAL COMMA */
7440 || ch == 0x2E4E /* PUNCTUS ELEVATUS MARK */
7441 || ch == 0x2E4F /* CORNISH VERSE DIVIDER */
7442 || ch == 0x2E5D /* OBLIQUE HYPHEN */
7443 || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
7444 || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
7445 || ch == 0xA6F3 /* BAMUM FULL STOP */
7446 || ch == 0xA6F4 /* BAMUM COLON */
7447 || ch == 0xA6F5 /* BAMUM COMMA */
7448 || ch == 0xA6F6 /* BAMUM SEMICOLON */
7449 || ch == 0xA6F7 /* BAMUM QUESTION MARK */
7450 || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
7451 || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
7452 || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
7453 || ch == 0xAAF0 /* MEETEI MAYEK CHEIKHAN */
7454 || ch == 0xAAF1 /* MEETEI MAYEK AHANG KHUDAM */
7455 || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
7456 || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
7457 || (ch >= 0x10AF0 && ch <= 0x10AF5) /* MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS */
7458 || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
7459 || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
7460 || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
7461 || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
7462 || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
7463 || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
7464 || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
7465 || ch == 0x10EAD /* YEZIDI HYPHENATION MARK */
7466 || ch == 0x11047 /* BRAHMI DANDA */
7467 || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
7468 || ch == 0x110BE /* KAITHI SECTION MARK */
7469 || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
7470 || ch == 0x110C0 /* KAITHI DANDA */
7471 || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
7472 || ch == 0x11140 /* CHAKMA SECTION MARK */
7473 || ch == 0x11141 /* CHAKMA DANDA */
7474 || ch == 0x11142 /* CHAKMA DOUBLE DANDA */
7475 || ch == 0x11143 /* CHAKMA QUESTION MARK */
7476 || ch == 0x111C5 /* SHARADA DANDA */
7477 || ch == 0x111C6 /* SHARADA DOUBLE DANDA */
7478 || ch == 0x111C8 /* SHARADA SEPARATOR */
7479 || (ch >= 0x111DD && ch <= 0x111DF) /* SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 */
7480 || ch == 0x11238 /* KHOJKI DANDA */
7481 || ch == 0x11239 /* KHOJKI DOUBLE DANDA */
7482 || ch == 0x1123B /* KHOJKI SECTION MARK */
7483 || ch == 0x1123C /* KHOJKI DOUBLE SECTION MARK */
7484 || ch == 0x112A9 /* MULTANI SECTION MARK */
7485 || (ch >= 0x1144B && ch <= 0x1144E) /* NEWA DANDA..NEWA GAP FILLER */
7486 || ch == 0x1145A /* NEWA DOUBLE COMMA */
7487 || ch == 0x1145B /* NEWA PLACEHOLDER MARK */
7488 || ch == 0x115C2 /* SIDDHAM DANDA */
7489 || ch == 0x115C3 /* SIDDHAM DOUBLE DANDA */
7490 || (ch >= 0x115C9 && ch <= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
7491 || ch == 0x11641 /* MODI DANDA */
7492 || ch == 0x11642 /* MODI DOUBLE DANDA */
7493 || (ch >= 0x1173C && ch <= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
7494 || (ch >= 0x11944 && ch <= 0x11946) /* DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK */
7495 || ch == 0x11A41 /* ZANABAZAR SQUARE MARK TSHEG */
7496 || ch == 0x11A42 /* ZANABAZAR SQUARE MARK SHAD */
7497 || ch == 0x11A43 /* ZANABAZAR SQUARE MARK DOUBLE SHAD */
7498 || ch == 0x11A44 /* ZANABAZAR SQUARE MARK LONG TSHEG */
7499 || ch == 0x11A9A /* SOYOMBO MARK TSHEG */
7500 || ch == 0x11A9B /* SOYOMBO MARK SHAD */
7501 || ch == 0x11A9C /* SOYOMBO MARK DOUBLE SHAD */
7502 || ch == 0x11AA1 /* SOYOMBO TERMINAL MARK-1 */
7503 || ch == 0x11AA2 /* SOYOMBO TERMINAL MARK-2 */
7504 || (ch >= 0x11C41 && ch <= 0x11C45) /* BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 */
7505 || ch == 0x11F43 /* KAWI DANDA */
7506 || ch == 0x11F44 /* KAWI DOUBLE DANDA */
7507 || ch == 0x11FFF /* TAMIL PUNCTUATION END OF TEXT */
7508 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
7509 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
7510 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
7511 || ch == 0x12474 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON */
7512 || ch == 0x16A6E /* MRO DANDA */
7513 || ch == 0x16A6F /* MRO DOUBLE DANDA */
7514 || ch == 0x16AF5 /* BASSA VAH FULL STOP */
7515 || ch == 0x16B37 /* PAHAWH HMONG SIGN VOS THOM */
7516 || ch == 0x16B38 /* PAHAWH HMONG SIGN VOS TSHAB CEEB */
7517 || ch == 0x16B39 /* PAHAWH HMONG SIGN CIM CHEEM */
7518 || ch == 0x16B44 /* PAHAWH HMONG SIGN XAUS */
7519 || ch == 0x16E97 /* MEDEFAIDRIN COMMA */
7520 || ch == 0x16E98 /* MEDEFAIDRIN FULL STOP */
7521 || ch == 0x1BC9F /* DUPLOYAN PUNCTUATION CHINOOK FULL STOP */
7522 || (ch >= 0x1DA87 && ch <= 0x1DA8A) /* SIGNWRITING COMMA..SIGNWRITING COLON */)
7523 attr |= (int64_t) 1 << LBP_BA;
7525 /* break opportunity before */
7526 if (/* Dictionary Use */
7527 ch == 0x00B4 /* ACUTE ACCENT */
7528 || ch == 0x1FFD /* GREEK OXIA */
7529 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
7530 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
7531 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
7532 /* Tibetan and Phags-Pa Head Letters */
7533 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
7534 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
7535 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
7536 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
7537 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
7538 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
7539 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
7540 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
7541 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
7542 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
7543 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
7544 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
7545 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
7546 /* Mongolian */
7547 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
7548 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7549 || ch == 0x0C77 /* TELUGU SIGN SIDDHAM */
7550 || ch == 0x0C84 /* KANNADA SIGN SIDDHAM */
7551 || ch == 0xA8FC /* DEVANAGARI SIGN SIDDHAM */
7552 || ch == 0x11175 /* MAHAJANI SECTION MARK */
7553 || ch == 0x111DB /* SHARADA SIGN SIDDHAM */
7554 || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */
7555 || (ch >= 0x11660 && ch <= 0x1166C) /* MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT */
7556 || ch == 0x119E2 /* NANDINAGARI SIGN SIDDHAM */
7557 || ch == 0x11A3F /* ZANABAZAR SQUARE INITIAL HEAD MARK */
7558 || ch == 0x11A45 /* ZANABAZAR SQUARE INITIAL DOUBLE-LINED HEAD MARK */
7559 || ch == 0x11A9E /* SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME */
7560 || ch == 0x11A9F /* SOYOMBO HEAD MARK WITH MOON AND SUN AND FLAME */
7561 || ch == 0x11AA0 /* SOYOMBO HEAD MARK WITH MOON AND SUN */
7562 || (ch >= 0x11B00 && ch <= 0x11B09) /* DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU */
7563 || ch == 0x11C70 /* MARCHEN HEAD MARK */)
7564 attr |= (int64_t) 1 << LBP_BB;
7566 /* hyphen */
7567 if (ch == 0x002D /* HYPHEN-MINUS */)
7568 attr |= (int64_t) 1 << LBP_HY;
7570 /* contingent break opportunity */
7571 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
7572 attr |= (int64_t) 1 << LBP_CB;
7574 /* closing parenthesis */
7575 if (ch == 0x0029 /* RIGHT PARENTHESIS */
7576 || ch == 0x005D /* RIGHT SQUARE BRACKET */)
7578 if (unicode_width[ch] != NULL
7579 && (strcmp (unicode_width[ch], "W") == 0
7580 || strcmp (unicode_width[ch], "F") == 0
7581 || strcmp (unicode_width[ch], "H") == 0))
7582 attr |= (int64_t) 1 << LBP_CP2;
7583 else
7584 attr |= (int64_t) 1 << LBP_CP1;
7587 /* closing punctuation */
7588 if ((unicode_attributes[ch].category[0] == 'P'
7589 && unicode_attributes[ch].category[1] == 'e'
7590 && !(attr & (((int64_t) 1 << LBP_CP1) | ((int64_t) 1 << LBP_CP2))))
7591 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
7592 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
7593 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
7594 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
7595 || ch == 0xFE50 /* SMALL COMMA */
7596 || ch == 0xFE52 /* SMALL FULL STOP */
7597 || ch == 0xFF0C /* FULLWIDTH COMMA */
7598 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
7599 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
7600 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
7601 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7602 || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
7603 || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
7604 || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
7605 || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
7606 || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
7607 || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
7608 || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
7609 || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */
7610 || ch == 0x13438 /* EGYPTIAN HIEROGLYPH END SEGMENT */
7611 || ch == 0x1343D /* EGYPTIAN HIEROGLYPH END ENCLOSURE */
7612 || ch == 0x1343F /* EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE */
7613 || ch == 0x145CF /* ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK */)
7614 attr |= (int64_t) 1 << LBP_CL;
7616 /* exclamation/interrogation */
7617 if (ch == 0x0021 /* EXCLAMATION MARK */
7618 || ch == 0x003F /* QUESTION MARK */
7619 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
7620 || ch == 0x061B /* ARABIC SEMICOLON */
7621 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
7622 || ch == 0x061F /* ARABIC QUESTION MARK */
7623 || ch == 0x06D4 /* ARABIC FULL STOP */
7624 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
7625 || ch == 0x0F0D /* TIBETAN MARK SHAD */
7626 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
7627 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */
7628 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7629 || ch == 0x061D /* ARABIC END OF TEXT MARK */
7630 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
7631 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
7632 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
7633 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
7634 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
7635 || ch == 0x1802 /* MONGOLIAN COMMA */
7636 || ch == 0x1803 /* MONGOLIAN FULL STOP */
7637 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
7638 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
7639 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
7640 || ch == 0x1945 /* LIMBU QUESTION MARK */
7641 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
7642 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
7643 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
7644 || ch == 0x2CFE /* COPTIC FULL STOP */
7645 || ch == 0x2E2E /* REVERSED QUESTION MARK */
7646 || ch == 0x2E53 /* MEDIEVAL EXCLAMATION MARK */
7647 || ch == 0x2E54 /* MEDIEVAL QUESTION MARK */
7648 || ch == 0xA60E /* VAI FULL STOP */
7649 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
7650 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
7651 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
7652 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
7653 || ch == 0xFE56 /* SMALL QUESTION MARK */
7654 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
7655 || ch == 0x115C4 /* SIDDHAM SEPARATOR DOT */
7656 || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */
7657 || ch == 0x11C71 /* MARCHEN MARK SHAD */)
7658 attr |= (int64_t) 1 << LBP_EX;
7660 /* inseparable */
7661 if (ch == 0x2024 /* ONE DOT LEADER */
7662 || ch == 0x2025 /* TWO DOT LEADER */
7663 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
7664 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */
7665 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7666 || ch == 0x22EF /* MIDLINE HORIZONTAL ELLIPSIS */
7667 || ch == 0x10AF6 /* MANICHAEAN PUNCTUATION LINE FILLER */)
7668 attr |= (int64_t) 1 << LBP_IN;
7670 /* non starter */
7671 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
7672 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
7673 || ch == 0x203D /* INTERROBANG */
7674 || ch == 0x2047 /* DOUBLE QUESTION MARK */
7675 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
7676 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
7677 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
7678 || ch == 0x301C /* WAVE DASH */
7679 || ch == 0x303C /* MASU MARK */
7680 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
7681 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
7682 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
7683 || ch == 0x309D /* HIRAGANA ITERATION MARK */
7684 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
7685 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
7686 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
7687 || ch == 0x30FD /* KATAKANA ITERATION MARK */
7688 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
7689 || ch == 0xFE54 /* SMALL SEMICOLON */
7690 || ch == 0xFE55 /* SMALL COLON */
7691 || ch == 0xFF1A /* FULLWIDTH COLON */
7692 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
7693 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
7694 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
7695 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
7696 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7697 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
7698 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL
7699 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
7700 || ch == 0xA015 /* YI SYLLABLE WU */
7701 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
7702 || ch == 0x16FE0 /* TANGUT ITERATION MARK */
7703 || ch == 0x16FE1 /* NUSHU ITERATION MARK */
7704 || ch == 0x16FE2 /* OLD CHINESE HOOK MARK */
7705 || ch == 0x16FE3 /* OLD CHINESE ITERATION MARK */
7706 || ch == 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
7707 || ch == 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
7708 || ch == 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */)
7709 attr |= (int64_t) 1 << LBP_NS;
7711 /* opening punctuation */
7712 if ((unicode_attributes[ch].category[0] == 'P'
7713 && unicode_attributes[ch].category[1] == 's')
7714 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
7715 || ch == 0x00BF /* INVERTED QUESTION MARK */
7716 || ch == 0x2E18 /* INVERTED INTERROBANG */
7717 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7718 || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
7719 || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
7720 || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
7721 || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
7722 || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
7723 || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
7724 || ch == 0x1342F /* EGYPTIAN HIEROGLYPH V011D */
7725 || ch == 0x13437 /* EGYPTIAN HIEROGLYPH BEGIN SEGMENT */
7726 || ch == 0x1343C /* EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE */
7727 || ch == 0x1343E /* EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE */
7728 || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */
7729 || (ch >= 0x1E95E && ch <= 0x1E95F) /* ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK */)
7731 if (unicode_width[ch] != NULL
7732 && (strcmp (unicode_width[ch], "W") == 0
7733 || strcmp (unicode_width[ch], "F") == 0
7734 || strcmp (unicode_width[ch], "H") == 0))
7735 attr |= (int64_t) 1 << LBP_OP2;
7736 else
7737 attr |= (int64_t) 1 << LBP_OP1;
7740 /* ambiguous quotation */
7741 if ((unicode_attributes[ch].category[0] == 'P'
7742 && (unicode_attributes[ch].category[1] == 'f'
7743 || unicode_attributes[ch].category[1] == 'i'))
7744 || ch == 0x0022 /* QUOTATION MARK */
7745 || ch == 0x0027 /* APOSTROPHE */
7746 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
7747 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
7748 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
7749 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
7750 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
7751 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
7752 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
7753 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
7754 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
7755 || ch == 0x2E0B /* RAISED SQUARE */
7756 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7757 || ch == 0x275F /* HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT */
7758 || ch == 0x2760 /* HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */
7759 || ch == 0x1F676 /* SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
7760 || ch == 0x1F677 /* SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
7761 || ch == 0x1F678 /* SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */)
7763 if (unicode_attributes[ch].category[0] == 'P'
7764 && unicode_attributes[ch].category[1] == 'i')
7765 attr |= (int64_t) 1 << LBP_QU2;
7766 else if (unicode_attributes[ch].category[0] == 'P'
7767 && unicode_attributes[ch].category[1] == 'f')
7768 attr |= (int64_t) 1 << LBP_QU3;
7769 else
7770 attr |= (int64_t) 1 << LBP_QU1;
7773 /* infix separator (numeric) */
7774 if (ch == 0x002C /* COMMA */
7775 || ch == 0x002E /* FULL STOP */
7776 || ch == 0x003A /* COLON */
7777 || ch == 0x003B /* SEMICOLON */
7778 || ch == 0x037E /* GREEK QUESTION MARK */
7779 || ch == 0x0589 /* ARMENIAN FULL STOP */
7780 || ch == 0x060C /* ARABIC COMMA */
7781 || ch == 0x060D /* ARABIC DATE SEPARATOR */
7782 || ch == 0x07F8 /* NKO COMMA */
7783 || ch == 0x2044 /* FRACTION SLASH */
7784 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
7785 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
7786 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
7787 attr |= (int64_t) 1 << LBP_IS;
7789 /* numeric */
7790 if ((unicode_attributes[ch].category[0] == 'N'
7791 && unicode_attributes[ch].category[1] == 'd'
7792 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL
7793 && !(ch >= 0x1B50 && ch <= 0x1B59) /* BALINESE DIGIT ZERO..NINE */
7794 && !(ch >= 0xA9D0 && ch <= 0xA9D9) /* JAVANESE DIGIT ZERO..NINE */
7795 && !(ch >= 0xAA50 && ch <= 0xAA59) /* CHAM DIGIT ZERO..NINE */
7796 && !(ch >= 0x11066 && ch <= 0x1106F) /* BRAHMI DIGIT ZERO..NINE */
7797 && !(ch >= 0x11950 && ch <= 0x11959) /* DIVES AKURU DIGIT ZERO..NINE */
7798 && !(ch >= 0x11F50 && ch <= 0x11F59) /* KAWI DIGIT ZERO..NINE */)
7799 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
7800 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */
7801 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7802 || ch == 0x0600 /* ARABIC NUMBER SIGN */
7803 || ch == 0x0601 /* ARABIC SIGN SANAH */
7804 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
7805 || ch == 0x0603 /* ARABIC SIGN SAFHA */
7806 || ch == 0x0604 /* ARABIC SIGN SAMVAT */
7807 || ch == 0x0605 /* ARABIC NUMBER MARK ABOVE */
7808 || ch == 0x06DD /* ARABIC END OF AYAH */
7809 || ch == 0x0890 /* ARABIC POUND MARK ABOVE */
7810 || ch == 0x0891 /* ARABIC PIASTRE MARK ABOVE */
7811 || ch == 0x08E2 /* ARABIC DISPUTED END OF AYAH */
7812 || ch == 0x110BD /* KAITHI NUMBER SIGN */
7813 || ch == 0x110CD /* KAITHI NUMBER SIGN ABOVE */)
7814 attr |= (int64_t) 1 << LBP_NU;
7816 /* postfix numeric */
7817 if (ch == 0x0025 /* PERCENT SIGN */
7818 || ch == 0x00A2 /* CENT SIGN */
7819 || ch == 0x00B0 /* DEGREE SIGN */
7820 || ch == 0x060B /* AFGHANI SIGN */
7821 || ch == 0x066A /* ARABIC PERCENT SIGN */
7822 || ch == 0x2030 /* PER MILLE SIGN */
7823 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
7824 || ch == 0x2032 /* PRIME */
7825 || ch == 0x2033 /* DOUBLE PRIME */
7826 || ch == 0x2034 /* TRIPLE PRIME */
7827 || ch == 0x2035 /* REVERSED PRIME */
7828 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
7829 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
7830 || ch == 0x20A7 /* PESETA SIGN */
7831 || ch == 0x2103 /* DEGREE CELSIUS */
7832 || ch == 0x2109 /* DEGREE FAHRENHEIT */
7833 || ch == 0xFDFC /* RIAL SIGN */
7834 || ch == 0xFE6A /* SMALL PERCENT SIGN */
7835 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
7836 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
7837 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7838 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
7839 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
7840 || ch == 0x09F2 /* BENGALI RUPEE MARK */
7841 || ch == 0x09F3 /* BENGALI RUPEE SIGN */
7842 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
7843 || ch == 0x0D79 /* MALAYALAM DATE MARK */
7844 || ch == 0x2057 /* QUADRUPLE PRIME */
7845 || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
7846 || ch == 0x20BB /* NORDIC MARK SIGN */
7847 || ch == 0x20BE /* LARI SIGN */
7848 || ch == 0x20C0 /* SOM SIGN */
7849 || ch == 0xA838 /* NORTH INDIC RUPEE MARK */
7850 || (ch >= 0x11FDD && ch <= 0x11FE0) /* TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN */
7851 || ch == 0x1ECAC /* INDIC SIYAQ PLACEHOLDER */
7852 || ch == 0x1ECB0 /* INDIC SIYAQ RUPEE MARK */)
7853 attr |= (int64_t) 1 << LBP_PO;
7855 /* prefix numeric */
7856 if ((unicode_attributes[ch].category[0] == 'S'
7857 && unicode_attributes[ch].category[1] == 'c')
7858 || ch == 0x002B /* PLUS SIGN */
7859 || ch == 0x005C /* REVERSE SOLIDUS */
7860 || ch == 0x00B1 /* PLUS-MINUS SIGN */
7861 || ch == 0x2116 /* NUMERO SIGN */
7862 || ch == 0x2212 /* MINUS SIGN */
7863 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
7864 if (!(attr & ((int64_t) 1 << LBP_PO)))
7865 attr |= (int64_t) 1 << LBP_PR;
7867 /* symbols allowing breaks */
7868 if (ch == 0x002F /* SOLIDUS */)
7869 attr |= (int64_t) 1 << LBP_SY;
7871 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
7872 attr |= (int64_t) 1 << LBP_H2;
7874 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
7875 attr |= (int64_t) 1 << LBP_H3;
7877 if ((ch >= 0x05D0 && ch <= 0x05F2) || ch == 0xFB1D
7878 || (ch >= 0xFB1F && ch <= 0xFB28) || (ch >= 0xFB2A && ch <= 0xFB4F))
7879 attr |= (int64_t) 1 << LBP_HL;
7881 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
7882 attr |= (int64_t) 1 << LBP_JL;
7884 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
7885 attr |= (int64_t) 1 << LBP_JV;
7887 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
7888 attr |= (int64_t) 1 << LBP_JT;
7890 /* Brahmic scripts: pre-base repha */
7891 if ((ch >= 0x11003 && ch <= 0x11004)
7892 || ch == 0x11F02
7893 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7894 || ch == 0x1193F
7895 || ch == 0x11941)
7896 attr |= (int64_t) 1 << LBP_AP;
7898 /* Brahmic scripts: consonants */
7899 if ((ch >= 0x1B05 && ch <= 0x1B33)
7900 || (ch >= 0x1B45 && ch <= 0x1B4C)
7901 || (ch >= 0xA984 && ch <= 0xA9B2)
7902 || (ch >= 0x11005 && ch <= 0x11037)
7903 || (ch >= 0x11071 && ch <= 0x11072)
7904 || ch == 0x11075
7905 || (ch >= 0x11305 && ch <= 0x1130C)
7906 || (ch >= 0x1130F && ch <= 0x11310)
7907 || (ch >= 0x11313 && ch <= 0x11328)
7908 || (ch >= 0x1132A && ch <= 0x11330)
7909 || (ch >= 0x11332 && ch <= 0x11333)
7910 || (ch >= 0x11335 && ch <= 0x11339)
7911 || (ch >= 0x11360 && ch <= 0x11361)
7912 || (ch >= 0x11F04 && ch <= 0x11F10)
7913 || (ch >= 0x11F12 && ch <= 0x11F33)
7914 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7915 || (ch >= 0x11900 && ch <= 0x11906)
7916 || ch == 0x11909
7917 || (ch >= 0x1190C && ch <= 0x11913)
7918 || (ch >= 0x11915 && ch <= 0x11916)
7919 || (ch >= 0x11918 && ch <= 0x1192F))
7920 attr |= (int64_t) 1 << LBP_AK;
7922 /* Brahmic scripts: independent vowels */
7923 if ((ch >= 0x1BC0 && ch <= 0x1BE5)
7924 || (ch >= 0xAA00 && ch <= 0xAA28)
7925 || (ch >= 0x11066 && ch <= 0x1106F)
7926 || ch == 0x11350
7927 || (ch >= 0x1135E && ch <= 0x1135F)
7928 || (ch >= 0x11EE0 && ch <= 0x11EF1)
7929 || (ch >= 0x11F50 && ch <= 0x11F59))
7930 attr |= (int64_t) 1 << LBP_AS;
7932 /* Brahmic scripts: conjoining viramas */
7933 if (ch == 0x1B44
7934 || ch == 0xA9C0
7935 || ch == 0x11046 || ch == 0x1134D || ch == 0x1193E || ch == 0x11F42)
7936 attr |= (int64_t) 1 << LBP_VI;
7938 /* Brahmic scripts: viramas for final consonants */
7939 if (ch == 0x1BF2 || ch == 0x1BF3)
7940 attr |= (int64_t) 1 << LBP_VF;
7942 if (is_property_regional_indicator (ch))
7943 attr |= (int64_t) 1 << LBP_RI;
7945 /* complex context (South East Asian) */
7946 if (((unicode_attributes[ch].category[0] == 'C'
7947 && unicode_attributes[ch].category[1] == 'f')
7948 || (unicode_attributes[ch].category[0] == 'L'
7949 && (unicode_attributes[ch].category[1] == 'm'
7950 || unicode_attributes[ch].category[1] == 'o'))
7951 || (unicode_attributes[ch].category[0] == 'M'
7952 && (unicode_attributes[ch].category[1] == 'c'
7953 || unicode_attributes[ch].category[1] == 'n')
7954 && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
7955 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7956 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
7957 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
7958 || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
7959 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
7960 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
7961 || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
7962 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
7963 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
7964 || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
7965 || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */
7966 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
7967 || (ch >= 0x1173F && ch <= 0x11746) /* Ahom */)
7968 && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
7969 || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
7970 || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
7971 || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
7972 || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
7973 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
7974 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
7975 || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */
7976 || (ch >= 0x11700 && ch <= 0x1171A) /* Ahom */
7977 || (ch >= 0x1171D && ch <= 0x1172B) /* Ahom */
7978 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
7979 || (ch >= 0x1173F && ch <= 0x11746) /* Ahom */))
7980 attr |= (int64_t) 1 << LBP_SA;
7982 /* attached characters and combining marks */
7983 if ((unicode_attributes[ch].category[0] == 'M'
7984 && (unicode_attributes[ch].category[1] == 'c'
7985 || unicode_attributes[ch].category[1] == 'e'
7986 || unicode_attributes[ch].category[1] == 'n')
7987 && ch != 0x1BF2 /* BATAK PANGOLAT */
7988 && ch != 0x1BF3 /* BATAK PANONGONAN */)
7989 || (unicode_attributes[ch].category[0] == 'C'
7990 && (unicode_attributes[ch].category[1] == 'c'
7991 || unicode_attributes[ch].category[1] == 'f')
7992 && ch != 0x0600 /* ARABIC NUMBER SIGN */
7993 && ch != 0x0601 /* ARABIC SIGN SANAH */
7994 && ch != 0x0602 /* ARABIC FOOTNOTE MARKER */
7995 && ch != 0x0603 /* ARABIC SIGN SAFHA */
7996 && ch != 0x0604 /* ARABIC SIGN SAMVAT */
7997 && ch != 0x0605 /* ARABIC NUMBER MARK ABOVE */
7998 && ch != 0x06DD /* ARABIC END OF AYAH */
7999 && ch != 0x0890 /* ARABIC POUND MARK ABOVE */
8000 && ch != 0x0891 /* ARABIC PIASTRE MARK ABOVE */
8001 && ch != 0x08E2 /* ARABIC DISPUTED END OF AYAH */
8002 && ch != 0x110BD /* KAITHI NUMBER SIGN */
8003 && ch != 0x110CD /* KAITHI NUMBER SIGN ABOVE */
8004 && ch != 0x13437 /* EGYPTIAN HIEROGLYPH BEGIN SEGMENT */
8005 && ch != 0x13438 /* EGYPTIAN HIEROGLYPH END SEGMENT */
8006 && ch != 0x1343C /* EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE */
8007 && ch != 0x1343D /* EGYPTIAN HIEROGLYPH END ENCLOSURE */
8008 && ch != 0x1343E /* EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE */
8009 && ch != 0x1343F /* EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE */)
8010 || ch == 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
8011 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_CR) | ((int64_t) 1 << LBP_LF) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_VI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW) | ((int64_t) 1 << LBP_ZWJ))))
8012 attr |= (int64_t) 1 << LBP_CM;
8014 /* ideographic */
8015 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
8016 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
8017 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
8018 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
8019 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
8020 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
8021 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8022 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
8023 || (ch >= 0x1B50 && ch <= 0x1B59) /* BALINESE DIGIT ZERO..NINE */
8024 || ch == 0x1B5C /* BALINESE WINDU */
8025 || (ch >= 0x1B61 && ch <= 0x1B6A) /* BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE */
8026 || (ch >= 0x1B74 && ch <= 0x1B7C) /* BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING */
8027 || ch == 0x231A /* WATCH */
8028 || ch == 0x231B /* HOURGLASS */
8029 || ch == 0x23F0 /* ALARM CLOCK */
8030 || ch == 0x23F1 /* STOPWATCH */
8031 || ch == 0x23F2 /* TIMER CLOCK */
8032 || ch == 0x23F3 /* HOURGLASS WITH FLOWING SAND */
8033 || ch == 0x2600 /* BLACK SUN WITH RAYS */
8034 || ch == 0x2601 /* CLOUD */
8035 || ch == 0x2602 /* UMBRELLA */
8036 || ch == 0x2603 /* SNOWMAN */
8037 || ch == 0x2614 /* UMBRELLA WITH RAIN DROPS */
8038 || ch == 0x2615 /* HOT BEVERAGE */
8039 || ch == 0x2618 /* SHAMROCK */
8040 || ch == 0x261A /* BLACK LEFT POINTING INDEX */
8041 || ch == 0x261B /* BLACK RIGHT POINTING INDEX */
8042 || ch == 0x261C /* WHITE LEFT POINTING INDEX */
8043 || ch == 0x261D /* WHITE UP POINTING INDEX */
8044 || ch == 0x261E /* WHITE RIGHT POINTING INDEX */
8045 || ch == 0x261F /* WHITE DOWN POINTING INDEX */
8046 || ch == 0x2639 /* WHITE FROWNING FACE */
8047 || ch == 0x263A /* WHITE SMILING FACE */
8048 || ch == 0x263B /* BLACK SMILING FACE */
8049 || ch == 0x2668 /* HOT SPRINGS */
8050 || ch == 0x267F /* WHEELCHAIR SYMBOL */
8051 || ch == 0x26BD /* SOCCER BALL */
8052 || ch == 0x26BE /* BASEBALL */
8053 || ch == 0x26BF /* SQUARED KEY */
8054 || ch == 0x26C0 /* WHITE DRAUGHTS MAN */
8055 || ch == 0x26C1 /* WHITE DRAUGHTS KING */
8056 || ch == 0x26C2 /* BLACK DRAUGHTS MAN */
8057 || ch == 0x26C3 /* BLACK DRAUGHTS KING */
8058 || ch == 0x26C4 /* SNOWMAN WITHOUT SNOW */
8059 || ch == 0x26C5 /* SUN BEHIND CLOUD */
8060 || ch == 0x26C6 /* RAIN */
8061 || ch == 0x26C7 /* BLACK SNOWMAN */
8062 || ch == 0x26C8 /* THUNDER CLOUD AND RAIN */
8063 || ch == 0x26CD /* DISABLED CAR */
8064 || ch == 0x26CF /* PICK */
8065 || ch == 0x26D0 /* CAR SLIDING */
8066 || ch == 0x26D1 /* HELMET WITH WHITE CROSS */
8067 || ch == 0x26D3 /* CHAINS */
8068 || ch == 0x26D4 /* NO ENTRY */
8069 || ch == 0x26D8 /* BLACK LEFT LANE MERGE */
8070 || ch == 0x26D9 /* WHITE LEFT LANE MERGE */
8071 || ch == 0x26DC /* LEFT CLOSED ENTRY */
8072 || ch == 0x26DF /* BLACK TRUCK */
8073 || ch == 0x26E0 /* RESTRICTED LEFT ENTRY-1 */
8074 || ch == 0x26E1 /* RESTRICTED LEFT ENTRY-2 */
8075 || ch == 0x26EA /* CHURCH */
8076 || ch == 0x26F1 /* UMBRELLA ON GROUND */
8077 || ch == 0x26F2 /* FOUNTAIN */
8078 || ch == 0x26F3 /* FLAG IN HOLE */
8079 || ch == 0x26F4 /* FERRY */
8080 || ch == 0x26F5 /* SAILBOAT */
8081 || ch == 0x26F7 /* SKIER */
8082 || ch == 0x26F8 /* ICE SKATE */
8083 || ch == 0x26F9 /* PERSON WITH BALL */
8084 || ch == 0x26FA /* TENT */
8085 || ch == 0x26FD /* FUEL PUMP */
8086 || ch == 0x26FE /* CUP ON BLACK SQUARE */
8087 || ch == 0x26FF /* WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE */
8088 || ch == 0x2700 /* BLACK SAFETY SCISSORS */
8089 || ch == 0x2701 /* UPPER BLADE SCISSORS */
8090 || ch == 0x2702 /* BLACK SCISSORS */
8091 || ch == 0x2703 /* LOWER BLADE SCISSORS */
8092 || ch == 0x2704 /* WHITE SCISSORS */
8093 || ch == 0x2708 /* AIRPLANE */
8094 || ch == 0x2709 /* ENVELOPE */
8095 || ch == 0x270A /* RAISED FIST */
8096 || ch == 0x270B /* RAISED HAND */
8097 || ch == 0x270C /* VICTORY HAND */
8098 || ch == 0x270D /* WRITING HAND */
8099 || ch == 0x2764 /* HEAVY BLACK HEART */
8100 || (ch >= 0x3000 && ch <= 0x33FF
8101 && !(attr & (((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP1) | ((int64_t) 1 << LBP_OP2) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP1) | ((int64_t) 1 << LBP_CP2))))
8102 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
8103 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
8104 || (ch >= 0xA9C1 && ch <= 0xA9C6) /* JAVANESE LEFT RERENGGAN..JAVANESE PADA WINDU */
8105 || (ch >= 0xA9CA && ch <= 0xA9CD) /* JAVANESE PADA ADEG..JAVANESE TURNED PADA PISELEH */
8106 || (ch >= 0xA9D0 && ch <= 0xA9D9) /* JAVANESE DIGIT ZERO..NINE */
8107 || ch == 0xA9DE /* JAVANESE PADA TIRTA TUMETES */
8108 || ch == 0xA9DF /* JAVANESE PADA ISEN-ISEN */
8109 || (ch >= 0xAA50 && ch <= 0xAA59) /* CHAM DIGIT ZERO..NINE */
8110 || ch == 0xAA5C /* CHAM PUNCTUATION SPIRAL */
8111 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
8112 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
8113 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
8114 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
8115 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
8116 || ch == 0xFE45 /* SESAME DOT */
8117 || ch == 0xFE46 /* WHITE SESAME DOT */
8118 || ch == 0xFE49 /* DASHED OVERLINE */
8119 || ch == 0xFE4A /* CENTRELINE OVERLINE */
8120 || ch == 0xFE4B /* WAVY OVERLINE */
8121 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
8122 || ch == 0xFE4D /* DASHED LOW LINE */
8123 || ch == 0xFE4E /* CENTRELINE LOW LINE */
8124 || ch == 0xFE4F /* WAVY LOW LINE */
8125 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
8126 || ch == 0xFE58 /* SMALL EM DASH */
8127 || ch == 0xFE5F /* SMALL NUMBER SIGN */
8128 || ch == 0xFE60 /* SMALL AMPERSAND */
8129 || ch == 0xFE61 /* SMALL ASTERISK */
8130 || ch == 0xFE62 /* SMALL PLUS SIGN */
8131 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
8132 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
8133 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
8134 || ch == 0xFE66 /* SMALL EQUALS SIGN */
8135 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
8136 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
8137 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
8138 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
8139 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
8140 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
8141 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
8142 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
8143 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
8144 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
8145 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
8146 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
8147 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
8148 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
8149 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
8150 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
8151 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
8152 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
8153 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
8154 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
8155 || ch == 0xFF5E /* FULLWIDTH TILDE */
8156 || ch == 0xFF66 /* Halfwidth Katakana */
8157 || (ch >= 0xFF71 && ch <= 0xFF9D) /* Halfwidth Katakana */
8158 || (ch >= 0xFFA0 && ch <= 0xFFBE) /* Halfwidth Hangul */
8159 || (ch >= 0xFFC2 && ch <= 0xFFC7) /* Halfwidth Hangul */
8160 || (ch >= 0xFFCA && ch <= 0xFFCF) /* Halfwidth Hangul */
8161 || (ch >= 0xFFD2 && ch <= 0xFFD7) /* Halfwidth Hangul */
8162 || (ch >= 0xFFDA && ch <= 0xFFDC) /* Halfwidth Hangul */
8163 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
8164 || ch == 0xFFE3 /* FULLWIDTH MACRON */
8165 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
8166 || (ch >= 0x11049 && ch <= 0x1104D) /* BRAHMI PUNCTUATION DOT..BRAHMI PUNCTUATION LOTUS */
8167 || (ch >= 0x11052 && ch <= 0x11065) /* BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND */
8168 || (ch >= 0x11950 && ch <= 0x11959) /* DIVES AKURU DIGIT ZERO..NINE */
8169 || (ch >= 0x11F45 && ch <= 0x11F4F) /* Kawi Punctuation */
8170 || (ch >= 0x17000 && ch <= 0x187F7) /* Tangut Ideograph */
8171 || (ch >= 0x18800 && ch <= 0x18AFF) /* Tangut Ideograph */
8172 || (ch >= 0x18D00 && ch <= 0x18D08) /* Tangut Ideograph Supplement */
8173 || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
8174 || (ch >= 0x1B002 && ch <= 0x1B122) /* Hentaigana, archaic Hiragana/Katakana */
8175 || (ch >= 0x1B170 && ch <= 0x1B2FB) /* Nushu */
8176 || (ch >= 0x1F000 && ch <= 0x1F02B) /* Mahjong Tiles */
8177 || (ch >= 0x1F030 && ch <= 0x1F093) /* Domino Tiles */
8178 || (ch >= 0x1F0A0 && ch <= 0x1F0F5) /* Playing Cards */
8179 || (ch >= 0x1F10D && ch <= 0x1F10F) /* Circled Symbols */
8180 || (ch >= 0x1F16D && ch <= 0x1F16F) /* Circled Symbols */
8181 || ch == 0x1F1AD /* MASK WORK SYMBOL */
8182 || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
8183 || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
8184 || (ch >= 0x1F260 && ch <= 0x1F265) /* Rounded Symbols */
8185 || (ch >= 0x1F300 && ch <= 0x1F5FF /* Miscellaneous Symbols and Pictographs */
8186 && ch != 0x1F3B5 && ch != 0x1F3B6 && ch != 0x1F3BC
8187 && ch != 0x1F4A0 && ch != 0x1F4A2 && ch != 0x1F4A4
8188 && ch != 0x1F4AF && ch != 0x1F4B1 && ch != 0x1F4B2
8189 && !(ch >= 0x1F39C && ch <= 0x1F39D)
8190 && !(ch >= 0x1F3FB && ch <= 0x1F3FF)
8191 && !(ch >= 0x1F500 && ch <= 0x1F506)
8192 && !(ch >= 0x1F517 && ch <= 0x1F524)
8193 && !(ch >= 0x1F532 && ch <= 0x1F549)
8194 && !(ch >= 0x1F5D4 && ch <= 0x1F5DB)
8195 && !(ch >= 0x1F5F4 && ch <= 0x1F5F9))
8196 || (ch >= 0x1F600 && ch <= 0x1F64F) /* Emoticons */
8197 || (ch >= 0x1F680 && ch <= 0x1F6DF) /* Transport and Map Symbols */
8198 || (ch >= 0x1F6E0 && ch <= 0x1F6EC) /* Transport and Map Symbols */
8199 || (ch >= 0x1F6F0 && ch <= 0x1F6FC) /* Transport and Map Symbols */
8200 || ch == 0x1F774 /* LOT OF FORTUNE */
8201 || ch == 0x1F775 /* OCCULTATION */
8202 || ch == 0x1F776 /* LUNAR ECLIPSE */
8203 || ch == 0x1F77B /* HAUMEA */
8204 || ch == 0x1F77C /* MAKEMAKE */
8205 || ch == 0x1F77D /* GONGGONG */
8206 || ch == 0x1F77E /* QUAOAR */
8207 || ch == 0x1F77F /* ORCUS */
8208 || (ch >= 0x1F7D5 && ch <= 0x1F7D8) /* Circled polygons */
8209 || ch == 0x1F7D9 /* NINE POINTED WHITE STAR */
8210 || (ch >= 0x1F7E0 && ch <= 0x1F7EB) /* Large circles */
8211 || ch == 0x1F7F0 /* Heavy equals sign */
8212 || (ch >= 0x1F8B0 && ch <= 0x1F8B1) /* Curved arrows */
8213 || (ch >= 0x1F90C && ch <= 0x1F9FF) /* Supplemental Symbols and Pictographs */
8214 || (ch >= 0x1FA60 && ch <= 0x1FA6D) /* Xiangqi pieces */
8215 || (ch >= 0x1FA70 && ch <= 0x1FA74) /* Emoticons */
8216 || (ch >= 0x1FA75 && ch <= 0x1FA77) /* Colored heart symbols */
8217 || (ch >= 0x1FA78 && ch <= 0x1FA7C) /* Medical pictographs */
8218 || (ch >= 0x1FA80 && ch <= 0x1FA88) /* Pictographs */
8219 || (ch >= 0x1FA90 && ch <= 0x1FABD) /* Pictographs */
8220 || (ch >= 0x1FABF && ch <= 0x1FAC2) /* Pictographs */
8221 || (ch >= 0x1FACE && ch <= 0x1FADB) /* Pictographs */
8222 || (ch >= 0x1FAE0 && ch <= 0x1FAE8) /* Pictographs */
8223 || (ch >= 0x1FAF7 && ch <= 0x1FAF8) /* Pictographs */
8224 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
8225 || (ch >= 0x2A6D7 && ch <= 0x2A6DF) /* CJK Ideograph Extension B */
8226 || (ch >= 0x2A700 && ch <= 0x2B739) /* CJK Ideograph Extension C */
8227 || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */
8228 || (ch >= 0x2B820 && ch <= 0x2CEAF) /* CJK Ideograph Extension E */
8229 || (ch >= 0x2CEB0 && ch <= 0x2EBE0) /* CJK Ideograph Extension F */
8230 || (ch >= 0x2EBF0 && ch <= 0x2EE5D) /* CJK Ideograph Extension I */
8231 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
8232 || (ch >= 0x30000 && ch <= 0x3134A) /* CJK Ideograph Extension G */
8233 || (ch >= 0x31350 && ch <= 0x323AF) /* CJK Ideograph Extension H */)
8234 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_EB))))
8236 /* ambiguous (ideograph) ? */
8237 if ((unicode_width[ch] != NULL
8238 && unicode_width[ch][0] == 'A'
8239 && ch >= 0x2000
8240 && ch != 0x2614
8241 && ch != 0x2615
8242 && ch != 0x261C
8243 && ch != 0x261E
8244 && ch != 0x2668
8245 && ch != 0x26BE
8246 && ch != 0x26BF
8247 && !(ch >= 0x26C4 && ch <= 0x26C8)
8248 && ch != 0x26CD
8249 && ch != 0x26CF
8250 && ch != 0x26D0
8251 && ch != 0x26D1
8252 && ch != 0x26D3
8253 && ch != 0x26D4
8254 && ch != 0x26D8
8255 && ch != 0x26D9
8256 && ch != 0x26DC
8257 && ch != 0x26DF
8258 && ch != 0x26E0
8259 && ch != 0x26E1
8260 && ch != 0x26EA
8261 && !(ch >= 0x26F1 && ch <= 0x26F5)
8262 && !(ch >= 0x26F7 && ch <= 0x26FA)
8263 && !(ch >= 0x26FD && ch <= 0x26FF))
8264 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
8265 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
8266 || (ch >= 0x3248 && ch <= 0x324F) /* CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE */)
8267 attr |= (int64_t) 1 << LBP_AI;
8268 else
8269 attr |= (int64_t) 1 << LBP_ID1;
8272 /* ordinary alphabetic and symbol characters */
8273 if ((unicode_attributes[ch].category[0] == 'L'
8274 && (unicode_attributes[ch].category[1] == 'u'
8275 || unicode_attributes[ch].category[1] == 'l'
8276 || unicode_attributes[ch].category[1] == 't'
8277 || unicode_attributes[ch].category[1] == 'm'
8278 || unicode_attributes[ch].category[1] == 'o'))
8279 || (unicode_attributes[ch].category[0] == 'S'
8280 && (unicode_attributes[ch].category[1] == 'm'
8281 || unicode_attributes[ch].category[1] == 'k'
8282 || unicode_attributes[ch].category[1] == 'o'))
8283 || (unicode_attributes[ch].category[0] == 'N'
8284 && (unicode_attributes[ch].category[1] == 'l'
8285 || unicode_attributes[ch].category[1] == 'o'))
8286 || (unicode_attributes[ch].category[0] == 'P'
8287 && (unicode_attributes[ch].category[1] == 'c'
8288 || unicode_attributes[ch].category[1] == 'd'
8289 || unicode_attributes[ch].category[1] == 'o'))
8290 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
8291 || ch == 0x2061 /* FUNCTION APPLICATION */
8292 || ch == 0x2062 /* INVISIBLE TIMES */
8293 || ch == 0x2063 /* INVISIBLE SEPARATOR */
8294 || ch == 0x2064 /* INVISIBLE PLUS */
8295 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8296 || ch == 0x08E2 /* ARABIC DISPUTED END OF AYAH */)
8297 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP1) | ((int64_t) 1 << LBP_CP2) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP1) | ((int64_t) 1 << LBP_OP2) | ((int64_t) 1 << LBP_QU1) | ((int64_t) 1 << LBP_QU2) | ((int64_t) 1 << LBP_QU3) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_AP) | ((int64_t) 1 << LBP_AK) | ((int64_t) 1 << LBP_AS) | ((int64_t) 1 << LBP_VI) | ((int64_t) 1 << LBP_VF) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID1) | ((int64_t) 1 << LBP_ID2) | ((int64_t) 1 << LBP_EB) | ((int64_t) 1 << LBP_EM)))
8298 && ch != 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */
8299 && !(ch >= 0x3248 && ch <= 0x324F) /* CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE */)
8301 /* ambiguous (alphabetic) ? */
8302 if ((unicode_width[ch] != NULL
8303 && unicode_width[ch][0] == 'A'
8304 && ch >= 0x2000
8305 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
8306 && ch != 0x2022 /* BULLET */
8307 && ch != 0x203E /* OVERLINE */
8308 && ch != 0x2126 /* OHM SIGN */
8309 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
8310 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
8311 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
8312 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
8313 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
8314 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
8315 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
8316 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
8317 || ch == 0x00A7 /* SECTION SIGN */
8318 || ch == 0x00A8 /* DIAERESIS */
8319 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
8320 || ch == 0x00B2 /* SUPERSCRIPT TWO */
8321 || ch == 0x00B3 /* SUPERSCRIPT THREE */
8322 || ch == 0x00B6 /* PILCROW SIGN */
8323 || ch == 0x00B7 /* MIDDLE DOT */
8324 || ch == 0x00B8 /* CEDILLA */
8325 || ch == 0x00B9 /* SUPERSCRIPT ONE */
8326 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
8327 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
8328 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
8329 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
8330 || ch == 0x00D7 /* MULTIPLICATION SIGN */
8331 || ch == 0x00F7 /* DIVISION SIGN */
8332 || ch == 0x02C7 /* CARON */
8333 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
8334 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
8335 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
8336 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
8337 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
8338 || ch == 0x02D8 /* BREVE */
8339 || ch == 0x02D9 /* DOT ABOVE */
8340 || ch == 0x02DA /* RING ABOVE */
8341 || ch == 0x02DB /* OGONEK */
8342 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
8343 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
8344 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
8345 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8346 || ch == 0x2015 /* HORIZONTAL BAR */
8347 || ch == 0x2016 /* DOUBLE VERTICAL LINE */
8348 || ch == 0x2020 /* DAGGER */
8349 || ch == 0x2021 /* DOUBLE DAGGER */
8350 || ch == 0x203B /* REFERENCE MARK */
8351 || ch == 0x2074 /* SUPERSCRIPT FOUR */
8352 || ch == 0x207F /* SUPERSCRIPT LATIN SMALL LETTER N */
8353 || (ch >= 0x2081 && ch <= 0x2084) /* SUBSCRIPT ONE..FOUR */
8354 || ch == 0x2105 /* CARE OF */
8355 || ch == 0x2113 /* SCRIPT SMALL L */
8356 || ch == 0x2121 /* TELEPHONE SIGN */
8357 || ch == 0x2122 /* TRADE MARK SIGN */
8358 || ch == 0x212B /* ANGSTROM SIGN */
8359 || ch == 0x2154 /* VULGAR FRACTION TWO THIRDS */
8360 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
8361 || ch == 0x215B /* VULGAR FRACTION ONE EIGHTH */
8362 || ch == 0x215E /* VULGAR FRACTION SEVEN EIGHTHS */
8363 || (ch >= 0x2160 && ch <= 0x216B) /* ROMAN NUMERAL ONE..TWELVE */
8364 || (ch >= 0x2170 && ch <= 0x2179) /* SMALL ROMAN NUMERAL ONE..TEN */
8365 || ch == 0x2189 /* VULGAR FRACTION ZERO THIRDS */
8366 || (ch >= 0x2190 && ch <= 0x2199) /* LEFTWARDS ARROW..SOUTH WEST ARROW */
8367 || ch == 0x21D2 /* RIGHTWARDS DOUBLE ARROW */
8368 || ch == 0x21D4 /* LEFT RIGHT DOUBLE ARROW */
8369 || ch == 0x2200 /* FOR ALL */
8370 || ch == 0x2202 /* PARTIAL DIFFERENTIAL */
8371 || ch == 0x2203 /* THERE EXISTS */
8372 || ch == 0x2207 /* NABLA */
8373 || ch == 0x2208 /* ELEMENT OF */
8374 || ch == 0x220B /* CONTAINS AS MEMBER */
8375 || ch == 0x220F /* N-ARY PRODUCT */
8376 || ch == 0x2211 /* N-ARY SUMMATION */
8377 || ch == 0x2215 /* DIVISION SLASH */
8378 || ch == 0x221A /* SQUARE ROOT */
8379 || ch == 0x221D /* PROPORTIONAL TO */
8380 || ch == 0x221E /* INFINITY */
8381 || ch == 0x221F /* RIGHT ANGLE */
8382 || ch == 0x2220 /* ANGLE */
8383 || ch == 0x2223 /* DIVIDES */
8384 || ch == 0x2225 /* PARALLEL TO */
8385 || ch == 0x2227 /* LOGICAL AND */
8386 || ch == 0x2228 /* LOGICAL OR */
8387 || ch == 0x2229 /* INTERSECTION */
8388 || ch == 0x222A /* UNION */
8389 || ch == 0x222B /* INTEGRAL */
8390 || ch == 0x222C /* DOUBLE INTEGRAL */
8391 || ch == 0x222E /* CONTOUR INTEGRAL */
8392 || ch == 0x2234 /* THEREFORE */
8393 || ch == 0x2235 /* BECAUSE */
8394 || ch == 0x2236 /* RATIO */
8395 || ch == 0x2237 /* PROPORTION */
8396 || ch == 0x223C /* TILDE OPERATOR */
8397 || ch == 0x223D /* REVERSED TILDE */
8398 || ch == 0x2248 /* ALMOST EQUAL TO */
8399 || ch == 0x224C /* ALL EQUAL TO */
8400 || ch == 0x2252 /* APPROXIMATELY EQUAL TO OR THE IMAGE OF */
8401 || ch == 0x2260 /* NOT EQUAL TO */
8402 || ch == 0x2261 /* IDENTICAL TO */
8403 || ch == 0x2264 /* LESS-THAN OR EQUAL TO */
8404 || ch == 0x2265 /* GREATER-THAN OR EQUAL TO */
8405 || ch == 0x2266 /* LESS-THAN OVER EQUAL TO */
8406 || ch == 0x2267 /* GREATER-THAN OVER EQUAL TO */
8407 || ch == 0x226A /* MUCH LESS-THAN */
8408 || ch == 0x226B /* MUCH GREATER-THAN */
8409 || ch == 0x226E /* NOT LESS-THAN */
8410 || ch == 0x226F /* NOT GREATER-THAN */
8411 || ch == 0x2282 /* SUBSET OF */
8412 || ch == 0x2283 /* SUPERSET OF */
8413 || ch == 0x2286 /* SUBSET OF OR EQUAL TO */
8414 || ch == 0x2287 /* SUPERSET OF OR EQUAL TO */
8415 || ch == 0x2295 /* CIRCLED PLUS */
8416 || ch == 0x2299 /* CIRCLED DOT OPERATOR */
8417 || ch == 0x22A5 /* UP TACK */
8418 || ch == 0x22BF /* RIGHT TRIANGLE */
8419 || ch == 0x2312 /* ARC */
8420 || (ch >= 0x2460 && ch <= 0x24E9) /* CIRCLED DIGIT ONE..CIRCLED LATIN SMALL LETTER Z */
8421 || (ch >= 0x24EB && ch <= 0x24FE) /* NEGATIVE CIRCLED NUMBER ELEVEN..NEGATIVE CIRCLED DIGIT ZERO */
8422 || (ch >= 0x2500 && ch <= 0x254B) /* BOX DRAWINGS LIGHT HORIZONTAL..BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL */
8423 || (ch >= 0x2550 && ch <= 0x2574) /* BOX DRAWINGS DOUBLE HORIZONTAL..BOX DRAWINGS LIGHT LEFT */
8424 || (ch >= 0x2580 && ch <= 0x258F) /* UPPER HALF BLOCK..LEFT ONE EIGHTH BLOCK */
8425 || (ch >= 0x2592 && ch <= 0x2595) /* MEDIUM SHADE..RIGHT ONE EIGHTH BLOCK */
8426 || ch == 0x25A0 /* BLACK SQUARE */
8427 || ch == 0x25A1 /* WHITE SQUARE */
8428 || (ch >= 0x25A3 && ch <= 0x25A9) /* WHITE SQUARE CONTAINING BLACK SMALL SQUARE..SQUARE WITH DIAGONAL CROSSHATCH FILL */
8429 || ch == 0x25B2 /* BLACK UP-POINTING TRIANGLE */
8430 || ch == 0x25B3 /* WHITE UP-POINTING TRIANGLE */
8431 || ch == 0x25B6 /* BLACK RIGHT-POINTING TRIANGLE */
8432 || ch == 0x25B7 /* WHITE RIGHT-POINTING TRIANGLE */
8433 || ch == 0x25BC /* BLACK DOWN-POINTING TRIANGLE */
8434 || ch == 0x25BD /* WHITE DOWN-POINTING TRIANGLE */
8435 || ch == 0x25C0 /* BLACK LEFT-POINTING TRIANGLE */
8436 || ch == 0x25C1 /* WHITE LEFT-POINTING TRIANGLE */
8437 || (ch >= 0x25C6 && ch <= 0x25C8) /* BLACK DIAMOND..WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND */
8438 || ch == 0x25CB /* WHITE CIRCLE */
8439 || (ch >= 0x25CE && ch <= 0x25D1) /* BULLSEYE..CIRCLE WITH RIGHT HALF BLACK */
8440 || (ch >= 0x25E2 && ch <= 0x25E5) /* BLACK LOWER RIGHT TRIANGLE..BLACK UPPER RIGHT TRIANGLE */
8441 || ch == 0x25EF /* LARGE CIRCLE */
8442 || ch == 0x2605 /* BLACK STAR */
8443 || ch == 0x2606 /* WHITE STAR */
8444 || ch == 0x2609 /* SUN */
8445 || ch == 0x260E /* BLACK TELEPHONE */
8446 || ch == 0x260F /* WHITE TELEPHONE */
8447 || ch == 0x2616 /* WHITE SHOGI PIECE */
8448 || ch == 0x2617 /* BLACK SHOGI PIECE */
8449 || ch == 0x2640 /* FEMALE SIGN */
8450 || ch == 0x2642 /* MALE SIGN */
8451 || ch == 0x2660 /* BLACK SPADE SUIT */
8452 || ch == 0x2661 /* WHITE HEART SUIT */
8453 || (ch >= 0x2663 && ch <= 0x2665) /* BLACK CLUB SUIT..BLACK HEART SUIT */
8454 || ch == 0x2667 /* WHITE CLUB SUIT */
8455 || ch == 0x2669 /* QUARTER NOTE */
8456 || ch == 0x266A /* EIGHTH NOTE */
8457 || ch == 0x266C /* BEAMED SIXTEENTH NOTES */
8458 || ch == 0x266D /* MUSIC FLAT SIGN */
8459 || ch == 0x266F /* MUSIC SHARP SIGN */
8460 || ch == 0x269E /* THREE LINES CONVERGING RIGHT */
8461 || ch == 0x269F /* THREE LINES CONVERGING LEFT */
8462 || (ch >= 0x26C9 && ch <= 0x26CC) /* TURNED WHITE SHOGI PIECE..CROSSING LANES */
8463 || ch == 0x26D2 /* CIRCLED CROSSING LANES */
8464 || (ch >= 0x26D5 && ch <= 0x26D7) /* ALTERNATE ONE-WAY LEFT WAY TRAFFIC..WHITE TWO-WAY LEFT WAY TRAFFIC */
8465 || ch == 0x26DA /* DRIVE SLOW SIGN */
8466 || ch == 0x26DB /* HEAVY WHITE DOWN-POINTING TRIANGLE */
8467 || ch == 0x26DD /* SQUARED SALTIRE */
8468 || ch == 0x26DE /* FALLING DIAGONAL IN WHITE CIRCLE IN BLACK SQUARE */
8469 || ch == 0x26E3 /* HEAVY CIRCLE WITH STROKE AND TWO DOTS ABOVE */
8470 || ch == 0x26E8 /* BLACK CROSS ON SHIELD */
8471 || ch == 0x26E9 /* SHINTO SHRINE */
8472 || (ch >= 0x26EB && ch <= 0x26F0) /* CASTLE..MOUNTAIN */
8473 || ch == 0x26F6 /* SQUARE FOUR CORNERS */
8474 || ch == 0x26FB /* JAPANESE BANK SYMBOL */
8475 || ch == 0x26FC /* HEADSTONE GRAVEYARD SYMBOL */
8476 || ch == 0x2757 /* HEAVY EXCLAMATION MARK SYMBOL */
8477 || (ch >= 0x2776 && ch <= 0x277F) /* DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED NUMBER TEN */
8478 || (ch >= 0x2B55 && ch <= 0x2B59) /* HEAVY LARGE CIRCLE..HEAVY CIRCLED SALTIRE */
8479 || ch == 0xFFFD /* REPLACEMENT CHARACTER */
8480 || (ch >= 0x1F100 && ch <= 0x1F10C) /* DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */
8481 || (ch >= 0x1F110 && ch <= 0x1F12D) /* PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED CD */
8482 || (ch >= 0x1F130 && ch <= 0x1F169) /* SQUARED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z */
8483 || (ch >= 0x1F170 && ch <= 0x1F1AC) /* NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VOD */)
8484 attr |= (int64_t) 1 << LBP_AI;
8485 else
8486 attr |= (int64_t) 1 << LBP_AL;
8487 attr &= ~((int64_t) 1 << LBP_CM);
8490 else
8492 /* Unassigned character. */
8493 if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
8494 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
8495 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
8496 || (ch >= 0x1F02C && ch <= 0x1F02F) /* reserved */
8497 || (ch >= 0x1F094 && ch <= 0x1F09F) /* reserved */
8498 || (ch >= 0x1F0AF && ch <= 0x1F0B0) /* reserved */
8499 || ch == 0x1F0C0 /* reserved */
8500 || ch == 0x1F0D0 /* reserved */
8501 || (ch >= 0x1F0F6 && ch <= 0x1F0FF) /* reserved */
8502 || (ch >= 0x1F10D && ch <= 0x1F10F) /* reserved */
8503 || ch == 0x1F12F /* reserved */
8504 || (ch >= 0x1F16C && ch <= 0x1F16F) /* reserved */
8505 || (ch >= 0x1F1AD && ch <= 0x1F1E5) /* reserved */
8506 || (ch >= 0x1F203 && ch <= 0x1F20F) /* reserved */
8507 || (ch >= 0x1F23C && ch <= 0x1F23F) /* reserved */
8508 || (ch >= 0x1F249 && ch <= 0x1F24F) /* reserved */
8509 || (ch >= 0x1F252 && ch <= 0x1F2FF) /* reserved */
8510 || (ch >= 0x1F6D3 && ch <= 0x1F6DF) /* reserved */
8511 || (ch >= 0x1F6ED && ch <= 0x1F6EF) /* reserved */
8512 || (ch >= 0x1F6F7 && ch <= 0x1F6FF) /* reserved */
8513 || (ch >= 0x1F774 && ch <= 0x1F77F) /* reserved */
8514 || (ch >= 0x1F7D5 && ch <= 0x1F7FF) /* reserved */
8515 || (ch >= 0x1F80C && ch <= 0x1F80F) /* reserved */
8516 || (ch >= 0x1F848 && ch <= 0x1F84F) /* reserved */
8517 || (ch >= 0x1F85A && ch <= 0x1F85F) /* reserved */
8518 || (ch >= 0x1F888 && ch <= 0x1F88F) /* reserved */
8519 || (ch >= 0x1F8AE && ch <= 0x1F90F) /* reserved */
8520 || ch == 0x1F91F /* reserved */
8521 || ch == 0x1F93F /* reserved */
8522 || (ch >= 0x1F928 && ch <= 0x1F92F) /* reserved */
8523 || (ch >= 0x1F931 && ch <= 0x1F932) /* reserved */
8524 || (ch >= 0x1F94C && ch <= 0x1F94F) /* reserved */
8525 || (ch >= 0x1F95F && ch <= 0x1F97F) /* reserved */
8526 || (ch >= 0x1F992 && ch <= 0x1F9BF) /* reserved */
8527 || (ch >= 0x1F9C1 && ch <= 0x1FB92) /* reserved */
8528 || (ch >= 0x1FB94 && ch <= 0x1FBCA) /* reserved */
8529 || (ch >= 0x1FBF0 && ch <= 0x1FBF9) /* reserved */
8530 || (ch >= 0x1FC00 && ch <= 0x1FFFD) /* reserved */
8531 || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
8532 || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
8533 Supplementary Ideographic Plane (Plane 2) outside of blocks */
8534 || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
8535 Supplementary Ideographic Plane (Plane 2) outside of blocks */
8536 || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
8538 if (is_property_extended_pictographic (ch))
8539 attr |= (int64_t) 1 << LBP_ID2;
8540 else
8541 attr |= (int64_t) 1 << LBP_ID1;
8545 if (attr == 0)
8546 /* unknown */
8547 attr |= (int64_t) 1 << LBP_XX;
8549 return attr;
8552 /* Output the line breaking properties in a human readable format. */
8553 static void
8554 debug_output_lbp (FILE *stream)
8556 unsigned int i;
8558 for (i = 0; i < 0x110000; i++)
8560 int64_t attr = get_lbp (i);
8561 if (attr != (int64_t) 1 << LBP_XX)
8563 fprintf (stream, "0x%04X", i);
8564 #define PRINT_BIT(attr,bit) \
8565 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
8566 #define PRINT_BIT_ALT(attr,bit,name) \
8567 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #name);
8568 PRINT_BIT(attr,LBP_BK);
8569 PRINT_BIT(attr,LBP_CR);
8570 PRINT_BIT(attr,LBP_LF);
8571 PRINT_BIT(attr,LBP_CM);
8572 PRINT_BIT(attr,LBP_WJ);
8573 PRINT_BIT(attr,LBP_ZW);
8574 PRINT_BIT(attr,LBP_GL);
8575 PRINT_BIT(attr,LBP_SP);
8576 PRINT_BIT(attr,LBP_B2);
8577 PRINT_BIT(attr,LBP_BA);
8578 PRINT_BIT(attr,LBP_BB);
8579 PRINT_BIT(attr,LBP_HY);
8580 PRINT_BIT(attr,LBP_CB);
8581 PRINT_BIT(attr,LBP_CL);
8582 PRINT_BIT_ALT(attr,LBP_CP1,LBP_CP);
8583 PRINT_BIT_ALT(attr,LBP_CP2,LBP_CP);
8584 PRINT_BIT(attr,LBP_EX);
8585 PRINT_BIT(attr,LBP_IN);
8586 PRINT_BIT(attr,LBP_NS);
8587 PRINT_BIT_ALT(attr,LBP_OP1,LBP_OP);
8588 PRINT_BIT_ALT(attr,LBP_OP2,LBP_OP);
8589 PRINT_BIT_ALT(attr,LBP_QU1,LBP_QU);
8590 PRINT_BIT_ALT(attr,LBP_QU2,LBP_QU);
8591 PRINT_BIT_ALT(attr,LBP_QU3,LBP_QU);
8592 PRINT_BIT(attr,LBP_IS);
8593 PRINT_BIT(attr,LBP_NU);
8594 PRINT_BIT(attr,LBP_PO);
8595 PRINT_BIT(attr,LBP_PR);
8596 PRINT_BIT(attr,LBP_SY);
8597 PRINT_BIT(attr,LBP_AI);
8598 PRINT_BIT(attr,LBP_AL);
8599 PRINT_BIT(attr,LBP_H2);
8600 PRINT_BIT(attr,LBP_H3);
8601 PRINT_BIT(attr,LBP_HL);
8602 PRINT_BIT_ALT(attr,LBP_ID1,LBP_ID);
8603 PRINT_BIT_ALT(attr,LBP_ID2,LBP_ID);
8604 PRINT_BIT(attr,LBP_JL);
8605 PRINT_BIT(attr,LBP_JV);
8606 PRINT_BIT(attr,LBP_JT);
8607 PRINT_BIT(attr,LBP_AP);
8608 PRINT_BIT(attr,LBP_AK);
8609 PRINT_BIT(attr,LBP_AS);
8610 PRINT_BIT(attr,LBP_VI);
8611 PRINT_BIT(attr,LBP_VF);
8612 PRINT_BIT(attr,LBP_RI);
8613 PRINT_BIT(attr,LBP_SA);
8614 PRINT_BIT(attr,LBP_ZWJ);
8615 PRINT_BIT(attr,LBP_EB);
8616 PRINT_BIT(attr,LBP_EM);
8617 PRINT_BIT(attr,LBP_XX);
8618 #undef PRINT_BIT_ALT
8619 #undef PRINT_BIT
8620 fprintf (stream, "\n");
8625 static void
8626 debug_output_lbrk_tables (const char *filename)
8628 FILE *stream;
8630 stream = fopen (filename, "w");
8631 if (stream == NULL)
8633 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8634 exit (1);
8637 debug_output_lbp (stream);
8639 if (ferror (stream) || fclose (stream))
8641 fprintf (stderr, "error writing to '%s'\n", filename);
8642 exit (1);
8646 /* The line breaking property from the LineBreak.txt file. */
8647 int unicode_org_lbp[0x110000];
8649 /* Stores in unicode_org_lbp[] the line breaking property from the
8650 LineBreak.txt file. */
8651 static void
8652 fill_org_lbp (const char *linebreak_filename)
8654 unsigned int i, j;
8655 FILE *stream;
8656 char field0[FIELDLEN];
8657 char field1[FIELDLEN];
8658 char field2[FIELDLEN];
8659 int lineno = 0;
8661 for (i = 0; i < 0x110000; i++)
8662 unicode_org_lbp[i] = LBP_XX;
8664 stream = fopen (linebreak_filename, "r");
8665 if (stream == NULL)
8667 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
8668 exit (1);
8671 for (;;)
8673 int n;
8674 int c;
8675 int value;
8677 lineno++;
8678 c = getc (stream);
8679 if (c == EOF)
8680 break;
8681 if (c == '\n')
8682 continue;
8683 if (c == '#')
8685 do c = getc (stream); while (c != EOF && c != '\n');
8686 continue;
8688 ungetc (c, stream);
8689 n = getfield (stream, field0, ';');
8690 do c = getc (stream); while (c == ' ');
8691 ungetc (c, stream);
8692 n += getfield (stream, field1, '#');
8693 n += getfield (stream, field2, '\n');
8694 if (n == 0)
8695 break;
8696 if (n != 3)
8698 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
8699 lineno);
8700 exit (1);
8702 /* Remove trailing spaces from field0. */
8703 while (strlen (field0) > 0 && field0[strlen (field0) - 1] == ' ')
8704 field0[strlen (field0) - 1] = '\0';
8705 /* Remove trailing spaces from field1. */
8706 while (strlen (field1) > 0 && field1[strlen (field1) - 1] == ' ')
8707 field1[strlen (field1) - 1] = '\0';
8708 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
8709 if (false) {}
8710 TRY(LBP_BK)
8711 TRY(LBP_CR)
8712 TRY(LBP_LF)
8713 TRY(LBP_CM)
8714 TRY(LBP_WJ)
8715 TRY(LBP_ZW)
8716 TRY(LBP_GL)
8717 TRY(LBP_SP)
8718 TRY(LBP_B2)
8719 TRY(LBP_BA)
8720 TRY(LBP_BB)
8721 TRY(LBP_HY)
8722 TRY(LBP_CB)
8723 TRY(LBP_CL)
8724 TRY(LBP_CP)
8725 TRY(LBP_EX)
8726 TRY(LBP_IN)
8727 TRY(LBP_NS)
8728 TRY(LBP_OP)
8729 TRY(LBP_QU)
8730 TRY(LBP_IS)
8731 TRY(LBP_NU)
8732 TRY(LBP_PO)
8733 TRY(LBP_PR)
8734 TRY(LBP_SY)
8735 TRY(LBP_AI)
8736 TRY(LBP_AL)
8737 TRY(LBP_H2)
8738 TRY(LBP_H3)
8739 TRY(LBP_HL)
8740 TRY(LBP_ID)
8741 TRY(LBP_JL)
8742 TRY(LBP_JV)
8743 TRY(LBP_JT)
8744 TRY(LBP_AP)
8745 TRY(LBP_AK)
8746 TRY(LBP_AS)
8747 TRY(LBP_VI)
8748 TRY(LBP_VF)
8749 TRY(LBP_RI)
8750 TRY(LBP_SA)
8751 TRY(LBP_ZWJ)
8752 TRY(LBP_EB)
8753 TRY(LBP_EM)
8754 TRY(LBP_XX)
8755 #undef TRY
8756 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
8757 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
8758 else if (strcmp (field1, "CJ") == 0) value = LBP_NS;
8759 else
8761 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
8762 field1, linebreak_filename, lineno);
8763 exit (1);
8765 i = strtoul (field0, NULL, 16);
8766 if (strstr (field0, "..") != NULL)
8768 /* Deal with a range. */
8769 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
8770 for (; i <= j; i++)
8771 unicode_org_lbp[i] = value;
8773 else
8775 /* Single character line. */
8776 unicode_org_lbp[i] = value;
8780 if (ferror (stream) || fclose (stream))
8782 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
8783 exit (1);
8787 /* Output the line breaking properties in a human readable format. */
8788 static void
8789 debug_output_org_lbp (FILE *stream)
8791 unsigned int i;
8793 for (i = 0; i < 0x110000; i++)
8795 int attr = unicode_org_lbp[i];
8796 if (attr != LBP_XX)
8798 fprintf (stream, "0x%04X", i);
8799 #define PRINT_BIT(attr,bit) \
8800 if (attr == bit) fprintf (stream, " " #bit);
8801 PRINT_BIT(attr,LBP_BK);
8802 PRINT_BIT(attr,LBP_CR);
8803 PRINT_BIT(attr,LBP_LF);
8804 PRINT_BIT(attr,LBP_CM);
8805 PRINT_BIT(attr,LBP_WJ);
8806 PRINT_BIT(attr,LBP_ZW);
8807 PRINT_BIT(attr,LBP_GL);
8808 PRINT_BIT(attr,LBP_SP);
8809 PRINT_BIT(attr,LBP_B2);
8810 PRINT_BIT(attr,LBP_BA);
8811 PRINT_BIT(attr,LBP_BB);
8812 PRINT_BIT(attr,LBP_HY);
8813 PRINT_BIT(attr,LBP_CB);
8814 PRINT_BIT(attr,LBP_CL);
8815 PRINT_BIT(attr,LBP_CP);
8816 PRINT_BIT(attr,LBP_EX);
8817 PRINT_BIT(attr,LBP_IN);
8818 PRINT_BIT(attr,LBP_NS);
8819 PRINT_BIT(attr,LBP_OP);
8820 PRINT_BIT(attr,LBP_QU);
8821 PRINT_BIT(attr,LBP_IS);
8822 PRINT_BIT(attr,LBP_NU);
8823 PRINT_BIT(attr,LBP_PO);
8824 PRINT_BIT(attr,LBP_PR);
8825 PRINT_BIT(attr,LBP_SY);
8826 PRINT_BIT(attr,LBP_AI);
8827 PRINT_BIT(attr,LBP_AL);
8828 PRINT_BIT(attr,LBP_H2);
8829 PRINT_BIT(attr,LBP_H3);
8830 PRINT_BIT(attr,LBP_HL);
8831 PRINT_BIT(attr,LBP_ID);
8832 PRINT_BIT(attr,LBP_JL);
8833 PRINT_BIT(attr,LBP_JV);
8834 PRINT_BIT(attr,LBP_JT);
8835 PRINT_BIT(attr,LBP_AP);
8836 PRINT_BIT(attr,LBP_AK);
8837 PRINT_BIT(attr,LBP_AS);
8838 PRINT_BIT(attr,LBP_VI);
8839 PRINT_BIT(attr,LBP_VF);
8840 PRINT_BIT(attr,LBP_RI);
8841 PRINT_BIT(attr,LBP_SA);
8842 PRINT_BIT(attr,LBP_ZWJ);
8843 PRINT_BIT(attr,LBP_EB);
8844 PRINT_BIT(attr,LBP_EM);
8845 PRINT_BIT(attr,LBP_XX);
8846 #undef PRINT_BIT
8847 fprintf (stream, "\n");
8852 static void
8853 debug_output_org_lbrk_tables (const char *filename)
8855 FILE *stream;
8857 stream = fopen (filename, "w");
8858 if (stream == NULL)
8860 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8861 exit (1);
8864 debug_output_org_lbp (stream);
8866 if (ferror (stream) || fclose (stream))
8868 fprintf (stderr, "error writing to '%s'\n", filename);
8869 exit (1);
8873 /* Given an enum value LBP_..., returns its name "LBP_..." as a string. */
8874 static const char *
8875 lbp_value_to_string (unsigned int value)
8877 const char *value_string;
8878 switch (value)
8880 #define CASE(x) case x: value_string = #x; break;
8881 CASE(LBP_BK);
8882 CASE(LBP_CR);
8883 CASE(LBP_LF);
8884 CASE(LBP_CM);
8885 CASE(LBP_WJ);
8886 CASE(LBP_ZW);
8887 CASE(LBP_GL);
8888 CASE(LBP_SP);
8889 CASE(LBP_B2);
8890 CASE(LBP_BA);
8891 CASE(LBP_BB);
8892 CASE(LBP_HY);
8893 CASE(LBP_CB);
8894 CASE(LBP_CL);
8895 CASE(LBP_CP1);
8896 CASE(LBP_CP2);
8897 CASE(LBP_EX);
8898 CASE(LBP_IN);
8899 CASE(LBP_NS);
8900 CASE(LBP_OP1);
8901 CASE(LBP_OP2);
8902 CASE(LBP_QU1);
8903 CASE(LBP_QU2);
8904 CASE(LBP_QU3);
8905 CASE(LBP_IS);
8906 CASE(LBP_NU);
8907 CASE(LBP_PO);
8908 CASE(LBP_PR);
8909 CASE(LBP_SY);
8910 CASE(LBP_AI);
8911 CASE(LBP_AL);
8912 CASE(LBP_H2);
8913 CASE(LBP_H3);
8914 CASE(LBP_HL);
8915 CASE(LBP_ID1);
8916 CASE(LBP_ID2);
8917 CASE(LBP_JL);
8918 CASE(LBP_JV);
8919 CASE(LBP_JT);
8920 CASE(LBP_AP);
8921 CASE(LBP_AK);
8922 CASE(LBP_AS);
8923 CASE(LBP_VI);
8924 CASE(LBP_VF);
8925 CASE(LBP_RI);
8926 CASE(LBP_SA);
8927 CASE(LBP_ZWJ);
8928 CASE(LBP_EB);
8929 CASE(LBP_EM);
8930 CASE(LBP_XX);
8931 #undef CASE
8932 default:
8933 abort ();
8935 return value_string;
8938 /* Construction of sparse 3-level tables. */
8939 #define TABLE lbp_table
8940 #define ELEMENT unsigned char
8941 #define DEFAULT LBP_XX
8942 #define xmalloc malloc
8943 #define xrealloc realloc
8944 #include "3level.h"
8946 static void
8947 output_lbp (FILE *stream1, FILE *stream2)
8949 unsigned int i;
8950 struct lbp_table t;
8951 unsigned int level1_offset, level2_offset, level3_offset;
8953 t.p = 7;
8954 t.q = 9;
8955 lbp_table_init (&t);
8957 for (i = 0; i < 0x110000; i++)
8959 int64_t attr = get_lbp (i);
8961 /* Now attr should contain exactly one bit. */
8962 assert (attr != 0 && (attr & (attr - 1)) == 0);
8964 if (attr != (int64_t) 1 << LBP_XX)
8966 unsigned int log2_attr;
8967 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
8969 lbp_table_add (&t, i, log2_attr);
8973 lbp_table_finalize (&t);
8975 level1_offset =
8976 5 * sizeof (uint32_t);
8977 level2_offset =
8978 5 * sizeof (uint32_t)
8979 + t.level1_size * sizeof (uint32_t);
8980 level3_offset =
8981 5 * sizeof (uint32_t)
8982 + t.level1_size * sizeof (uint32_t)
8983 + (t.level2_size << t.q) * sizeof (uint32_t);
8985 for (i = 0; i < 5; i++)
8986 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
8987 ((uint32_t *) t.result)[i]);
8988 fprintf (stream1, "\n");
8989 fprintf (stream1, "typedef struct\n");
8990 fprintf (stream1, " {\n");
8991 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
8992 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
8993 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
8994 fprintf (stream1, " }\n");
8995 fprintf (stream1, "lbrkprop_t;\n");
8996 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
8998 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
8999 fprintf (stream2, "{\n");
9000 fprintf (stream2, " {");
9001 if (t.level1_size > 8)
9002 fprintf (stream2, "\n ");
9003 for (i = 0; i < t.level1_size; i++)
9005 uint32_t offset;
9006 if (i > 0 && (i % 8) == 0)
9007 fprintf (stream2, "\n ");
9008 offset = ((uint32_t *) (t.result + level1_offset))[i];
9009 if (offset == 0)
9010 fprintf (stream2, " %5d", -1);
9011 else
9012 fprintf (stream2, " %5zu",
9013 (offset - level2_offset) / sizeof (uint32_t));
9014 if (i+1 < t.level1_size)
9015 fprintf (stream2, ",");
9017 if (t.level1_size > 8)
9018 fprintf (stream2, "\n ");
9019 fprintf (stream2, " },\n");
9020 fprintf (stream2, " {");
9021 if (t.level2_size << t.q > 8)
9022 fprintf (stream2, "\n ");
9023 for (i = 0; i < t.level2_size << t.q; i++)
9025 uint32_t offset;
9026 if (i > 0 && (i % 8) == 0)
9027 fprintf (stream2, "\n ");
9028 offset = ((uint32_t *) (t.result + level2_offset))[i];
9029 if (offset == 0)
9030 fprintf (stream2, " %5d", -1);
9031 else
9032 fprintf (stream2, " %5zu",
9033 (offset - level3_offset) / sizeof (unsigned char));
9034 if (i+1 < t.level2_size << t.q)
9035 fprintf (stream2, ",");
9037 if (t.level2_size << t.q > 8)
9038 fprintf (stream2, "\n ");
9039 fprintf (stream2, " },\n");
9040 fprintf (stream2, " {");
9041 if (t.level3_size << t.p > 8)
9042 fprintf (stream2, "\n ");
9043 for (i = 0; i < t.level3_size << t.p; i++)
9045 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
9046 if (i > 0 && (i % 8) == 0)
9047 fprintf (stream2, "\n ");
9048 fprintf (stream2, " %s%s", lbp_value_to_string (value),
9049 (i+1 < t.level3_size << t.p ? "," : ""));
9051 if (t.level3_size << t.p > 8)
9052 fprintf (stream2, "\n ");
9053 fprintf (stream2, " }\n");
9054 fprintf (stream2, "};\n");
9057 static void
9058 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
9060 const char *filenames[2];
9061 FILE *streams[2];
9062 size_t i;
9064 filenames[0] = filename1;
9065 filenames[1] = filename2;
9067 for (i = 0; i < 2; i++)
9069 streams[i] = fopen (filenames[i], "w");
9070 if (streams[i] == NULL)
9072 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
9073 exit (1);
9077 for (i = 0; i < 2; i++)
9079 FILE *stream = streams[i];
9081 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9082 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
9083 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9084 version);
9085 fprintf (stream, "\n");
9087 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
9088 fprintf (stream, "\n");
9089 output_library_license (stream, false);
9090 fprintf (stream, "\n");
9093 output_lbp (streams[0], streams[1]);
9095 for (i = 0; i < 2; i++)
9097 if (ferror (streams[i]) || fclose (streams[i]))
9099 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
9100 exit (1);
9105 static void
9106 output_lbrk_rules_as_tables (const char *filename, const char *version)
9108 FILE *stream;
9110 stream = fopen (filename, "w");
9111 if (stream == NULL)
9113 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9114 exit (1);
9117 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9118 fprintf (stream, "/* Table that encodes several line breaking rules. */\n");
9119 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9120 version);
9121 fprintf (stream, "\n");
9123 fprintf (stream, "/* Copyright (C) 2001-2024 Free Software Foundation, Inc.\n");
9124 fprintf (stream, "\n");
9125 output_library_license (stream, false);
9126 fprintf (stream, "\n");
9128 fprintf (stream, "#include <config.h>\n");
9129 fprintf (stream, "\n");
9130 fprintf (stream, "/* Specification. */\n");
9131 fprintf (stream, "#include \"unilbrk/lbrktables.h\"\n");
9132 fprintf (stream, "\n");
9133 fprintf (stream, "/* Define unilbrkprop, table of line breaking properties. */\n");
9134 fprintf (stream, "#include \"unilbrk/lbrkprop2.h\"\n");
9135 fprintf (stream, "\n");
9137 /* LBP_* table indices are in the range 0 .. NLBP-1. */
9138 const unsigned int NLBP = 40;
9140 unsigned int before;
9141 unsigned int after;
9142 /* Describe the table cell (before, after). */
9143 struct table_cell
9145 /* Break prohibited when no spaces, i.e. in before ÷ after */
9146 bool prohibited_no_sp;
9147 /* Break prohibited with spaces, i.e. in before SP+ ÷ after */
9148 bool prohibited_with_sp;
9150 struct table_cell table[NLBP][NLBP];
9151 /* Sets table[before][after].field to value. */
9152 #define set_table_cell(field,value) \
9153 (before == LBP_CP ? (set_table_cell_1 (LBP_CP1, field, value), set_table_cell_1 (LBP_CP2, field, value)) : \
9154 before == LBP_OP ? (set_table_cell_1 (LBP_OP1, field, value), set_table_cell_1 (LBP_OP2, field, value)) : \
9155 before == LBP_QU ? (set_table_cell_1 (LBP_QU1, field, value), set_table_cell_1 (LBP_QU2, field, value), set_table_cell_1 (LBP_QU3, field, value)) : \
9156 before == LBP_ID ? (set_table_cell_1 (LBP_ID1, field, value), set_table_cell_1 (LBP_ID2, field, value)) : \
9157 set_table_cell_1 (before, field, value))
9158 #define set_table_cell_1(row,field,value) \
9159 (after == LBP_CP ? (set_table_cell_2 (row, LBP_CP1, field, value), set_table_cell_2 (row, LBP_CP2, field, value)) : \
9160 after == LBP_OP ? (set_table_cell_2 (row, LBP_OP1, field, value), set_table_cell_2 (row, LBP_OP2, field, value)) : \
9161 after == LBP_QU ? (set_table_cell_2 (row, LBP_QU1, field, value), set_table_cell_2 (row, LBP_QU2, field, value), set_table_cell_2 (row, LBP_QU3, field, value)) : \
9162 after == LBP_ID ? (set_table_cell_2 (row, LBP_ID1, field, value), set_table_cell_2 (row, LBP_ID2, field, value)) : \
9163 set_table_cell_2 (row, after, field, value))
9164 #define set_table_cell_2(row,column,field,value) \
9165 (table[row][column].field = (value))
9167 /* Fill the table.
9168 If we were to apply the rules in top-down order (high precedence rules
9169 first), the table_cell fields have to support values false/true/unknown.
9170 If we apply the rules in the opposite order (high precedence order last),
9171 the table_cell fields need to support only the values false/true.
9172 So, that's what we do here. */
9174 /* (LB31) Break everywhere. */
9175 for (before = 0; before < NLBP; before++)
9176 for (after = 0; after < NLBP; after++)
9177 set_table_cell (prohibited_no_sp, false);
9179 /* (LB30b) Do not break between an emoji base (or potential emoji) and an
9180 emoji modifier. */
9181 before = LBP_EB; after = LBP_EM; set_table_cell (prohibited_no_sp, true);
9182 before = LBP_ID2; after = LBP_EM; set_table_cell (prohibited_no_sp, true);
9184 /* (LB30) Do not break between letters, numbers, or ordinary symbols and
9185 opening or closing parentheses (except for East Asian parentheses). */
9186 before = LBP_AL; after = LBP_OP1; set_table_cell (prohibited_no_sp, true);
9187 before = LBP_HL; after = LBP_OP1; set_table_cell (prohibited_no_sp, true);
9188 before = LBP_NU; after = LBP_OP1; set_table_cell (prohibited_no_sp, true);
9189 before = LBP_CP1; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9190 before = LBP_CP1; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9191 before = LBP_CP1; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9193 /* (LB29) Do not break between numeric punctuation and alphabetics
9194 ("e.g."). */
9195 /* We don't implement this rule, because we find it desirable to break before
9196 the HTML tag "</P>" in strings like "<P>Some sentence.</P>". */
9197 #if 0
9198 before = LBP_IS; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9199 before = LBP_IS; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9200 #endif
9202 /* (LB28a) Do not break inside the orthographic syllables of Brahmic
9203 scripts. */
9204 /* We cannot implement this rule as long as it not clear what is designated
9205 by "◌". */
9206 #if 0
9207 before = LBP_AP; after = LBP_AK; set_table_cell (prohibited_no_sp, true);
9208 before = LBP_AP; after = LBP_AS; set_table_cell (prohibited_no_sp, true);
9209 before = LBP_AK; after = LBP_VF; set_table_cell (prohibited_no_sp, true);
9210 before = LBP_AK; after = LBP_VI; set_table_cell (prohibited_no_sp, true);
9211 before = LBP_AS; after = LBP_VF; set_table_cell (prohibited_no_sp, true);
9212 before = LBP_AS; after = LBP_VI; set_table_cell (prohibited_no_sp, true);
9213 #endif
9215 /* (LB28) Do not break between alphabetics ("at"). */
9216 before = LBP_AL; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9217 before = LBP_AL; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9218 before = LBP_HL; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9219 before = LBP_HL; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9221 /* (LB27) Korean Syllable Block. */
9222 before = LBP_JL; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9223 before = LBP_JV; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9224 before = LBP_JT; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9225 before = LBP_H2; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9226 before = LBP_H3; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9227 before = LBP_PR; after = LBP_JL; set_table_cell (prohibited_no_sp, true);
9228 before = LBP_PR; after = LBP_JV; set_table_cell (prohibited_no_sp, true);
9229 before = LBP_PR; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9230 before = LBP_PR; after = LBP_H2; set_table_cell (prohibited_no_sp, true);
9231 before = LBP_PR; after = LBP_H3; set_table_cell (prohibited_no_sp, true);
9233 /* (LB26) Do not break a Korean syllable. */
9234 before = LBP_JL; after = LBP_JL; set_table_cell (prohibited_no_sp, true);
9235 before = LBP_JL; after = LBP_JV; set_table_cell (prohibited_no_sp, true);
9236 before = LBP_JL; after = LBP_H2; set_table_cell (prohibited_no_sp, true);
9237 before = LBP_JL; after = LBP_H3; set_table_cell (prohibited_no_sp, true);
9238 before = LBP_JV; after = LBP_JV; set_table_cell (prohibited_no_sp, true);
9239 before = LBP_JV; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9240 before = LBP_H2; after = LBP_JV; set_table_cell (prohibited_no_sp, true);
9241 before = LBP_H2; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9242 before = LBP_JT; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9243 before = LBP_H3; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9245 /* (LB25) Do not break between the following pairs of classes relevant to
9246 numbers. */
9247 before = LBP_CL; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9248 before = LBP_CP; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9249 before = LBP_CL; after = LBP_PR; set_table_cell (prohibited_no_sp, true);
9250 before = LBP_CP; after = LBP_PR; set_table_cell (prohibited_no_sp, true);
9251 before = LBP_NU; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9252 before = LBP_NU; after = LBP_PR; set_table_cell (prohibited_no_sp, true);
9253 before = LBP_PO; after = LBP_OP; set_table_cell (prohibited_no_sp, true);
9254 before = LBP_PO; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9255 before = LBP_PR; after = LBP_OP; set_table_cell (prohibited_no_sp, true);
9256 before = LBP_PR; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9257 before = LBP_HY; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9258 before = LBP_IS; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9259 before = LBP_NU; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9260 before = LBP_SY; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9262 /* (LB24) Do not break between numeric prefix/postfix and letters, or between
9263 letters and prefix/postfix. */
9264 before = LBP_PR; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9265 before = LBP_PR; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9266 before = LBP_PO; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9267 before = LBP_PO; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9268 before = LBP_AL; after = LBP_PR; set_table_cell (prohibited_no_sp, true);
9269 before = LBP_AL; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9270 before = LBP_HL; after = LBP_PR; set_table_cell (prohibited_no_sp, true);
9271 before = LBP_HL; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9273 /* (LB23a) Do not break between numeric prefixes and ideographs, or between
9274 ideographs and numeric postfixes. */
9275 before = LBP_PR; after = LBP_ID; set_table_cell (prohibited_no_sp, true);
9276 before = LBP_PR; after = LBP_EB; set_table_cell (prohibited_no_sp, true);
9277 before = LBP_PR; after = LBP_EM; set_table_cell (prohibited_no_sp, true);
9278 before = LBP_ID; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9279 before = LBP_EB; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9280 before = LBP_EM; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9282 /* (LB23) Do not break between digits and letters. */
9283 before = LBP_AL; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9284 before = LBP_HL; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9285 before = LBP_NU; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9286 before = LBP_NU; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9288 /* (LB22) Do not break before ellipses. */
9289 for (before = 0; before < NLBP; before++)
9291 after = LBP_IN; set_table_cell (prohibited_no_sp, true);
9294 /* (LB21b) Don’t break between Solidus and Hebrew letters. */
9295 before = LBP_SY; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9297 /* (LB21) Do not break before hyphen-minus, other hyphens, fixed-width spaces,
9298 small kana, and other non-starters, or after acute accents. */
9299 for (before = 0; before < NLBP; before++)
9301 after = LBP_BA; set_table_cell (prohibited_no_sp, true);
9302 after = LBP_HY; set_table_cell (prohibited_no_sp, true);
9303 after = LBP_NS; set_table_cell (prohibited_no_sp, true);
9305 for (after = 0; after < NLBP; after++)
9307 before = LBP_BB; set_table_cell (prohibited_no_sp, true);
9310 /* (LB19) Do not break before or after quotation marks, such as '”'. */
9311 for (before = 0; before < NLBP; before++)
9313 after = LBP_QU; set_table_cell (prohibited_no_sp, true);
9315 for (after = 0; after < NLBP; after++)
9317 before = LBP_QU; set_table_cell (prohibited_no_sp, true);
9320 /* (LB18) Break after spaces. */
9321 for (before = 0; before < NLBP; before++)
9322 for (after = 0; after < NLBP; after++)
9323 set_table_cell (prohibited_with_sp, false);
9325 /* (LB17) Do not break within '——', even with intervening spaces. */
9326 before = LBP_B2; after = LBP_B2; set_table_cell (prohibited_no_sp, true);
9327 set_table_cell (prohibited_with_sp, true);
9329 /* (LB16) Do not break between closing punctuation and a nonstarter (lb=NS),
9330 even with intervening spaces. */
9331 before = LBP_CL; after = LBP_NS; set_table_cell (prohibited_no_sp, true);
9332 set_table_cell (prohibited_with_sp, true);
9333 before = LBP_CL; after = LBP_CP; set_table_cell (prohibited_no_sp, true);
9334 set_table_cell (prohibited_with_sp, true);
9336 /* (LB15b) Do not break before an ambiguous quotation that is a final
9337 punctuation, even after spaces. */
9338 for (before = 0; before < NLBP; before++)
9340 after = LBP_QU3; set_table_cell (prohibited_no_sp, true);
9341 set_table_cell (prohibited_with_sp, true);
9344 /* (LB15a) Do not break before an ambiguous quotation that is an initial
9345 punctuation, even after spaces. */
9346 for (after = 0; after < NLBP; after++)
9348 before = LBP_QU2; set_table_cell (prohibited_no_sp, true);
9349 set_table_cell (prohibited_with_sp, true);
9352 /* (LB14) Do not break after '[', even after spaces. */
9353 for (after = 0; after < NLBP; after++)
9355 before = LBP_OP; set_table_cell (prohibited_no_sp, true);
9356 set_table_cell (prohibited_with_sp, true);
9359 /* (LB13) Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
9360 for (before = 0; before < NLBP; before++)
9362 after = LBP_CL; set_table_cell (prohibited_no_sp, true);
9363 set_table_cell (prohibited_with_sp, true);
9364 after = LBP_CP; set_table_cell (prohibited_no_sp, true);
9365 set_table_cell (prohibited_with_sp, true);
9366 after = LBP_EX; set_table_cell (prohibited_no_sp, true);
9367 set_table_cell (prohibited_with_sp, true);
9368 after = LBP_IS; set_table_cell (prohibited_no_sp, true);
9369 set_table_cell (prohibited_with_sp, true);
9370 after = LBP_SY; set_table_cell (prohibited_no_sp, true);
9371 set_table_cell (prohibited_with_sp, true);
9374 /* (LB12a) Do not break before NBSP and related characters, except after
9375 spaces and hyphens. */
9376 for (before = 0; before < NLBP; before++)
9377 if (before != LBP_BA && before != LBP_HY)
9379 after = LBP_GL; set_table_cell (prohibited_no_sp, true);
9382 /* (LB12) Do not break after NBSP and related characters. */
9383 for (after = 0; after < NLBP; after++)
9385 before = LBP_GL; set_table_cell (prohibited_no_sp, true);
9388 /* (LB11) Do not break before or after Word joiner and related characters. */
9389 for (before = 0; before < NLBP; before++)
9391 after = LBP_WJ; set_table_cell (prohibited_no_sp, true);
9392 set_table_cell (prohibited_with_sp, true);
9394 for (after = 0; after < NLBP; after++)
9396 before = LBP_WJ; set_table_cell (prohibited_no_sp, true);
9399 /* (LB10) Treat any remaining combining mark or ZWJ as AL. */
9400 /* We resolve LBP_CM at runtime, before accessing the table. */
9401 for (before = 0; before < NLBP; before++)
9402 table[before][LBP_ZWJ] = table[before][LBP_AL];
9403 for (after = 0; after < NLBP; after++)
9404 table[LBP_ZWJ][after] = table[LBP_AL][after];
9405 table[LBP_ZWJ][LBP_ZWJ] = table[LBP_AL][LBP_AL];
9407 /* (LB8a) Do not break between a zero width joiner and an ideograph, emoji
9408 base or emoji modifier. */
9409 before = LBP_ZWJ; after = LBP_ID; set_table_cell (prohibited_no_sp, true);
9410 before = LBP_ZWJ; after = LBP_EB; set_table_cell (prohibited_no_sp, true);
9411 before = LBP_ZWJ; after = LBP_EM; set_table_cell (prohibited_no_sp, true);
9413 /* Not reflected in the table:
9414 (LB30a) Break between two regional indicator symbols if and only if there are
9415 an even number of regional indicators preceding the position of the
9416 break.
9417 (LB21a) Don't break after Hebrew + Hyphen.
9418 (LB20) Break before and after unresolved CB.
9419 We resolve LBP_CB at runtime, before accessing the table.
9420 (LB9) Do not break a combining character sequence; treat it as if it has the
9421 line breaking class of the base character in all of the following rules.
9422 Treat ZWJ as if it were CM.
9423 (LB8) Break before any character following a zero-width space, even if one
9424 or more spaces intervene.
9425 We handle LBP_ZW at runtime, before accessing the table.
9426 (LB7) Do not break before spaces or zero width space.
9427 We handle LBP_ZW at runtime, before accessing the table.
9428 (LB6) Do not break before hard line breaks.
9429 We handle LBP_BK at runtime, before accessing the table.
9430 (LB5) Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
9431 (LB4) Always break after hard line breaks.
9432 (LB3) Always break at the end of text.
9433 (LB2) Never break at the start of text.
9436 fprintf (stream, "const unsigned char unilbrk_table[%u][%u] =\n", NLBP, NLBP);
9437 fprintf (stream, "{\n");
9438 fprintf (stream, " /* after */\n");
9440 fprintf (stream, " /* ");
9441 for (after = 0; after < NLBP; after++)
9442 fprintf (stream, " %-3s", lbp_value_to_string (after) + 4);
9443 fprintf (stream, " */\n");
9445 for (before = 0; before < NLBP; before++)
9447 fprintf (stream, "/* %3s */ {", lbp_value_to_string (before) + 4);
9448 for (after = 0; after < NLBP; after++)
9450 if (table[before][after].prohibited_no_sp)
9452 if (table[before][after].prohibited_with_sp)
9453 /* Prohibited break. */
9454 fprintf (stream, " P,");
9455 else
9456 /* Indirect break. */
9457 fprintf (stream, " I,");
9459 else
9461 if (table[before][after].prohibited_with_sp)
9462 abort ();
9463 else
9464 /* Direct break. */
9465 fprintf (stream, " D,");
9468 fprintf (stream, " },\n");
9470 fprintf (stream, "/* \"\" */\n");
9471 fprintf (stream, "/* before */\n");
9472 fprintf (stream, "};\n");
9474 if (ferror (stream) || fclose (stream))
9476 fprintf (stderr, "error writing to '%s'\n", filename);
9477 exit (1);
9481 /* ========================================================================= */
9483 /* Word break property.
9484 Updated for Unicode TR #29 revision 17. */
9486 /* Possible values of the Word_Break property. */
9487 enum
9489 WBP_OTHER = 0,
9490 WBP_CR = 11,
9491 WBP_LF = 12,
9492 WBP_NEWLINE = 10,
9493 WBP_EXTEND = 8,
9494 WBP_FORMAT = 9,
9495 WBP_KATAKANA = 1,
9496 WBP_ALETTER = 2,
9497 WBP_MIDNUMLET = 3,
9498 WBP_MIDLETTER = 4,
9499 WBP_MIDNUM = 5,
9500 WBP_NUMERIC = 6,
9501 WBP_EXTENDNUMLET = 7,
9502 WBP_RI = 13,
9503 WBP_DQ = 14,
9504 WBP_SQ = 15,
9505 WBP_HL = 16,
9506 WBP_ZWJ = 17,
9507 WBP_WSS = 22
9510 /* Returns the word breaking property for ch, as a bit mask. */
9511 static int
9512 get_wbp (unsigned int ch)
9514 int attr = 0;
9516 if (unicode_attributes[ch].name != NULL)
9518 if (ch == 0x000D)
9519 attr |= 1 << WBP_CR;
9521 if (ch == 0x000A)
9522 attr |= 1 << WBP_LF;
9524 if (ch == 0x000B || ch == 0x000C
9525 || ch == 0x0085
9526 || ch == 0x2028 || ch == 0x2029)
9527 attr |= 1 << WBP_NEWLINE;
9529 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
9530 || ((unicode_properties[ch] >> PROP_OTHER_GRAPHEME_EXTEND) & 1) != 0
9531 || (unicode_attributes[ch].category != NULL
9532 && strcmp (unicode_attributes[ch].category, "Mc") == 0)
9533 || ((unicode_properties[ch] >> PROP_EMOJI_MODIFIER) & 1) != 0 /* Emoji modifier */)
9534 attr |= 1 << WBP_EXTEND;
9536 if (unicode_attributes[ch].category != NULL
9537 && strcmp (unicode_attributes[ch].category, "Cf") == 0
9538 && !(ch >= 0x0600 && ch <= 0x0605)
9539 && ch != 0x06DD
9540 && ch != 0x070F
9541 && ch != 0x0890 && ch != 0x0891 && ch != 0x08E2
9542 && ch != 0x200B && ch != 0x200C && ch != 0x200D
9543 && ch != 0x110BD && ch != 0x110CD
9544 && !(ch >= 0xe0020 && ch <= 0xe007f))
9545 attr |= 1 << WBP_FORMAT;
9547 if ((unicode_scripts[ch] < numscripts
9548 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
9549 || (ch >= 0x3031 && ch <= 0x3035)
9550 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
9551 || ch == 0xFF70)
9552 attr |= 1 << WBP_KATAKANA;
9554 if ((unicode_scripts[ch] < numscripts
9555 && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0)
9556 && strcmp (unicode_attributes[ch].category, "Lo") == 0)
9557 attr |= 1 << WBP_HL;
9559 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
9560 || (ch >= 0x02C2 && ch <= 0x02C5)
9561 || (ch >= 0x02D2 && ch <= 0x02D7)
9562 || (ch >= 0x02DE && ch <= 0x02DF)
9563 || (ch >= 0x02E5 && ch <= 0x02EB)
9564 || ch == 0x02ED
9565 || (ch >= 0x02EF && ch <= 0x02FF)
9566 || (ch >= 0x055A && ch <= 0x055C)
9567 || ch == 0x055E
9568 || ch == 0x058A
9569 || ch == 0x05F3
9570 || ch == 0x070F
9571 || (ch >= 0xA708 && ch <= 0xA716)
9572 || (ch >= 0xA720 && ch <= 0xA721)
9573 || (ch >= 0xA789 && ch <= 0xA78A)
9574 || ch == 0xAB5B)
9575 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
9576 && (attr & (1 << WBP_KATAKANA)) == 0
9577 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
9578 && !(unicode_scripts[ch] < numscripts
9579 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
9580 && (attr & (1 << WBP_EXTEND)) == 0
9581 && (attr & (1 << WBP_HL)) == 0)
9582 attr |= 1 << WBP_ALETTER;
9584 if (is_WBP_MIDNUMLET (ch))
9585 attr |= 1 << WBP_MIDNUMLET;
9587 if (is_WBP_MIDLETTER (ch) && ch != 0x02D7)
9588 attr |= 1 << WBP_MIDLETTER;
9590 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
9591 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
9592 || ch == 0xFF1B)
9593 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
9594 attr |= 1 << WBP_MIDNUM;
9596 if ((((get_lbp (ch) >> LBP_NU) & 1) != 0
9597 || (ch >= 0x1B50 && ch <= 0x1B59) /* BALINESE DIGIT ZERO..NINE */
9598 || (ch >= 0xA9D0 && ch <= 0xA9D9) /* JAVANESE DIGIT ZERO..NINE */
9599 || (ch >= 0xAA50 && ch <= 0xAA59) /* CHAM DIGIT ZERO..NINE */
9600 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT ZERO..NINE */
9601 || (ch >= 0x11066 && ch <= 0x1106F) /* BRAHMI DIGIT ZERO..NINE */
9602 || (ch >= 0x11950 && ch <= 0x11959) /* DIVES AKURU DIGIT ZERO..NINE */
9603 || (ch >= 0x11F50 && ch <= 0x11F59) /* KAWI DIGIT ZERO..NINE */)
9604 && ch != 0x066C)
9605 attr |= 1 << WBP_NUMERIC;
9607 if ((unicode_attributes[ch].category != NULL
9608 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
9609 || ch == 0x202F /* NARROW NO-BREAK SPACE */)
9610 attr |= 1 << WBP_EXTENDNUMLET;
9612 if (is_property_regional_indicator (ch))
9613 attr |= 1 << WBP_RI;
9615 if (ch == 0x0022)
9616 attr |= 1 << WBP_DQ;
9618 if (ch == 0x0027)
9619 attr |= 1 << WBP_SQ;
9621 if (ch == 0x200D)
9622 attr |= 1 << WBP_ZWJ;
9624 if (is_category_Zs (ch) && ((get_lbp (ch) >> LBP_GL) & 1) == 0)
9625 attr |= 1 << WBP_WSS;
9628 if (attr == 0)
9629 /* other */
9630 attr |= 1 << WBP_OTHER;
9632 return attr;
9635 /* Output the word break property in a human readable format. */
9636 static void
9637 debug_output_wbp (FILE *stream)
9639 unsigned int i;
9641 for (i = 0; i < 0x110000; i++)
9643 int attr = get_wbp (i);
9644 if (attr != 1 << WBP_OTHER)
9646 fprintf (stream, "0x%04X", i);
9647 if (attr & (1 << WBP_CR))
9648 fprintf (stream, " CR");
9649 if (attr & (1 << WBP_LF))
9650 fprintf (stream, " LF");
9651 if (attr & (1 << WBP_NEWLINE))
9652 fprintf (stream, " Newline");
9653 if (attr & (1 << WBP_EXTEND))
9654 fprintf (stream, " Extend");
9655 if (attr & (1 << WBP_FORMAT))
9656 fprintf (stream, " Format");
9657 if (attr & (1 << WBP_KATAKANA))
9658 fprintf (stream, " Katakana");
9659 if (attr & (1 << WBP_ALETTER))
9660 fprintf (stream, " ALetter");
9661 if (attr & (1 << WBP_MIDNUMLET))
9662 fprintf (stream, " MidNumLet");
9663 if (attr & (1 << WBP_MIDLETTER))
9664 fprintf (stream, " MidLetter");
9665 if (attr & (1 << WBP_MIDNUM))
9666 fprintf (stream, " MidNum");
9667 if (attr & (1 << WBP_NUMERIC))
9668 fprintf (stream, " Numeric");
9669 if (attr & (1 << WBP_EXTENDNUMLET))
9670 fprintf (stream, " ExtendNumLet");
9671 if (attr & (1 << WBP_RI))
9672 fprintf (stream, " Regional_Indicator");
9673 if (attr & (1 << WBP_DQ))
9674 fprintf (stream, " Double_Quote");
9675 if (attr & (1 << WBP_SQ))
9676 fprintf (stream, " Single_Quote");
9677 if (attr & (1 << WBP_HL))
9678 fprintf (stream, " Hebrew_Letter");
9679 if (attr & (1 << WBP_ZWJ))
9680 fprintf (stream, " ZWJ");
9681 if (attr & (1 << WBP_WSS))
9682 fprintf (stream, " WSegSpace");
9683 fprintf (stream, "\n");
9688 static void
9689 debug_output_wbrk_tables (const char *filename)
9691 FILE *stream;
9693 stream = fopen (filename, "w");
9694 if (stream == NULL)
9696 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9697 exit (1);
9700 debug_output_wbp (stream);
9702 if (ferror (stream) || fclose (stream))
9704 fprintf (stderr, "error writing to '%s'\n", filename);
9705 exit (1);
9709 /* The word break property from the WordBreakProperty.txt file. */
9710 int unicode_org_wbp[0x110000];
9712 /* Stores in unicode_org_wbp[] the word break property from the
9713 WordBreakProperty.txt file. */
9714 static void
9715 fill_org_wbp (const char *wordbreakproperty_filename)
9717 unsigned int i;
9718 FILE *stream;
9720 for (i = 0; i < 0x110000; i++)
9721 unicode_org_wbp[i] = WBP_OTHER;
9723 stream = fopen (wordbreakproperty_filename, "r");
9724 if (stream == NULL)
9726 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
9727 exit (1);
9730 for (;;)
9732 char buf[200+1];
9733 unsigned int i1, i2;
9734 char padding[200+1];
9735 char propname[200+1];
9736 int propvalue;
9738 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9739 break;
9741 if (buf[0] == '\0' || buf[0] == '#')
9742 continue;
9744 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
9746 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
9748 fprintf (stderr, "parse error in '%s'\n",
9749 wordbreakproperty_filename);
9750 exit (1);
9752 i2 = i1;
9754 #define PROP(name,value) \
9755 if (strcmp (propname, name) == 0) propvalue = value; else
9756 PROP ("CR", WBP_CR)
9757 PROP ("LF", WBP_LF)
9758 PROP ("Newline", WBP_NEWLINE)
9759 PROP ("Extend", WBP_EXTEND)
9760 PROP ("Format", WBP_FORMAT)
9761 PROP ("Katakana", WBP_KATAKANA)
9762 PROP ("ALetter", WBP_ALETTER)
9763 PROP ("MidNumLet", WBP_MIDNUMLET)
9764 PROP ("MidLetter", WBP_MIDLETTER)
9765 PROP ("MidNum", WBP_MIDNUM)
9766 PROP ("Numeric", WBP_NUMERIC)
9767 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
9768 PROP ("Regional_Indicator", WBP_RI)
9769 PROP ("Double_Quote", WBP_DQ)
9770 PROP ("Single_Quote", WBP_SQ)
9771 PROP ("Hebrew_Letter", WBP_HL)
9772 PROP ("ZWJ", WBP_ZWJ)
9773 PROP ("WSegSpace", WBP_WSS)
9774 #undef PROP
9776 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
9777 wordbreakproperty_filename);
9778 exit (1);
9780 assert (i1 <= i2 && i2 < 0x110000);
9782 for (i = i1; i <= i2; i++)
9783 unicode_org_wbp[i] = propvalue;
9786 if (ferror (stream) || fclose (stream))
9788 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
9789 exit (1);
9793 /* Output the word break property in a human readable format. */
9794 static void
9795 debug_output_org_wbp (FILE *stream)
9797 unsigned int i;
9799 for (i = 0; i < 0x110000; i++)
9801 int propvalue = unicode_org_wbp[i];
9802 if (propvalue != WBP_OTHER)
9804 fprintf (stream, "0x%04X", i);
9805 #define PROP(name,value) \
9806 if (propvalue == value) fprintf (stream, " " name); else
9807 PROP ("CR", WBP_CR)
9808 PROP ("LF", WBP_LF)
9809 PROP ("Newline", WBP_NEWLINE)
9810 PROP ("Extend", WBP_EXTEND)
9811 PROP ("Format", WBP_FORMAT)
9812 PROP ("Katakana", WBP_KATAKANA)
9813 PROP ("ALetter", WBP_ALETTER)
9814 PROP ("MidNumLet", WBP_MIDNUMLET)
9815 PROP ("MidLetter", WBP_MIDLETTER)
9816 PROP ("MidNum", WBP_MIDNUM)
9817 PROP ("Numeric", WBP_NUMERIC)
9818 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
9819 PROP ("Regional_Indicator", WBP_RI)
9820 PROP ("Double_Quote", WBP_DQ)
9821 PROP ("Single_Quote", WBP_SQ)
9822 PROP ("Hebrew_Letter", WBP_HL)
9823 PROP ("ZWJ", WBP_ZWJ)
9824 PROP ("WSegSpace", WBP_WSS)
9825 #undef PROP
9826 fprintf (stream, " ??");
9827 fprintf (stream, "\n");
9832 static void
9833 debug_output_org_wbrk_tables (const char *filename)
9835 FILE *stream;
9837 stream = fopen (filename, "w");
9838 if (stream == NULL)
9840 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9841 exit (1);
9844 debug_output_org_wbp (stream);
9846 if (ferror (stream) || fclose (stream))
9848 fprintf (stderr, "error writing to '%s'\n", filename);
9849 exit (1);
9853 /* Construction of sparse 3-level tables. */
9854 #define TABLE wbp_table
9855 #define ELEMENT unsigned char
9856 #define DEFAULT WBP_OTHER
9857 #define xmalloc malloc
9858 #define xrealloc realloc
9859 #include "3level.h"
9861 static void
9862 output_wbp (FILE *stream)
9864 unsigned int i;
9865 struct wbp_table t;
9866 unsigned int level1_offset, level2_offset, level3_offset;
9868 t.p = 7;
9869 t.q = 9;
9870 wbp_table_init (&t);
9872 for (i = 0; i < 0x110000; i++)
9874 int attr = get_wbp (i);
9876 /* Now attr should contain exactly one bit. */
9877 assert (attr != 0 && (attr & (attr - 1)) == 0);
9879 if (attr != 1 << WBP_OTHER)
9881 unsigned int log2_attr;
9882 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
9884 wbp_table_add (&t, i, log2_attr);
9888 wbp_table_finalize (&t);
9890 level1_offset =
9891 5 * sizeof (uint32_t);
9892 level2_offset =
9893 5 * sizeof (uint32_t)
9894 + t.level1_size * sizeof (uint32_t);
9895 level3_offset =
9896 5 * sizeof (uint32_t)
9897 + t.level1_size * sizeof (uint32_t)
9898 + (t.level2_size << t.q) * sizeof (uint32_t);
9900 for (i = 0; i < 5; i++)
9901 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
9902 ((uint32_t *) t.result)[i]);
9903 fprintf (stream, "\n");
9904 fprintf (stream, "typedef struct\n");
9905 fprintf (stream, " {\n");
9906 fprintf (stream, " int level1[%zu];\n", t.level1_size);
9907 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
9908 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
9909 fprintf (stream, " }\n");
9910 fprintf (stream, "wbrkprop_t;\n");
9911 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
9912 fprintf (stream, "{\n");
9913 fprintf (stream, " {");
9914 if (t.level1_size > 8)
9915 fprintf (stream, "\n ");
9916 for (i = 0; i < t.level1_size; i++)
9918 uint32_t offset;
9919 if (i > 0 && (i % 8) == 0)
9920 fprintf (stream, "\n ");
9921 offset = ((uint32_t *) (t.result + level1_offset))[i];
9922 if (offset == 0)
9923 fprintf (stream, " %5d", -1);
9924 else
9925 fprintf (stream, " %5zu",
9926 (offset - level2_offset) / sizeof (uint32_t));
9927 if (i+1 < t.level1_size)
9928 fprintf (stream, ",");
9930 if (t.level1_size > 8)
9931 fprintf (stream, "\n ");
9932 fprintf (stream, " },\n");
9933 fprintf (stream, " {");
9934 if (t.level2_size << t.q > 8)
9935 fprintf (stream, "\n ");
9936 for (i = 0; i < t.level2_size << t.q; i++)
9938 uint32_t offset;
9939 if (i > 0 && (i % 8) == 0)
9940 fprintf (stream, "\n ");
9941 offset = ((uint32_t *) (t.result + level2_offset))[i];
9942 if (offset == 0)
9943 fprintf (stream, " %5d", -1);
9944 else
9945 fprintf (stream, " %5zu",
9946 (offset - level3_offset) / sizeof (unsigned char));
9947 if (i+1 < t.level2_size << t.q)
9948 fprintf (stream, ",");
9950 if (t.level2_size << t.q > 8)
9951 fprintf (stream, "\n ");
9952 fprintf (stream, " },\n");
9953 fprintf (stream, " {");
9954 if (t.level3_size << t.p > 4)
9955 fprintf (stream, "\n ");
9956 for (i = 0; i < t.level3_size << t.p; i++)
9958 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
9959 const char *value_string;
9960 switch (value)
9962 #define CASE(x) case x: value_string = #x; break;
9963 CASE(WBP_OTHER);
9964 CASE(WBP_CR);
9965 CASE(WBP_LF);
9966 CASE(WBP_NEWLINE);
9967 CASE(WBP_EXTEND);
9968 CASE(WBP_FORMAT);
9969 CASE(WBP_KATAKANA);
9970 CASE(WBP_ALETTER);
9971 CASE(WBP_MIDNUMLET);
9972 CASE(WBP_MIDLETTER);
9973 CASE(WBP_MIDNUM);
9974 CASE(WBP_NUMERIC);
9975 CASE(WBP_EXTENDNUMLET);
9976 CASE(WBP_RI);
9977 CASE(WBP_DQ);
9978 CASE(WBP_SQ);
9979 CASE(WBP_HL);
9980 CASE(WBP_ZWJ);
9981 CASE(WBP_WSS);
9982 #undef CASE
9983 default:
9984 abort ();
9986 if (i > 0 && (i % 4) == 0)
9987 fprintf (stream, "\n ");
9988 fprintf (stream, " %s%s", value_string,
9989 (i+1 < t.level3_size << t.p ? "," : ""));
9991 if (t.level3_size << t.p > 4)
9992 fprintf (stream, "\n ");
9993 fprintf (stream, " }\n");
9994 fprintf (stream, "};\n");
9997 static void
9998 output_wbrk_tables (const char *filename, const char *version)
10000 FILE *stream;
10002 stream = fopen (filename, "w");
10003 if (stream == NULL)
10005 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10006 exit (1);
10009 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10010 fprintf (stream, "/* Word breaking properties of Unicode characters. */\n");
10011 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10012 version);
10013 fprintf (stream, "\n");
10015 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10016 fprintf (stream, "\n");
10017 output_library_license (stream, false);
10018 fprintf (stream, "\n");
10020 output_wbp (stream);
10022 if (ferror (stream) || fclose (stream))
10024 fprintf (stderr, "error writing to '%s'\n", filename);
10025 exit (1);
10029 /* ========================================================================= */
10031 /* Grapheme break property.
10032 Updated for Unicode TR #29 revision 29. */
10034 /* Possible values of the Grapheme_Cluster_Break property. */
10035 enum
10037 GBP_OTHER = 0,
10038 GBP_CR = 1,
10039 GBP_LF = 2,
10040 GBP_CONTROL = 3,
10041 GBP_EXTEND = 4,
10042 GBP_PREPEND = 5,
10043 GBP_SPACINGMARK = 6,
10044 GBP_L = 7,
10045 GBP_V = 8,
10046 GBP_T = 9,
10047 GBP_LV = 10,
10048 GBP_LVT = 11,
10049 GBP_RI = 12,
10050 GBP_ZWJ = 13,
10051 GBP_EB = 14,
10052 GBP_EM = 15,
10053 GBP_GAZ = 16,
10054 GBP_EBG = 17
10057 /* Construction of sparse 3-level tables. */
10058 #define TABLE gbp_table
10059 #define ELEMENT unsigned char
10060 #define DEFAULT GBP_OTHER
10061 #define xmalloc malloc
10062 #define xrealloc realloc
10063 #include "3level.h"
10065 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
10066 int unicode_org_gbp[0x110000];
10068 /* Output the unit test data for the grapheme break property. */
10069 static void
10070 output_gbp_test (const char *filename)
10072 FILE *stream;
10073 bool need_comma;
10074 unsigned int ch;
10076 stream = fopen (filename, "w");
10077 if (stream == NULL)
10079 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10080 exit (1);
10083 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10084 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
10085 fprintf (stream, " Copyright (C) 2010-2024 Free Software Foundation, Inc.\n");
10086 fprintf (stream, "\n");
10087 output_tests_license (stream);
10088 fprintf (stream, "\n");
10090 need_comma = false;
10091 for (ch = 0; ch < 0x110000; ch++)
10093 int gbp = unicode_org_gbp[ch];
10094 const char *gbp_string;
10096 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
10097 ch++;
10099 switch (gbp)
10101 #define CASE(x) case x: gbp_string = #x; break;
10102 CASE (GBP_OTHER)
10103 CASE (GBP_CR)
10104 CASE (GBP_LF)
10105 CASE (GBP_CONTROL)
10106 CASE (GBP_EXTEND)
10107 CASE (GBP_PREPEND)
10108 CASE (GBP_SPACINGMARK)
10109 CASE (GBP_L)
10110 CASE (GBP_V)
10111 CASE (GBP_T)
10112 CASE (GBP_LV)
10113 CASE (GBP_LVT)
10114 CASE (GBP_RI)
10115 CASE (GBP_ZWJ)
10116 CASE (GBP_EB)
10117 CASE (GBP_EM)
10118 CASE (GBP_GAZ)
10119 CASE (GBP_EBG)
10120 #undef CASE
10121 default:
10122 abort ();
10125 if (need_comma)
10126 fprintf (stream, ",\n");
10127 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
10129 need_comma = true;
10131 fprintf (stream, "\n");
10133 if (ferror (stream) || fclose (stream))
10135 fprintf (stderr, "error writing to '%s'\n", filename);
10136 exit (1);
10140 /* Output the per-character grapheme break property table. */
10141 static void
10142 output_gbp_table (const char *filename, const char *version)
10144 FILE *stream;
10145 unsigned int ch, i;
10146 struct gbp_table t;
10147 unsigned int level1_offset, level2_offset, level3_offset;
10149 stream = fopen (filename, "w");
10150 if (stream == NULL)
10152 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10153 exit (1);
10156 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10157 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
10158 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10159 version);
10160 fprintf (stream, "\n");
10162 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10163 fprintf (stream, "\n");
10164 output_library_license (stream, false);
10165 fprintf (stream, "\n");
10167 t.p = 7;
10168 t.q = 9;
10169 gbp_table_init (&t);
10171 for (ch = 0; ch < 0x110000; ch++)
10172 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
10174 gbp_table_finalize (&t);
10176 /* Offsets in t.result, in memory of this process. */
10177 level1_offset =
10178 5 * sizeof (uint32_t);
10179 level2_offset =
10180 5 * sizeof (uint32_t)
10181 + t.level1_size * sizeof (uint32_t);
10182 level3_offset =
10183 5 * sizeof (uint32_t)
10184 + t.level1_size * sizeof (uint32_t)
10185 + (t.level2_size << t.q) * sizeof (uint32_t);
10187 for (i = 0; i < 5; i++)
10188 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
10189 ((uint32_t *) t.result)[i]);
10190 fprintf (stream, "static const\n");
10191 fprintf (stream, "struct\n");
10192 fprintf (stream, " {\n");
10193 fprintf (stream, " int level1[%zu];\n", t.level1_size);
10194 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
10195 fprintf (stream, " unsigned char level3[%zu << %d];\n",
10196 t.level3_size, t.p);
10197 fprintf (stream, " }\n");
10198 fprintf (stream, "unigbrkprop =\n");
10199 fprintf (stream, "{\n");
10200 fprintf (stream, " {");
10201 if (t.level1_size > 8)
10202 fprintf (stream, "\n ");
10203 for (i = 0; i < t.level1_size; i++)
10205 uint32_t offset;
10206 if (i > 0 && (i % 8) == 0)
10207 fprintf (stream, "\n ");
10208 offset = ((uint32_t *) (t.result + level1_offset))[i];
10209 if (offset == 0)
10210 fprintf (stream, " %5d", -1);
10211 else
10212 fprintf (stream, " %5zu",
10213 (offset - level2_offset) / sizeof (uint32_t));
10214 if (i+1 < t.level1_size)
10215 fprintf (stream, ",");
10217 if (t.level1_size > 8)
10218 fprintf (stream, "\n ");
10219 fprintf (stream, " },\n");
10220 fprintf (stream, " {");
10221 if (t.level2_size << t.q > 8)
10222 fprintf (stream, "\n ");
10223 for (i = 0; i < t.level2_size << t.q; i++)
10225 uint32_t offset;
10226 if (i > 0 && (i % 8) == 0)
10227 fprintf (stream, "\n ");
10228 offset = ((uint32_t *) (t.result + level2_offset))[i];
10229 if (offset == 0)
10230 fprintf (stream, " %5d", -1);
10231 else
10232 fprintf (stream, " %5zu",
10233 (offset - level3_offset) / sizeof (uint8_t));
10234 if (i+1 < t.level2_size << t.q)
10235 fprintf (stream, ",");
10237 if (t.level2_size << t.q > 8)
10238 fprintf (stream, "\n ");
10239 fprintf (stream, " },\n");
10240 fprintf (stream, " {");
10241 if (t.level3_size << t.p > 4)
10242 fprintf (stream, "\n ");
10243 for (i = 0; i < t.level3_size << t.p; i++)
10245 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
10246 const char *value_string;
10247 switch (value)
10249 #define CASE(x) case x: value_string = #x; break;
10250 CASE (GBP_OTHER)
10251 CASE (GBP_CR)
10252 CASE (GBP_LF)
10253 CASE (GBP_CONTROL)
10254 CASE (GBP_EXTEND)
10255 CASE (GBP_PREPEND)
10256 CASE (GBP_SPACINGMARK)
10257 CASE (GBP_L)
10258 CASE (GBP_V)
10259 CASE (GBP_T)
10260 CASE (GBP_LV)
10261 CASE (GBP_LVT)
10262 CASE (GBP_RI)
10263 CASE (GBP_ZWJ)
10264 CASE (GBP_EB)
10265 CASE (GBP_EM)
10266 CASE (GBP_GAZ)
10267 CASE (GBP_EBG)
10268 #undef CASE
10269 default:
10270 abort ();
10272 if (i > 0 && (i % 4) == 0)
10273 fprintf (stream, "\n ");
10274 fprintf (stream, " %s%s", value_string,
10275 (i+1 < t.level3_size << t.p ? "," : ""));
10277 if (t.level3_size << t.p > 4)
10278 fprintf (stream, "\n ");
10279 fprintf (stream, " }\n");
10280 fprintf (stream, "};\n");
10282 if (ferror (stream) || fclose (stream))
10284 fprintf (stderr, "error writing to '%s'\n", filename);
10285 exit (1);
10289 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
10290 GraphemeBreakProperty.txt file. */
10291 static void
10292 fill_org_gbp (const char *graphemebreakproperty_filename)
10294 unsigned int i;
10295 FILE *stream;
10296 int lineno = 0;
10298 for (i = 0; i < 0x110000; i++)
10299 unicode_org_gbp[i] = GBP_OTHER;
10301 stream = fopen (graphemebreakproperty_filename, "r");
10302 if (stream == NULL)
10304 fprintf (stderr, "error during fopen of '%s'\n",
10305 graphemebreakproperty_filename);
10306 exit (1);
10309 for (;;)
10311 char buf[200+1];
10312 unsigned int i1, i2;
10313 char padding[200+1];
10314 char propname[200+1];
10315 int propvalue;
10317 lineno++;
10318 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
10319 break;
10321 if (buf[0] == '\0' || buf[0] == '#')
10322 continue;
10324 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
10326 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
10328 fprintf (stderr, "parse error in '%s'\n",
10329 graphemebreakproperty_filename);
10330 exit (1);
10332 i2 = i1;
10334 #define PROP(name,value) \
10335 if (strcmp (propname, name) == 0) propvalue = value; else
10336 PROP ("CR", GBP_CR)
10337 PROP ("LF", GBP_LF)
10338 PROP ("Control", GBP_CONTROL)
10339 PROP ("Extend", GBP_EXTEND)
10340 PROP ("Prepend", GBP_PREPEND)
10341 PROP ("SpacingMark", GBP_SPACINGMARK)
10342 PROP ("L", GBP_L)
10343 PROP ("V", GBP_V)
10344 PROP ("T", GBP_T)
10345 PROP ("LV", GBP_LV)
10346 PROP ("LVT", GBP_LVT)
10347 PROP ("Regional_Indicator", GBP_RI)
10348 PROP ("ZWJ", GBP_ZWJ)
10349 PROP ("E_Base", GBP_EB)
10350 PROP ("E_Modifier", GBP_EM)
10351 PROP ("Glue_After_Zwj", GBP_GAZ)
10352 PROP ("E_Base_GAZ", GBP_EBG)
10353 #undef PROP
10355 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
10356 graphemebreakproperty_filename, lineno);
10357 exit (1);
10359 assert (i1 <= i2 && i2 < 0x110000);
10361 for (i = i1; i <= i2; i++)
10362 unicode_org_gbp[i] = propvalue;
10365 if (ferror (stream) || fclose (stream))
10367 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
10368 exit (1);
10372 /* ========================================================================= */
10374 /* Composition and decomposition.
10375 Updated for Unicode TR #15 revision 33. */
10377 /* Maximum number of characters into which a single Unicode character can be
10378 decomposed. */
10379 #define MAX_DECOMP_LENGTH 18
10381 enum
10383 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
10384 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
10385 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
10386 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
10387 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
10388 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
10389 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
10390 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
10391 UC_DECOMP_SUPER, /* <super> A superscript form. */
10392 UC_DECOMP_SUB, /* <sub> A subscript form. */
10393 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
10394 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
10395 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
10396 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
10397 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
10398 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
10399 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
10402 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
10403 decompositions). Return the type, or -1 for none. */
10404 static int
10405 get_decomposition (unsigned int ch,
10406 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
10408 const char *decomposition = unicode_attributes[ch].decomposition;
10410 if (decomposition != NULL && decomposition[0] != '\0')
10412 int type = UC_DECOMP_CANONICAL;
10413 unsigned int length;
10414 char *endptr;
10416 if (decomposition[0] == '<')
10418 const char *rangle;
10419 size_t typelen;
10421 rangle = strchr (decomposition + 1, '>');
10422 assert (rangle != NULL);
10423 typelen = rangle + 1 - decomposition;
10424 #define TYPE(t1,t2) \
10425 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
10426 type = t2; \
10427 else
10428 TYPE ("<font>", UC_DECOMP_FONT)
10429 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
10430 TYPE ("<initial>", UC_DECOMP_INITIAL)
10431 TYPE ("<medial>", UC_DECOMP_MEDIAL)
10432 TYPE ("<final>", UC_DECOMP_FINAL)
10433 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
10434 TYPE ("<circle>", UC_DECOMP_CIRCLE)
10435 TYPE ("<super>", UC_DECOMP_SUPER)
10436 TYPE ("<sub>", UC_DECOMP_SUB)
10437 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
10438 TYPE ("<wide>", UC_DECOMP_WIDE)
10439 TYPE ("<narrow>", UC_DECOMP_NARROW)
10440 TYPE ("<small>", UC_DECOMP_SMALL)
10441 TYPE ("<square>", UC_DECOMP_SQUARE)
10442 TYPE ("<fraction>", UC_DECOMP_FRACTION)
10443 TYPE ("<compat>", UC_DECOMP_COMPAT)
10445 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
10446 exit (1);
10448 #undef TYPE
10449 decomposition = rangle + 1;
10450 if (decomposition[0] == ' ')
10451 decomposition++;
10453 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
10455 decomposed[length] = strtoul (decomposition, &endptr, 16);
10456 if (endptr == decomposition)
10457 break;
10458 decomposition = endptr;
10459 if (decomposition[0] == ' ')
10460 decomposition++;
10462 /* Make sure that *DECOMPOSITION is not NULL-terminated.
10463 Otherwise MAX_DECOMP_LENGTH is too small. */
10464 assert (*decomposition == '\0');
10466 *lengthp = length;
10467 return type;
10469 else
10470 return -1;
10473 /* Construction of sparse 3-level tables. */
10474 #define TABLE decomp_table
10475 #define ELEMENT uint16_t
10476 #define DEFAULT (uint16_t)(-1)
10477 #define xmalloc malloc
10478 #define xrealloc realloc
10479 #include "3level.h"
10481 static void
10482 output_decomposition (FILE *stream1, FILE *stream2)
10484 struct decomp_table t;
10485 unsigned int level1_offset, level2_offset, level3_offset;
10486 unsigned int offset;
10487 unsigned int ch;
10488 unsigned int i;
10490 t.p = 5;
10491 t.q = 5;
10492 decomp_table_init (&t);
10494 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
10495 fprintf (stream1, "\n");
10496 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
10497 offset = 0;
10499 for (ch = 0; ch < 0x110000; ch++)
10501 unsigned int length;
10502 unsigned int decomposed[MAX_DECOMP_LENGTH];
10503 int type = get_decomposition (ch, &length, decomposed);
10505 if (type >= 0)
10507 assert (offset < (1 << 15));
10508 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
10510 /* Produce length 3-bytes entries. */
10511 /* We would need a special representation of zero-length entries. */
10512 assert (length != 0);
10513 for (i = 0; i < length; i++)
10515 if (offset > 0)
10516 fprintf (stream2, ",");
10517 if ((offset % 4) == 0)
10518 fprintf (stream2, "\n ");
10519 assert (decomposed[i] < (1 << 18));
10520 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
10521 (((i+1 < length ? (1 << 23) : 0)
10522 | (i == 0 ? (type << 18) : 0)
10523 | decomposed[i]) >> 16) & 0xff,
10524 (decomposed[i] >> 8) & 0xff,
10525 decomposed[i] & 0xff);
10526 offset++;
10531 fprintf (stream2, "\n};\n");
10532 fprintf (stream2, "\n");
10534 decomp_table_finalize (&t);
10536 level1_offset =
10537 5 * sizeof (uint32_t);
10538 level2_offset =
10539 5 * sizeof (uint32_t)
10540 + t.level1_size * sizeof (uint32_t);
10541 level3_offset =
10542 5 * sizeof (uint32_t)
10543 + t.level1_size * sizeof (uint32_t)
10544 + (t.level2_size << t.q) * sizeof (uint32_t);
10546 for (i = 0; i < 5; i++)
10547 fprintf (stream1, "#define decomp_header_%d %d\n", i,
10548 ((uint32_t *) t.result)[i]);
10549 fprintf (stream1, "\n");
10550 fprintf (stream1, "typedef struct\n");
10551 fprintf (stream1, " {\n");
10552 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
10553 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
10554 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
10555 fprintf (stream1, " }\n");
10556 fprintf (stream1, "decomp_index_table_t;\n");
10557 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
10558 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
10559 fprintf (stream2, "{\n");
10560 fprintf (stream2, " {");
10561 if (t.level1_size > 8)
10562 fprintf (stream2, "\n ");
10563 for (i = 0; i < t.level1_size; i++)
10565 uint32_t offset;
10566 if (i > 0 && (i % 8) == 0)
10567 fprintf (stream2, "\n ");
10568 offset = ((uint32_t *) (t.result + level1_offset))[i];
10569 if (offset == 0)
10570 fprintf (stream2, " %5d", -1);
10571 else
10572 fprintf (stream2, " %5zu",
10573 (offset - level2_offset) / sizeof (uint32_t));
10574 if (i+1 < t.level1_size)
10575 fprintf (stream2, ",");
10577 if (t.level1_size > 8)
10578 fprintf (stream2, "\n ");
10579 fprintf (stream2, " },\n");
10580 fprintf (stream2, " {");
10581 if (t.level2_size << t.q > 8)
10582 fprintf (stream2, "\n ");
10583 for (i = 0; i < t.level2_size << t.q; i++)
10585 uint32_t offset;
10586 if (i > 0 && (i % 8) == 0)
10587 fprintf (stream2, "\n ");
10588 offset = ((uint32_t *) (t.result + level2_offset))[i];
10589 if (offset == 0)
10590 fprintf (stream2, " %5d", -1);
10591 else
10592 fprintf (stream2, " %5zu",
10593 (offset - level3_offset) / sizeof (uint16_t));
10594 if (i+1 < t.level2_size << t.q)
10595 fprintf (stream2, ",");
10597 if (t.level2_size << t.q > 8)
10598 fprintf (stream2, "\n ");
10599 fprintf (stream2, " },\n");
10600 fprintf (stream2, " {");
10601 if (t.level3_size << t.p > 8)
10602 fprintf (stream2, "\n ");
10603 for (i = 0; i < t.level3_size << t.p; i++)
10605 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
10606 if (i > 0 && (i % 8) == 0)
10607 fprintf (stream2, "\n ");
10608 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
10609 if (i+1 < t.level3_size << t.p)
10610 fprintf (stream2, ",");
10612 if (t.level3_size << t.p > 8)
10613 fprintf (stream2, "\n ");
10614 fprintf (stream2, " }\n");
10615 fprintf (stream2, "};\n");
10618 static void
10619 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
10621 const char *filenames[2];
10622 FILE *streams[2];
10623 size_t i;
10625 filenames[0] = filename1;
10626 filenames[1] = filename2;
10628 for (i = 0; i < 2; i++)
10630 streams[i] = fopen (filenames[i], "w");
10631 if (streams[i] == NULL)
10633 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
10634 exit (1);
10638 for (i = 0; i < 2; i++)
10640 FILE *stream = streams[i];
10642 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10643 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
10644 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10645 version);
10646 fprintf (stream, "\n");
10648 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10649 fprintf (stream, "\n");
10650 output_library_license (stream, true);
10651 fprintf (stream, "\n");
10654 output_decomposition (streams[0], streams[1]);
10656 for (i = 0; i < 2; i++)
10658 if (ferror (streams[i]) || fclose (streams[i]))
10660 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
10661 exit (1);
10666 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
10667 char unicode_composition_exclusions[0x110000];
10669 static void
10670 fill_composition_exclusions (const char *compositionexclusions_filename)
10672 FILE *stream;
10673 unsigned int i;
10675 stream = fopen (compositionexclusions_filename, "r");
10676 if (stream == NULL)
10678 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
10679 exit (1);
10682 for (i = 0; i < 0x110000; i++)
10683 unicode_composition_exclusions[i] = 0;
10685 for (;;)
10687 char buf[200+1];
10688 unsigned int i;
10690 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
10691 break;
10693 if (buf[0] == '\0' || buf[0] == '#')
10694 continue;
10696 if (sscanf (buf, "%X", &i) != 1)
10698 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
10699 exit (1);
10701 assert (i < 0x110000);
10703 unicode_composition_exclusions[i] = 1;
10706 if (ferror (stream) || fclose (stream))
10708 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
10709 exit (1);
10713 static void
10714 debug_output_composition_tables (const char *filename)
10716 FILE *stream;
10717 unsigned int ch;
10719 stream = fopen (filename, "w");
10720 if (stream == NULL)
10722 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10723 exit (1);
10726 for (ch = 0; ch < 0x110000; ch++)
10728 unsigned int length;
10729 unsigned int decomposed[MAX_DECOMP_LENGTH];
10730 int type = get_decomposition (ch, &length, decomposed);
10732 if (type == UC_DECOMP_CANONICAL
10733 /* Consider only binary decompositions.
10734 Exclude singleton decompositions. */
10735 && length == 2)
10737 unsigned int code1 = decomposed[0];
10738 unsigned int code2 = decomposed[1];
10739 unsigned int combined = ch;
10741 /* Exclude decompositions where the first part is not a starter,
10742 i.e. is not of canonical combining class 0. */
10743 if (strcmp (unicode_attributes[code1].combining, "0") == 0
10744 /* Exclude characters listed in CompositionExclusions.txt. */
10745 && !unicode_composition_exclusions[combined])
10747 /* The combined character must now also be a starter.
10748 Verify this. */
10749 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
10751 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
10752 code1,
10753 code2,
10754 combined,
10755 unicode_attributes[code2].combining);
10760 if (ferror (stream) || fclose (stream))
10762 fprintf (stderr, "error writing to '%s'\n", filename);
10763 exit (1);
10767 static void
10768 output_composition_tables (const char *filename, const char *version)
10770 FILE *stream;
10771 unsigned int ch;
10773 stream = fopen (filename, "w");
10774 if (stream == NULL)
10776 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10777 exit (1);
10780 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10781 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
10782 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10783 version);
10784 fprintf (stream, "\n");
10786 fprintf (stream, "/* Copyright (C) 2009-2024 Free Software Foundation, Inc.\n");
10787 fprintf (stream, "\n");
10788 output_library_license (stream, true);
10789 fprintf (stream, "\n");
10791 /* The composition table is a set of mappings (code1, code2) -> combined,
10792 with 928 entries,
10793 367 values for code1 (from 0x003C to 0x30FD),
10794 54 values for code2 (from 0x0300 to 0x309A).
10795 For a fixed code1, there are from 1 to 19 possible values for code2.
10796 For a fixed code2, there are from 1 to 117 possible values for code1.
10797 This is a very sparse matrix.
10799 We want an O(1) hash lookup.
10801 We could implement the hash lookup by mapping (code1, code2) to a linear
10802 combination mul1*code1 + mul2*code2, which is then used as an index into
10803 a 3-level table. But this leads to a table of size 37 KB.
10805 We use gperf to implement the hash lookup, giving it the 928 sets of
10806 4 bytes (code1, code2) as input. gperf generates a hash table of size
10807 1527, which is quite good (60% filled). It requires an auxiliary table
10808 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
10810 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
10811 fprintf (stream, "%%struct-type\n");
10812 fprintf (stream, "%%language=ANSI-C\n");
10813 fprintf (stream, "%%define slot-name codes\n");
10814 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
10815 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
10816 fprintf (stream, "%%compare-lengths\n");
10817 fprintf (stream, "%%compare-strncmp\n");
10818 fprintf (stream, "%%readonly-tables\n");
10819 fprintf (stream, "%%omit-struct-type\n");
10820 fprintf (stream, "%%%%\n");
10822 for (ch = 0; ch < 0x110000; ch++)
10824 unsigned int length;
10825 unsigned int decomposed[MAX_DECOMP_LENGTH];
10826 int type = get_decomposition (ch, &length, decomposed);
10828 if (type == UC_DECOMP_CANONICAL
10829 /* Consider only binary decompositions.
10830 Exclude singleton decompositions. */
10831 && length == 2)
10833 unsigned int code1 = decomposed[0];
10834 unsigned int code2 = decomposed[1];
10835 unsigned int combined = ch;
10837 /* Exclude decompositions where the first part is not a starter,
10838 i.e. is not of canonical combining class 0. */
10839 if (strcmp (unicode_attributes[code1].combining, "0") == 0
10840 /* Exclude characters listed in CompositionExclusions.txt. */
10841 && !unicode_composition_exclusions[combined])
10843 /* The combined character must now also be a starter.
10844 Verify this. */
10845 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
10847 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
10848 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
10849 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
10850 combined);
10855 if (ferror (stream) || fclose (stream))
10857 fprintf (stderr, "error writing to '%s'\n", filename);
10858 exit (1);
10862 /* ========================================================================= */
10864 /* Output the test for a simple character mapping table to the given file. */
10866 static void
10867 output_simple_mapping_test (const char *filename,
10868 const char *function_name,
10869 unsigned int (*func) (unsigned int),
10870 const char *version)
10872 FILE *stream;
10873 bool need_comma;
10874 unsigned int ch;
10876 stream = fopen (filename, "w");
10877 if (stream == NULL)
10879 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10880 exit (1);
10883 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10884 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
10885 fprintf (stream, " Copyright (C) 2009-2024 Free Software Foundation, Inc.\n");
10886 fprintf (stream, "\n");
10887 output_tests_license (stream);
10888 fprintf (stream, "\n");
10889 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10890 version);
10891 fprintf (stream, "\n");
10892 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
10893 fprintf (stream, "\n");
10895 need_comma = false;
10896 for (ch = 0; ch < 0x110000; ch++)
10898 unsigned int value = func (ch);
10900 if (value != ch)
10902 if (need_comma)
10903 fprintf (stream, ",\n");
10904 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
10905 need_comma = true;
10908 if (need_comma)
10909 fprintf (stream, "\n");
10911 fprintf (stream, "\n");
10912 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
10913 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
10915 if (ferror (stream) || fclose (stream))
10917 fprintf (stderr, "error writing to '%s'\n", filename);
10918 exit (1);
10922 /* Construction of sparse 3-level tables. */
10923 #define TABLE mapping_table
10924 #define ELEMENT int32_t
10925 #define DEFAULT 0
10926 #define xmalloc malloc
10927 #define xrealloc realloc
10928 #include "3level.h"
10930 /* Output a simple character mapping table to the given file. */
10932 static void
10933 output_simple_mapping (const char *filename,
10934 unsigned int (*func) (unsigned int),
10935 const char *version)
10937 FILE *stream;
10938 unsigned int ch, i;
10939 struct mapping_table t;
10940 unsigned int level1_offset, level2_offset, level3_offset;
10942 stream = fopen (filename, "w");
10943 if (stream == NULL)
10945 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10946 exit (1);
10949 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10950 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
10951 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10952 version);
10953 fprintf (stream, "\n");
10955 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10956 fprintf (stream, "\n");
10957 output_library_license (stream,
10958 strcmp (filename, "unicase/tolower.h") == 0
10959 || strcmp (filename, "unicase/toupper.h") == 0);
10960 fprintf (stream, "\n");
10962 t.p = 7;
10963 t.q = 9;
10964 mapping_table_init (&t);
10966 for (ch = 0; ch < 0x110000; ch++)
10968 int value = (int) func (ch) - (int) ch;
10970 mapping_table_add (&t, ch, value);
10973 mapping_table_finalize (&t);
10975 /* Offsets in t.result, in memory of this process. */
10976 level1_offset =
10977 5 * sizeof (uint32_t);
10978 level2_offset =
10979 5 * sizeof (uint32_t)
10980 + t.level1_size * sizeof (uint32_t);
10981 level3_offset =
10982 5 * sizeof (uint32_t)
10983 + t.level1_size * sizeof (uint32_t)
10984 + (t.level2_size << t.q) * sizeof (uint32_t);
10986 for (i = 0; i < 5; i++)
10987 fprintf (stream, "#define mapping_header_%d %d\n", i,
10988 ((uint32_t *) t.result)[i]);
10989 fprintf (stream, "static const\n");
10990 fprintf (stream, "struct\n");
10991 fprintf (stream, " {\n");
10992 fprintf (stream, " int level1[%zu];\n", t.level1_size);
10993 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
10994 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
10995 fprintf (stream, " }\n");
10996 fprintf (stream, "u_mapping =\n");
10997 fprintf (stream, "{\n");
10998 fprintf (stream, " {");
10999 if (t.level1_size > 8)
11000 fprintf (stream, "\n ");
11001 for (i = 0; i < t.level1_size; i++)
11003 uint32_t offset;
11004 if (i > 0 && (i % 8) == 0)
11005 fprintf (stream, "\n ");
11006 offset = ((uint32_t *) (t.result + level1_offset))[i];
11007 if (offset == 0)
11008 fprintf (stream, " %5d", -1);
11009 else
11010 fprintf (stream, " %5zu",
11011 (offset - level2_offset) / sizeof (uint32_t));
11012 if (i+1 < t.level1_size)
11013 fprintf (stream, ",");
11015 if (t.level1_size > 8)
11016 fprintf (stream, "\n ");
11017 fprintf (stream, " },\n");
11018 fprintf (stream, " {");
11019 if (t.level2_size << t.q > 8)
11020 fprintf (stream, "\n ");
11021 for (i = 0; i < t.level2_size << t.q; i++)
11023 uint32_t offset;
11024 if (i > 0 && (i % 8) == 0)
11025 fprintf (stream, "\n ");
11026 offset = ((uint32_t *) (t.result + level2_offset))[i];
11027 if (offset == 0)
11028 fprintf (stream, " %5d", -1);
11029 else
11030 fprintf (stream, " %5zu",
11031 (offset - level3_offset) / sizeof (int32_t));
11032 if (i+1 < t.level2_size << t.q)
11033 fprintf (stream, ",");
11035 if (t.level2_size << t.q > 8)
11036 fprintf (stream, "\n ");
11037 fprintf (stream, " },\n");
11038 fprintf (stream, " {");
11039 if (t.level3_size << t.p > 8)
11040 fprintf (stream, "\n ");
11041 for (i = 0; i < t.level3_size << t.p; i++)
11043 if (i > 0 && (i % 8) == 0)
11044 fprintf (stream, "\n ");
11045 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
11046 if (i+1 < t.level3_size << t.p)
11047 fprintf (stream, ",");
11049 if (t.level3_size << t.p > 8)
11050 fprintf (stream, "\n ");
11051 fprintf (stream, " }\n");
11052 fprintf (stream, "};\n");
11054 if (ferror (stream) || fclose (stream))
11056 fprintf (stderr, "error writing to '%s'\n", filename);
11057 exit (1);
11061 /* ========================================================================= */
11063 /* A special casing context.
11064 A context is negated through x -> -x. */
11065 enum
11067 SCC_ALWAYS = 0,
11068 SCC_FINAL_SIGMA,
11069 SCC_AFTER_SOFT_DOTTED,
11070 SCC_MORE_ABOVE,
11071 SCC_BEFORE_DOT,
11072 SCC_AFTER_I
11075 /* A special casing rule. */
11076 struct special_casing_rule
11078 unsigned int code;
11079 unsigned int lower_mapping[3];
11080 unsigned int title_mapping[3];
11081 unsigned int upper_mapping[3];
11082 unsigned int casefold_mapping[3];
11083 const char *language;
11084 int context;
11087 /* The special casing rules. */
11088 struct special_casing_rule **casing_rules;
11089 unsigned int num_casing_rules;
11090 unsigned int allocated_casing_rules;
11092 static void
11093 add_casing_rule (struct special_casing_rule *new_rule)
11095 if (num_casing_rules == allocated_casing_rules)
11097 allocated_casing_rules = 2 * allocated_casing_rules;
11098 if (allocated_casing_rules < 16)
11099 allocated_casing_rules = 16;
11100 casing_rules =
11101 (struct special_casing_rule **)
11102 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
11104 casing_rules[num_casing_rules++] = new_rule;
11107 /* Stores in casing_rules the special casing rules found in
11108 specialcasing_filename. */
11109 static void
11110 fill_casing_rules (const char *specialcasing_filename)
11112 FILE *stream;
11114 stream = fopen (specialcasing_filename, "r");
11115 if (stream == NULL)
11117 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
11118 exit (1);
11121 casing_rules = NULL;
11122 num_casing_rules = 0;
11123 allocated_casing_rules = 0;
11125 for (;;)
11127 char buf[200+1];
11128 char *scanptr;
11129 char *endptr;
11130 int i;
11132 unsigned int code;
11133 unsigned int lower_mapping[3];
11134 unsigned int title_mapping[3];
11135 unsigned int upper_mapping[3];
11136 char *language;
11137 int context;
11139 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
11140 break;
11142 if (buf[0] == '\0' || buf[0] == '#')
11143 continue;
11145 /* Scan code. */
11146 scanptr = buf;
11147 code = strtoul (scanptr, &endptr, 16);
11148 if (endptr == scanptr)
11150 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11151 exit (1);
11153 scanptr = endptr;
11154 if (*scanptr != ';')
11156 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11157 exit (1);
11159 scanptr++;
11161 /* Scan lower mapping. */
11162 for (i = 0; i < 3; i++)
11163 lower_mapping[i] = 0;
11164 for (i = 0; i < 3; i++)
11166 while (*scanptr == ' ')
11167 scanptr++;
11168 if (*scanptr == ';')
11169 break;
11170 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
11171 if (endptr == scanptr)
11173 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11174 exit (1);
11176 scanptr = endptr;
11178 if (*scanptr != ';')
11180 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11181 exit (1);
11183 scanptr++;
11185 /* Scan title mapping. */
11186 for (i = 0; i < 3; i++)
11187 title_mapping[i] = 0;
11188 for (i = 0; i < 3; i++)
11190 while (*scanptr == ' ')
11191 scanptr++;
11192 if (*scanptr == ';')
11193 break;
11194 title_mapping[i] = strtoul (scanptr, &endptr, 16);
11195 if (endptr == scanptr)
11197 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11198 exit (1);
11200 scanptr = endptr;
11202 if (*scanptr != ';')
11204 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11205 exit (1);
11207 scanptr++;
11209 /* Scan upper mapping. */
11210 for (i = 0; i < 3; i++)
11211 upper_mapping[i] = 0;
11212 for (i = 0; i < 3; i++)
11214 while (*scanptr == ' ')
11215 scanptr++;
11216 if (*scanptr == ';')
11217 break;
11218 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
11219 if (endptr == scanptr)
11221 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11222 exit (1);
11224 scanptr = endptr;
11226 if (*scanptr != ';')
11228 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11229 exit (1);
11231 scanptr++;
11233 /* Scan language and context. */
11234 language = NULL;
11235 context = SCC_ALWAYS;
11236 while (*scanptr == ' ')
11237 scanptr++;
11238 if (*scanptr != '\0' && *scanptr != '#')
11240 const char *word_begin = scanptr;
11241 const char *word_end;
11243 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
11244 scanptr++;
11245 word_end = scanptr;
11247 while (*scanptr == ' ')
11248 scanptr++;
11250 if (word_end - word_begin == 2)
11252 language = (char *) malloc ((word_end - word_begin) + 1);
11253 memcpy (language, word_begin, 2);
11254 language[word_end - word_begin] = '\0';
11255 word_begin = word_end = NULL;
11257 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
11259 word_begin = scanptr;
11260 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
11261 scanptr++;
11262 word_end = scanptr;
11266 if (word_end > word_begin)
11268 bool negate = false;
11270 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
11272 word_begin += 4;
11273 negate = true;
11275 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
11276 context = SCC_FINAL_SIGMA;
11277 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
11278 context = SCC_AFTER_SOFT_DOTTED;
11279 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
11280 context = SCC_MORE_ABOVE;
11281 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
11282 context = SCC_BEFORE_DOT;
11283 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
11284 context = SCC_AFTER_I;
11285 else
11287 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
11288 exit (1);
11290 if (negate)
11291 context = - context;
11294 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
11296 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11297 exit (1);
11301 /* Store the rule. */
11303 struct special_casing_rule *new_rule =
11304 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
11305 new_rule->code = code;
11306 new_rule->language = language;
11307 new_rule->context = context;
11308 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
11309 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
11310 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
11312 add_casing_rule (new_rule);
11316 if (ferror (stream) || fclose (stream))
11318 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
11319 exit (1);
11323 /* A casefolding rule. */
11324 struct casefold_rule
11326 unsigned int code;
11327 unsigned int mapping[3];
11328 const char *language;
11331 /* The casefolding rules. */
11332 struct casefold_rule **casefolding_rules;
11333 unsigned int num_casefolding_rules;
11334 unsigned int allocated_casefolding_rules;
11336 /* Stores in casefolding_rules the case folding rules found in
11337 casefolding_filename. */
11338 static void
11339 fill_casefolding_rules (const char *casefolding_filename)
11341 FILE *stream;
11343 stream = fopen (casefolding_filename, "r");
11344 if (stream == NULL)
11346 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
11347 exit (1);
11350 casefolding_rules = NULL;
11351 num_casefolding_rules = 0;
11352 allocated_casefolding_rules = 0;
11354 for (;;)
11356 char buf[200+1];
11357 char *scanptr;
11358 char *endptr;
11359 int i;
11361 unsigned int code;
11362 char type;
11363 unsigned int mapping[3];
11365 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
11366 break;
11368 if (buf[0] == '\0' || buf[0] == '#')
11369 continue;
11371 /* Scan code. */
11372 scanptr = buf;
11373 code = strtoul (scanptr, &endptr, 16);
11374 if (endptr == scanptr)
11376 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11377 exit (1);
11379 scanptr = endptr;
11380 if (*scanptr != ';')
11382 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11383 exit (1);
11385 scanptr++;
11387 /* Scan type. */
11388 while (*scanptr == ' ')
11389 scanptr++;
11391 switch (*scanptr)
11393 case 'C': case 'F': case 'S': case 'T':
11394 type = *scanptr;
11395 break;
11396 default:
11397 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11398 exit (1);
11400 scanptr++;
11401 if (*scanptr != ';')
11403 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11404 exit (1);
11406 scanptr++;
11408 /* Scan casefold mapping. */
11409 for (i = 0; i < 3; i++)
11410 mapping[i] = 0;
11411 for (i = 0; i < 3; i++)
11413 while (*scanptr == ' ')
11414 scanptr++;
11415 if (*scanptr == ';')
11416 break;
11417 mapping[i] = strtoul (scanptr, &endptr, 16);
11418 if (endptr == scanptr)
11420 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11421 exit (1);
11423 scanptr = endptr;
11425 if (*scanptr != ';')
11427 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11428 exit (1);
11430 scanptr++;
11432 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
11433 if (type != 'S')
11435 const char * const *languages;
11436 unsigned int languages_count;
11438 /* Type 'T' indicates that the rule is applicable to Turkish
11439 languages only. */
11440 if (type == 'T')
11442 static const char * const turkish_languages[] = { "tr", "az" };
11443 languages = turkish_languages;
11444 languages_count = 2;
11446 else
11448 static const char * const all_languages[] = { NULL };
11449 languages = all_languages;
11450 languages_count = 1;
11453 for (i = 0; i < languages_count; i++)
11455 /* Store a new rule. */
11456 struct casefold_rule *new_rule =
11457 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
11458 new_rule->code = code;
11459 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
11460 new_rule->language = languages[i];
11462 if (num_casefolding_rules == allocated_casefolding_rules)
11464 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
11465 if (allocated_casefolding_rules < 16)
11466 allocated_casefolding_rules = 16;
11467 casefolding_rules =
11468 (struct casefold_rule **)
11469 realloc (casefolding_rules,
11470 allocated_casefolding_rules * sizeof (struct casefold_rule *));
11472 casefolding_rules[num_casefolding_rules++] = new_rule;
11477 if (ferror (stream) || fclose (stream))
11479 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
11480 exit (1);
11484 /* Casefold mapping, when it maps to a single character. */
11485 unsigned int unicode_casefold[0x110000];
11487 static unsigned int
11488 to_casefold (unsigned int ch)
11490 return unicode_casefold[ch];
11493 /* Redistribute the casefolding_rules:
11494 - Rules that map to a single character, language independently, are stored
11495 in unicode_casefold.
11496 - Other rules are merged into casing_rules. */
11497 static void
11498 redistribute_casefolding_rules (void)
11500 unsigned int ch, i, j;
11502 /* Fill unicode_casefold[]. */
11503 for (ch = 0; ch < 0x110000; ch++)
11504 unicode_casefold[ch] = ch;
11505 for (i = 0; i < num_casefolding_rules; i++)
11507 struct casefold_rule *cfrule = casefolding_rules[i];
11509 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
11511 ch = cfrule->code;
11512 assert (ch < 0x110000);
11513 unicode_casefold[ch] = cfrule->mapping[0];
11517 /* Extend the special casing rules by filling in their casefold_mapping[]
11518 field. */
11519 for (j = 0; j < num_casing_rules; j++)
11521 struct special_casing_rule *rule = casing_rules[j];
11522 unsigned int k;
11524 rule->casefold_mapping[0] = to_casefold (rule->code);
11525 for (k = 1; k < 3; k++)
11526 rule->casefold_mapping[k] = 0;
11529 /* Now merge the other casefolding rules into casing_rules. */
11530 for (i = 0; i < num_casefolding_rules; i++)
11532 struct casefold_rule *cfrule = casefolding_rules[i];
11534 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
11536 /* Find a rule that applies to the same code, same language, and it
11537 has context SCC_ALWAYS. At the same time, update all rules that
11538 have the same code and same or more specific language. */
11539 struct special_casing_rule *found_rule = NULL;
11541 for (j = 0; j < num_casing_rules; j++)
11543 struct special_casing_rule *rule = casing_rules[j];
11545 if (rule->code == cfrule->code
11546 && (cfrule->language == NULL
11547 || (rule->language != NULL
11548 && strcmp (rule->language, cfrule->language) == 0)))
11550 memcpy (rule->casefold_mapping, cfrule->mapping,
11551 sizeof (rule->casefold_mapping));
11553 if ((cfrule->language == NULL
11554 ? rule->language == NULL
11555 : rule->language != NULL
11556 && strcmp (rule->language, cfrule->language) == 0)
11557 && rule->context == SCC_ALWAYS)
11559 /* Found it. */
11560 found_rule = rule;
11565 if (found_rule == NULL)
11567 /* Create a new rule. */
11568 struct special_casing_rule *new_rule =
11569 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
11571 /* Try to find a rule that applies to the same code, no language
11572 restriction, and with context SCC_ALWAYS. */
11573 for (j = 0; j < num_casing_rules; j++)
11575 struct special_casing_rule *rule = casing_rules[j];
11577 if (rule->code == cfrule->code
11578 && rule->context == SCC_ALWAYS
11579 && rule->language == NULL)
11581 /* Found it. */
11582 found_rule = rule;
11583 break;
11587 new_rule->code = cfrule->code;
11588 new_rule->language = cfrule->language;
11589 new_rule->context = SCC_ALWAYS;
11590 if (found_rule != NULL)
11592 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
11593 sizeof (new_rule->lower_mapping));
11594 memcpy (new_rule->title_mapping, found_rule->title_mapping,
11595 sizeof (new_rule->title_mapping));
11596 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
11597 sizeof (new_rule->upper_mapping));
11599 else
11601 unsigned int k;
11603 new_rule->lower_mapping[0] = to_lower (cfrule->code);
11604 for (k = 1; k < 3; k++)
11605 new_rule->lower_mapping[k] = 0;
11606 new_rule->title_mapping[0] = to_title (cfrule->code);
11607 for (k = 1; k < 3; k++)
11608 new_rule->title_mapping[k] = 0;
11609 new_rule->upper_mapping[0] = to_upper (cfrule->code);
11610 for (k = 1; k < 3; k++)
11611 new_rule->upper_mapping[k] = 0;
11613 memcpy (new_rule->casefold_mapping, cfrule->mapping,
11614 sizeof (new_rule->casefold_mapping));
11616 add_casing_rule (new_rule);
11622 static int
11623 compare_casing_rules (const void *a, const void *b)
11625 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
11626 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
11627 unsigned int a_code = a_rule->code;
11628 unsigned int b_code = b_rule->code;
11630 if (a_code < b_code)
11631 return -1;
11632 if (a_code > b_code)
11633 return 1;
11635 /* Sort the more specific rules before the more general ones. */
11636 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
11637 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
11640 static void
11641 sort_casing_rules (void)
11643 /* Sort the rules 1. by code, 2. by specificity. */
11644 if (num_casing_rules > 1)
11645 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
11646 compare_casing_rules);
11649 /* Output the special casing rules. */
11650 static void
11651 output_casing_rules (const char *filename, const char *version)
11653 FILE *stream;
11654 unsigned int i, j;
11655 unsigned int minor;
11657 stream = fopen (filename, "w");
11658 if (stream == NULL)
11660 fprintf (stderr, "cannot open '%s' for writing\n", filename);
11661 exit (1);
11664 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
11665 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
11666 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
11667 version);
11668 fprintf (stream, "\n");
11670 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
11671 fprintf (stream, "\n");
11672 output_library_license (stream, false);
11673 fprintf (stream, "\n");
11675 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
11676 fprintf (stream, "%%struct-type\n");
11677 fprintf (stream, "%%language=ANSI-C\n");
11678 fprintf (stream, "%%define slot-name code\n");
11679 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
11680 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
11681 fprintf (stream, "%%compare-lengths\n");
11682 fprintf (stream, "%%compare-strncmp\n");
11683 fprintf (stream, "%%readonly-tables\n");
11684 fprintf (stream, "%%omit-struct-type\n");
11685 fprintf (stream, "%%%%\n");
11687 minor = 0;
11688 for (i = 0; i < num_casing_rules; i++)
11690 struct special_casing_rule *rule = casing_rules[i];
11691 int context;
11693 if (i > 0 && rule->code == casing_rules[i - 1]->code)
11694 minor += 1;
11695 else
11696 minor = 0;
11698 if (!(rule->code < 0x10000))
11700 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
11701 exit (1);
11704 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
11705 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
11707 fprintf (stream, "%d, ",
11708 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
11710 context = rule->context;
11711 if (context < 0)
11713 fprintf (stream, "-");
11714 context = - context;
11716 else
11717 fprintf (stream, " ");
11718 switch (context)
11720 case SCC_ALWAYS:
11721 fprintf (stream, "SCC_ALWAYS ");
11722 break;
11723 case SCC_FINAL_SIGMA:
11724 fprintf (stream, "SCC_FINAL_SIGMA ");
11725 break;
11726 case SCC_AFTER_SOFT_DOTTED:
11727 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
11728 break;
11729 case SCC_MORE_ABOVE:
11730 fprintf (stream, "SCC_MORE_ABOVE ");
11731 break;
11732 case SCC_BEFORE_DOT:
11733 fprintf (stream, "SCC_BEFORE_DOT ");
11734 break;
11735 case SCC_AFTER_I:
11736 fprintf (stream, "SCC_AFTER_I ");
11737 break;
11738 default:
11739 abort ();
11741 fprintf (stream, ", ");
11743 if (rule->language != NULL)
11745 assert (strlen (rule->language) == 2);
11746 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
11748 else
11749 fprintf (stream, "{ '\\0', '\\0' }, ");
11751 fprintf (stream, "{ ");
11752 for (j = 0; j < 3; j++)
11754 if (j > 0)
11755 fprintf (stream, ", ");
11756 if (!(rule->upper_mapping[j] < 0x10000))
11758 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
11759 exit (1);
11761 if (rule->upper_mapping[j] != 0)
11762 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
11763 else
11764 fprintf (stream, " 0");
11766 fprintf (stream, " }, { ");
11767 for (j = 0; j < 3; j++)
11769 if (j > 0)
11770 fprintf (stream, ", ");
11771 if (!(rule->lower_mapping[j] < 0x10000))
11773 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
11774 exit (1);
11776 if (rule->lower_mapping[j] != 0)
11777 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
11778 else
11779 fprintf (stream, " 0");
11781 fprintf (stream, " }, { ");
11782 for (j = 0; j < 3; j++)
11784 if (j > 0)
11785 fprintf (stream, ", ");
11786 if (!(rule->title_mapping[j] < 0x10000))
11788 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
11789 exit (1);
11791 if (rule->title_mapping[j] != 0)
11792 fprintf (stream, "0x%04X", rule->title_mapping[j]);
11793 else
11794 fprintf (stream, " 0");
11796 fprintf (stream, " }, { ");
11797 for (j = 0; j < 3; j++)
11799 if (j > 0)
11800 fprintf (stream, ", ");
11801 if (!(rule->casefold_mapping[j] < 0x10000))
11803 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
11804 exit (1);
11806 if (rule->casefold_mapping[j] != 0)
11807 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
11808 else
11809 fprintf (stream, " 0");
11811 fprintf (stream, " }\n");
11814 if (ferror (stream) || fclose (stream))
11816 fprintf (stderr, "error writing to '%s'\n", filename);
11817 exit (1);
11821 /* ========================================================================= */
11823 /* Quoting the Unicode standard:
11824 Definition: A character is defined to be "cased" if it has the Lowercase
11825 or Uppercase property or has a General_Category value of
11826 Titlecase_Letter. */
11827 static bool
11828 is_cased (unsigned int ch)
11830 return (is_property_lowercase (ch)
11831 || is_property_uppercase (ch)
11832 || is_category_Lt (ch));
11835 /* Quoting the Unicode standard:
11836 Definition: A character is defined to be "case-ignorable" if it has the
11837 value MidLetter {or the value MidNumLet} for the Word_Break property or
11838 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
11839 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11840 The text marked in braces was added in Unicode 5.1.0, see
11841 <https://www.unicode.org/versions/Unicode5.1.0/> section "Update of
11842 Definition of case-ignorable". */
11843 /* Since this predicate is only used for the "Before C" and "After C"
11844 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
11845 This simplifies the evaluation of the regular expressions
11846 \p{cased} (\p{case-ignorable})* C
11848 C (\p{case-ignorable})* \p{cased}
11850 static bool
11851 is_case_ignorable (unsigned int ch)
11853 return (unicode_org_wbp[ch] == WBP_MIDLETTER
11854 || unicode_org_wbp[ch] == WBP_MIDNUMLET
11855 || is_category_Mn (ch)
11856 || is_category_Me (ch)
11857 || is_category_Cf (ch)
11858 || is_category_Lm (ch)
11859 || is_category_Sk (ch))
11860 && !is_cased (ch);
11863 /* ------------------------------------------------------------------------- */
11865 /* Output all case related properties. */
11866 static void
11867 output_casing_properties (const char *version)
11869 #define PROPERTY(FN,P) \
11870 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
11871 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
11872 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
11873 PROPERTY(cased, cased)
11874 PROPERTY(ignorable, case_ignorable)
11875 #undef PROPERTY
11878 /* ========================================================================= */
11880 /* Output the Unicode version. */
11881 static void
11882 output_version (const char *filename, const char *version)
11884 FILE *stream;
11885 int major;
11886 int minor;
11888 stream = fopen (filename, "w");
11889 if (stream == NULL)
11891 fprintf (stderr, "cannot open '%s' for writing\n", filename);
11892 exit (1);
11895 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
11896 fprintf (stream, "/* Supported Unicode version. */\n");
11897 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
11898 version);
11899 fprintf (stream, "\n");
11901 fprintf (stream, "/* Copyright (C) 2024 Free Software Foundation, Inc.\n");
11902 fprintf (stream, "\n");
11903 output_library_license (stream, false);
11904 fprintf (stream, "\n");
11906 fprintf (stream, "#include <config.h>\n");
11907 fprintf (stream, "\n");
11909 fprintf (stream, "/* Specification. */\n");
11910 fprintf (stream, "#include \"unimetadata.h\"\n");
11911 fprintf (stream, "\n");
11913 sscanf (version, "%d.%d", &major, &minor);
11914 fprintf (stream, "const int _libunistring_unicode_version = (%d << 8) | %d;\n",
11915 major, minor);
11917 if (ferror (stream) || fclose (stream))
11919 fprintf (stderr, "error writing to '%s'\n", filename);
11920 exit (1);
11924 /* ========================================================================= */
11927 main (int argc, char * argv[])
11929 const char *unicodedata_filename;
11930 const char *proplist_filename;
11931 const char *derivedproplist_filename;
11932 const char *emojidata_filename;
11933 const char *arabicshaping_filename;
11934 const char *scripts_filename;
11935 const char *blocks_filename;
11936 const char *proplist30_filename;
11937 const char *bidimirroring_filename;
11938 const char *eastasianwidth_filename;
11939 const char *linebreak_filename;
11940 const char *wordbreakproperty_filename;
11941 const char *graphemebreakproperty_filename;
11942 const char *compositionexclusions_filename;
11943 const char *specialcasing_filename;
11944 const char *casefolding_filename;
11945 const char *version;
11947 if (argc != 18)
11949 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt emoji-data.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt BidiMirroring.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
11950 argv[0]);
11951 exit (1);
11954 unicodedata_filename = argv[1];
11955 proplist_filename = argv[2];
11956 derivedproplist_filename = argv[3];
11957 emojidata_filename = argv[4];
11958 arabicshaping_filename = argv[5];
11959 scripts_filename = argv[6];
11960 blocks_filename = argv[7];
11961 proplist30_filename = argv[8];
11962 bidimirroring_filename = argv[9];
11963 eastasianwidth_filename = argv[10];
11964 linebreak_filename = argv[11];
11965 wordbreakproperty_filename = argv[12];
11966 graphemebreakproperty_filename = argv[13];
11967 compositionexclusions_filename = argv[14];
11968 specialcasing_filename = argv[15];
11969 casefolding_filename = argv[16];
11970 version = argv[17];
11972 fill_attributes (unicodedata_filename);
11973 clear_properties ();
11974 fill_properties (proplist_filename);
11975 fill_properties (derivedproplist_filename);
11976 fill_properties (emojidata_filename);
11977 fill_properties30 (proplist30_filename);
11978 fill_arabicshaping (arabicshaping_filename);
11979 fill_scripts (scripts_filename);
11980 fill_blocks (blocks_filename);
11981 fill_mirror (bidimirroring_filename);
11982 fill_width (eastasianwidth_filename);
11983 fill_org_lbp (linebreak_filename);
11984 fill_org_wbp (wordbreakproperty_filename);
11985 fill_org_gbp (graphemebreakproperty_filename);
11986 fill_composition_exclusions (compositionexclusions_filename);
11987 fill_casing_rules (specialcasing_filename);
11988 fill_casefolding_rules (casefolding_filename);
11989 redistribute_casefolding_rules ();
11990 sort_casing_rules ();
11992 output_categories (version);
11993 output_category ("unictype/categ_of.h", version);
11994 output_combclass ("unictype/combiningclass.h", version);
11995 output_bidi_category ("unictype/bidi_of.h", version);
11996 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
11997 output_decimal_digit ("unictype/decdigit.h", version);
11998 output_digit_test ("../tests/unictype/test-digit.h", version);
11999 output_digit ("unictype/digit.h", version);
12000 output_numeric_test ("../tests/unictype/test-numeric.h", version);
12001 output_numeric ("unictype/numeric.h", version);
12002 output_mirror ("unictype/mirror.h", version);
12003 output_properties (version);
12004 output_indic_conjunct_break_test ("../tests/unictype/test-incb_of.h", version);
12005 output_indic_conjunct_break ("unictype/incb_of.h", version);
12006 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
12007 output_joining_type ("unictype/joiningtype_of.h", version);
12008 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
12009 output_joining_group ("unictype/joininggroup_of.h", version);
12011 output_scripts (version);
12012 output_scripts_byname (version);
12013 output_blocks (version);
12014 output_ident_properties (version);
12015 output_nonspacing_property ("uniwidth/width0.h", version);
12016 output_width2_property ("uniwidth/width2.h", version);
12017 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
12018 output_old_ctype (version);
12020 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
12021 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
12022 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
12023 output_lbrk_rules_as_tables ("unilbrk/lbrktables.c", version);
12025 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
12026 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
12027 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
12029 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
12030 output_gbp_table ("unigbrk/gbrkprop.h", version);
12032 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
12033 debug_output_composition_tables ("uninorm/composition.txt");
12034 output_composition_tables ("uninorm/composition-table.gperf", version);
12036 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
12037 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
12038 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
12039 output_simple_mapping ("unicase/toupper.h", to_upper, version);
12040 output_simple_mapping ("unicase/tolower.h", to_lower, version);
12041 output_simple_mapping ("unicase/totitle.h", to_title, version);
12042 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
12043 output_casing_rules ("unicase/special-casing-table.gperf", version);
12044 output_casing_properties (version);
12046 output_version ("unimetadata/u-version.c", version);
12048 return 0;
12052 * Local Variables:
12053 * coding: utf-8
12054 * compile-command: "\
12055 * gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \\
12056 * ./gen-uni-tables \\
12057 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/UnicodeData.txt \\
12058 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/PropList.txt \\
12059 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/DerivedCoreProperties.txt \\
12060 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/emoji/emoji-data.txt \\
12061 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/ArabicShaping.txt \\
12062 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/Scripts.txt \\
12063 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/Blocks.txt \\
12064 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \\
12065 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/BidiMirroring.txt \\
12066 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/EastAsianWidth.txt \\
12067 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/LineBreak.txt \\
12068 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/auxiliary/WordBreakProperty.txt \\
12069 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \\
12070 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/CompositionExclusions.txt \\
12071 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/SpecialCasing.txt \\
12072 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/CaseFolding.txt \\
12073 * 15.1.0 \\
12074 * && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \\
12075 * && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt \\
12076 * && clisp -C uniname/gen-uninames.lisp \\
12077 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/UnicodeData.txt \\
12078 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/NameAliases.txt \\
12079 * uniname/uninames.h \\
12080 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12081 * echo; \\
12082 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/NameAliases.txt; } \\
12083 * > ../tests/uniname/NameAliases.txt \\
12084 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12085 * echo; \\
12086 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/UnicodeData.txt; } \\
12087 * > ../tests/uniname/UnicodeData.txt \\
12088 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12089 * echo; \\
12090 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/NormalizationTest.txt; } \\
12091 * > ../tests/uninorm/NormalizationTest.txt \\
12092 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12093 * echo; \\
12094 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/auxiliary/GraphemeBreakTest.txt; } \\
12095 * > ../tests/unigbrk/GraphemeBreakTest.txt \\
12096 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12097 * echo; \\
12098 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/15.1.0/ucd/auxiliary/WordBreakTest.txt; } \\
12099 * > ../tests/uniwbrk/WordBreakTest.txt"
12100 * End: