unistr/u{8,16,32}-uctomb: Avoid possible trouble with huge strings.
[gnulib.git] / lib / gen-uni-tables.c
blob75961b41d8f79f38fdae21ba01825d18e13d90e3
1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2020 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <https://www.gnu.org/licenses/>. */
20 /* Usage example:
21 $ gen-uni-tables /usr/local/share/www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt \
22 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/PropList.txt \
23 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/DerivedCoreProperties.txt \
24 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/ArabicShaping.txt \
25 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/Scripts.txt \
26 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/Blocks.txt \
27 /usr/local/share/www.unicode.org/Public/3.0-Update1/PropList-3.0.1.txt \
28 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/EastAsianWidth.txt \
29 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/LineBreak.txt \
30 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakProperty.txt \
31 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
32 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/CompositionExclusions.txt \
33 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/SpecialCasing.txt \
34 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt \
35 9.0.0
38 #include <assert.h>
39 #include <stdbool.h>
40 #include <stdint.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <time.h>
46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48 /* ========================================================================= */
50 /* Reading UnicodeData.txt. */
51 /* See UCD.html. */
53 /* This structure represents one line in the UnicodeData.txt file. */
54 struct unicode_attribute
56 const char *name; /* Character name */
57 const char *category; /* General category */
58 const char *combining; /* Canonical combining class */
59 const char *bidi; /* Bidirectional category */
60 const char *decomposition; /* Character decomposition mapping */
61 const char *decdigit; /* Decimal digit value */
62 const char *digit; /* Digit value */
63 const char *numeric; /* Numeric value */
64 bool mirrored; /* mirrored */
65 const char *oldname; /* Old Unicode 1.0 name */
66 const char *comment; /* Comment */
67 unsigned int upper; /* Uppercase mapping */
68 unsigned int lower; /* Lowercase mapping */
69 unsigned int title; /* Titlecase mapping */
72 /* Missing fields are represented with "" for strings, and NONE for
73 characters. */
74 #define NONE (~(unsigned int)0)
76 /* The entire contents of the UnicodeData.txt file. */
77 struct unicode_attribute unicode_attributes [0x110000];
79 /* Stores in unicode_attributes[i] the values from the given fields. */
80 static void
81 fill_attribute (unsigned int i,
82 const char *field1, const char *field2,
83 const char *field3, const char *field4,
84 const char *field5, const char *field6,
85 const char *field7, const char *field8,
86 const char *field9, const char *field10,
87 const char *field11, const char *field12,
88 const char *field13, const char *field14)
90 struct unicode_attribute * uni;
92 if (i >= 0x110000)
94 fprintf (stderr, "index too large\n");
95 exit (1);
97 if (strcmp (field2, "Cs") == 0)
98 /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */
99 return;
100 uni = &unicode_attributes[i];
101 /* Copy the strings. */
102 uni->name = strdup (field1);
103 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
104 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
105 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
106 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
107 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
108 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
109 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
110 uni->mirrored = (field9[0] == 'Y');
111 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
112 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
113 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
114 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
115 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
118 /* Maximum length of a field in the UnicodeData.txt file. */
119 #define FIELDLEN 160
121 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
122 Reads up to (but excluding) DELIM.
123 Returns 1 when a field was successfully read, otherwise 0. */
124 static int
125 getfield (FILE *stream, char *buffer, int delim)
127 int count = 0;
128 int c;
130 for (; (c = getc (stream)), (c != EOF && c != delim); )
132 /* The original unicode.org UnicodeData.txt file happens to have
133 CR/LF line terminators. Silently convert to LF. */
134 if (c == '\r')
135 continue;
137 /* Put c into the buffer. */
138 if (++count >= FIELDLEN - 1)
140 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
141 exit (1);
143 *buffer++ = c;
146 if (c == EOF)
147 return 0;
149 *buffer = '\0';
150 return 1;
153 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
154 file. */
155 static void
156 fill_attributes (const char *unicodedata_filename)
158 unsigned int i, j;
159 FILE *stream;
160 char field0[FIELDLEN];
161 char field1[FIELDLEN];
162 char field2[FIELDLEN];
163 char field3[FIELDLEN];
164 char field4[FIELDLEN];
165 char field5[FIELDLEN];
166 char field6[FIELDLEN];
167 char field7[FIELDLEN];
168 char field8[FIELDLEN];
169 char field9[FIELDLEN];
170 char field10[FIELDLEN];
171 char field11[FIELDLEN];
172 char field12[FIELDLEN];
173 char field13[FIELDLEN];
174 char field14[FIELDLEN];
175 int lineno = 0;
177 for (i = 0; i < 0x110000; i++)
178 unicode_attributes[i].name = NULL;
180 stream = fopen (unicodedata_filename, "r");
181 if (stream == NULL)
183 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
184 exit (1);
187 for (;;)
189 int n;
191 lineno++;
192 n = getfield (stream, field0, ';');
193 n += getfield (stream, field1, ';');
194 n += getfield (stream, field2, ';');
195 n += getfield (stream, field3, ';');
196 n += getfield (stream, field4, ';');
197 n += getfield (stream, field5, ';');
198 n += getfield (stream, field6, ';');
199 n += getfield (stream, field7, ';');
200 n += getfield (stream, field8, ';');
201 n += getfield (stream, field9, ';');
202 n += getfield (stream, field10, ';');
203 n += getfield (stream, field11, ';');
204 n += getfield (stream, field12, ';');
205 n += getfield (stream, field13, ';');
206 n += getfield (stream, field14, '\n');
207 if (n == 0)
208 break;
209 if (n != 15)
211 fprintf (stderr, "short line in '%s':%d\n",
212 unicodedata_filename, lineno);
213 exit (1);
215 i = strtoul (field0, NULL, 16);
216 if (field1[0] == '<'
217 && strlen (field1) >= 9
218 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
220 /* Deal with a range. */
221 lineno++;
222 n = getfield (stream, field0, ';');
223 n += getfield (stream, field1, ';');
224 n += getfield (stream, field2, ';');
225 n += getfield (stream, field3, ';');
226 n += getfield (stream, field4, ';');
227 n += getfield (stream, field5, ';');
228 n += getfield (stream, field6, ';');
229 n += getfield (stream, field7, ';');
230 n += getfield (stream, field8, ';');
231 n += getfield (stream, field9, ';');
232 n += getfield (stream, field10, ';');
233 n += getfield (stream, field11, ';');
234 n += getfield (stream, field12, ';');
235 n += getfield (stream, field13, ';');
236 n += getfield (stream, field14, '\n');
237 if (n != 15)
239 fprintf (stderr, "missing end range in '%s':%d\n",
240 unicodedata_filename, lineno);
241 exit (1);
243 if (!(field1[0] == '<'
244 && strlen (field1) >= 8
245 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
247 fprintf (stderr, "missing end range in '%s':%d\n",
248 unicodedata_filename, lineno);
249 exit (1);
251 field1[strlen (field1) - 7] = '\0';
252 j = strtoul (field0, NULL, 16);
253 for (; i <= j; i++)
254 fill_attribute (i, field1+1, field2, field3, field4, field5,
255 field6, field7, field8, field9, field10,
256 field11, field12, field13, field14);
258 else
260 /* Single character line */
261 fill_attribute (i, field1, field2, field3, field4, field5,
262 field6, field7, field8, field9, field10,
263 field11, field12, field13, field14);
267 if (ferror (stream) || fclose (stream))
269 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
270 exit (1);
274 /* ========================================================================= */
276 /* General category. */
277 /* See Unicode 3.0 book, section 4.5,
278 UCD.html. */
280 static bool
281 is_category_L (unsigned int ch)
283 return (unicode_attributes[ch].name != NULL
284 && unicode_attributes[ch].category[0] == 'L');
287 static bool
288 is_category_LC (unsigned int ch)
290 /* See PropertyValueAliases.txt. */
291 return (unicode_attributes[ch].name != NULL
292 && unicode_attributes[ch].category[0] == 'L'
293 && (unicode_attributes[ch].category[1] == 'u'
294 || unicode_attributes[ch].category[1] == 'l'
295 || unicode_attributes[ch].category[1] == 't'));
298 static bool
299 is_category_Lu (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 'u');
306 static bool
307 is_category_Ll (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'l');
314 static bool
315 is_category_Lt (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'L'
319 && unicode_attributes[ch].category[1] == 't');
322 static bool
323 is_category_Lm (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'L'
327 && unicode_attributes[ch].category[1] == 'm');
330 static bool
331 is_category_Lo (unsigned int ch)
333 return (unicode_attributes[ch].name != NULL
334 && unicode_attributes[ch].category[0] == 'L'
335 && unicode_attributes[ch].category[1] == 'o');
338 static bool
339 is_category_M (unsigned int ch)
341 return (unicode_attributes[ch].name != NULL
342 && unicode_attributes[ch].category[0] == 'M');
345 static bool
346 is_category_Mn (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'M'
350 && unicode_attributes[ch].category[1] == 'n');
353 static bool
354 is_category_Mc (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'M'
358 && unicode_attributes[ch].category[1] == 'c');
361 static bool
362 is_category_Me (unsigned int ch)
364 return (unicode_attributes[ch].name != NULL
365 && unicode_attributes[ch].category[0] == 'M'
366 && unicode_attributes[ch].category[1] == 'e');
369 static bool
370 is_category_N (unsigned int ch)
372 return (unicode_attributes[ch].name != NULL
373 && unicode_attributes[ch].category[0] == 'N');
376 static bool
377 is_category_Nd (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'N'
381 && unicode_attributes[ch].category[1] == 'd');
384 static bool
385 is_category_Nl (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'N'
389 && unicode_attributes[ch].category[1] == 'l');
392 static bool
393 is_category_No (unsigned int ch)
395 return (unicode_attributes[ch].name != NULL
396 && unicode_attributes[ch].category[0] == 'N'
397 && unicode_attributes[ch].category[1] == 'o');
400 static bool
401 is_category_P (unsigned int ch)
403 return (unicode_attributes[ch].name != NULL
404 && unicode_attributes[ch].category[0] == 'P');
407 static bool
408 is_category_Pc (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 'c');
415 static bool
416 is_category_Pd (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'd');
423 static bool
424 is_category_Ps (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 's');
431 static bool
432 is_category_Pe (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'e');
439 static bool
440 is_category_Pi (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'P'
444 && unicode_attributes[ch].category[1] == 'i');
447 static bool
448 is_category_Pf (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'P'
452 && unicode_attributes[ch].category[1] == 'f');
455 static bool
456 is_category_Po (unsigned int ch)
458 return (unicode_attributes[ch].name != NULL
459 && unicode_attributes[ch].category[0] == 'P'
460 && unicode_attributes[ch].category[1] == 'o');
463 static bool
464 is_category_S (unsigned int ch)
466 return (unicode_attributes[ch].name != NULL
467 && unicode_attributes[ch].category[0] == 'S');
470 static bool
471 is_category_Sm (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'm');
478 static bool
479 is_category_Sc (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'S'
483 && unicode_attributes[ch].category[1] == 'c');
486 static bool
487 is_category_Sk (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'S'
491 && unicode_attributes[ch].category[1] == 'k');
494 static bool
495 is_category_So (unsigned int ch)
497 return (unicode_attributes[ch].name != NULL
498 && unicode_attributes[ch].category[0] == 'S'
499 && unicode_attributes[ch].category[1] == 'o');
502 static bool
503 is_category_Z (unsigned int ch)
505 return (unicode_attributes[ch].name != NULL
506 && unicode_attributes[ch].category[0] == 'Z');
509 static bool
510 is_category_Zs (unsigned int ch)
512 return (unicode_attributes[ch].name != NULL
513 && unicode_attributes[ch].category[0] == 'Z'
514 && unicode_attributes[ch].category[1] == 's');
517 static bool
518 is_category_Zl (unsigned int ch)
520 return (unicode_attributes[ch].name != NULL
521 && unicode_attributes[ch].category[0] == 'Z'
522 && unicode_attributes[ch].category[1] == 'l');
525 static bool
526 is_category_Zp (unsigned int ch)
528 return (unicode_attributes[ch].name != NULL
529 && unicode_attributes[ch].category[0] == 'Z'
530 && unicode_attributes[ch].category[1] == 'p');
533 static bool
534 is_category_C (unsigned int ch)
536 return (unicode_attributes[ch].name == NULL
537 || unicode_attributes[ch].category[0] == 'C');
540 static bool
541 is_category_Cc (unsigned int ch)
543 return (unicode_attributes[ch].name != NULL
544 && unicode_attributes[ch].category[0] == 'C'
545 && unicode_attributes[ch].category[1] == 'c');
548 static bool
549 is_category_Cf (unsigned int ch)
551 return (unicode_attributes[ch].name != NULL
552 && unicode_attributes[ch].category[0] == 'C'
553 && unicode_attributes[ch].category[1] == 'f');
556 static bool
557 is_category_Cs (unsigned int ch)
559 return (ch >= 0xd800 && ch < 0xe000);
562 static bool
563 is_category_Co (unsigned int ch)
565 return (unicode_attributes[ch].name != NULL
566 && unicode_attributes[ch].category[0] == 'C'
567 && unicode_attributes[ch].category[1] == 'o');
570 static bool
571 is_category_Cn (unsigned int ch)
573 return (unicode_attributes[ch].name == NULL
574 && !(ch >= 0xd800 && ch < 0xe000));
577 /* Output a boolean property in a human readable format. */
578 static void
579 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
581 FILE *stream;
582 unsigned int ch;
584 stream = fopen (filename, "w");
585 if (stream == NULL)
587 fprintf (stderr, "cannot open '%s' for writing\n", filename);
588 exit (1);
591 #if 0 /* This yields huge text output. */
592 for (ch = 0; ch < 0x110000; ch++)
593 if (predicate (ch))
595 fprintf (stream, "0x%04X\n", ch);
597 #else
598 for (ch = 0; ch < 0x110000; ch++)
599 if (predicate (ch))
601 unsigned int first = ch;
602 unsigned int last;
604 while (ch + 1 < 0x110000 && predicate (ch + 1))
605 ch++;
606 last = ch;
607 if (first < last)
608 fprintf (stream, "0x%04X..0x%04X\n", first, last);
609 else
610 fprintf (stream, "0x%04X\n", ch);
612 #endif
614 if (ferror (stream) || fclose (stream))
616 fprintf (stderr, "error writing to '%s'\n", filename);
617 exit (1);
621 /* Output the unit test for a boolean property. */
622 static void
623 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
625 FILE *stream;
626 bool need_comma;
627 unsigned int ch;
629 stream = fopen (filename, "w");
630 if (stream == NULL)
632 fprintf (stderr, "cannot open '%s' for writing\n", filename);
633 exit (1);
636 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
637 fprintf (stream, "/* Test the Unicode character type functions.\n");
638 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
639 fprintf (stream, "\n");
640 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
641 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
642 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
643 fprintf (stream, " (at your option) any later version.\n");
644 fprintf (stream, "\n");
645 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
646 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
647 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
648 fprintf (stream, " GNU General Public License for more details.\n");
649 fprintf (stream, "\n");
650 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
651 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
652 fprintf (stream, "\n");
653 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
654 fprintf (stream, "\n");
656 need_comma = false;
657 for (ch = 0; ch < 0x110000; ch++)
658 if (predicate (ch))
660 unsigned int first = ch;
661 unsigned int last;
663 while (ch + 1 < 0x110000 && predicate (ch + 1))
664 ch++;
665 last = ch;
666 if (need_comma)
667 fprintf (stream, ",\n");
668 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
669 need_comma = true;
671 if (need_comma)
672 fprintf (stream, "\n");
674 fprintf (stream, "\n");
675 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
676 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
678 if (ferror (stream) || fclose (stream))
680 fprintf (stderr, "error writing to '%s'\n", filename);
681 exit (1);
685 /* Construction of sparse 3-level tables. */
686 #define TABLE predicate_table
687 #define xmalloc malloc
688 #define xrealloc realloc
689 #include "3levelbit.h"
691 /* Output a boolean property in a three-level bitmap. */
692 static void
693 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
695 FILE *stream;
696 unsigned int ch, i;
697 struct predicate_table t;
698 unsigned int level1_offset, level2_offset, level3_offset;
700 stream = fopen (filename, "w");
701 if (stream == NULL)
703 fprintf (stderr, "cannot open '%s' for writing\n", filename);
704 exit (1);
707 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
708 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
709 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
710 version);
712 t.p = 4; /* or: 5 */
713 t.q = 7; /* or: 6 */
714 predicate_table_init (&t);
716 for (ch = 0; ch < 0x110000; ch++)
717 if (predicate (ch))
718 predicate_table_add (&t, ch);
720 predicate_table_finalize (&t);
722 /* Offsets in t.result, in memory of this process. */
723 level1_offset =
724 5 * sizeof (uint32_t);
725 level2_offset =
726 5 * sizeof (uint32_t)
727 + t.level1_size * sizeof (uint32_t);
728 level3_offset =
729 5 * sizeof (uint32_t)
730 + t.level1_size * sizeof (uint32_t)
731 + (t.level2_size << t.q) * sizeof (uint32_t);
733 for (i = 0; i < 5; i++)
734 if (i != 1)
735 fprintf (stream, "#define header_%d %d\n", i,
736 ((uint32_t *) t.result)[i]);
738 fprintf (stream, "static const\n");
739 fprintf (stream, "struct\n");
740 fprintf (stream, " {\n");
741 fprintf (stream, " int header[1];\n");
742 fprintf (stream, " int level1[%zu];\n", t.level1_size);
743 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
744 fprintf (stream, " unsigned int level3[%zu << %d];\n", t.level3_size, t.p);
745 fprintf (stream, " }\n");
746 fprintf (stream, "%s =\n", name);
747 fprintf (stream, "{\n");
748 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
749 fprintf (stream, " {");
750 if (t.level1_size > 1)
751 fprintf (stream, "\n ");
752 for (i = 0; i < t.level1_size; i++)
754 uint32_t offset;
755 if (i > 0 && (i % 1) == 0)
756 fprintf (stream, "\n ");
757 offset = ((uint32_t *) (t.result + level1_offset))[i];
758 if (offset == 0)
759 fprintf (stream, " %5d", -1);
760 else
761 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
762 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
763 if (i+1 < t.level1_size)
764 fprintf (stream, ",");
766 if (t.level1_size > 1)
767 fprintf (stream, "\n ");
768 fprintf (stream, " },\n");
769 fprintf (stream, " {");
770 if (t.level2_size << t.q > 1)
771 fprintf (stream, "\n ");
772 for (i = 0; i < t.level2_size << t.q; i++)
774 uint32_t offset;
775 if (i > 0 && (i % 1) == 0)
776 fprintf (stream, "\n ");
777 offset = ((uint32_t *) (t.result + level2_offset))[i];
778 if (offset == 0)
779 fprintf (stream, " %5d", -1);
780 else
781 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
782 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
783 if (i+1 < t.level2_size << t.q)
784 fprintf (stream, ",");
786 if (t.level2_size << t.q > 1)
787 fprintf (stream, "\n ");
788 fprintf (stream, " },\n");
789 fprintf (stream, " {");
790 if (t.level3_size << t.p > 4)
791 fprintf (stream, "\n ");
792 for (i = 0; i < t.level3_size << t.p; i++)
794 if (i > 0 && (i % 4) == 0)
795 fprintf (stream, "\n ");
796 fprintf (stream, " 0x%08XU",
797 ((uint32_t *) (t.result + level3_offset))[i]);
798 if (i+1 < t.level3_size << t.p)
799 fprintf (stream, ",");
801 if (t.level3_size << t.p > 4)
802 fprintf (stream, "\n ");
803 fprintf (stream, " }\n");
804 fprintf (stream, "};\n");
806 if (ferror (stream) || fclose (stream))
808 fprintf (stderr, "error writing to '%s'\n", filename);
809 exit (1);
813 /* Output all categories. */
814 static void
815 output_categories (const char *version)
817 #define CATEGORY(C) \
818 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
819 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
820 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
821 CATEGORY (L)
822 CATEGORY (LC)
823 CATEGORY (Lu)
824 CATEGORY (Ll)
825 CATEGORY (Lt)
826 CATEGORY (Lm)
827 CATEGORY (Lo)
828 CATEGORY (M)
829 CATEGORY (Mn)
830 CATEGORY (Mc)
831 CATEGORY (Me)
832 CATEGORY (N)
833 CATEGORY (Nd)
834 CATEGORY (Nl)
835 CATEGORY (No)
836 CATEGORY (P)
837 CATEGORY (Pc)
838 CATEGORY (Pd)
839 CATEGORY (Ps)
840 CATEGORY (Pe)
841 CATEGORY (Pi)
842 CATEGORY (Pf)
843 CATEGORY (Po)
844 CATEGORY (S)
845 CATEGORY (Sm)
846 CATEGORY (Sc)
847 CATEGORY (Sk)
848 CATEGORY (So)
849 CATEGORY (Z)
850 CATEGORY (Zs)
851 CATEGORY (Zl)
852 CATEGORY (Zp)
853 CATEGORY (C)
854 CATEGORY (Cc)
855 CATEGORY (Cf)
856 CATEGORY (Cs)
857 CATEGORY (Co)
858 CATEGORY (Cn)
859 #undef CATEGORY
862 enum
864 UC_CATEGORY_MASK_L = 0x0000001f,
865 UC_CATEGORY_MASK_LC = 0x00000007,
866 UC_CATEGORY_MASK_Lu = 0x00000001,
867 UC_CATEGORY_MASK_Ll = 0x00000002,
868 UC_CATEGORY_MASK_Lt = 0x00000004,
869 UC_CATEGORY_MASK_Lm = 0x00000008,
870 UC_CATEGORY_MASK_Lo = 0x00000010,
871 UC_CATEGORY_MASK_M = 0x000000e0,
872 UC_CATEGORY_MASK_Mn = 0x00000020,
873 UC_CATEGORY_MASK_Mc = 0x00000040,
874 UC_CATEGORY_MASK_Me = 0x00000080,
875 UC_CATEGORY_MASK_N = 0x00000700,
876 UC_CATEGORY_MASK_Nd = 0x00000100,
877 UC_CATEGORY_MASK_Nl = 0x00000200,
878 UC_CATEGORY_MASK_No = 0x00000400,
879 UC_CATEGORY_MASK_P = 0x0003f800,
880 UC_CATEGORY_MASK_Pc = 0x00000800,
881 UC_CATEGORY_MASK_Pd = 0x00001000,
882 UC_CATEGORY_MASK_Ps = 0x00002000,
883 UC_CATEGORY_MASK_Pe = 0x00004000,
884 UC_CATEGORY_MASK_Pi = 0x00008000,
885 UC_CATEGORY_MASK_Pf = 0x00010000,
886 UC_CATEGORY_MASK_Po = 0x00020000,
887 UC_CATEGORY_MASK_S = 0x003c0000,
888 UC_CATEGORY_MASK_Sm = 0x00040000,
889 UC_CATEGORY_MASK_Sc = 0x00080000,
890 UC_CATEGORY_MASK_Sk = 0x00100000,
891 UC_CATEGORY_MASK_So = 0x00200000,
892 UC_CATEGORY_MASK_Z = 0x01c00000,
893 UC_CATEGORY_MASK_Zs = 0x00400000,
894 UC_CATEGORY_MASK_Zl = 0x00800000,
895 UC_CATEGORY_MASK_Zp = 0x01000000,
896 UC_CATEGORY_MASK_C = 0x3e000000,
897 UC_CATEGORY_MASK_Cc = 0x02000000,
898 UC_CATEGORY_MASK_Cf = 0x04000000,
899 UC_CATEGORY_MASK_Cs = 0x08000000,
900 UC_CATEGORY_MASK_Co = 0x10000000,
901 UC_CATEGORY_MASK_Cn = 0x20000000
904 static int
905 general_category_byname (const char *category_name)
907 if (category_name[0] != '\0'
908 && (category_name[1] == '\0' || category_name[2] == '\0'))
909 switch (category_name[0])
911 case 'L':
912 switch (category_name[1])
914 case '\0': return UC_CATEGORY_MASK_L;
915 case 'C': return UC_CATEGORY_MASK_LC;
916 case 'u': return UC_CATEGORY_MASK_Lu;
917 case 'l': return UC_CATEGORY_MASK_Ll;
918 case 't': return UC_CATEGORY_MASK_Lt;
919 case 'm': return UC_CATEGORY_MASK_Lm;
920 case 'o': return UC_CATEGORY_MASK_Lo;
922 break;
923 case 'M':
924 switch (category_name[1])
926 case '\0': return UC_CATEGORY_MASK_M;
927 case 'n': return UC_CATEGORY_MASK_Mn;
928 case 'c': return UC_CATEGORY_MASK_Mc;
929 case 'e': return UC_CATEGORY_MASK_Me;
931 break;
932 case 'N':
933 switch (category_name[1])
935 case '\0': return UC_CATEGORY_MASK_N;
936 case 'd': return UC_CATEGORY_MASK_Nd;
937 case 'l': return UC_CATEGORY_MASK_Nl;
938 case 'o': return UC_CATEGORY_MASK_No;
940 break;
941 case 'P':
942 switch (category_name[1])
944 case '\0': return UC_CATEGORY_MASK_P;
945 case 'c': return UC_CATEGORY_MASK_Pc;
946 case 'd': return UC_CATEGORY_MASK_Pd;
947 case 's': return UC_CATEGORY_MASK_Ps;
948 case 'e': return UC_CATEGORY_MASK_Pe;
949 case 'i': return UC_CATEGORY_MASK_Pi;
950 case 'f': return UC_CATEGORY_MASK_Pf;
951 case 'o': return UC_CATEGORY_MASK_Po;
953 break;
954 case 'S':
955 switch (category_name[1])
957 case '\0': return UC_CATEGORY_MASK_S;
958 case 'm': return UC_CATEGORY_MASK_Sm;
959 case 'c': return UC_CATEGORY_MASK_Sc;
960 case 'k': return UC_CATEGORY_MASK_Sk;
961 case 'o': return UC_CATEGORY_MASK_So;
963 break;
964 case 'Z':
965 switch (category_name[1])
967 case '\0': return UC_CATEGORY_MASK_Z;
968 case 's': return UC_CATEGORY_MASK_Zs;
969 case 'l': return UC_CATEGORY_MASK_Zl;
970 case 'p': return UC_CATEGORY_MASK_Zp;
972 break;
973 case 'C':
974 switch (category_name[1])
976 case '\0': return UC_CATEGORY_MASK_C;
977 case 'c': return UC_CATEGORY_MASK_Cc;
978 case 'f': return UC_CATEGORY_MASK_Cf;
979 case 's': return UC_CATEGORY_MASK_Cs;
980 case 'o': return UC_CATEGORY_MASK_Co;
981 case 'n': return UC_CATEGORY_MASK_Cn;
983 break;
985 /* Invalid category name. */
986 abort ();
989 /* Construction of sparse 3-level tables. */
990 #define TABLE category_table
991 #define ELEMENT uint8_t
992 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
993 #define xmalloc malloc
994 #define xrealloc realloc
995 #include "3level.h"
997 /* Output the per-character category table. */
998 static void
999 output_category (const char *filename, const char *version)
1001 FILE *stream;
1002 unsigned int ch, i;
1003 struct category_table t;
1004 unsigned int level1_offset, level2_offset, level3_offset;
1005 uint16_t *level3_packed;
1007 stream = fopen (filename, "w");
1008 if (stream == NULL)
1010 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1011 exit (1);
1014 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1015 fprintf (stream, "/* Categories of Unicode characters. */\n");
1016 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1017 version);
1019 t.p = 7;
1020 t.q = 9;
1021 category_table_init (&t);
1023 for (ch = 0; ch < 0x110000; ch++)
1025 int value;
1026 unsigned int log2_value;
1028 if (is_category_Cs (ch))
1029 value = UC_CATEGORY_MASK_Cs;
1030 else if (unicode_attributes[ch].name != NULL)
1031 value = general_category_byname (unicode_attributes[ch].category);
1032 else
1033 continue;
1035 /* Now value should contain exactly one bit. */
1036 assert (value != 0 && (value & (value - 1)) == 0);
1038 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1040 assert (log2_value <= 0x1f);
1042 category_table_add (&t, ch, log2_value);
1045 category_table_finalize (&t);
1047 /* Offsets in t.result, in memory of this process. */
1048 level1_offset =
1049 5 * sizeof (uint32_t);
1050 level2_offset =
1051 5 * sizeof (uint32_t)
1052 + t.level1_size * sizeof (uint32_t);
1053 level3_offset =
1054 5 * sizeof (uint32_t)
1055 + t.level1_size * sizeof (uint32_t)
1056 + (t.level2_size << t.q) * sizeof (uint32_t);
1058 for (i = 0; i < 5; i++)
1059 fprintf (stream, "#define category_header_%d %d\n", i,
1060 ((uint32_t *) t.result)[i]);
1061 fprintf (stream, "static const\n");
1062 fprintf (stream, "struct\n");
1063 fprintf (stream, " {\n");
1064 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1065 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1066 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1067 (1 << t.p) * 5 / 16);
1068 fprintf (stream, " }\n");
1069 fprintf (stream, "u_category =\n");
1070 fprintf (stream, "{\n");
1071 fprintf (stream, " {");
1072 if (t.level1_size > 8)
1073 fprintf (stream, "\n ");
1074 for (i = 0; i < t.level1_size; i++)
1076 uint32_t offset;
1077 if (i > 0 && (i % 8) == 0)
1078 fprintf (stream, "\n ");
1079 offset = ((uint32_t *) (t.result + level1_offset))[i];
1080 if (offset == 0)
1081 fprintf (stream, " %5d", -1);
1082 else
1083 fprintf (stream, " %5zu",
1084 (offset - level2_offset) / sizeof (uint32_t));
1085 if (i+1 < t.level1_size)
1086 fprintf (stream, ",");
1088 if (t.level1_size > 8)
1089 fprintf (stream, "\n ");
1090 fprintf (stream, " },\n");
1091 fprintf (stream, " {");
1092 if (t.level2_size << t.q > 8)
1093 fprintf (stream, "\n ");
1094 for (i = 0; i < t.level2_size << t.q; i++)
1096 uint32_t offset;
1097 if (i > 0 && (i % 8) == 0)
1098 fprintf (stream, "\n ");
1099 offset = ((uint32_t *) (t.result + level2_offset))[i];
1100 if (offset == 0)
1101 fprintf (stream, " %5d", -1);
1102 else
1103 fprintf (stream, " %5zu",
1104 (offset - level3_offset) / sizeof (uint8_t));
1105 if (i+1 < t.level2_size << t.q)
1106 fprintf (stream, ",");
1108 if (t.level2_size << t.q > 8)
1109 fprintf (stream, "\n ");
1110 fprintf (stream, " },\n");
1111 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1112 not 32-bit units, in order to make the lookup function easier. */
1113 level3_packed =
1114 (uint16_t *)
1115 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1116 for (i = 0; i < t.level3_size << t.p; i++)
1118 unsigned int j = (i * 5) / 16;
1119 unsigned int k = (i * 5) % 16;
1120 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1121 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1122 level3_packed[j] = value & 0xffff;
1123 level3_packed[j+1] = value >> 16;
1125 fprintf (stream, " {");
1126 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1127 fprintf (stream, "\n ");
1128 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1130 if (i > 0 && (i % 8) == 0)
1131 fprintf (stream, "\n ");
1132 fprintf (stream, " 0x%04x", level3_packed[i]);
1133 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1134 fprintf (stream, ",");
1136 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1137 fprintf (stream, "\n ");
1138 fprintf (stream, " }\n");
1139 free (level3_packed);
1140 fprintf (stream, "};\n");
1142 if (ferror (stream) || fclose (stream))
1144 fprintf (stderr, "error writing to '%s'\n", filename);
1145 exit (1);
1149 /* ========================================================================= */
1151 /* Canonical combining class. */
1152 /* See Unicode 3.0 book, section 4.2,
1153 UCD.html. */
1155 /* Construction of sparse 3-level tables. */
1156 #define TABLE combclass_table
1157 #define ELEMENT uint8_t
1158 #define DEFAULT 0
1159 #define xmalloc malloc
1160 #define xrealloc realloc
1161 #include "3level.h"
1163 /* Output the per-character combining class table. */
1164 static void
1165 output_combclass (const char *filename, const char *version)
1167 FILE *stream;
1168 unsigned int ch, i;
1169 struct combclass_table t;
1170 unsigned int level1_offset, level2_offset, level3_offset;
1172 stream = fopen (filename, "w");
1173 if (stream == NULL)
1175 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1176 exit (1);
1179 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1180 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1181 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1182 version);
1184 t.p = 7;
1185 t.q = 9;
1186 combclass_table_init (&t);
1188 for (ch = 0; ch < 0x110000; ch++)
1189 if (unicode_attributes[ch].name != NULL)
1191 int value = atoi (unicode_attributes[ch].combining);
1192 assert (value >= 0 && value <= 255);
1193 combclass_table_add (&t, ch, value);
1196 combclass_table_finalize (&t);
1198 /* Offsets in t.result, in memory of this process. */
1199 level1_offset =
1200 5 * sizeof (uint32_t);
1201 level2_offset =
1202 5 * sizeof (uint32_t)
1203 + t.level1_size * sizeof (uint32_t);
1204 level3_offset =
1205 5 * sizeof (uint32_t)
1206 + t.level1_size * sizeof (uint32_t)
1207 + (t.level2_size << t.q) * sizeof (uint32_t);
1209 for (i = 0; i < 5; i++)
1210 fprintf (stream, "#define combclass_header_%d %d\n", i,
1211 ((uint32_t *) t.result)[i]);
1212 fprintf (stream, "static const\n");
1213 fprintf (stream, "struct\n");
1214 fprintf (stream, " {\n");
1215 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1216 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1217 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1218 fprintf (stream, " }\n");
1219 fprintf (stream, "u_combclass =\n");
1220 fprintf (stream, "{\n");
1221 fprintf (stream, " {");
1222 if (t.level1_size > 8)
1223 fprintf (stream, "\n ");
1224 for (i = 0; i < t.level1_size; i++)
1226 uint32_t offset;
1227 if (i > 0 && (i % 8) == 0)
1228 fprintf (stream, "\n ");
1229 offset = ((uint32_t *) (t.result + level1_offset))[i];
1230 if (offset == 0)
1231 fprintf (stream, " %5d", -1);
1232 else
1233 fprintf (stream, " %5zu",
1234 (offset - level2_offset) / sizeof (uint32_t));
1235 if (i+1 < t.level1_size)
1236 fprintf (stream, ",");
1238 if (t.level1_size > 8)
1239 fprintf (stream, "\n ");
1240 fprintf (stream, " },\n");
1241 fprintf (stream, " {");
1242 if (t.level2_size << t.q > 8)
1243 fprintf (stream, "\n ");
1244 for (i = 0; i < t.level2_size << t.q; i++)
1246 uint32_t offset;
1247 if (i > 0 && (i % 8) == 0)
1248 fprintf (stream, "\n ");
1249 offset = ((uint32_t *) (t.result + level2_offset))[i];
1250 if (offset == 0)
1251 fprintf (stream, " %5d", -1);
1252 else
1253 fprintf (stream, " %5zu",
1254 (offset - level3_offset) / sizeof (uint8_t));
1255 if (i+1 < t.level2_size << t.q)
1256 fprintf (stream, ",");
1258 if (t.level2_size << t.q > 8)
1259 fprintf (stream, "\n ");
1260 fprintf (stream, " },\n");
1261 fprintf (stream, " {");
1262 if (t.level3_size << t.p > 8)
1263 fprintf (stream, "\n ");
1264 for (i = 0; i < t.level3_size << t.p; i++)
1266 if (i > 0 && (i % 8) == 0)
1267 fprintf (stream, "\n ");
1268 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1269 if (i+1 < t.level3_size << t.p)
1270 fprintf (stream, ",");
1272 if (t.level3_size << t.p > 8)
1273 fprintf (stream, "\n ");
1274 fprintf (stream, " }\n");
1275 fprintf (stream, "};\n");
1277 if (ferror (stream) || fclose (stream))
1279 fprintf (stderr, "error writing to '%s'\n", filename);
1280 exit (1);
1284 /* ========================================================================= */
1286 /* Bidirectional category. */
1287 /* See Unicode 3.0 book, section 4.3,
1288 UCD.html. */
1290 enum
1292 UC_BIDI_L, /* Left-to-Right */
1293 UC_BIDI_LRE, /* Left-to-Right Embedding */
1294 UC_BIDI_LRO, /* Left-to-Right Override */
1295 UC_BIDI_R, /* Right-to-Left */
1296 UC_BIDI_AL, /* Right-to-Left Arabic */
1297 UC_BIDI_RLE, /* Right-to-Left Embedding */
1298 UC_BIDI_RLO, /* Right-to-Left Override */
1299 UC_BIDI_PDF, /* Pop Directional Format */
1300 UC_BIDI_EN, /* European Number */
1301 UC_BIDI_ES, /* European Number Separator */
1302 UC_BIDI_ET, /* European Number Terminator */
1303 UC_BIDI_AN, /* Arabic Number */
1304 UC_BIDI_CS, /* Common Number Separator */
1305 UC_BIDI_NSM, /* Non-Spacing Mark */
1306 UC_BIDI_BN, /* Boundary Neutral */
1307 UC_BIDI_B, /* Paragraph Separator */
1308 UC_BIDI_S, /* Segment Separator */
1309 UC_BIDI_WS, /* Whitespace */
1310 UC_BIDI_ON, /* Other Neutral */
1311 UC_BIDI_LRI, /* Left-to-Right Isolate */
1312 UC_BIDI_RLI, /* Right-to-Left Isolate */
1313 UC_BIDI_FSI, /* First Strong Isolate */
1314 UC_BIDI_PDI /* Pop Directional Isolate */
1317 static int
1318 bidi_category_byname (const char *category_name)
1320 switch (category_name[0])
1322 case 'A':
1323 switch (category_name[1])
1325 case 'L':
1326 if (category_name[2] == '\0')
1327 return UC_BIDI_AL;
1328 break;
1329 case 'N':
1330 if (category_name[2] == '\0')
1331 return UC_BIDI_AN;
1332 break;
1334 break;
1335 case 'B':
1336 switch (category_name[1])
1338 case '\0':
1339 return UC_BIDI_B;
1340 case 'N':
1341 if (category_name[2] == '\0')
1342 return UC_BIDI_BN;
1343 break;
1345 break;
1346 case 'C':
1347 switch (category_name[1])
1349 case 'S':
1350 if (category_name[2] == '\0')
1351 return UC_BIDI_CS;
1352 break;
1354 break;
1355 case 'E':
1356 switch (category_name[1])
1358 case 'N':
1359 if (category_name[2] == '\0')
1360 return UC_BIDI_EN;
1361 break;
1362 case 'S':
1363 if (category_name[2] == '\0')
1364 return UC_BIDI_ES;
1365 break;
1366 case 'T':
1367 if (category_name[2] == '\0')
1368 return UC_BIDI_ET;
1369 break;
1371 break;
1372 case 'F':
1373 switch (category_name[1])
1375 case 'S':
1376 switch (category_name[2])
1378 case 'I':
1379 if (category_name[3] == '\0')
1380 return UC_BIDI_FSI;
1381 break;
1384 break;
1385 case 'L':
1386 switch (category_name[1])
1388 case '\0':
1389 return UC_BIDI_L;
1390 case 'R':
1391 switch (category_name[2])
1393 case 'E':
1394 if (category_name[3] == '\0')
1395 return UC_BIDI_LRE;
1396 break;
1397 case 'O':
1398 if (category_name[3] == '\0')
1399 return UC_BIDI_LRO;
1400 break;
1401 case 'I':
1402 if (category_name[3] == '\0')
1403 return UC_BIDI_LRI;
1404 break;
1406 break;
1408 break;
1409 case 'N':
1410 switch (category_name[1])
1412 case 'S':
1413 switch (category_name[2])
1415 case 'M':
1416 if (category_name[3] == '\0')
1417 return UC_BIDI_NSM;
1418 break;
1420 break;
1422 break;
1423 case 'O':
1424 switch (category_name[1])
1426 case 'N':
1427 if (category_name[2] == '\0')
1428 return UC_BIDI_ON;
1429 break;
1431 break;
1432 case 'P':
1433 switch (category_name[1])
1435 case 'D':
1436 switch (category_name[2])
1438 case 'F':
1439 if (category_name[3] == '\0')
1440 return UC_BIDI_PDF;
1441 break;
1442 case 'I':
1443 if (category_name[3] == '\0')
1444 return UC_BIDI_PDI;
1445 break;
1447 break;
1449 break;
1450 case 'R':
1451 switch (category_name[1])
1453 case '\0':
1454 return UC_BIDI_R;
1455 case 'L':
1456 switch (category_name[2])
1458 case 'E':
1459 if (category_name[3] == '\0')
1460 return UC_BIDI_RLE;
1461 break;
1462 case 'O':
1463 if (category_name[3] == '\0')
1464 return UC_BIDI_RLO;
1465 break;
1466 case 'I':
1467 if (category_name[3] == '\0')
1468 return UC_BIDI_RLI;
1469 break;
1471 break;
1473 break;
1474 case 'S':
1475 if (category_name[1] == '\0')
1476 return UC_BIDI_S;
1477 break;
1478 case 'W':
1479 switch (category_name[1])
1481 case 'S':
1482 if (category_name[2] == '\0')
1483 return UC_BIDI_WS;
1484 break;
1486 break;
1488 /* Invalid bidi category name. */
1489 abort ();
1492 static int
1493 get_bidi_category (unsigned int ch)
1495 if (unicode_attributes[ch].name != NULL)
1496 return bidi_category_byname (unicode_attributes[ch].bidi);
1497 else
1499 /* The bidi category of unassigned characters depends on the range.
1500 See UTR #9 and DerivedBidiClass.txt. */
1501 if ((ch >= 0x0590 && ch <= 0x05FF)
1502 || (ch >= 0x07FB && ch <= 0x08FF)
1503 || (ch >= 0xFB37 && ch <= 0xFB45)
1504 || (ch >= 0x10800 && ch <= 0x10FFF))
1505 return UC_BIDI_R;
1506 else if ((ch >= 0x0600 && ch <= 0x07BF)
1507 || (ch >= 0x2064 && ch <= 0x2069)
1508 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1509 || (ch >= 0xFDFE && ch <= 0xFEFE))
1510 return UC_BIDI_AL;
1511 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1512 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1513 || (ch & 0xFFFF) == 0xFFFE
1514 || (ch & 0xFFFF) == 0xFFFF
1515 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1516 return UC_BIDI_BN;
1517 else
1518 return UC_BIDI_L;
1522 /* Construction of sparse 3-level tables. */
1523 #define TABLE bidi_category_table
1524 #define ELEMENT uint8_t
1525 #define DEFAULT UC_BIDI_L
1526 #define xmalloc malloc
1527 #define xrealloc realloc
1528 #include "3level.h"
1530 /* Output the per-character bidi category table. */
1531 static void
1532 output_bidi_category (const char *filename, const char *version)
1534 FILE *stream;
1535 unsigned int ch, i;
1536 struct bidi_category_table t;
1537 unsigned int level1_offset, level2_offset, level3_offset;
1538 uint16_t *level3_packed;
1540 stream = fopen (filename, "w");
1541 if (stream == NULL)
1543 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1544 exit (1);
1547 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1548 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1549 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1550 version);
1552 t.p = 7;
1553 t.q = 9;
1554 bidi_category_table_init (&t);
1556 for (ch = 0; ch < 0x110000; ch++)
1558 int value = get_bidi_category (ch);
1560 assert (value <= 0x1f);
1562 bidi_category_table_add (&t, ch, value);
1565 bidi_category_table_finalize (&t);
1567 /* Offsets in t.result, in memory of this process. */
1568 level1_offset =
1569 5 * sizeof (uint32_t);
1570 level2_offset =
1571 5 * sizeof (uint32_t)
1572 + t.level1_size * sizeof (uint32_t);
1573 level3_offset =
1574 5 * sizeof (uint32_t)
1575 + t.level1_size * sizeof (uint32_t)
1576 + (t.level2_size << t.q) * sizeof (uint32_t);
1578 for (i = 0; i < 5; i++)
1579 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1580 ((uint32_t *) t.result)[i]);
1581 fprintf (stream, "static const\n");
1582 fprintf (stream, "struct\n");
1583 fprintf (stream, " {\n");
1584 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1585 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1586 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1587 (1 << t.p) * 5 / 16);
1588 fprintf (stream, " }\n");
1589 fprintf (stream, "u_bidi_category =\n");
1590 fprintf (stream, "{\n");
1591 fprintf (stream, " {");
1592 if (t.level1_size > 8)
1593 fprintf (stream, "\n ");
1594 for (i = 0; i < t.level1_size; i++)
1596 uint32_t offset;
1597 if (i > 0 && (i % 8) == 0)
1598 fprintf (stream, "\n ");
1599 offset = ((uint32_t *) (t.result + level1_offset))[i];
1600 if (offset == 0)
1601 fprintf (stream, " %5d", -1);
1602 else
1603 fprintf (stream, " %5zu",
1604 (offset - level2_offset) / sizeof (uint32_t));
1605 if (i+1 < t.level1_size)
1606 fprintf (stream, ",");
1608 if (t.level1_size > 8)
1609 fprintf (stream, "\n ");
1610 fprintf (stream, " },\n");
1611 fprintf (stream, " {");
1612 if (t.level2_size << t.q > 8)
1613 fprintf (stream, "\n ");
1614 for (i = 0; i < t.level2_size << t.q; i++)
1616 uint32_t offset;
1617 if (i > 0 && (i % 8) == 0)
1618 fprintf (stream, "\n ");
1619 offset = ((uint32_t *) (t.result + level2_offset))[i];
1620 if (offset == 0)
1621 fprintf (stream, " %5d", -1);
1622 else
1623 fprintf (stream, " %5zu",
1624 (offset - level3_offset) / sizeof (uint8_t));
1625 if (i+1 < t.level2_size << t.q)
1626 fprintf (stream, ",");
1628 if (t.level2_size << t.q > 8)
1629 fprintf (stream, "\n ");
1630 fprintf (stream, " },\n");
1631 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1632 not 32-bit units, in order to make the lookup function easier. */
1633 level3_packed =
1634 (uint16_t *)
1635 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1636 for (i = 0; i < t.level3_size << t.p; i++)
1638 unsigned int j = (i * 5) / 16;
1639 unsigned int k = (i * 5) % 16;
1640 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1641 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1642 level3_packed[j] = value & 0xffff;
1643 level3_packed[j+1] = value >> 16;
1645 fprintf (stream, " {");
1646 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1647 fprintf (stream, "\n ");
1648 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1650 if (i > 0 && (i % 8) == 0)
1651 fprintf (stream, "\n ");
1652 fprintf (stream, " 0x%04x", level3_packed[i]);
1653 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1654 fprintf (stream, ",");
1656 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1657 fprintf (stream, "\n ");
1658 fprintf (stream, " }\n");
1659 free (level3_packed);
1660 fprintf (stream, "};\n");
1662 if (ferror (stream) || fclose (stream))
1664 fprintf (stderr, "error writing to '%s'\n", filename);
1665 exit (1);
1669 /* ========================================================================= */
1671 /* Decimal digit value. */
1672 /* See Unicode 3.0 book, section 4.6. */
1674 static int
1675 get_decdigit_value (unsigned int ch)
1677 if (unicode_attributes[ch].name != NULL
1678 && unicode_attributes[ch].decdigit[0] != '\0')
1679 return atoi (unicode_attributes[ch].decdigit);
1680 return -1;
1683 /* Construction of sparse 3-level tables. */
1684 #define TABLE decdigit_table
1685 #define ELEMENT uint8_t
1686 #define DEFAULT 0
1687 #define xmalloc malloc
1688 #define xrealloc realloc
1689 #include "3level.h"
1691 /* Output the unit test for the per-character decimal digit value table. */
1692 static void
1693 output_decimal_digit_test (const char *filename, const char *version)
1695 FILE *stream;
1696 bool need_comma;
1697 unsigned int ch;
1699 stream = fopen (filename, "w");
1700 if (stream == NULL)
1702 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1703 exit (1);
1706 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1707 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1708 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1709 version);
1711 need_comma = false;
1712 for (ch = 0; ch < 0x110000; ch++)
1714 int value = get_decdigit_value (ch);
1716 assert (value >= -1 && value < 10);
1718 if (value >= 0)
1720 if (need_comma)
1721 fprintf (stream, ",\n");
1722 fprintf (stream, " { 0x%04X, %d }", ch, value);
1723 need_comma = true;
1726 if (need_comma)
1727 fprintf (stream, "\n");
1729 if (ferror (stream) || fclose (stream))
1731 fprintf (stderr, "error writing to '%s'\n", filename);
1732 exit (1);
1736 /* Output the per-character decimal digit value table. */
1737 static void
1738 output_decimal_digit (const char *filename, const char *version)
1740 FILE *stream;
1741 unsigned int ch, i;
1742 struct decdigit_table t;
1743 unsigned int level1_offset, level2_offset, level3_offset;
1745 stream = fopen (filename, "w");
1746 if (stream == NULL)
1748 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1749 exit (1);
1752 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1753 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1754 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1755 version);
1757 t.p = 7;
1758 t.q = 9;
1759 decdigit_table_init (&t);
1761 for (ch = 0; ch < 0x110000; ch++)
1763 int value = 1 + get_decdigit_value (ch);
1765 assert (value >= 0 && value <= 10);
1767 decdigit_table_add (&t, ch, value);
1770 decdigit_table_finalize (&t);
1772 /* Offsets in t.result, in memory of this process. */
1773 level1_offset =
1774 5 * sizeof (uint32_t);
1775 level2_offset =
1776 5 * sizeof (uint32_t)
1777 + t.level1_size * sizeof (uint32_t);
1778 level3_offset =
1779 5 * sizeof (uint32_t)
1780 + t.level1_size * sizeof (uint32_t)
1781 + (t.level2_size << t.q) * sizeof (uint32_t);
1783 for (i = 0; i < 5; i++)
1784 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1785 ((uint32_t *) t.result)[i]);
1786 fprintf (stream, "static const\n");
1787 fprintf (stream, "struct\n");
1788 fprintf (stream, " {\n");
1789 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1790 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1791 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1792 t.p - 1);
1793 fprintf (stream, " }\n");
1794 fprintf (stream, "u_decdigit =\n");
1795 fprintf (stream, "{\n");
1796 fprintf (stream, " {");
1797 if (t.level1_size > 8)
1798 fprintf (stream, "\n ");
1799 for (i = 0; i < t.level1_size; i++)
1801 uint32_t offset;
1802 if (i > 0 && (i % 8) == 0)
1803 fprintf (stream, "\n ");
1804 offset = ((uint32_t *) (t.result + level1_offset))[i];
1805 if (offset == 0)
1806 fprintf (stream, " %5d", -1);
1807 else
1808 fprintf (stream, " %5zu",
1809 (offset - level2_offset) / sizeof (uint32_t));
1810 if (i+1 < t.level1_size)
1811 fprintf (stream, ",");
1813 if (t.level1_size > 8)
1814 fprintf (stream, "\n ");
1815 fprintf (stream, " },\n");
1816 fprintf (stream, " {");
1817 if (t.level2_size << t.q > 8)
1818 fprintf (stream, "\n ");
1819 for (i = 0; i < t.level2_size << t.q; i++)
1821 uint32_t offset;
1822 if (i > 0 && (i % 8) == 0)
1823 fprintf (stream, "\n ");
1824 offset = ((uint32_t *) (t.result + level2_offset))[i];
1825 if (offset == 0)
1826 fprintf (stream, " %5d", -1);
1827 else
1828 fprintf (stream, " %5zu",
1829 (offset - level3_offset) / sizeof (uint8_t));
1830 if (i+1 < t.level2_size << t.q)
1831 fprintf (stream, ",");
1833 if (t.level2_size << t.q > 8)
1834 fprintf (stream, "\n ");
1835 fprintf (stream, " },\n");
1836 /* Pack the level3 array. Each entry needs 4 bits only. */
1837 fprintf (stream, " {");
1838 if (t.level3_size << (t.p - 1) > 8)
1839 fprintf (stream, "\n ");
1840 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1842 if (i > 0 && (i % 8) == 0)
1843 fprintf (stream, "\n ");
1844 fprintf (stream, " 0x%02x",
1845 ((uint8_t *) (t.result + level3_offset))[2*i]
1846 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1847 if (i+1 < t.level3_size << (t.p - 1))
1848 fprintf (stream, ",");
1850 if (t.level3_size << (t.p - 1) > 8)
1851 fprintf (stream, "\n ");
1852 fprintf (stream, " }\n");
1853 fprintf (stream, "};\n");
1855 if (ferror (stream) || fclose (stream))
1857 fprintf (stderr, "error writing to '%s'\n", filename);
1858 exit (1);
1862 /* ========================================================================= */
1864 /* Digit value. */
1865 /* See Unicode 3.0 book, section 4.6. */
1867 static int
1868 get_digit_value (unsigned int ch)
1870 if (unicode_attributes[ch].name != NULL
1871 && unicode_attributes[ch].digit[0] != '\0')
1872 return atoi (unicode_attributes[ch].digit);
1873 return -1;
1876 /* Output the unit test for the per-character digit value table. */
1877 static void
1878 output_digit_test (const char *filename, const char *version)
1880 FILE *stream;
1881 bool need_comma;
1882 unsigned int ch;
1884 stream = fopen (filename, "w");
1885 if (stream == NULL)
1887 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1888 exit (1);
1891 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1892 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1893 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1894 version);
1896 need_comma = false;
1897 for (ch = 0; ch < 0x110000; ch++)
1899 int value = get_digit_value (ch);
1901 assert (value >= -1 && value < 10);
1903 if (value >= 0)
1905 if (need_comma)
1906 fprintf (stream, ",\n");
1907 fprintf (stream, " { 0x%04X, %d }", ch, value);
1908 need_comma = true;
1911 if (need_comma)
1912 fprintf (stream, "\n");
1914 if (ferror (stream) || fclose (stream))
1916 fprintf (stderr, "error writing to '%s'\n", filename);
1917 exit (1);
1921 /* Output the per-character digit value table. */
1922 static void
1923 output_digit (const char *filename, const char *version)
1925 FILE *stream;
1926 unsigned int ch, i;
1927 struct decdigit_table t;
1928 unsigned int level1_offset, level2_offset, level3_offset;
1930 stream = fopen (filename, "w");
1931 if (stream == NULL)
1933 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1934 exit (1);
1937 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1938 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1939 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1940 version);
1942 t.p = 7;
1943 t.q = 9;
1944 decdigit_table_init (&t);
1946 for (ch = 0; ch < 0x110000; ch++)
1948 int value = 1 + get_digit_value (ch);
1950 assert (value >= 0 && value <= 10);
1952 decdigit_table_add (&t, ch, value);
1955 decdigit_table_finalize (&t);
1957 /* Offsets in t.result, in memory of this process. */
1958 level1_offset =
1959 5 * sizeof (uint32_t);
1960 level2_offset =
1961 5 * sizeof (uint32_t)
1962 + t.level1_size * sizeof (uint32_t);
1963 level3_offset =
1964 5 * sizeof (uint32_t)
1965 + t.level1_size * sizeof (uint32_t)
1966 + (t.level2_size << t.q) * sizeof (uint32_t);
1968 for (i = 0; i < 5; i++)
1969 fprintf (stream, "#define digit_header_%d %d\n", i,
1970 ((uint32_t *) t.result)[i]);
1971 fprintf (stream, "static const\n");
1972 fprintf (stream, "struct\n");
1973 fprintf (stream, " {\n");
1974 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1975 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1976 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1977 t.p - 1);
1978 fprintf (stream, " }\n");
1979 fprintf (stream, "u_digit =\n");
1980 fprintf (stream, "{\n");
1981 fprintf (stream, " {");
1982 if (t.level1_size > 8)
1983 fprintf (stream, "\n ");
1984 for (i = 0; i < t.level1_size; i++)
1986 uint32_t offset;
1987 if (i > 0 && (i % 8) == 0)
1988 fprintf (stream, "\n ");
1989 offset = ((uint32_t *) (t.result + level1_offset))[i];
1990 if (offset == 0)
1991 fprintf (stream, " %5d", -1);
1992 else
1993 fprintf (stream, " %5zu",
1994 (offset - level2_offset) / sizeof (uint32_t));
1995 if (i+1 < t.level1_size)
1996 fprintf (stream, ",");
1998 if (t.level1_size > 8)
1999 fprintf (stream, "\n ");
2000 fprintf (stream, " },\n");
2001 fprintf (stream, " {");
2002 if (t.level2_size << t.q > 8)
2003 fprintf (stream, "\n ");
2004 for (i = 0; i < t.level2_size << t.q; i++)
2006 uint32_t offset;
2007 if (i > 0 && (i % 8) == 0)
2008 fprintf (stream, "\n ");
2009 offset = ((uint32_t *) (t.result + level2_offset))[i];
2010 if (offset == 0)
2011 fprintf (stream, " %5d", -1);
2012 else
2013 fprintf (stream, " %5zu",
2014 (offset - level3_offset) / sizeof (uint8_t));
2015 if (i+1 < t.level2_size << t.q)
2016 fprintf (stream, ",");
2018 if (t.level2_size << t.q > 8)
2019 fprintf (stream, "\n ");
2020 fprintf (stream, " },\n");
2021 /* Pack the level3 array. Each entry needs 4 bits only. */
2022 fprintf (stream, " {");
2023 if (t.level3_size << (t.p - 1) > 8)
2024 fprintf (stream, "\n ");
2025 for (i = 0; i < t.level3_size << (t.p - 1); i++)
2027 if (i > 0 && (i % 8) == 0)
2028 fprintf (stream, "\n ");
2029 fprintf (stream, " 0x%02x",
2030 ((uint8_t *) (t.result + level3_offset))[2*i]
2031 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
2032 if (i+1 < t.level3_size << (t.p - 1))
2033 fprintf (stream, ",");
2035 if (t.level3_size << (t.p - 1) > 8)
2036 fprintf (stream, "\n ");
2037 fprintf (stream, " }\n");
2038 fprintf (stream, "};\n");
2040 if (ferror (stream) || fclose (stream))
2042 fprintf (stderr, "error writing to '%s'\n", filename);
2043 exit (1);
2047 /* ========================================================================= */
2049 /* Numeric value. */
2050 /* See Unicode 3.0 book, section 4.6. */
2052 typedef struct { int numerator; int denominator; } uc_fraction_t;
2054 static uc_fraction_t
2055 get_numeric_value (unsigned int ch)
2057 uc_fraction_t value;
2059 if (unicode_attributes[ch].name != NULL
2060 && unicode_attributes[ch].numeric[0] != '\0')
2062 const char *str = unicode_attributes[ch].numeric;
2063 /* str is of the form "integer" or "integer/posinteger". */
2064 value.numerator = atoi (str);
2065 if (strchr (str, '/') != NULL)
2066 value.denominator = atoi (strchr (str, '/') + 1);
2067 else
2068 value.denominator = 1;
2070 else
2072 value.numerator = 0;
2073 value.denominator = 0;
2075 return value;
2078 /* Output the unit test for the per-character numeric value table. */
2079 static void
2080 output_numeric_test (const char *filename, const char *version)
2082 FILE *stream;
2083 bool need_comma;
2084 unsigned int ch;
2086 stream = fopen (filename, "w");
2087 if (stream == NULL)
2089 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2090 exit (1);
2093 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2094 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2095 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2096 version);
2098 need_comma = false;
2099 for (ch = 0; ch < 0x110000; ch++)
2101 uc_fraction_t value = get_numeric_value (ch);
2103 if (value.numerator != 0 || value.denominator != 0)
2105 if (need_comma)
2106 fprintf (stream, ",\n");
2107 fprintf (stream, " { 0x%04X, %d, %d }",
2108 ch, value.numerator, value.denominator);
2109 need_comma = true;
2112 if (need_comma)
2113 fprintf (stream, "\n");
2115 if (ferror (stream) || fclose (stream))
2117 fprintf (stderr, "error writing to '%s'\n", filename);
2118 exit (1);
2122 /* Construction of sparse 3-level tables. */
2123 #define TABLE numeric_table
2124 #define ELEMENT uint8_t
2125 #define DEFAULT 0
2126 #define xmalloc malloc
2127 #define xrealloc realloc
2128 #include "3level.h"
2130 /* Output the per-character numeric value table. */
2131 static void
2132 output_numeric (const char *filename, const char *version)
2134 FILE *stream;
2135 uc_fraction_t fractions[160];
2136 unsigned int nfractions;
2137 unsigned int ch, i, j;
2138 struct numeric_table t;
2139 unsigned int level1_offset, level2_offset, level3_offset;
2140 uint16_t *level3_packed;
2142 stream = fopen (filename, "w");
2143 if (stream == NULL)
2145 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2146 exit (1);
2149 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2150 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2151 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2152 version);
2154 /* Create table of occurring fractions. */
2155 nfractions = 0;
2156 for (ch = 0; ch < 0x110000; ch++)
2158 uc_fraction_t value = get_numeric_value (ch);
2160 for (i = 0; i < nfractions; i++)
2161 if (value.numerator == fractions[i].numerator
2162 && value.denominator == fractions[i].denominator)
2163 break;
2164 if (i == nfractions)
2166 assert (nfractions != SIZEOF (fractions));
2167 for (i = 0; i < nfractions; i++)
2168 if (value.denominator < fractions[i].denominator
2169 || (value.denominator == fractions[i].denominator
2170 && value.numerator < fractions[i].numerator))
2171 break;
2172 for (j = nfractions; j > i; j--)
2173 fractions[j] = fractions[j - 1];
2174 fractions[i] = value;
2175 nfractions++;
2179 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2180 nfractions);
2181 fprintf (stream, "{\n");
2182 for (i = 0; i < nfractions; i++)
2184 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2185 fractions[i].denominator);
2186 if (i+1 < nfractions)
2187 fprintf (stream, ",");
2188 fprintf (stream, "\n");
2190 fprintf (stream, "};\n");
2192 t.p = 7;
2193 t.q = 9;
2194 numeric_table_init (&t);
2196 for (ch = 0; ch < 0x110000; ch++)
2198 uc_fraction_t value = get_numeric_value (ch);
2200 for (i = 0; i < nfractions; i++)
2201 if (value.numerator == fractions[i].numerator
2202 && value.denominator == fractions[i].denominator)
2203 break;
2204 assert (i != nfractions);
2206 numeric_table_add (&t, ch, i);
2209 numeric_table_finalize (&t);
2211 /* Offsets in t.result, in memory of this process. */
2212 level1_offset =
2213 5 * sizeof (uint32_t);
2214 level2_offset =
2215 5 * sizeof (uint32_t)
2216 + t.level1_size * sizeof (uint32_t);
2217 level3_offset =
2218 5 * sizeof (uint32_t)
2219 + t.level1_size * sizeof (uint32_t)
2220 + (t.level2_size << t.q) * sizeof (uint32_t);
2222 for (i = 0; i < 5; i++)
2223 fprintf (stream, "#define numeric_header_%d %d\n", i,
2224 ((uint32_t *) t.result)[i]);
2225 fprintf (stream, "static const\n");
2226 fprintf (stream, "struct\n");
2227 fprintf (stream, " {\n");
2228 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2229 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2230 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2231 (1 << t.p) * 8 / 16);
2232 fprintf (stream, " }\n");
2233 fprintf (stream, "u_numeric =\n");
2234 fprintf (stream, "{\n");
2235 fprintf (stream, " {");
2236 if (t.level1_size > 8)
2237 fprintf (stream, "\n ");
2238 for (i = 0; i < t.level1_size; i++)
2240 uint32_t offset;
2241 if (i > 0 && (i % 8) == 0)
2242 fprintf (stream, "\n ");
2243 offset = ((uint32_t *) (t.result + level1_offset))[i];
2244 if (offset == 0)
2245 fprintf (stream, " %5d", -1);
2246 else
2247 fprintf (stream, " %5zu",
2248 (offset - level2_offset) / sizeof (uint32_t));
2249 if (i+1 < t.level1_size)
2250 fprintf (stream, ",");
2252 if (t.level1_size > 8)
2253 fprintf (stream, "\n ");
2254 fprintf (stream, " },\n");
2255 fprintf (stream, " {");
2256 if (t.level2_size << t.q > 8)
2257 fprintf (stream, "\n ");
2258 for (i = 0; i < t.level2_size << t.q; i++)
2260 uint32_t offset;
2261 if (i > 0 && (i % 8) == 0)
2262 fprintf (stream, "\n ");
2263 offset = ((uint32_t *) (t.result + level2_offset))[i];
2264 if (offset == 0)
2265 fprintf (stream, " %5d", -1);
2266 else
2267 fprintf (stream, " %5zu",
2268 (offset - level3_offset) / sizeof (uint8_t));
2269 if (i+1 < t.level2_size << t.q)
2270 fprintf (stream, ",");
2272 if (t.level2_size << t.q > 8)
2273 fprintf (stream, "\n ");
2274 fprintf (stream, " },\n");
2275 /* Pack the level3 array. Each entry needs 8 bits only. Use 16-bit units,
2276 not 32-bit units, in order to make the lookup function easier. */
2277 level3_packed =
2278 (uint16_t *)
2279 calloc ((t.level3_size << t.p) * 8 / 16 + 1, sizeof (uint16_t));
2280 for (i = 0; i < t.level3_size << t.p; i++)
2282 unsigned int j = (i * 8) / 16;
2283 unsigned int k = (i * 8) % 16;
2284 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2285 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2286 level3_packed[j] = value & 0xffff;
2287 level3_packed[j+1] = value >> 16;
2289 fprintf (stream, " {");
2290 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2291 fprintf (stream, "\n ");
2292 for (i = 0; i < (t.level3_size << t.p) * 8 / 16 + 1; i++)
2294 if (i > 0 && (i % 8) == 0)
2295 fprintf (stream, "\n ");
2296 fprintf (stream, " 0x%04x", level3_packed[i]);
2297 if (i+1 < (t.level3_size << t.p) * 8 / 16 + 1)
2298 fprintf (stream, ",");
2300 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2301 fprintf (stream, "\n ");
2302 fprintf (stream, " }\n");
2303 free (level3_packed);
2304 fprintf (stream, "};\n");
2306 if (ferror (stream) || fclose (stream))
2308 fprintf (stderr, "error writing to '%s'\n", filename);
2309 exit (1);
2313 /* ========================================================================= */
2315 /* Mirrored. */
2316 /* See Unicode 3.0 book, section 4.7,
2317 UAX #9. */
2319 /* List of mirrored character pairs. This is a subset of the characters
2320 having the BidiMirrored property. */
2321 static unsigned int mirror_pairs[][2] =
2323 { 0x0028, 0x0029 },
2324 { 0x003C, 0x003E },
2325 { 0x005B, 0x005D },
2326 { 0x007B, 0x007D },
2327 { 0x00AB, 0x00BB },
2328 { 0x2039, 0x203A },
2329 { 0x2045, 0x2046 },
2330 { 0x207D, 0x207E },
2331 { 0x208D, 0x208E },
2332 { 0x2208, 0x220B },
2333 { 0x220A, 0x220D },
2334 { 0x223C, 0x223D },
2335 { 0x2243, 0x22CD },
2336 { 0x2252, 0x2253 },
2337 { 0x2254, 0x2255 },
2338 { 0x2264, 0x2265 },
2339 { 0x2266, 0x2267 },
2340 { 0x226A, 0x226B },
2341 { 0x2276, 0x2277 },
2342 { 0x2278, 0x2279 },
2343 { 0x227A, 0x227B },
2344 { 0x227C, 0x227D },
2345 { 0x2282, 0x2283 },
2346 { 0x2286, 0x2287 },
2347 { 0x228F, 0x2290 },
2348 { 0x2291, 0x2292 },
2349 { 0x22A2, 0x22A3 },
2350 { 0x22B0, 0x22B1 },
2351 { 0x22B2, 0x22B3 },
2352 { 0x22B4, 0x22B5 },
2353 { 0x22B6, 0x22B7 },
2354 { 0x22C9, 0x22CA },
2355 { 0x22CB, 0x22CC },
2356 { 0x22D0, 0x22D1 },
2357 { 0x22D6, 0x22D7 },
2358 { 0x22D8, 0x22D9 },
2359 { 0x22DA, 0x22DB },
2360 { 0x22DC, 0x22DD },
2361 { 0x22DE, 0x22DF },
2362 { 0x22F0, 0x22F1 },
2363 { 0x2308, 0x2309 },
2364 { 0x230A, 0x230B },
2365 { 0x2329, 0x232A },
2366 { 0x3008, 0x3009 },
2367 { 0x300A, 0x300B },
2368 { 0x300C, 0x300D },
2369 { 0x300E, 0x300F },
2370 { 0x3010, 0x3011 },
2371 { 0x3014, 0x3015 },
2372 { 0x3016, 0x3017 },
2373 { 0x3018, 0x3019 },
2374 { 0x301A, 0x301B }
2377 static int
2378 get_mirror_value (unsigned int ch)
2380 bool mirrored;
2381 unsigned int mirror_char;
2382 unsigned int i;
2384 mirrored = (unicode_attributes[ch].name != NULL
2385 && unicode_attributes[ch].mirrored);
2386 mirror_char = 0xfffd;
2387 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2388 if (ch == mirror_pairs[i][0])
2390 mirror_char = mirror_pairs[i][1];
2391 break;
2393 else if (ch == mirror_pairs[i][1])
2395 mirror_char = mirror_pairs[i][0];
2396 break;
2398 if (mirrored)
2399 return (int) mirror_char - (int) ch;
2400 else
2402 assert (mirror_char == 0xfffd);
2403 return 0;
2407 /* Construction of sparse 3-level tables. */
2408 #define TABLE mirror_table
2409 #define ELEMENT int32_t
2410 #define DEFAULT 0
2411 #define xmalloc malloc
2412 #define xrealloc realloc
2413 #include "3level.h"
2415 /* Output the per-character mirror table. */
2416 static void
2417 output_mirror (const char *filename, const char *version)
2419 FILE *stream;
2420 unsigned int ch, i;
2421 struct mirror_table t;
2422 unsigned int level1_offset, level2_offset, level3_offset;
2424 stream = fopen (filename, "w");
2425 if (stream == NULL)
2427 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2428 exit (1);
2431 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2432 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2433 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2434 version);
2436 t.p = 7;
2437 t.q = 9;
2438 mirror_table_init (&t);
2440 for (ch = 0; ch < 0x110000; ch++)
2442 int value = get_mirror_value (ch);
2444 mirror_table_add (&t, ch, value);
2447 mirror_table_finalize (&t);
2449 /* Offsets in t.result, in memory of this process. */
2450 level1_offset =
2451 5 * sizeof (uint32_t);
2452 level2_offset =
2453 5 * sizeof (uint32_t)
2454 + t.level1_size * sizeof (uint32_t);
2455 level3_offset =
2456 5 * sizeof (uint32_t)
2457 + t.level1_size * sizeof (uint32_t)
2458 + (t.level2_size << t.q) * sizeof (uint32_t);
2460 for (i = 0; i < 5; i++)
2461 fprintf (stream, "#define mirror_header_%d %d\n", i,
2462 ((uint32_t *) t.result)[i]);
2463 fprintf (stream, "static const\n");
2464 fprintf (stream, "struct\n");
2465 fprintf (stream, " {\n");
2466 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2467 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2468 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2469 fprintf (stream, " }\n");
2470 fprintf (stream, "u_mirror =\n");
2471 fprintf (stream, "{\n");
2472 fprintf (stream, " {");
2473 if (t.level1_size > 8)
2474 fprintf (stream, "\n ");
2475 for (i = 0; i < t.level1_size; i++)
2477 uint32_t offset;
2478 if (i > 0 && (i % 8) == 0)
2479 fprintf (stream, "\n ");
2480 offset = ((uint32_t *) (t.result + level1_offset))[i];
2481 if (offset == 0)
2482 fprintf (stream, " %5d", -1);
2483 else
2484 fprintf (stream, " %5zu",
2485 (offset - level2_offset) / sizeof (uint32_t));
2486 if (i+1 < t.level1_size)
2487 fprintf (stream, ",");
2489 if (t.level1_size > 8)
2490 fprintf (stream, "\n ");
2491 fprintf (stream, " },\n");
2492 fprintf (stream, " {");
2493 if (t.level2_size << t.q > 8)
2494 fprintf (stream, "\n ");
2495 for (i = 0; i < t.level2_size << t.q; i++)
2497 uint32_t offset;
2498 if (i > 0 && (i % 8) == 0)
2499 fprintf (stream, "\n ");
2500 offset = ((uint32_t *) (t.result + level2_offset))[i];
2501 if (offset == 0)
2502 fprintf (stream, " %5d", -1);
2503 else
2504 fprintf (stream, " %5zu",
2505 (offset - level3_offset) / sizeof (int32_t));
2506 if (i+1 < t.level2_size << t.q)
2507 fprintf (stream, ",");
2509 if (t.level2_size << t.q > 8)
2510 fprintf (stream, "\n ");
2511 fprintf (stream, " },\n");
2512 fprintf (stream, " {");
2513 if (t.level3_size << t.p > 8)
2514 fprintf (stream, "\n ");
2515 for (i = 0; i < t.level3_size << t.p; i++)
2517 if (i > 0 && (i % 8) == 0)
2518 fprintf (stream, "\n ");
2519 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2520 if (i+1 < t.level3_size << t.p)
2521 fprintf (stream, ",");
2523 if (t.level3_size << t.p > 8)
2524 fprintf (stream, "\n ");
2525 fprintf (stream, " }\n");
2526 fprintf (stream, "};\n");
2528 if (ferror (stream) || fclose (stream))
2530 fprintf (stderr, "error writing to '%s'\n", filename);
2531 exit (1);
2535 /* ========================================================================= */
2537 /* Particular values of the word break property. */
2539 static bool
2540 is_WBP_MIDNUMLET (unsigned int ch)
2542 return (ch == 0x002E || ch == 0x2018 || ch == 0x2019
2543 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2546 static bool
2547 is_WBP_MIDLETTER (unsigned int ch)
2549 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2550 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A
2551 || ch == 0x02D7);
2554 /* ========================================================================= */
2556 /* Properties. */
2558 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2559 enum
2561 /* PropList.txt */
2562 PROP_WHITE_SPACE,
2563 PROP_BIDI_CONTROL,
2564 PROP_JOIN_CONTROL,
2565 PROP_DASH,
2566 PROP_HYPHEN,
2567 PROP_QUOTATION_MARK,
2568 PROP_TERMINAL_PUNCTUATION,
2569 PROP_OTHER_MATH,
2570 PROP_HEX_DIGIT,
2571 PROP_ASCII_HEX_DIGIT,
2572 PROP_OTHER_ALPHABETIC,
2573 PROP_IDEOGRAPHIC,
2574 PROP_DIACRITIC,
2575 PROP_EXTENDER,
2576 PROP_OTHER_LOWERCASE,
2577 PROP_OTHER_UPPERCASE,
2578 PROP_NONCHARACTER_CODE_POINT,
2579 PROP_OTHER_GRAPHEME_EXTEND,
2580 PROP_IDS_BINARY_OPERATOR,
2581 PROP_IDS_TRINARY_OPERATOR,
2582 PROP_RADICAL,
2583 PROP_UNIFIED_IDEOGRAPH,
2584 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2585 PROP_DEPRECATED,
2586 PROP_SOFT_DOTTED,
2587 PROP_LOGICAL_ORDER_EXCEPTION,
2588 PROP_OTHER_ID_START,
2589 PROP_OTHER_ID_CONTINUE,
2590 PROP_STERM,
2591 PROP_VARIATION_SELECTOR,
2592 PROP_PATTERN_WHITE_SPACE,
2593 PROP_PATTERN_SYNTAX,
2594 PROP_PREPENDED_CONCATENATION_MARK,
2595 /* DerivedCoreProperties.txt */
2596 PROP_MATH,
2597 PROP_ALPHABETIC,
2598 PROP_LOWERCASE,
2599 PROP_UPPERCASE,
2600 PROP_CASED,
2601 PROP_CASE_IGNORABLE,
2602 PROP_CHANGES_WHEN_LOWERCASED,
2603 PROP_CHANGES_WHEN_UPPERCASED,
2604 PROP_CHANGES_WHEN_TITLECASED,
2605 PROP_CHANGES_WHEN_CASEFOLDED,
2606 PROP_CHANGES_WHEN_CASEMAPPED,
2607 PROP_ID_START,
2608 PROP_ID_CONTINUE,
2609 PROP_XID_START,
2610 PROP_XID_CONTINUE,
2611 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2612 PROP_GRAPHEME_EXTEND,
2613 PROP_GRAPHEME_BASE,
2614 PROP_GRAPHEME_LINK
2616 unsigned long long unicode_properties[0x110000];
2618 static void
2619 clear_properties (void)
2621 unsigned int i;
2623 for (i = 0; i < 0x110000; i++)
2624 unicode_properties[i] = 0;
2627 /* Stores in unicode_properties[] the properties from the
2628 PropList.txt or DerivedCoreProperties.txt file. */
2629 static void
2630 fill_properties (const char *proplist_filename)
2632 unsigned int i;
2633 FILE *stream;
2635 stream = fopen (proplist_filename, "r");
2636 if (stream == NULL)
2638 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2639 exit (1);
2642 for (;;)
2644 char buf[200+1];
2645 unsigned int i1, i2;
2646 char padding[200+1];
2647 char propname[200+1];
2648 unsigned int propvalue;
2650 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2651 break;
2653 if (buf[0] == '\0' || buf[0] == '#')
2654 continue;
2656 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2658 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2660 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2661 exit (1);
2663 i2 = i1;
2665 #define PROP(name,value) \
2666 if (strcmp (propname, name) == 0) propvalue = value; else
2667 /* PropList.txt */
2668 PROP ("White_Space", PROP_WHITE_SPACE)
2669 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2670 PROP ("Join_Control", PROP_JOIN_CONTROL)
2671 PROP ("Dash", PROP_DASH)
2672 PROP ("Hyphen", PROP_HYPHEN)
2673 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2674 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2675 PROP ("Other_Math", PROP_OTHER_MATH)
2676 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2677 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2678 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2679 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2680 PROP ("Diacritic", PROP_DIACRITIC)
2681 PROP ("Extender", PROP_EXTENDER)
2682 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2683 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2684 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2685 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2686 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2687 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2688 PROP ("Radical", PROP_RADICAL)
2689 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2690 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2691 PROP ("Deprecated", PROP_DEPRECATED)
2692 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2693 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2694 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2695 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2696 PROP ("Sentence_Terminal", PROP_STERM)
2697 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2698 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2699 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2700 PROP ("Prepended_Concatenation_Mark", PROP_PREPENDED_CONCATENATION_MARK)
2701 /* DerivedCoreProperties.txt */
2702 PROP ("Math", PROP_MATH)
2703 PROP ("Alphabetic", PROP_ALPHABETIC)
2704 PROP ("Lowercase", PROP_LOWERCASE)
2705 PROP ("Uppercase", PROP_UPPERCASE)
2706 PROP ("Cased", PROP_CASED)
2707 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2708 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2709 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2710 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2711 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2712 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2713 PROP ("ID_Start", PROP_ID_START)
2714 PROP ("ID_Continue", PROP_ID_CONTINUE)
2715 PROP ("XID_Start", PROP_XID_START)
2716 PROP ("XID_Continue", PROP_XID_CONTINUE)
2717 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2718 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2719 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2720 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2721 #undef PROP
2723 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2724 proplist_filename);
2725 exit (1);
2727 assert (i1 <= i2 && i2 < 0x110000);
2729 for (i = i1; i <= i2; i++)
2730 unicode_properties[i] |= 1ULL << propvalue;
2733 if (ferror (stream) || fclose (stream))
2735 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2736 exit (1);
2740 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2741 file. */
2742 static void
2743 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2745 unsigned int i;
2746 FILE *stream;
2747 char buf[100+1];
2749 for (i = 0; i < 0x110000; i++)
2750 array[i] = 0;
2752 stream = fopen (proplist_filename, "r");
2753 if (stream == NULL)
2755 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2756 exit (1);
2759 /* Search for the "Property dump for: ..." line. */
2762 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2764 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2765 exit (1);
2768 while (strstr (buf, property_name) == NULL);
2770 for (;;)
2772 unsigned int i1, i2;
2774 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2775 break;
2776 if (buf[0] == '*')
2777 break;
2778 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2780 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2782 fprintf (stderr, "parse error in property in '%s'\n",
2783 proplist_filename);
2784 exit (1);
2787 else if (strlen (buf) >= 4)
2789 if (sscanf (buf, "%4X", &i1) < 1)
2791 fprintf (stderr, "parse error in property in '%s'\n",
2792 proplist_filename);
2793 exit (1);
2795 i2 = i1;
2797 else
2799 fprintf (stderr, "parse error in property in '%s'\n",
2800 proplist_filename);
2801 exit (1);
2803 assert (i1 <= i2 && i2 < 0x110000);
2804 for (i = i1; i <= i2; i++)
2805 array[i] = 1;
2808 if (ferror (stream) || fclose (stream))
2810 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2811 exit (1);
2815 /* Properties from Unicode 3.0 PropList.txt file. */
2817 /* The paired punctuation property from the PropList.txt file. */
2818 char unicode_pairedpunctuation[0x110000];
2820 /* The left of pair property from the PropList.txt file. */
2821 char unicode_leftofpair[0x110000];
2823 static void
2824 fill_properties30 (const char *proplist30_filename)
2826 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2827 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2830 /* ------------------------------------------------------------------------- */
2832 /* See PropList.txt, UCD.html. */
2833 static bool
2834 is_property_white_space (unsigned int ch)
2836 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2839 /* See Unicode 3.0 book, section 4.10,
2840 PropList.txt, UCD.html,
2841 DerivedCoreProperties.txt, UCD.html. */
2842 static bool
2843 is_property_alphabetic (unsigned int ch)
2845 bool result1 =
2846 is_category_L (ch)
2847 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2848 /* For some reason, the following are listed as having property
2849 Alphabetic but not as having property Other_Alphabetic. */
2850 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2851 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2852 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2853 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2854 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2855 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2856 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2857 || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
2858 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2859 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2860 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2861 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2862 || (ch >= 0x12400 && ch <= 0x1246E); /* CUNEIFORM NUMERIC SIGNS */
2863 bool result2 =
2864 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2866 assert (result1 == result2);
2867 return result1;
2870 /* See PropList.txt, UCD.html. */
2871 static bool
2872 is_property_other_alphabetic (unsigned int ch)
2874 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2877 /* See PropList.txt, UCD.html. */
2878 static bool
2879 is_property_not_a_character (unsigned int ch)
2881 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2884 /* See PropList.txt, UCD.html,
2885 DerivedCoreProperties.txt, UCD.html. */
2886 static bool
2887 is_property_default_ignorable_code_point (unsigned int ch)
2889 bool result1 =
2890 (is_category_Cf (ch)
2891 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2892 && !((ch >= 0x0600 && ch <= 0x0605) || ch == 0x06DD || ch == 0x070F)
2893 /* For some reason, the following are not listed as having property
2894 Default_Ignorable_Code_Point. */
2895 && !(ch == 0x110BD)
2896 && !(ch == 0x8E2))
2897 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2898 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2899 bool result2 =
2900 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2902 assert (result1 == result2);
2903 return result1;
2906 /* See PropList.txt, UCD.html. */
2907 static bool
2908 is_property_other_default_ignorable_code_point (unsigned int ch)
2910 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2913 /* See PropList.txt, UCD.html. */
2914 static bool
2915 is_property_deprecated (unsigned int ch)
2917 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2920 /* See PropList.txt, UCD.html. */
2921 static bool
2922 is_property_logical_order_exception (unsigned int ch)
2924 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2927 /* See PropList.txt, UCD.html. */
2928 static bool
2929 is_property_variation_selector (unsigned int ch)
2931 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2934 /* See PropList-3.0.1.txt. */
2935 static bool
2936 is_property_private_use (unsigned int ch)
2938 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2939 return (ch >= 0xE000 && ch <= 0xF8FF)
2940 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2941 || (ch >= 0x100000 && ch <= 0x10FFFD);
2944 /* See PropList-3.0.1.txt. */
2945 static bool
2946 is_property_unassigned_code_value (unsigned int ch)
2948 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2951 /* See PropList.txt, UCD.html,
2952 DerivedCoreProperties.txt, UCD.html. */
2953 static bool
2954 is_property_uppercase (unsigned int ch)
2956 bool result1 =
2957 is_category_Lu (ch)
2958 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2959 bool result2 =
2960 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2962 assert (result1 == result2);
2963 return result1;
2966 /* See PropList.txt, UCD.html. */
2967 static bool
2968 is_property_other_uppercase (unsigned int ch)
2970 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2973 /* See PropList.txt, UCD.html,
2974 DerivedCoreProperties.txt, UCD.html. */
2975 static bool
2976 is_property_lowercase (unsigned int ch)
2978 bool result1 =
2979 is_category_Ll (ch)
2980 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2981 bool result2 =
2982 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2984 assert (result1 == result2);
2985 return result1;
2988 /* See PropList.txt, UCD.html. */
2989 static bool
2990 is_property_other_lowercase (unsigned int ch)
2992 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2995 /* See PropList-3.0.1.txt. */
2996 static bool
2997 is_property_titlecase (unsigned int ch)
2999 return is_category_Lt (ch);
3002 /* See DerivedCoreProperties.txt. */
3003 static bool
3004 is_property_cased (unsigned int ch)
3006 bool result1 = (is_property_lowercase (ch)
3007 || is_property_uppercase (ch)
3008 || is_category_Lt (ch));
3009 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
3011 assert (result1 == result2);
3012 return result1;
3015 /* See DerivedCoreProperties.txt. */
3016 static bool
3017 is_property_case_ignorable (unsigned int ch)
3019 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
3020 || ch == 0x0027
3021 || is_category_Mn (ch)
3022 || is_category_Me (ch)
3023 || is_category_Cf (ch)
3024 || is_category_Lm (ch)
3025 || is_category_Sk (ch));
3026 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
3028 assert (result1 == result2);
3029 return result1;
3032 /* See DerivedCoreProperties.txt. */
3033 static bool
3034 is_property_changes_when_lowercased (unsigned int ch)
3036 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
3037 bool result2 = (unicode_attributes[ch].name != NULL
3038 && unicode_attributes[ch].lower != NONE
3039 && unicode_attributes[ch].lower != ch);
3041 assert (result1 == result2);
3042 return result1;
3045 /* See DerivedCoreProperties.txt. */
3046 static bool
3047 is_property_changes_when_uppercased (unsigned int ch)
3049 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3052 /* See DerivedCoreProperties.txt. */
3053 static bool
3054 is_property_changes_when_titlecased (unsigned int ch)
3056 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3059 /* See DerivedCoreProperties.txt. */
3060 static bool
3061 is_property_changes_when_casefolded (unsigned int ch)
3063 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3066 /* See DerivedCoreProperties.txt. */
3067 static bool
3068 is_property_changes_when_casemapped (unsigned int ch)
3070 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3073 /* See PropList.txt, UCD.html. */
3074 static bool
3075 is_property_soft_dotted (unsigned int ch)
3077 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3080 /* See DerivedCoreProperties.txt, UCD.html. */
3081 static bool
3082 is_property_id_start (unsigned int ch)
3084 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3087 /* See PropList.txt, UCD.html. */
3088 static bool
3089 is_property_other_id_start (unsigned int ch)
3091 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3094 /* See DerivedCoreProperties.txt, UCD.html. */
3095 static bool
3096 is_property_id_continue (unsigned int ch)
3098 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3101 /* See PropList.txt, UCD.html. */
3102 static bool
3103 is_property_other_id_continue (unsigned int ch)
3105 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3108 /* See DerivedCoreProperties.txt, UCD.html. */
3109 static bool
3110 is_property_xid_start (unsigned int ch)
3112 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3115 /* See DerivedCoreProperties.txt, UCD.html. */
3116 static bool
3117 is_property_xid_continue (unsigned int ch)
3119 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3122 /* See PropList.txt, UCD.html. */
3123 static bool
3124 is_property_pattern_white_space (unsigned int ch)
3126 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3129 /* See PropList.txt, UCD.html. */
3130 static bool
3131 is_property_pattern_syntax (unsigned int ch)
3133 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3136 /* See PropList.txt, UCD.html. */
3137 static bool
3138 is_property_join_control (unsigned int ch)
3140 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3143 /* See DerivedCoreProperties.txt, UCD.html. */
3144 static bool
3145 is_property_grapheme_base (unsigned int ch)
3147 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3150 /* See DerivedCoreProperties.txt, UCD.html. */
3151 static bool
3152 is_property_grapheme_extend (unsigned int ch)
3154 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3157 /* See PropList.txt, UCD.html. */
3158 static bool
3159 is_property_other_grapheme_extend (unsigned int ch)
3161 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3164 /* See DerivedCoreProperties.txt, UCD.html. */
3165 static bool
3166 is_property_grapheme_link (unsigned int ch)
3168 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3171 /* See PropList.txt, UCD.html. */
3172 static bool
3173 is_property_bidi_control (unsigned int ch)
3175 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3178 /* See PropList-3.0.1.txt. */
3179 static bool
3180 is_property_bidi_left_to_right (unsigned int ch)
3182 return (get_bidi_category (ch) == UC_BIDI_L);
3185 /* See PropList-3.0.1.txt. */
3186 static bool
3187 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3189 return (get_bidi_category (ch) == UC_BIDI_R);
3192 /* See PropList-3.0.1.txt. */
3193 static bool
3194 is_property_bidi_arabic_right_to_left (unsigned int ch)
3196 return (get_bidi_category (ch) == UC_BIDI_AL);
3199 /* See PropList-3.0.1.txt. */
3200 static bool
3201 is_property_bidi_european_digit (unsigned int ch)
3203 return (get_bidi_category (ch) == UC_BIDI_EN);
3206 /* See PropList-3.0.1.txt. */
3207 static bool
3208 is_property_bidi_eur_num_separator (unsigned int ch)
3210 return (get_bidi_category (ch) == UC_BIDI_ES);
3213 /* See PropList-3.0.1.txt. */
3214 static bool
3215 is_property_bidi_eur_num_terminator (unsigned int ch)
3217 return (get_bidi_category (ch) == UC_BIDI_ET);
3220 /* See PropList-3.0.1.txt. */
3221 static bool
3222 is_property_bidi_arabic_digit (unsigned int ch)
3224 return (get_bidi_category (ch) == UC_BIDI_AN);
3227 /* See PropList-3.0.1.txt. */
3228 static bool
3229 is_property_bidi_common_separator (unsigned int ch)
3231 return (get_bidi_category (ch) == UC_BIDI_CS);
3234 /* See PropList-3.0.1.txt. */
3235 static bool
3236 is_property_bidi_block_separator (unsigned int ch)
3238 return (get_bidi_category (ch) == UC_BIDI_B);
3241 /* See PropList-3.0.1.txt. */
3242 static bool
3243 is_property_bidi_segment_separator (unsigned int ch)
3245 return (get_bidi_category (ch) == UC_BIDI_S);
3248 /* See PropList-3.0.1.txt. */
3249 static bool
3250 is_property_bidi_whitespace (unsigned int ch)
3252 return (get_bidi_category (ch) == UC_BIDI_WS);
3255 /* See PropList-3.0.1.txt. */
3256 static bool
3257 is_property_bidi_non_spacing_mark (unsigned int ch)
3259 return (get_bidi_category (ch) == UC_BIDI_NSM);
3262 /* See PropList-3.0.1.txt. */
3263 static bool
3264 is_property_bidi_boundary_neutral (unsigned int ch)
3266 return (get_bidi_category (ch) == UC_BIDI_BN);
3269 /* See PropList-3.0.1.txt. */
3270 static bool
3271 is_property_bidi_pdf (unsigned int ch)
3273 return (get_bidi_category (ch) == UC_BIDI_PDF);
3276 /* See PropList-3.0.1.txt. */
3277 static bool
3278 is_property_bidi_embedding_or_override (unsigned int ch)
3280 int category = get_bidi_category (ch);
3281 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3282 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3285 /* See PropList-3.0.1.txt. */
3286 static bool
3287 is_property_bidi_other_neutral (unsigned int ch)
3289 return (get_bidi_category (ch) == UC_BIDI_ON);
3292 /* See PropList.txt, UCD.html. */
3293 static bool
3294 is_property_hex_digit (unsigned int ch)
3296 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3299 /* See PropList.txt, UCD.html. */
3300 static bool
3301 is_property_ascii_hex_digit (unsigned int ch)
3303 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3306 /* See Unicode 3.0 book, section 4.10,
3307 PropList.txt, UCD.html. */
3308 static bool
3309 is_property_ideographic (unsigned int ch)
3311 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3314 /* See PropList.txt, UCD.html. */
3315 static bool
3316 is_property_unified_ideograph (unsigned int ch)
3318 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3321 /* See PropList.txt, UCD.html. */
3322 static bool
3323 is_property_radical (unsigned int ch)
3325 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3328 /* See PropList.txt, UCD.html. */
3329 static bool
3330 is_property_ids_binary_operator (unsigned int ch)
3332 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3335 /* See PropList.txt, UCD.html. */
3336 static bool
3337 is_property_ids_trinary_operator (unsigned int ch)
3339 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3342 /* See PropList-3.0.1.txt. */
3343 static bool
3344 is_property_zero_width (unsigned int ch)
3346 return is_category_Cf (ch)
3347 || (unicode_attributes[ch].name != NULL
3348 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3351 /* See PropList-3.0.1.txt. */
3352 static bool
3353 is_property_space (unsigned int ch)
3355 return is_category_Zs (ch);
3358 /* See PropList-3.0.1.txt. */
3359 static bool
3360 is_property_non_break (unsigned int ch)
3362 /* This is exactly the set of characters having line breaking
3363 property GL. */
3364 return (ch == 0x00A0 /* NO-BREAK SPACE */
3365 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3366 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3367 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3368 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3369 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3370 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3371 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3372 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3373 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3374 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3375 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3376 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3377 || ch == 0x2007 /* FIGURE SPACE */
3378 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3379 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3382 /* See PropList-3.0.1.txt. */
3383 static bool
3384 is_property_iso_control (unsigned int ch)
3386 bool result1 =
3387 (unicode_attributes[ch].name != NULL
3388 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3389 bool result2 =
3390 is_category_Cc (ch);
3392 assert (result1 == result2);
3393 return result1;
3396 /* See PropList-3.0.1.txt. */
3397 static bool
3398 is_property_format_control (unsigned int ch)
3400 return (is_category_Cf (ch)
3401 && get_bidi_category (ch) == UC_BIDI_BN
3402 && !is_property_join_control (ch)
3403 && ch != 0xFEFF);
3406 /* See PropList.txt, UCD.html. */
3407 static bool
3408 is_property_dash (unsigned int ch)
3410 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3413 /* See PropList.txt, UCD.html. */
3414 static bool
3415 is_property_hyphen (unsigned int ch)
3417 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3420 /* See PropList-3.0.1.txt. */
3421 static bool
3422 is_property_punctuation (unsigned int ch)
3424 return is_category_P (ch);
3427 /* See PropList-3.0.1.txt. */
3428 static bool
3429 is_property_line_separator (unsigned int ch)
3431 return is_category_Zl (ch);
3434 /* See PropList-3.0.1.txt. */
3435 static bool
3436 is_property_paragraph_separator (unsigned int ch)
3438 return is_category_Zp (ch);
3441 /* See PropList.txt, UCD.html. */
3442 static bool
3443 is_property_quotation_mark (unsigned int ch)
3445 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3448 /* See PropList.txt, UCD.html. */
3449 static bool
3450 is_property_sentence_terminal (unsigned int ch)
3452 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3455 /* See PropList.txt, UCD.html. */
3456 static bool
3457 is_property_terminal_punctuation (unsigned int ch)
3459 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3462 /* See PropList-3.0.1.txt. */
3463 static bool
3464 is_property_currency_symbol (unsigned int ch)
3466 return is_category_Sc (ch);
3469 /* See Unicode 3.0 book, section 4.9,
3470 PropList.txt, UCD.html,
3471 DerivedCoreProperties.txt, UCD.html. */
3472 static bool
3473 is_property_math (unsigned int ch)
3475 bool result1 =
3476 is_category_Sm (ch)
3477 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3478 bool result2 =
3479 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3481 assert (result1 == result2);
3482 return result1;
3485 /* See PropList.txt, UCD.html. */
3486 static bool
3487 is_property_other_math (unsigned int ch)
3489 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3492 /* See PropList-3.0.1.txt. */
3493 static bool
3494 is_property_paired_punctuation (unsigned int ch)
3496 return unicode_pairedpunctuation[ch];
3499 /* See PropList-3.0.1.txt. */
3500 static bool
3501 is_property_left_of_pair (unsigned int ch)
3503 return unicode_leftofpair[ch];
3506 /* See PropList-3.0.1.txt. */
3507 static bool
3508 is_property_combining (unsigned int ch)
3510 return (unicode_attributes[ch].name != NULL
3511 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3512 || is_category_Mc (ch)
3513 || is_category_Me (ch)
3514 || is_category_Mn (ch)));
3517 #if 0 /* same as is_property_bidi_non_spacing_mark */
3518 /* See PropList-3.0.1.txt. */
3519 static bool
3520 is_property_non_spacing (unsigned int ch)
3522 return (unicode_attributes[ch].name != NULL
3523 && get_bidi_category (ch) == UC_BIDI_NSM);
3525 #endif
3527 /* See PropList-3.0.1.txt. */
3528 static bool
3529 is_property_composite (unsigned int ch)
3531 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3532 logical in some sense. */
3533 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3534 return true;
3535 if (unicode_attributes[ch].name != NULL
3536 && unicode_attributes[ch].decomposition != NULL)
3538 /* Test whether the decomposition contains more than one character,
3539 and the first is not a space. */
3540 const char *decomp = unicode_attributes[ch].decomposition;
3541 if (decomp[0] == '<')
3543 decomp = strchr (decomp, '>') + 1;
3544 if (decomp[0] == ' ')
3545 decomp++;
3547 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3549 return false;
3552 /* See PropList-3.0.1.txt. */
3553 static bool
3554 is_property_decimal_digit (unsigned int ch)
3556 return is_category_Nd (ch);
3559 /* See PropList-3.0.1.txt. */
3560 static bool
3561 is_property_numeric (unsigned int ch)
3563 return ((get_numeric_value (ch)).denominator > 0)
3564 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3565 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3568 /* See PropList.txt, UCD.html. */
3569 static bool
3570 is_property_diacritic (unsigned int ch)
3572 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3575 /* See PropList.txt, UCD.html. */
3576 static bool
3577 is_property_extender (unsigned int ch)
3579 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3582 /* See PropList-3.0.1.txt. */
3583 static bool
3584 is_property_ignorable_control (unsigned int ch)
3586 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3587 || is_category_Cf (ch))
3588 && ch != 0x0000;
3591 /* ------------------------------------------------------------------------- */
3593 /* Output all properties. */
3594 static void
3595 output_properties (const char *version)
3597 #define PROPERTY(P) \
3598 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3599 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3600 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3601 PROPERTY(white_space)
3602 PROPERTY(alphabetic)
3603 PROPERTY(other_alphabetic)
3604 PROPERTY(not_a_character)
3605 PROPERTY(default_ignorable_code_point)
3606 PROPERTY(other_default_ignorable_code_point)
3607 PROPERTY(deprecated)
3608 PROPERTY(logical_order_exception)
3609 PROPERTY(variation_selector)
3610 PROPERTY(private_use)
3611 PROPERTY(unassigned_code_value)
3612 PROPERTY(uppercase)
3613 PROPERTY(other_uppercase)
3614 PROPERTY(lowercase)
3615 PROPERTY(other_lowercase)
3616 PROPERTY(titlecase)
3617 PROPERTY(cased)
3618 PROPERTY(case_ignorable)
3619 PROPERTY(changes_when_lowercased)
3620 PROPERTY(changes_when_uppercased)
3621 PROPERTY(changes_when_titlecased)
3622 PROPERTY(changes_when_casefolded)
3623 PROPERTY(changes_when_casemapped)
3624 PROPERTY(soft_dotted)
3625 PROPERTY(id_start)
3626 PROPERTY(other_id_start)
3627 PROPERTY(id_continue)
3628 PROPERTY(other_id_continue)
3629 PROPERTY(xid_start)
3630 PROPERTY(xid_continue)
3631 PROPERTY(pattern_white_space)
3632 PROPERTY(pattern_syntax)
3633 PROPERTY(join_control)
3634 PROPERTY(grapheme_base)
3635 PROPERTY(grapheme_extend)
3636 PROPERTY(other_grapheme_extend)
3637 PROPERTY(grapheme_link)
3638 PROPERTY(bidi_control)
3639 PROPERTY(bidi_left_to_right)
3640 PROPERTY(bidi_hebrew_right_to_left)
3641 PROPERTY(bidi_arabic_right_to_left)
3642 PROPERTY(bidi_european_digit)
3643 PROPERTY(bidi_eur_num_separator)
3644 PROPERTY(bidi_eur_num_terminator)
3645 PROPERTY(bidi_arabic_digit)
3646 PROPERTY(bidi_common_separator)
3647 PROPERTY(bidi_block_separator)
3648 PROPERTY(bidi_segment_separator)
3649 PROPERTY(bidi_whitespace)
3650 PROPERTY(bidi_non_spacing_mark)
3651 PROPERTY(bidi_boundary_neutral)
3652 PROPERTY(bidi_pdf)
3653 PROPERTY(bidi_embedding_or_override)
3654 PROPERTY(bidi_other_neutral)
3655 PROPERTY(hex_digit)
3656 PROPERTY(ascii_hex_digit)
3657 PROPERTY(ideographic)
3658 PROPERTY(unified_ideograph)
3659 PROPERTY(radical)
3660 PROPERTY(ids_binary_operator)
3661 PROPERTY(ids_trinary_operator)
3662 PROPERTY(zero_width)
3663 PROPERTY(space)
3664 PROPERTY(non_break)
3665 PROPERTY(iso_control)
3666 PROPERTY(format_control)
3667 PROPERTY(dash)
3668 PROPERTY(hyphen)
3669 PROPERTY(punctuation)
3670 PROPERTY(line_separator)
3671 PROPERTY(paragraph_separator)
3672 PROPERTY(quotation_mark)
3673 PROPERTY(sentence_terminal)
3674 PROPERTY(terminal_punctuation)
3675 PROPERTY(currency_symbol)
3676 PROPERTY(math)
3677 PROPERTY(other_math)
3678 PROPERTY(paired_punctuation)
3679 PROPERTY(left_of_pair)
3680 PROPERTY(combining)
3681 PROPERTY(composite)
3682 PROPERTY(decimal_digit)
3683 PROPERTY(numeric)
3684 PROPERTY(diacritic)
3685 PROPERTY(extender)
3686 PROPERTY(ignorable_control)
3687 #undef PROPERTY
3690 /* ========================================================================= */
3692 /* Arabic Shaping. */
3694 enum
3696 UC_JOINING_TYPE_U, /* Non_Joining */
3697 UC_JOINING_TYPE_T, /* Transparent */
3698 UC_JOINING_TYPE_C, /* Join_Causing */
3699 UC_JOINING_TYPE_L, /* Left_Joining */
3700 UC_JOINING_TYPE_R, /* Right_Joining */
3701 UC_JOINING_TYPE_D /* Dual_Joining */
3704 static uint8_t unicode_joining_type[0x110000];
3706 enum
3708 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
3709 UC_JOINING_GROUP_AIN, /* Ain */
3710 UC_JOINING_GROUP_ALAPH, /* Alaph */
3711 UC_JOINING_GROUP_ALEF, /* Alef */
3712 UC_JOINING_GROUP_BEH, /* Beh */
3713 UC_JOINING_GROUP_BETH, /* Beth */
3714 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
3715 UC_JOINING_GROUP_DAL, /* Dal */
3716 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
3717 UC_JOINING_GROUP_E, /* E */
3718 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
3719 UC_JOINING_GROUP_FE, /* Fe */
3720 UC_JOINING_GROUP_FEH, /* Feh */
3721 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
3722 UC_JOINING_GROUP_GAF, /* Gaf */
3723 UC_JOINING_GROUP_GAMAL, /* Gamal */
3724 UC_JOINING_GROUP_HAH, /* Hah */
3725 UC_JOINING_GROUP_HE, /* He */
3726 UC_JOINING_GROUP_HEH, /* Heh */
3727 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
3728 UC_JOINING_GROUP_HETH, /* Heth */
3729 UC_JOINING_GROUP_KAF, /* Kaf */
3730 UC_JOINING_GROUP_KAPH, /* Kaph */
3731 UC_JOINING_GROUP_KHAPH, /* Khaph */
3732 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
3733 UC_JOINING_GROUP_LAM, /* Lam */
3734 UC_JOINING_GROUP_LAMADH, /* Lamadh */
3735 UC_JOINING_GROUP_MEEM, /* Meem */
3736 UC_JOINING_GROUP_MIM, /* Mim */
3737 UC_JOINING_GROUP_NOON, /* Noon */
3738 UC_JOINING_GROUP_NUN, /* Nun */
3739 UC_JOINING_GROUP_NYA, /* Nya */
3740 UC_JOINING_GROUP_PE, /* Pe */
3741 UC_JOINING_GROUP_QAF, /* Qaf */
3742 UC_JOINING_GROUP_QAPH, /* Qaph */
3743 UC_JOINING_GROUP_REH, /* Reh */
3744 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
3745 UC_JOINING_GROUP_SAD, /* Sad */
3746 UC_JOINING_GROUP_SADHE, /* Sadhe */
3747 UC_JOINING_GROUP_SEEN, /* Seen */
3748 UC_JOINING_GROUP_SEMKATH, /* Semkath */
3749 UC_JOINING_GROUP_SHIN, /* Shin */
3750 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
3751 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
3752 UC_JOINING_GROUP_TAH, /* Tah */
3753 UC_JOINING_GROUP_TAW, /* Taw */
3754 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
3755 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
3756 UC_JOINING_GROUP_TETH, /* Teth */
3757 UC_JOINING_GROUP_WAW, /* Waw */
3758 UC_JOINING_GROUP_YEH, /* Yeh */
3759 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
3760 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
3761 UC_JOINING_GROUP_YUDH, /* Yudh */
3762 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
3763 UC_JOINING_GROUP_ZAIN, /* Zain */
3764 UC_JOINING_GROUP_ZHAIN, /* Zhain */
3765 UC_JOINING_GROUP_ROHINGYA_YEH, /* Rohingya_Yeh */
3766 UC_JOINING_GROUP_STRAIGHT_WAW, /* Straight_Waw */
3767 UC_JOINING_GROUP_MANICHAEAN_ALEPH, /* Manichaean_Aleph */
3768 UC_JOINING_GROUP_MANICHAEAN_BETH, /* Manichaean_Beth */
3769 UC_JOINING_GROUP_MANICHAEAN_GIMEL, /* Manichaean_Gimel */
3770 UC_JOINING_GROUP_MANICHAEAN_DALETH, /* Manichaean_Daleth */
3771 UC_JOINING_GROUP_MANICHAEAN_WAW, /* Manichaean_Waw */
3772 UC_JOINING_GROUP_MANICHAEAN_ZAYIN, /* Manichaean_Zayin */
3773 UC_JOINING_GROUP_MANICHAEAN_HETH, /* Manichaean_Heth */
3774 UC_JOINING_GROUP_MANICHAEAN_TETH, /* Manichaean_Teth */
3775 UC_JOINING_GROUP_MANICHAEAN_YODH, /* Manichaean_Yodh */
3776 UC_JOINING_GROUP_MANICHAEAN_KAPH, /* Manichaean_Kaph */
3777 UC_JOINING_GROUP_MANICHAEAN_LAMEDH, /* Manichaean_Lamedh */
3778 UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, /* Manichaean_Dhamedh */
3779 UC_JOINING_GROUP_MANICHAEAN_THAMEDH, /* Manichaean_Thamedh */
3780 UC_JOINING_GROUP_MANICHAEAN_MEM, /* Manichaean_Mem */
3781 UC_JOINING_GROUP_MANICHAEAN_NUN, /* Manichaean_Nun */
3782 UC_JOINING_GROUP_MANICHAEAN_SAMEKH, /* Manichaean_Aleph */
3783 UC_JOINING_GROUP_MANICHAEAN_AYIN, /* Manichaean_Ayin */
3784 UC_JOINING_GROUP_MANICHAEAN_PE, /* Manichaean_Pe */
3785 UC_JOINING_GROUP_MANICHAEAN_SADHE, /* Manichaean_Sadhe */
3786 UC_JOINING_GROUP_MANICHAEAN_QOPH, /* Manichaean_Qoph */
3787 UC_JOINING_GROUP_MANICHAEAN_RESH, /* Manichaean_Resh */
3788 UC_JOINING_GROUP_MANICHAEAN_TAW, /* Manichaean_Taw */
3789 UC_JOINING_GROUP_MANICHAEAN_ONE, /* Manichaean_One */
3790 UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
3791 UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
3792 UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
3793 UC_JOINING_GROUP_MANICHAEAN_HUNDRED, /* Manichaean_Hundred */
3794 UC_JOINING_GROUP_AFRICAN_FEH, /* African_Feh */
3795 UC_JOINING_GROUP_AFRICAN_QAF, /* African_Qaf */
3796 UC_JOINING_GROUP_AFRICAN_NOON /* African_Noon */
3799 static uint8_t unicode_joining_group[0x110000];
3801 static void
3802 fill_arabicshaping (const char *arabicshaping_filename)
3804 FILE *stream;
3805 unsigned int i;
3806 int lineno;
3808 stream = fopen (arabicshaping_filename, "r");
3809 if (stream == NULL)
3811 fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
3812 exit (1);
3815 for (i = 0; i < 0x110000; i++)
3817 unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
3818 unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
3821 lineno = 0;
3822 for (;;)
3824 char buf[200+1];
3825 char separator1[200+1];
3826 char schematic_name[200+1];
3827 char separator2[200+1];
3828 char joining_type_name[200+1];
3829 char separator3[200+1];
3830 char joining_group_name[200+1];
3831 int joining_type;
3832 int joining_group;
3834 lineno++;
3835 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3836 break;
3838 if (buf[0] == '\0' || buf[0] == '#')
3839 continue;
3841 if (sscanf (buf, "%X%[; ]%[^;]%[; ]%[^;]%[; ]%100[^\n]",
3842 &i, separator1, schematic_name, separator2, joining_type_name,
3843 separator3, joining_group_name) != 7)
3845 fprintf (stderr, "parse error in '%s':%d\n",
3846 arabicshaping_filename, lineno);
3847 exit (1);
3849 assert (i < 0x110000);
3851 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
3852 if (false) {}
3853 TRY(UC_JOINING_TYPE_U)
3854 TRY(UC_JOINING_TYPE_T)
3855 TRY(UC_JOINING_TYPE_C)
3856 TRY(UC_JOINING_TYPE_L)
3857 TRY(UC_JOINING_TYPE_R)
3858 TRY(UC_JOINING_TYPE_D)
3859 #undef TRY
3860 else
3862 fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
3863 joining_type_name, arabicshaping_filename, lineno);
3864 exit (1);
3867 /* Remove trailing spaces. */
3868 while (joining_group_name[0] != '\0'
3869 && joining_group_name[strlen (joining_group_name) - 1] == ' ')
3870 joining_group_name[strlen (joining_group_name) - 1] = '\0';
3872 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
3873 if (false) {}
3874 TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
3875 TRY(UC_JOINING_GROUP_AIN, "AIN")
3876 TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
3877 TRY(UC_JOINING_GROUP_ALEF, "ALEF")
3878 TRY(UC_JOINING_GROUP_BEH, "BEH")
3879 TRY(UC_JOINING_GROUP_BETH, "BETH")
3880 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
3881 TRY(UC_JOINING_GROUP_DAL, "DAL")
3882 TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
3883 TRY(UC_JOINING_GROUP_E, "E")
3884 TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
3885 TRY(UC_JOINING_GROUP_FE, "FE")
3886 TRY(UC_JOINING_GROUP_FEH, "FEH")
3887 TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
3888 TRY(UC_JOINING_GROUP_GAF, "GAF")
3889 TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
3890 TRY(UC_JOINING_GROUP_HAH, "HAH")
3891 TRY(UC_JOINING_GROUP_HE, "HE")
3892 TRY(UC_JOINING_GROUP_HEH, "HEH")
3893 TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
3894 TRY(UC_JOINING_GROUP_HETH, "HETH")
3895 TRY(UC_JOINING_GROUP_KAF, "KAF")
3896 TRY(UC_JOINING_GROUP_KAPH, "KAPH")
3897 TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
3898 TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
3899 TRY(UC_JOINING_GROUP_LAM, "LAM")
3900 TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
3901 TRY(UC_JOINING_GROUP_MEEM, "MEEM")
3902 TRY(UC_JOINING_GROUP_MIM, "MIM")
3903 TRY(UC_JOINING_GROUP_NOON, "NOON")
3904 TRY(UC_JOINING_GROUP_NUN, "NUN")
3905 TRY(UC_JOINING_GROUP_NYA, "NYA")
3906 TRY(UC_JOINING_GROUP_PE, "PE")
3907 TRY(UC_JOINING_GROUP_QAF, "QAF")
3908 TRY(UC_JOINING_GROUP_QAPH, "QAPH")
3909 TRY(UC_JOINING_GROUP_REH, "REH")
3910 TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
3911 TRY(UC_JOINING_GROUP_SAD, "SAD")
3912 TRY(UC_JOINING_GROUP_SADHE, "SADHE")
3913 TRY(UC_JOINING_GROUP_SEEN, "SEEN")
3914 TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
3915 TRY(UC_JOINING_GROUP_SHIN, "SHIN")
3916 TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
3917 TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
3918 TRY(UC_JOINING_GROUP_TAH, "TAH")
3919 TRY(UC_JOINING_GROUP_TAW, "TAW")
3920 TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
3921 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
3922 TRY(UC_JOINING_GROUP_TETH, "TETH")
3923 TRY(UC_JOINING_GROUP_WAW, "WAW")
3924 TRY(UC_JOINING_GROUP_YEH, "YEH")
3925 TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
3926 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
3927 TRY(UC_JOINING_GROUP_YUDH, "YUDH")
3928 TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
3929 TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
3930 TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
3931 TRY(UC_JOINING_GROUP_ROHINGYA_YEH, "ROHINGYA YEH")
3932 TRY(UC_JOINING_GROUP_STRAIGHT_WAW, "STRAIGHT WAW")
3933 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH, "MANICHAEAN ALEPH")
3934 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH, "MANICHAEAN BETH")
3935 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL, "MANICHAEAN GIMEL")
3936 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH, "MANICHAEAN DALETH")
3937 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW, "MANICHAEAN WAW")
3938 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN, "MANICHAEAN ZAYIN")
3939 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH, "MANICHAEAN HETH")
3940 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH, "MANICHAEAN TETH")
3941 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH, "MANICHAEAN YODH")
3942 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH, "MANICHAEAN KAPH")
3943 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH, "MANICHAEAN LAMEDH")
3944 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, "MANICHAEAN DHAMEDH")
3945 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH, "MANICHAEAN THAMEDH")
3946 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM, "MANICHAEAN MEM")
3947 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN, "MANICHAEAN NUN")
3948 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH, "MANICHAEAN SAMEKH")
3949 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN, "MANICHAEAN AYIN")
3950 TRY(UC_JOINING_GROUP_MANICHAEAN_PE, "MANICHAEAN PE")
3951 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE, "MANICHAEAN SADHE")
3952 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH, "MANICHAEAN QOPH")
3953 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH, "MANICHAEAN RESH")
3954 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW, "MANICHAEAN TAW")
3955 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE, "MANICHAEAN ONE")
3956 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE, "MANICHAEAN FIVE")
3957 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN, "MANICHAEAN TEN")
3958 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY, "MANICHAEAN TWENTY")
3959 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED, "MANICHAEAN HUNDRED")
3960 TRY(UC_JOINING_GROUP_AFRICAN_FEH, "AFRICAN FEH")
3961 TRY(UC_JOINING_GROUP_AFRICAN_QAF, "AFRICAN QAF")
3962 TRY(UC_JOINING_GROUP_AFRICAN_NOON, "AFRICAN NOON")
3963 #undef TRY
3964 else
3966 fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
3967 joining_group_name, arabicshaping_filename, lineno);
3968 exit (1);
3971 unicode_joining_type[i] = joining_type;
3972 unicode_joining_group[i] = joining_group;
3975 if (ferror (stream) || fclose (stream))
3977 fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
3978 exit (1);
3982 /* Convert a Joining_Type value to a C identifier. */
3983 static const char *
3984 joining_type_as_c_identifier (int joining_type)
3986 #define TRY(value) if (joining_type == value) return #value;
3987 TRY(UC_JOINING_TYPE_U)
3988 TRY(UC_JOINING_TYPE_T)
3989 TRY(UC_JOINING_TYPE_C)
3990 TRY(UC_JOINING_TYPE_L)
3991 TRY(UC_JOINING_TYPE_R)
3992 TRY(UC_JOINING_TYPE_D)
3993 #undef TRY
3994 abort ();
3997 static void
3998 output_joining_type_test (const char *filename, const char *version)
4000 FILE *stream;
4001 bool need_comma;
4002 unsigned int ch;
4004 stream = fopen (filename, "w");
4005 if (stream == NULL)
4007 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4008 exit (1);
4011 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4012 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4013 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4014 version);
4016 need_comma = false;
4017 for (ch = 0; ch < 0x110000; ch++)
4019 int value = unicode_joining_type[ch];
4021 if (value != (uint8_t)~(uint8_t)0)
4023 if (need_comma)
4024 fprintf (stream, ",\n");
4025 fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
4026 need_comma = true;
4029 if (need_comma)
4030 fprintf (stream, "\n");
4032 if (ferror (stream) || fclose (stream))
4034 fprintf (stderr, "error writing to '%s'\n", filename);
4035 exit (1);
4039 /* Construction of sparse 3-level tables. */
4040 #define TABLE joining_type_table
4041 #define ELEMENT uint8_t
4042 #define DEFAULT (uint8_t)~(uint8_t)0
4043 #define xmalloc malloc
4044 #define xrealloc realloc
4045 #include "3level.h"
4047 static void
4048 output_joining_type (const char *filename, const char *version)
4050 FILE *stream;
4051 unsigned int ch, i;
4052 struct joining_type_table t;
4053 unsigned int level1_offset, level2_offset, level3_offset;
4054 uint8_t *level3_packed;
4056 stream = fopen (filename, "w");
4057 if (stream == NULL)
4059 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4060 exit (1);
4063 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4064 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4065 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4066 version);
4068 t.p = 7;
4069 t.q = 9;
4070 joining_type_table_init (&t);
4072 for (ch = 0; ch < 0x110000; ch++)
4074 uint8_t value = unicode_joining_type[ch];
4076 assert (value == (uint8_t)~(uint8_t)0 || value <= 0x0f);
4078 joining_type_table_add (&t, ch, value);
4081 joining_type_table_finalize (&t);
4083 /* Offsets in t.result, in memory of this process. */
4084 level1_offset =
4085 5 * sizeof (uint32_t);
4086 level2_offset =
4087 5 * sizeof (uint32_t)
4088 + t.level1_size * sizeof (uint32_t);
4089 level3_offset =
4090 5 * sizeof (uint32_t)
4091 + t.level1_size * sizeof (uint32_t)
4092 + (t.level2_size << t.q) * sizeof (uint32_t);
4094 for (i = 0; i < 5; i++)
4095 fprintf (stream, "#define joining_type_header_%d %d\n", i,
4096 ((uint32_t *) t.result)[i]);
4097 fprintf (stream, "static const\n");
4098 fprintf (stream, "struct\n");
4099 fprintf (stream, " {\n");
4100 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4101 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4102 fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
4103 (1 << t.p) * 4 / 8);
4104 fprintf (stream, " }\n");
4105 fprintf (stream, "u_joining_type =\n");
4106 fprintf (stream, "{\n");
4107 fprintf (stream, " {");
4108 if (t.level1_size > 8)
4109 fprintf (stream, "\n ");
4110 for (i = 0; i < t.level1_size; i++)
4112 uint32_t offset;
4113 if (i > 0 && (i % 8) == 0)
4114 fprintf (stream, "\n ");
4115 offset = ((uint32_t *) (t.result + level1_offset))[i];
4116 if (offset == 0)
4117 fprintf (stream, " %5d", -1);
4118 else
4119 fprintf (stream, " %5zu",
4120 (offset - level2_offset) / sizeof (uint32_t));
4121 if (i+1 < t.level1_size)
4122 fprintf (stream, ",");
4124 if (t.level1_size > 8)
4125 fprintf (stream, "\n ");
4126 fprintf (stream, " },\n");
4127 fprintf (stream, " {");
4128 if (t.level2_size << t.q > 8)
4129 fprintf (stream, "\n ");
4130 for (i = 0; i < t.level2_size << t.q; i++)
4132 uint32_t offset;
4133 if (i > 0 && (i % 8) == 0)
4134 fprintf (stream, "\n ");
4135 offset = ((uint32_t *) (t.result + level2_offset))[i];
4136 if (offset == 0)
4137 fprintf (stream, " %5d", -1);
4138 else
4139 fprintf (stream, " %5zu",
4140 (offset - level3_offset) / sizeof (uint8_t));
4141 if (i+1 < t.level2_size << t.q)
4142 fprintf (stream, ",");
4144 if (t.level2_size << t.q > 8)
4145 fprintf (stream, "\n ");
4146 fprintf (stream, " },\n");
4147 /* Pack the level3 array. Each entry needs 4 bits only. */
4148 level3_packed =
4149 (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
4150 for (i = 0; i < t.level3_size << t.p; i++)
4152 unsigned int j = (i * 4) / 8;
4153 unsigned int k = (i * 4) % 8;
4154 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
4155 level3_packed[j] |= (value << k);
4157 fprintf (stream, " {");
4158 if ((t.level3_size << t.p) * 4 / 8 > 8)
4159 fprintf (stream, "\n ");
4160 for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
4162 if (i > 0 && (i % 8) == 0)
4163 fprintf (stream, "\n ");
4164 fprintf (stream, " 0x%02x", level3_packed[i]);
4165 if (i+1 < (t.level3_size << t.p) * 4 / 8)
4166 fprintf (stream, ",");
4168 if ((t.level3_size << t.p) * 4 / 8 > 8)
4169 fprintf (stream, "\n ");
4170 fprintf (stream, " }\n");
4171 free (level3_packed);
4172 fprintf (stream, "};\n");
4174 if (ferror (stream) || fclose (stream))
4176 fprintf (stderr, "error writing to '%s'\n", filename);
4177 exit (1);
4181 /* Convert a Joining_Group value to a C identifier. */
4182 static const char *
4183 joining_group_as_c_identifier (int joining_group)
4185 #define TRY(value) if (joining_group == value) return #value;
4186 TRY(UC_JOINING_GROUP_NONE)
4187 TRY(UC_JOINING_GROUP_AIN)
4188 TRY(UC_JOINING_GROUP_ALAPH)
4189 TRY(UC_JOINING_GROUP_ALEF)
4190 TRY(UC_JOINING_GROUP_BEH)
4191 TRY(UC_JOINING_GROUP_BETH)
4192 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
4193 TRY(UC_JOINING_GROUP_DAL)
4194 TRY(UC_JOINING_GROUP_DALATH_RISH)
4195 TRY(UC_JOINING_GROUP_E)
4196 TRY(UC_JOINING_GROUP_FARSI_YEH)
4197 TRY(UC_JOINING_GROUP_FE)
4198 TRY(UC_JOINING_GROUP_FEH)
4199 TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
4200 TRY(UC_JOINING_GROUP_GAF)
4201 TRY(UC_JOINING_GROUP_GAMAL)
4202 TRY(UC_JOINING_GROUP_HAH)
4203 TRY(UC_JOINING_GROUP_HE)
4204 TRY(UC_JOINING_GROUP_HEH)
4205 TRY(UC_JOINING_GROUP_HEH_GOAL)
4206 TRY(UC_JOINING_GROUP_HETH)
4207 TRY(UC_JOINING_GROUP_KAF)
4208 TRY(UC_JOINING_GROUP_KAPH)
4209 TRY(UC_JOINING_GROUP_KHAPH)
4210 TRY(UC_JOINING_GROUP_KNOTTED_HEH)
4211 TRY(UC_JOINING_GROUP_LAM)
4212 TRY(UC_JOINING_GROUP_LAMADH)
4213 TRY(UC_JOINING_GROUP_MEEM)
4214 TRY(UC_JOINING_GROUP_MIM)
4215 TRY(UC_JOINING_GROUP_NOON)
4216 TRY(UC_JOINING_GROUP_NUN)
4217 TRY(UC_JOINING_GROUP_NYA)
4218 TRY(UC_JOINING_GROUP_PE)
4219 TRY(UC_JOINING_GROUP_QAF)
4220 TRY(UC_JOINING_GROUP_QAPH)
4221 TRY(UC_JOINING_GROUP_REH)
4222 TRY(UC_JOINING_GROUP_REVERSED_PE)
4223 TRY(UC_JOINING_GROUP_SAD)
4224 TRY(UC_JOINING_GROUP_SADHE)
4225 TRY(UC_JOINING_GROUP_SEEN)
4226 TRY(UC_JOINING_GROUP_SEMKATH)
4227 TRY(UC_JOINING_GROUP_SHIN)
4228 TRY(UC_JOINING_GROUP_SWASH_KAF)
4229 TRY(UC_JOINING_GROUP_SYRIAC_WAW)
4230 TRY(UC_JOINING_GROUP_TAH)
4231 TRY(UC_JOINING_GROUP_TAW)
4232 TRY(UC_JOINING_GROUP_TEH_MARBUTA)
4233 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
4234 TRY(UC_JOINING_GROUP_TETH)
4235 TRY(UC_JOINING_GROUP_WAW)
4236 TRY(UC_JOINING_GROUP_YEH)
4237 TRY(UC_JOINING_GROUP_YEH_BARREE)
4238 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
4239 TRY(UC_JOINING_GROUP_YUDH)
4240 TRY(UC_JOINING_GROUP_YUDH_HE)
4241 TRY(UC_JOINING_GROUP_ZAIN)
4242 TRY(UC_JOINING_GROUP_ZHAIN)
4243 TRY(UC_JOINING_GROUP_ROHINGYA_YEH)
4244 TRY(UC_JOINING_GROUP_STRAIGHT_WAW)
4245 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH)
4246 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH)
4247 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL)
4248 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH)
4249 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW)
4250 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN)
4251 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH)
4252 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH)
4253 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH)
4254 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH)
4255 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH)
4256 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH)
4257 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH)
4258 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM)
4259 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN)
4260 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH)
4261 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN)
4262 TRY(UC_JOINING_GROUP_MANICHAEAN_PE)
4263 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE)
4264 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH)
4265 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH)
4266 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW)
4267 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE)
4268 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE)
4269 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN)
4270 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY)
4271 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED)
4272 TRY(UC_JOINING_GROUP_AFRICAN_FEH)
4273 TRY(UC_JOINING_GROUP_AFRICAN_QAF)
4274 TRY(UC_JOINING_GROUP_AFRICAN_NOON)
4275 #undef TRY
4276 abort ();
4279 static void
4280 output_joining_group_test (const char *filename, const char *version)
4282 FILE *stream;
4283 bool need_comma;
4284 unsigned int ch;
4286 stream = fopen (filename, "w");
4287 if (stream == NULL)
4289 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4290 exit (1);
4293 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4294 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4295 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4296 version);
4298 need_comma = false;
4299 for (ch = 0; ch < 0x110000; ch++)
4301 int value = unicode_joining_group[ch];
4303 if (value != UC_JOINING_GROUP_NONE)
4305 if (need_comma)
4306 fprintf (stream, ",\n");
4307 fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
4308 need_comma = true;
4311 if (need_comma)
4312 fprintf (stream, "\n");
4314 if (ferror (stream) || fclose (stream))
4316 fprintf (stderr, "error writing to '%s'\n", filename);
4317 exit (1);
4321 /* Construction of sparse 3-level tables. */
4322 #define TABLE joining_group_table
4323 #define ELEMENT uint8_t
4324 #define DEFAULT UC_JOINING_GROUP_NONE
4325 #define xmalloc malloc
4326 #define xrealloc realloc
4327 #include "3level.h"
4329 static void
4330 output_joining_group (const char *filename, const char *version)
4332 FILE *stream;
4333 unsigned int ch, i;
4334 struct joining_group_table t;
4335 unsigned int level1_offset, level2_offset, level3_offset;
4336 uint16_t *level3_packed;
4338 stream = fopen (filename, "w");
4339 if (stream == NULL)
4341 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4342 exit (1);
4345 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4346 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4347 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4348 version);
4350 t.p = 7;
4351 t.q = 9;
4352 joining_group_table_init (&t);
4354 for (ch = 0; ch < 0x110000; ch++)
4356 uint8_t value = unicode_joining_group[ch];
4358 assert (value <= 0x7f);
4360 joining_group_table_add (&t, ch, value);
4363 joining_group_table_finalize (&t);
4365 /* Offsets in t.result, in memory of this process. */
4366 level1_offset =
4367 5 * sizeof (uint32_t);
4368 level2_offset =
4369 5 * sizeof (uint32_t)
4370 + t.level1_size * sizeof (uint32_t);
4371 level3_offset =
4372 5 * sizeof (uint32_t)
4373 + t.level1_size * sizeof (uint32_t)
4374 + (t.level2_size << t.q) * sizeof (uint32_t);
4376 for (i = 0; i < 5; i++)
4377 fprintf (stream, "#define joining_group_header_%d %d\n", i,
4378 ((uint32_t *) t.result)[i]);
4379 fprintf (stream, "static const\n");
4380 fprintf (stream, "struct\n");
4381 fprintf (stream, " {\n");
4382 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4383 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4384 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
4385 (1 << t.p) * 7 / 16);
4386 fprintf (stream, " }\n");
4387 fprintf (stream, "u_joining_group =\n");
4388 fprintf (stream, "{\n");
4389 fprintf (stream, " {");
4390 if (t.level1_size > 8)
4391 fprintf (stream, "\n ");
4392 for (i = 0; i < t.level1_size; i++)
4394 uint32_t offset;
4395 if (i > 0 && (i % 8) == 0)
4396 fprintf (stream, "\n ");
4397 offset = ((uint32_t *) (t.result + level1_offset))[i];
4398 if (offset == 0)
4399 fprintf (stream, " %5d", -1);
4400 else
4401 fprintf (stream, " %5zu",
4402 (offset - level2_offset) / sizeof (uint32_t));
4403 if (i+1 < t.level1_size)
4404 fprintf (stream, ",");
4406 if (t.level1_size > 8)
4407 fprintf (stream, "\n ");
4408 fprintf (stream, " },\n");
4409 fprintf (stream, " {");
4410 if (t.level2_size << t.q > 8)
4411 fprintf (stream, "\n ");
4412 for (i = 0; i < t.level2_size << t.q; i++)
4414 uint32_t offset;
4415 if (i > 0 && (i % 8) == 0)
4416 fprintf (stream, "\n ");
4417 offset = ((uint32_t *) (t.result + level2_offset))[i];
4418 if (offset == 0)
4419 fprintf (stream, " %5d", -1);
4420 else
4421 fprintf (stream, " %5zu",
4422 (offset - level3_offset) / sizeof (uint8_t));
4423 if (i+1 < t.level2_size << t.q)
4424 fprintf (stream, ",");
4426 if (t.level2_size << t.q > 8)
4427 fprintf (stream, "\n ");
4428 fprintf (stream, " },\n");
4429 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
4430 not 32-bit units, in order to make the lookup function easier. */
4431 level3_packed =
4432 (uint16_t *)
4433 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
4434 for (i = 0; i < t.level3_size << t.p; i++)
4436 unsigned int j = (i * 7) / 16;
4437 unsigned int k = (i * 7) % 16;
4438 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
4439 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
4440 level3_packed[j] = value & 0xffff;
4441 level3_packed[j+1] = value >> 16;
4443 fprintf (stream, " {");
4444 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4445 fprintf (stream, "\n ");
4446 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
4448 if (i > 0 && (i % 8) == 0)
4449 fprintf (stream, "\n ");
4450 fprintf (stream, " 0x%04x", level3_packed[i]);
4451 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
4452 fprintf (stream, ",");
4454 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4455 fprintf (stream, "\n ");
4456 fprintf (stream, " }\n");
4457 free (level3_packed);
4458 fprintf (stream, "};\n");
4460 if (ferror (stream) || fclose (stream))
4462 fprintf (stderr, "error writing to '%s'\n", filename);
4463 exit (1);
4467 /* ========================================================================= */
4469 /* Scripts. */
4471 static const char *scripts[256];
4472 static unsigned int numscripts;
4474 static uint8_t unicode_scripts[0x110000];
4476 static void
4477 fill_scripts (const char *scripts_filename)
4479 FILE *stream;
4480 unsigned int i;
4482 stream = fopen (scripts_filename, "r");
4483 if (stream == NULL)
4485 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
4486 exit (1);
4489 numscripts = 0;
4491 for (i = 0; i < 0x110000; i++)
4492 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
4494 for (;;)
4496 char buf[200+1];
4497 unsigned int i1, i2;
4498 char padding[200+1];
4499 char scriptname[200+1];
4500 int script;
4502 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4503 break;
4505 if (buf[0] == '\0' || buf[0] == '#')
4506 continue;
4508 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
4510 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
4512 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
4513 exit (1);
4515 i2 = i1;
4517 assert (i2 >= i1);
4518 assert (i2 < 0x110000);
4520 for (script = numscripts - 1; script >= 0; script--)
4521 if (strcmp (scripts[script], scriptname) == 0)
4522 break;
4523 if (script < 0)
4525 scripts[numscripts] = strdup (scriptname);
4526 script = numscripts;
4527 numscripts++;
4528 assert (numscripts != 256);
4531 for (i = i1; i <= i2; i++)
4533 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
4534 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
4535 unicode_scripts[i] = script;
4539 if (ferror (stream) || fclose (stream))
4541 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
4542 exit (1);
4546 /* Construction of sparse 3-level tables. */
4547 #define TABLE script_table
4548 #define ELEMENT uint8_t
4549 #define DEFAULT (uint8_t)~(uint8_t)0
4550 #define xmalloc malloc
4551 #define xrealloc realloc
4552 #include "3level.h"
4554 static void
4555 output_scripts (const char *version)
4557 const char *filename = "unictype/scripts.h";
4558 FILE *stream;
4559 unsigned int ch, s, i;
4560 struct script_table t;
4561 unsigned int level1_offset, level2_offset, level3_offset;
4563 typedef struct
4565 const char *lowercase_name;
4567 scriptinfo_t;
4568 scriptinfo_t scriptinfo[256];
4570 stream = fopen (filename, "w");
4571 if (stream == NULL)
4573 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4574 exit (1);
4577 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4578 fprintf (stream, "/* Unicode scripts. */\n");
4579 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4580 version);
4582 for (s = 0; s < numscripts; s++)
4584 char *lcp = strdup (scripts[s]);
4585 char *cp;
4587 for (cp = lcp; *cp != '\0'; cp++)
4588 if (*cp >= 'A' && *cp <= 'Z')
4589 *cp += 'a' - 'A';
4591 scriptinfo[s].lowercase_name = lcp;
4594 for (s = 0; s < numscripts; s++)
4596 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
4597 scriptinfo[s].lowercase_name);
4598 fprintf (stream, "{\n");
4599 i = 0;
4600 for (ch = 0; ch < 0x110000; ch++)
4601 if (unicode_scripts[ch] == s)
4603 unsigned int start;
4604 unsigned int end;
4606 start = ch;
4607 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
4608 ch++;
4609 end = ch;
4611 if (i > 0)
4612 fprintf (stream, ",\n");
4613 if (start == end)
4614 fprintf (stream, " { 0x%04X, 1, 1 }", start);
4615 else
4616 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
4617 start, end);
4618 i++;
4620 fprintf (stream, "\n");
4621 fprintf (stream, "};\n");
4624 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
4625 fprintf (stream, "{\n");
4626 for (s = 0; s < numscripts; s++)
4628 fprintf (stream, " {\n");
4629 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
4630 scriptinfo[s].lowercase_name);
4631 fprintf (stream, " script_%s_intervals,\n",
4632 scriptinfo[s].lowercase_name);
4633 fprintf (stream, " \"%s\"\n", scripts[s]);
4634 fprintf (stream, " }");
4635 if (s+1 < numscripts)
4636 fprintf (stream, ",");
4637 fprintf (stream, "\n");
4639 fprintf (stream, "};\n");
4641 t.p = 7;
4642 t.q = 9;
4643 script_table_init (&t);
4645 for (ch = 0; ch < 0x110000; ch++)
4647 unsigned int s = unicode_scripts[ch];
4648 if (s != (uint8_t)~(uint8_t)0)
4649 script_table_add (&t, ch, s);
4652 script_table_finalize (&t);
4654 /* Offsets in t.result, in memory of this process. */
4655 level1_offset =
4656 5 * sizeof (uint32_t);
4657 level2_offset =
4658 5 * sizeof (uint32_t)
4659 + t.level1_size * sizeof (uint32_t);
4660 level3_offset =
4661 5 * sizeof (uint32_t)
4662 + t.level1_size * sizeof (uint32_t)
4663 + (t.level2_size << t.q) * sizeof (uint32_t);
4665 for (i = 0; i < 5; i++)
4666 fprintf (stream, "#define script_header_%d %d\n", i,
4667 ((uint32_t *) t.result)[i]);
4668 fprintf (stream, "static const\n");
4669 fprintf (stream, "struct\n");
4670 fprintf (stream, " {\n");
4671 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4672 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4673 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
4674 fprintf (stream, " }\n");
4675 fprintf (stream, "u_script =\n");
4676 fprintf (stream, "{\n");
4677 fprintf (stream, " {");
4678 if (t.level1_size > 8)
4679 fprintf (stream, "\n ");
4680 for (i = 0; i < t.level1_size; i++)
4682 uint32_t offset;
4683 if (i > 0 && (i % 8) == 0)
4684 fprintf (stream, "\n ");
4685 offset = ((uint32_t *) (t.result + level1_offset))[i];
4686 if (offset == 0)
4687 fprintf (stream, " %5d", -1);
4688 else
4689 fprintf (stream, " %5zu",
4690 (offset - level2_offset) / sizeof (uint32_t));
4691 if (i+1 < t.level1_size)
4692 fprintf (stream, ",");
4694 if (t.level1_size > 8)
4695 fprintf (stream, "\n ");
4696 fprintf (stream, " },\n");
4697 fprintf (stream, " {");
4698 if (t.level2_size << t.q > 8)
4699 fprintf (stream, "\n ");
4700 for (i = 0; i < t.level2_size << t.q; i++)
4702 uint32_t offset;
4703 if (i > 0 && (i % 8) == 0)
4704 fprintf (stream, "\n ");
4705 offset = ((uint32_t *) (t.result + level2_offset))[i];
4706 if (offset == 0)
4707 fprintf (stream, " %5d", -1);
4708 else
4709 fprintf (stream, " %5zu",
4710 (offset - level3_offset) / sizeof (uint8_t));
4711 if (i+1 < t.level2_size << t.q)
4712 fprintf (stream, ",");
4714 if (t.level2_size << t.q > 8)
4715 fprintf (stream, "\n ");
4716 fprintf (stream, " },\n");
4717 fprintf (stream, " {");
4718 if (t.level3_size << t.p > 8)
4719 fprintf (stream, "\n ");
4720 for (i = 0; i < t.level3_size << t.p; i++)
4722 if (i > 0 && (i % 8) == 0)
4723 fprintf (stream, "\n ");
4724 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
4725 if (i+1 < t.level3_size << t.p)
4726 fprintf (stream, ",");
4728 if (t.level3_size << t.p > 8)
4729 fprintf (stream, "\n ");
4730 fprintf (stream, " }\n");
4731 fprintf (stream, "};\n");
4733 if (ferror (stream) || fclose (stream))
4735 fprintf (stderr, "error writing to '%s'\n", filename);
4736 exit (1);
4740 static void
4741 output_scripts_byname (const char *version)
4743 const char *filename = "unictype/scripts_byname.gperf";
4744 FILE *stream;
4745 unsigned int s;
4747 stream = fopen (filename, "w");
4748 if (stream == NULL)
4750 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4751 exit (1);
4754 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4755 fprintf (stream, "/* Unicode scripts. */\n");
4756 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4757 version);
4758 fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
4759 fprintf (stream, "%%struct-type\n");
4760 fprintf (stream, "%%language=ANSI-C\n");
4761 fprintf (stream, "%%define hash-function-name scripts_hash\n");
4762 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
4763 fprintf (stream, "%%readonly-tables\n");
4764 fprintf (stream, "%%global-table\n");
4765 fprintf (stream, "%%define word-array-name script_names\n");
4766 fprintf (stream, "%%pic\n");
4767 fprintf (stream, "%%define string-pool-name script_stringpool\n");
4768 fprintf (stream, "%%%%\n");
4769 for (s = 0; s < numscripts; s++)
4770 fprintf (stream, "%s, %u\n", scripts[s], s);
4772 if (ferror (stream) || fclose (stream))
4774 fprintf (stderr, "error writing to '%s'\n", filename);
4775 exit (1);
4779 /* ========================================================================= */
4781 /* Blocks. */
4783 typedef struct { unsigned int start; unsigned int end; const char *name; }
4784 block_t;
4785 static block_t blocks[384];
4786 static unsigned int numblocks;
4788 static void
4789 fill_blocks (const char *blocks_filename)
4791 FILE *stream;
4793 stream = fopen (blocks_filename, "r");
4794 if (stream == NULL)
4796 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
4797 exit (1);
4800 for (;;)
4802 char buf[200+1];
4803 unsigned int i1, i2;
4804 char padding[200+1];
4805 char blockname[200+1];
4807 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4808 break;
4810 if (buf[0] == '\0' || buf[0] == '#')
4811 continue;
4813 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
4815 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
4816 exit (1);
4818 blocks[numblocks].start = i1;
4819 blocks[numblocks].end = i2;
4820 blocks[numblocks].name = strdup (blockname);
4821 /* It must be sorted. */
4822 assert (numblocks == 0 || blocks[numblocks-1].end < blocks[numblocks].start);
4823 numblocks++;
4824 assert (numblocks != SIZEOF (blocks));
4827 if (ferror (stream) || fclose (stream))
4829 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
4830 exit (1);
4834 /* Return the smallest block index among the blocks for characters >= ch. */
4835 static unsigned int
4836 block_first_index (unsigned int ch)
4838 /* Binary search. */
4839 unsigned int lo = 0;
4840 unsigned int hi = numblocks;
4841 /* Invariants:
4842 All blocks[i], i < lo, have blocks[i].end < ch,
4843 all blocks[i], i >= hi, have blocks[i].end >= ch. */
4844 while (lo < hi)
4846 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4847 if (blocks[mid].end < ch)
4848 lo = mid + 1;
4849 else
4850 hi = mid;
4852 return hi;
4855 /* Return the largest block index among the blocks for characters <= ch,
4856 plus 1. */
4857 static unsigned int
4858 block_last_index (unsigned int ch)
4860 /* Binary search. */
4861 unsigned int lo = 0;
4862 unsigned int hi = numblocks;
4863 /* Invariants:
4864 All blocks[i], i < lo, have blocks[i].start <= ch,
4865 all blocks[i], i >= hi, have blocks[i].start > ch. */
4866 while (lo < hi)
4868 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4869 if (blocks[mid].start <= ch)
4870 lo = mid + 1;
4871 else
4872 hi = mid;
4874 return hi;
4877 static void
4878 output_blocks (const char *version)
4880 const char *filename = "unictype/blocks.h";
4881 const unsigned int shift = 8; /* bits to shift away for array access */
4882 const unsigned int threshold = 0x28000; /* cut-off table here to save space */
4883 FILE *stream;
4884 unsigned int i;
4885 unsigned int i1;
4887 stream = fopen (filename, "w");
4888 if (stream == NULL)
4890 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4891 exit (1);
4894 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4895 fprintf (stream, "/* Unicode blocks. */\n");
4896 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4897 version);
4899 fprintf (stream, "static const uc_block_t blocks[] =\n");
4900 fprintf (stream, "{\n");
4901 for (i = 0; i < numblocks; i++)
4903 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
4904 blocks[i].end, blocks[i].name);
4905 if (i+1 < numblocks)
4906 fprintf (stream, ",");
4907 fprintf (stream, "\n");
4909 fprintf (stream, "};\n");
4910 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
4911 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
4912 fprintf (stream, "static const uint16_t blocks_level1[%d * 2] =\n",
4913 threshold >> shift);
4914 fprintf (stream, "{\n");
4915 for (i1 = 0; i1 < (threshold >> shift); i1++)
4917 unsigned int first_index = block_first_index (i1 << shift);
4918 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
4919 fprintf (stream, " %3d, %3d", first_index, last_index);
4920 if (i1+1 < (threshold >> shift))
4921 fprintf (stream, ",");
4922 fprintf (stream, "\n");
4924 fprintf (stream, "};\n");
4925 fprintf (stream, "#define blocks_upper_first_index %d\n",
4926 block_first_index (threshold));
4927 fprintf (stream, "#define blocks_upper_last_index %d\n",
4928 block_last_index (0x10FFFF));
4930 if (ferror (stream) || fclose (stream))
4932 fprintf (stderr, "error writing to '%s'\n", filename);
4933 exit (1);
4937 /* ========================================================================= */
4939 /* C and Java syntax. */
4941 enum
4943 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4944 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4945 UC_IDENTIFIER_INVALID, /* not valid */
4946 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4949 /* ISO C 99 section 6.4.(3). */
4950 static bool
4951 is_c_whitespace (unsigned int ch)
4953 return (ch == ' ' /* space */
4954 || ch == '\t' /* horizontal tab */
4955 || ch == '\n' || ch == '\r' /* new-line */
4956 || ch == '\v' /* vertical tab */
4957 || ch == '\f'); /* form-feed */
4960 /* ISO C 99 section 6.4.2.1 and appendix D. */
4961 static int
4962 c_ident_category (unsigned int ch)
4964 /* Section 6.4.2.1. */
4965 if (ch >= '0' && ch <= '9')
4966 return UC_IDENTIFIER_VALID;
4967 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4968 return UC_IDENTIFIER_START;
4969 /* Appendix D. */
4970 if (0
4971 /* Latin */
4972 || (ch == 0x00AA)
4973 || (ch == 0x00BA)
4974 || (ch >= 0x00C0 && ch <= 0x00D6)
4975 || (ch >= 0x00D8 && ch <= 0x00F6)
4976 || (ch >= 0x00F8 && ch <= 0x01F5)
4977 || (ch >= 0x01FA && ch <= 0x0217)
4978 || (ch >= 0x0250 && ch <= 0x02A8)
4979 || (ch >= 0x1E00 && ch <= 0x1E9B)
4980 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4981 || (ch == 0x207F)
4982 /* Greek */
4983 || (ch == 0x0386)
4984 || (ch >= 0x0388 && ch <= 0x038A)
4985 || (ch == 0x038C)
4986 || (ch >= 0x038E && ch <= 0x03A1)
4987 || (ch >= 0x03A3 && ch <= 0x03CE)
4988 || (ch >= 0x03D0 && ch <= 0x03D6)
4989 || (ch == 0x03DA)
4990 || (ch == 0x03DC)
4991 || (ch == 0x03DE)
4992 || (ch == 0x03E0)
4993 || (ch >= 0x03E2 && ch <= 0x03F3)
4994 || (ch >= 0x1F00 && ch <= 0x1F15)
4995 || (ch >= 0x1F18 && ch <= 0x1F1D)
4996 || (ch >= 0x1F20 && ch <= 0x1F45)
4997 || (ch >= 0x1F48 && ch <= 0x1F4D)
4998 || (ch >= 0x1F50 && ch <= 0x1F57)
4999 || (ch == 0x1F59)
5000 || (ch == 0x1F5B)
5001 || (ch == 0x1F5D)
5002 || (ch >= 0x1F5F && ch <= 0x1F7D)
5003 || (ch >= 0x1F80 && ch <= 0x1FB4)
5004 || (ch >= 0x1FB6 && ch <= 0x1FBC)
5005 || (ch >= 0x1FC2 && ch <= 0x1FC4)
5006 || (ch >= 0x1FC6 && ch <= 0x1FCC)
5007 || (ch >= 0x1FD0 && ch <= 0x1FD3)
5008 || (ch >= 0x1FD6 && ch <= 0x1FDB)
5009 || (ch >= 0x1FE0 && ch <= 0x1FEC)
5010 || (ch >= 0x1FF2 && ch <= 0x1FF4)
5011 || (ch >= 0x1FF6 && ch <= 0x1FFC)
5012 /* Cyrillic */
5013 || (ch >= 0x0401 && ch <= 0x040C)
5014 || (ch >= 0x040E && ch <= 0x044F)
5015 || (ch >= 0x0451 && ch <= 0x045C)
5016 || (ch >= 0x045E && ch <= 0x0481)
5017 || (ch >= 0x0490 && ch <= 0x04C4)
5018 || (ch >= 0x04C7 && ch <= 0x04C8)
5019 || (ch >= 0x04CB && ch <= 0x04CC)
5020 || (ch >= 0x04D0 && ch <= 0x04EB)
5021 || (ch >= 0x04EE && ch <= 0x04F5)
5022 || (ch >= 0x04F8 && ch <= 0x04F9)
5023 /* Armenian */
5024 || (ch >= 0x0531 && ch <= 0x0556)
5025 || (ch >= 0x0561 && ch <= 0x0587)
5026 /* Hebrew */
5027 || (ch >= 0x05B0 && ch <= 0x05B9)
5028 || (ch >= 0x05BB && ch <= 0x05BD)
5029 || (ch == 0x05BF)
5030 || (ch >= 0x05C1 && ch <= 0x05C2)
5031 || (ch >= 0x05D0 && ch <= 0x05EA)
5032 || (ch >= 0x05F0 && ch <= 0x05F2)
5033 /* Arabic */
5034 || (ch >= 0x0621 && ch <= 0x063A)
5035 || (ch >= 0x0640 && ch <= 0x0652)
5036 || (ch >= 0x0670 && ch <= 0x06B7)
5037 || (ch >= 0x06BA && ch <= 0x06BE)
5038 || (ch >= 0x06C0 && ch <= 0x06CE)
5039 || (ch >= 0x06D0 && ch <= 0x06DC)
5040 || (ch >= 0x06E5 && ch <= 0x06E8)
5041 || (ch >= 0x06EA && ch <= 0x06ED)
5042 /* Devanagari */
5043 || (ch >= 0x0901 && ch <= 0x0903)
5044 || (ch >= 0x0905 && ch <= 0x0939)
5045 || (ch >= 0x093E && ch <= 0x094D)
5046 || (ch >= 0x0950 && ch <= 0x0952)
5047 || (ch >= 0x0958 && ch <= 0x0963)
5048 /* Bengali */
5049 || (ch >= 0x0981 && ch <= 0x0983)
5050 || (ch >= 0x0985 && ch <= 0x098C)
5051 || (ch >= 0x098F && ch <= 0x0990)
5052 || (ch >= 0x0993 && ch <= 0x09A8)
5053 || (ch >= 0x09AA && ch <= 0x09B0)
5054 || (ch == 0x09B2)
5055 || (ch >= 0x09B6 && ch <= 0x09B9)
5056 || (ch >= 0x09BE && ch <= 0x09C4)
5057 || (ch >= 0x09C7 && ch <= 0x09C8)
5058 || (ch >= 0x09CB && ch <= 0x09CD)
5059 || (ch >= 0x09DC && ch <= 0x09DD)
5060 || (ch >= 0x09DF && ch <= 0x09E3)
5061 || (ch >= 0x09F0 && ch <= 0x09F1)
5062 /* Gurmukhi */
5063 || (ch == 0x0A02)
5064 || (ch >= 0x0A05 && ch <= 0x0A0A)
5065 || (ch >= 0x0A0F && ch <= 0x0A10)
5066 || (ch >= 0x0A13 && ch <= 0x0A28)
5067 || (ch >= 0x0A2A && ch <= 0x0A30)
5068 || (ch >= 0x0A32 && ch <= 0x0A33)
5069 || (ch >= 0x0A35 && ch <= 0x0A36)
5070 || (ch >= 0x0A38 && ch <= 0x0A39)
5071 || (ch >= 0x0A3E && ch <= 0x0A42)
5072 || (ch >= 0x0A47 && ch <= 0x0A48)
5073 || (ch >= 0x0A4B && ch <= 0x0A4D)
5074 || (ch >= 0x0A59 && ch <= 0x0A5C)
5075 || (ch == 0x0A5E)
5076 || (ch == 0x0A74)
5077 /* Gujarati */
5078 || (ch >= 0x0A81 && ch <= 0x0A83)
5079 || (ch >= 0x0A85 && ch <= 0x0A8B)
5080 || (ch == 0x0A8D)
5081 || (ch >= 0x0A8F && ch <= 0x0A91)
5082 || (ch >= 0x0A93 && ch <= 0x0AA8)
5083 || (ch >= 0x0AAA && ch <= 0x0AB0)
5084 || (ch >= 0x0AB2 && ch <= 0x0AB3)
5085 || (ch >= 0x0AB5 && ch <= 0x0AB9)
5086 || (ch >= 0x0ABD && ch <= 0x0AC5)
5087 || (ch >= 0x0AC7 && ch <= 0x0AC9)
5088 || (ch >= 0x0ACB && ch <= 0x0ACD)
5089 || (ch == 0x0AD0)
5090 || (ch == 0x0AE0)
5091 /* Oriya */
5092 || (ch >= 0x0B01 && ch <= 0x0B03)
5093 || (ch >= 0x0B05 && ch <= 0x0B0C)
5094 || (ch >= 0x0B0F && ch <= 0x0B10)
5095 || (ch >= 0x0B13 && ch <= 0x0B28)
5096 || (ch >= 0x0B2A && ch <= 0x0B30)
5097 || (ch >= 0x0B32 && ch <= 0x0B33)
5098 || (ch >= 0x0B36 && ch <= 0x0B39)
5099 || (ch >= 0x0B3E && ch <= 0x0B43)
5100 || (ch >= 0x0B47 && ch <= 0x0B48)
5101 || (ch >= 0x0B4B && ch <= 0x0B4D)
5102 || (ch >= 0x0B5C && ch <= 0x0B5D)
5103 || (ch >= 0x0B5F && ch <= 0x0B61)
5104 /* Tamil */
5105 || (ch >= 0x0B82 && ch <= 0x0B83)
5106 || (ch >= 0x0B85 && ch <= 0x0B8A)
5107 || (ch >= 0x0B8E && ch <= 0x0B90)
5108 || (ch >= 0x0B92 && ch <= 0x0B95)
5109 || (ch >= 0x0B99 && ch <= 0x0B9A)
5110 || (ch == 0x0B9C)
5111 || (ch >= 0x0B9E && ch <= 0x0B9F)
5112 || (ch >= 0x0BA3 && ch <= 0x0BA4)
5113 || (ch >= 0x0BA8 && ch <= 0x0BAA)
5114 || (ch >= 0x0BAE && ch <= 0x0BB5)
5115 || (ch >= 0x0BB7 && ch <= 0x0BB9)
5116 || (ch >= 0x0BBE && ch <= 0x0BC2)
5117 || (ch >= 0x0BC6 && ch <= 0x0BC8)
5118 || (ch >= 0x0BCA && ch <= 0x0BCD)
5119 /* Telugu */
5120 || (ch >= 0x0C01 && ch <= 0x0C03)
5121 || (ch >= 0x0C05 && ch <= 0x0C0C)
5122 || (ch >= 0x0C0E && ch <= 0x0C10)
5123 || (ch >= 0x0C12 && ch <= 0x0C28)
5124 || (ch >= 0x0C2A && ch <= 0x0C33)
5125 || (ch >= 0x0C35 && ch <= 0x0C39)
5126 || (ch >= 0x0C3E && ch <= 0x0C44)
5127 || (ch >= 0x0C46 && ch <= 0x0C48)
5128 || (ch >= 0x0C4A && ch <= 0x0C4D)
5129 || (ch >= 0x0C60 && ch <= 0x0C61)
5130 /* Kannada */
5131 || (ch >= 0x0C82 && ch <= 0x0C83)
5132 || (ch >= 0x0C85 && ch <= 0x0C8C)
5133 || (ch >= 0x0C8E && ch <= 0x0C90)
5134 || (ch >= 0x0C92 && ch <= 0x0CA8)
5135 || (ch >= 0x0CAA && ch <= 0x0CB3)
5136 || (ch >= 0x0CB5 && ch <= 0x0CB9)
5137 || (ch >= 0x0CBE && ch <= 0x0CC4)
5138 || (ch >= 0x0CC6 && ch <= 0x0CC8)
5139 || (ch >= 0x0CCA && ch <= 0x0CCD)
5140 || (ch == 0x0CDE)
5141 || (ch >= 0x0CE0 && ch <= 0x0CE1)
5142 /* Malayalam */
5143 || (ch >= 0x0D02 && ch <= 0x0D03)
5144 || (ch >= 0x0D05 && ch <= 0x0D0C)
5145 || (ch >= 0x0D0E && ch <= 0x0D10)
5146 || (ch >= 0x0D12 && ch <= 0x0D28)
5147 || (ch >= 0x0D2A && ch <= 0x0D39)
5148 || (ch >= 0x0D3E && ch <= 0x0D43)
5149 || (ch >= 0x0D46 && ch <= 0x0D48)
5150 || (ch >= 0x0D4A && ch <= 0x0D4D)
5151 || (ch >= 0x0D60 && ch <= 0x0D61)
5152 /* Thai */
5153 || (ch >= 0x0E01 && ch <= 0x0E3A)
5154 || (ch >= 0x0E40 && ch <= 0x0E5B)
5155 /* Lao */
5156 || (ch >= 0x0E81 && ch <= 0x0E82)
5157 || (ch == 0x0E84)
5158 || (ch >= 0x0E87 && ch <= 0x0E88)
5159 || (ch == 0x0E8A)
5160 || (ch == 0x0E8D)
5161 || (ch >= 0x0E94 && ch <= 0x0E97)
5162 || (ch >= 0x0E99 && ch <= 0x0E9F)
5163 || (ch >= 0x0EA1 && ch <= 0x0EA3)
5164 || (ch == 0x0EA5)
5165 || (ch == 0x0EA7)
5166 || (ch >= 0x0EAA && ch <= 0x0EAB)
5167 || (ch >= 0x0EAD && ch <= 0x0EAE)
5168 || (ch >= 0x0EB0 && ch <= 0x0EB9)
5169 || (ch >= 0x0EBB && ch <= 0x0EBD)
5170 || (ch >= 0x0EC0 && ch <= 0x0EC4)
5171 || (ch == 0x0EC6)
5172 || (ch >= 0x0EC8 && ch <= 0x0ECD)
5173 || (ch >= 0x0EDC && ch <= 0x0EDD)
5174 /* Tibetan */
5175 || (ch == 0x0F00)
5176 || (ch >= 0x0F18 && ch <= 0x0F19)
5177 || (ch == 0x0F35)
5178 || (ch == 0x0F37)
5179 || (ch == 0x0F39)
5180 || (ch >= 0x0F3E && ch <= 0x0F47)
5181 || (ch >= 0x0F49 && ch <= 0x0F69)
5182 || (ch >= 0x0F71 && ch <= 0x0F84)
5183 || (ch >= 0x0F86 && ch <= 0x0F8B)
5184 || (ch >= 0x0F90 && ch <= 0x0F95)
5185 || (ch == 0x0F97)
5186 || (ch >= 0x0F99 && ch <= 0x0FAD)
5187 || (ch >= 0x0FB1 && ch <= 0x0FB7)
5188 || (ch == 0x0FB9)
5189 /* Georgian */
5190 || (ch >= 0x10A0 && ch <= 0x10C5)
5191 || (ch >= 0x10D0 && ch <= 0x10F6)
5192 /* Hiragana */
5193 || (ch >= 0x3041 && ch <= 0x3093)
5194 || (ch >= 0x309B && ch <= 0x309C)
5195 /* Katakana */
5196 || (ch >= 0x30A1 && ch <= 0x30F6)
5197 || (ch >= 0x30FB && ch <= 0x30FC)
5198 /* Bopomofo */
5199 || (ch >= 0x3105 && ch <= 0x312C)
5200 /* CJK Unified Ideographs */
5201 || (ch >= 0x4E00 && ch <= 0x9FA5)
5202 /* Hangul */
5203 || (ch >= 0xAC00 && ch <= 0xD7A3)
5204 /* Digits */
5205 || (ch >= 0x0660 && ch <= 0x0669)
5206 || (ch >= 0x06F0 && ch <= 0x06F9)
5207 || (ch >= 0x0966 && ch <= 0x096F)
5208 || (ch >= 0x09E6 && ch <= 0x09EF)
5209 || (ch >= 0x0A66 && ch <= 0x0A6F)
5210 || (ch >= 0x0AE6 && ch <= 0x0AEF)
5211 || (ch >= 0x0B66 && ch <= 0x0B6F)
5212 || (ch >= 0x0BE7 && ch <= 0x0BEF)
5213 || (ch >= 0x0C66 && ch <= 0x0C6F)
5214 || (ch >= 0x0CE6 && ch <= 0x0CEF)
5215 || (ch >= 0x0D66 && ch <= 0x0D6F)
5216 || (ch >= 0x0E50 && ch <= 0x0E59)
5217 || (ch >= 0x0ED0 && ch <= 0x0ED9)
5218 || (ch >= 0x0F20 && ch <= 0x0F33)
5219 /* Special characters */
5220 || (ch == 0x00B5)
5221 || (ch == 0x00B7)
5222 || (ch >= 0x02B0 && ch <= 0x02B8)
5223 || (ch == 0x02BB)
5224 || (ch >= 0x02BD && ch <= 0x02C1)
5225 || (ch >= 0x02D0 && ch <= 0x02D1)
5226 || (ch >= 0x02E0 && ch <= 0x02E4)
5227 || (ch == 0x037A)
5228 || (ch == 0x0559)
5229 || (ch == 0x093D)
5230 || (ch == 0x0B3D)
5231 || (ch == 0x1FBE)
5232 || (ch >= 0x203F && ch <= 0x2040)
5233 || (ch == 0x2102)
5234 || (ch == 0x2107)
5235 || (ch >= 0x210A && ch <= 0x2113)
5236 || (ch == 0x2115)
5237 || (ch >= 0x2118 && ch <= 0x211D)
5238 || (ch == 0x2124)
5239 || (ch == 0x2126)
5240 || (ch == 0x2128)
5241 || (ch >= 0x212A && ch <= 0x2131)
5242 || (ch >= 0x2133 && ch <= 0x2138)
5243 || (ch >= 0x2160 && ch <= 0x2182)
5244 || (ch >= 0x3005 && ch <= 0x3007)
5245 || (ch >= 0x3021 && ch <= 0x3029)
5247 return UC_IDENTIFIER_START;
5248 return UC_IDENTIFIER_INVALID;
5251 /* The Java Language Specification, 3rd edition, §3.6.
5252 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.6 */
5253 static bool
5254 is_java_whitespace (unsigned int ch)
5256 return (ch == ' ' || ch == '\t' || ch == '\f'
5257 || ch == '\n' || ch == '\r');
5260 /* The Java Language Specification, 3rd edition, §3.8.
5261 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.8
5262 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
5263 static int
5264 java_ident_category (unsigned int ch)
5266 /* FIXME: Check this against Sun's JDK implementation. */
5267 if (is_category_L (ch) /* = Character.isLetter(ch) */
5268 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
5269 || is_category_Sc (ch) /* currency symbol */
5270 || is_category_Pc (ch) /* connector punctuation */
5272 return UC_IDENTIFIER_START;
5273 if (is_category_Nd (ch) /* digit */
5274 || is_category_Mc (ch) /* combining mark */
5275 || is_category_Mn (ch) /* non-spacing mark */
5277 return UC_IDENTIFIER_VALID;
5278 if ((ch >= 0x0000 && ch <= 0x0008)
5279 || (ch >= 0x000E && ch <= 0x001B)
5280 || (ch >= 0x007F && ch <= 0x009F)
5281 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
5283 return UC_IDENTIFIER_IGNORABLE;
5284 return UC_IDENTIFIER_INVALID;
5287 /* Construction of sparse 3-level tables. */
5288 #define TABLE identsyntax_table
5289 #define ELEMENT uint8_t
5290 #define DEFAULT UC_IDENTIFIER_INVALID
5291 #define xmalloc malloc
5292 #define xrealloc realloc
5293 #include "3level.h"
5295 /* Output an identifier syntax categorization in a three-level bitmap. */
5296 static void
5297 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
5299 FILE *stream;
5300 unsigned int ch, i;
5301 struct identsyntax_table t;
5302 unsigned int level1_offset, level2_offset, level3_offset;
5304 stream = fopen (filename, "w");
5305 if (stream == NULL)
5307 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5308 exit (1);
5311 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5312 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
5313 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5314 version);
5316 t.p = 7; /* or 8 */
5317 t.q = 5; /* or 4 */
5318 identsyntax_table_init (&t);
5320 for (ch = 0; ch < 0x110000; ch++)
5322 int syntaxcode = predicate (ch);
5324 assert (syntaxcode <= 0x03);
5326 if (syntaxcode != UC_IDENTIFIER_INVALID)
5327 identsyntax_table_add (&t, ch, syntaxcode);
5330 identsyntax_table_finalize (&t);
5332 /* Offsets in t.result, in memory of this process. */
5333 level1_offset =
5334 5 * sizeof (uint32_t);
5335 level2_offset =
5336 5 * sizeof (uint32_t)
5337 + t.level1_size * sizeof (uint32_t);
5338 level3_offset =
5339 5 * sizeof (uint32_t)
5340 + t.level1_size * sizeof (uint32_t)
5341 + (t.level2_size << t.q) * sizeof (uint32_t);
5343 for (i = 0; i < 5; i++)
5344 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
5345 ((uint32_t *) t.result)[i]);
5346 fprintf (stream, "static const\n");
5347 fprintf (stream, "struct\n");
5348 fprintf (stream, " {\n");
5349 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5350 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
5351 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
5352 (1 << t.p) * 2 / 16);
5353 fprintf (stream, " }\n");
5354 fprintf (stream, "%s =\n", name);
5355 fprintf (stream, "{\n");
5356 fprintf (stream, " {");
5357 if (t.level1_size > 8)
5358 fprintf (stream, "\n ");
5359 for (i = 0; i < t.level1_size; i++)
5361 uint32_t offset;
5362 if (i > 0 && (i % 8) == 0)
5363 fprintf (stream, "\n ");
5364 offset = ((uint32_t *) (t.result + level1_offset))[i];
5365 if (offset == 0)
5366 fprintf (stream, " %5d", -1);
5367 else
5368 fprintf (stream, " %5zu",
5369 (offset - level2_offset) / sizeof (uint32_t));
5370 if (i+1 < t.level1_size)
5371 fprintf (stream, ",");
5373 if (t.level1_size > 8)
5374 fprintf (stream, "\n ");
5375 fprintf (stream, " },\n");
5376 fprintf (stream, " {");
5377 if (t.level2_size << t.q > 8)
5378 fprintf (stream, "\n ");
5379 for (i = 0; i < t.level2_size << t.q; i++)
5381 uint32_t offset;
5382 if (i > 0 && (i % 8) == 0)
5383 fprintf (stream, "\n ");
5384 offset = ((uint32_t *) (t.result + level2_offset))[i];
5385 if (offset == 0)
5386 fprintf (stream, " %5d", -1);
5387 else
5388 fprintf (stream, " %5zu",
5389 (offset - level3_offset) / sizeof (uint8_t));
5390 if (i+1 < t.level2_size << t.q)
5391 fprintf (stream, ",");
5393 if (t.level2_size << t.q > 8)
5394 fprintf (stream, "\n ");
5395 fprintf (stream, " },\n");
5396 /* Pack the level3 array. Each entry needs 2 bits only. */
5397 fprintf (stream, " {");
5398 if ((t.level3_size << t.p) * 2 / 16 > 8)
5399 fprintf (stream, "\n ");
5400 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
5402 if (i > 0 && (i % 8) == 0)
5403 fprintf (stream, "\n ");
5404 fprintf (stream, " 0x%04x",
5405 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
5406 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
5407 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
5408 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
5409 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
5410 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
5411 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
5412 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
5413 if (i+1 < (t.level3_size << t.p) * 2 / 16)
5414 fprintf (stream, ",");
5416 if ((t.level3_size << t.p) * 2 / 16 > 8)
5417 fprintf (stream, "\n ");
5418 fprintf (stream, " }\n");
5419 fprintf (stream, "};\n");
5421 if (ferror (stream) || fclose (stream))
5423 fprintf (stderr, "error writing to '%s'\n", filename);
5424 exit (1);
5428 static void
5429 output_ident_properties (const char *version)
5431 #define PROPERTY(P) \
5432 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
5433 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5434 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
5435 PROPERTY(c_whitespace)
5436 PROPERTY(java_whitespace)
5437 #undef PROPERTY
5439 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
5440 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
5443 /* ========================================================================= */
5445 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
5446 glibc/localedata/locales/i18n file, generated by
5447 glibc/localedata/gen-unicode-ctype.c. */
5449 /* Character mappings. */
5451 static unsigned int
5452 to_upper (unsigned int ch)
5454 if (unicode_attributes[ch].name != NULL
5455 && unicode_attributes[ch].upper != NONE)
5456 return unicode_attributes[ch].upper;
5457 else
5458 return ch;
5461 static unsigned int
5462 to_lower (unsigned int ch)
5464 if (unicode_attributes[ch].name != NULL
5465 && unicode_attributes[ch].lower != NONE)
5466 return unicode_attributes[ch].lower;
5467 else
5468 return ch;
5471 static unsigned int
5472 to_title (unsigned int ch)
5474 if (unicode_attributes[ch].name != NULL
5475 && unicode_attributes[ch].title != NONE)
5476 return unicode_attributes[ch].title;
5477 else
5478 return ch;
5481 /* Character class properties. */
5483 static bool
5484 is_upper (unsigned int ch)
5486 return (to_lower (ch) != ch);
5489 static bool
5490 is_lower (unsigned int ch)
5492 return (to_upper (ch) != ch)
5493 /* <U00DF> is lowercase, but without simple to_upper mapping. */
5494 || (ch == 0x00DF);
5497 static bool
5498 is_alpha (unsigned int ch)
5500 return (unicode_attributes[ch].name != NULL
5501 && ((unicode_attributes[ch].category[0] == 'L'
5502 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5503 <U0E2F>, <U0E46> should belong to is_punct. */
5504 && (ch != 0x0E2F) && (ch != 0x0E46))
5505 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5506 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
5507 || (ch == 0x0E31)
5508 || (ch >= 0x0E34 && ch <= 0x0E3A)
5509 || (ch >= 0x0E47 && ch <= 0x0E4E)
5510 /* Avoid warning for <U0345>. */
5511 || (ch == 0x0345)
5512 /* Avoid warnings for <U2160>..<U217F>. */
5513 || (unicode_attributes[ch].category[0] == 'N'
5514 && unicode_attributes[ch].category[1] == 'l')
5515 /* Avoid warnings for <U24B6>..<U24E9>. */
5516 || (unicode_attributes[ch].category[0] == 'S'
5517 && unicode_attributes[ch].category[1] == 'o'
5518 && strstr (unicode_attributes[ch].name, " LETTER ")
5519 != NULL)
5520 /* Consider all the non-ASCII digits as alphabetic.
5521 ISO C 99 forbids us to have them in category "digit",
5522 but we want iswalnum to return true on them. */
5523 || (unicode_attributes[ch].category[0] == 'N'
5524 && unicode_attributes[ch].category[1] == 'd'
5525 && !(ch >= 0x0030 && ch <= 0x0039))));
5528 static bool
5529 is_digit (unsigned int ch)
5531 #if 0
5532 return (unicode_attributes[ch].name != NULL
5533 && unicode_attributes[ch].category[0] == 'N'
5534 && unicode_attributes[ch].category[1] == 'd');
5535 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
5536 a zero. Must add <0> in front of them by hand. */
5537 #else
5538 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
5539 takes it away:
5540 7.25.2.1.5:
5541 The iswdigit function tests for any wide character that corresponds
5542 to a decimal-digit character (as defined in 5.2.1).
5543 5.2.1:
5544 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
5546 return (ch >= 0x0030 && ch <= 0x0039);
5547 #endif
5550 static bool
5551 is_alnum (unsigned int ch)
5553 return is_alpha (ch) || is_digit (ch);
5556 static bool
5557 is_blank (unsigned int ch)
5559 return (ch == 0x0009 /* '\t' */
5560 /* Category Zs without mention of "<noBreak>" */
5561 || (unicode_attributes[ch].name != NULL
5562 && unicode_attributes[ch].category[0] == 'Z'
5563 && unicode_attributes[ch].category[1] == 's'
5564 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
5567 static bool
5568 is_space (unsigned int ch)
5570 /* Don't make U+00A0 a space. Non-breaking space means that all programs
5571 should treat it like a punctuation character, not like a space. */
5572 return (ch == 0x0020 /* ' ' */
5573 || ch == 0x000C /* '\f' */
5574 || ch == 0x000A /* '\n' */
5575 || ch == 0x000D /* '\r' */
5576 || ch == 0x0009 /* '\t' */
5577 || ch == 0x000B /* '\v' */
5578 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
5579 || (unicode_attributes[ch].name != NULL
5580 && unicode_attributes[ch].category[0] == 'Z'
5581 && (unicode_attributes[ch].category[1] == 'l'
5582 || unicode_attributes[ch].category[1] == 'p'
5583 || (unicode_attributes[ch].category[1] == 's'
5584 && !strstr (unicode_attributes[ch].decomposition,
5585 "<noBreak>")))));
5588 static bool
5589 is_cntrl (unsigned int ch)
5591 return (unicode_attributes[ch].name != NULL
5592 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
5593 /* Categories Zl and Zp */
5594 || (unicode_attributes[ch].category[0] == 'Z'
5595 && (unicode_attributes[ch].category[1] == 'l'
5596 || unicode_attributes[ch].category[1] == 'p'))));
5599 static bool
5600 is_xdigit (unsigned int ch)
5602 #if 0
5603 return is_digit (ch)
5604 || (ch >= 0x0041 && ch <= 0x0046)
5605 || (ch >= 0x0061 && ch <= 0x0066);
5606 #else
5607 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
5608 takes it away:
5609 7.25.2.1.12:
5610 The iswxdigit function tests for any wide character that corresponds
5611 to a hexadecimal-digit character (as defined in 6.4.4.1).
5612 6.4.4.1:
5613 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
5615 return (ch >= 0x0030 && ch <= 0x0039)
5616 || (ch >= 0x0041 && ch <= 0x0046)
5617 || (ch >= 0x0061 && ch <= 0x0066);
5618 #endif
5621 static bool
5622 is_graph (unsigned int ch)
5624 return (unicode_attributes[ch].name != NULL
5625 && strcmp (unicode_attributes[ch].name, "<control>")
5626 && !is_space (ch));
5629 static bool
5630 is_print (unsigned int ch)
5632 return (unicode_attributes[ch].name != NULL
5633 && strcmp (unicode_attributes[ch].name, "<control>")
5634 /* Categories Zl and Zp */
5635 && !(unicode_attributes[ch].name != NULL
5636 && unicode_attributes[ch].category[0] == 'Z'
5637 && (unicode_attributes[ch].category[1] == 'l'
5638 || unicode_attributes[ch].category[1] == 'p')));
5641 static bool
5642 is_punct (unsigned int ch)
5644 #if 0
5645 return (unicode_attributes[ch].name != NULL
5646 && unicode_attributes[ch].category[0] == 'P');
5647 #else
5648 /* The traditional POSIX definition of punctuation is every graphic,
5649 non-alphanumeric character. */
5650 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
5651 #endif
5654 /* Output all properties. */
5655 static void
5656 output_old_ctype (const char *version)
5658 #define PROPERTY(P) \
5659 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
5660 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5661 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
5662 PROPERTY(alnum)
5663 PROPERTY(alpha)
5664 PROPERTY(cntrl)
5665 PROPERTY(digit)
5666 PROPERTY(graph)
5667 PROPERTY(lower)
5668 PROPERTY(print)
5669 PROPERTY(punct)
5670 PROPERTY(space)
5671 PROPERTY(upper)
5672 PROPERTY(xdigit)
5673 PROPERTY(blank)
5674 #undef PROPERTY
5677 #if 0
5679 static bool
5680 is_combining (unsigned int ch)
5682 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
5683 file. In 3.0.1 it was identical to the union of the general categories
5684 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
5685 PropList.txt file, so we take the latter definition. */
5686 return (unicode_attributes[ch].name != NULL
5687 && unicode_attributes[ch].category[0] == 'M'
5688 && (unicode_attributes[ch].category[1] == 'n'
5689 || unicode_attributes[ch].category[1] == 'c'
5690 || unicode_attributes[ch].category[1] == 'e'));
5693 static bool
5694 is_combining_level3 (unsigned int ch)
5696 return is_combining (ch)
5697 && !(unicode_attributes[ch].combining[0] != '\0'
5698 && unicode_attributes[ch].combining[0] != '0'
5699 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
5702 /* Return the UCS symbol string for a Unicode character. */
5703 static const char *
5704 ucs_symbol (unsigned int i)
5706 static char buf[11+1];
5708 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
5709 return buf;
5712 /* Return the UCS symbol range string for a Unicode characters interval. */
5713 static const char *
5714 ucs_symbol_range (unsigned int low, unsigned int high)
5716 static char buf[24+1];
5718 strcpy (buf, ucs_symbol (low));
5719 strcat (buf, "..");
5720 strcat (buf, ucs_symbol (high));
5721 return buf;
5724 /* Output a character class (= property) table. */
5726 static void
5727 output_charclass (FILE *stream, const char *classname,
5728 bool (*func) (unsigned int))
5730 char table[0x110000];
5731 unsigned int i;
5732 bool need_semicolon;
5733 const int max_column = 75;
5734 int column;
5736 for (i = 0; i < 0x110000; i++)
5737 table[i] = (int) func (i);
5739 fprintf (stream, "%s ", classname);
5740 need_semicolon = false;
5741 column = 1000;
5742 for (i = 0; i < 0x110000; )
5744 if (!table[i])
5745 i++;
5746 else
5748 unsigned int low, high;
5749 char buf[25];
5751 low = i;
5753 i++;
5754 while (i < 0x110000 && table[i]);
5755 high = i - 1;
5757 if (low == high)
5758 strcpy (buf, ucs_symbol (low));
5759 else
5760 strcpy (buf, ucs_symbol_range (low, high));
5762 if (need_semicolon)
5764 fprintf (stream, ";");
5765 column++;
5768 if (column + strlen (buf) > max_column)
5770 fprintf (stream, "/\n ");
5771 column = 3;
5774 fprintf (stream, "%s", buf);
5775 column += strlen (buf);
5776 need_semicolon = true;
5779 fprintf (stream, "\n");
5782 /* Output a character mapping table. */
5784 static void
5785 output_charmap (FILE *stream, const char *mapname,
5786 unsigned int (*func) (unsigned int))
5788 char table[0x110000];
5789 unsigned int i;
5790 bool need_semicolon;
5791 const int max_column = 75;
5792 int column;
5794 for (i = 0; i < 0x110000; i++)
5795 table[i] = (func (i) != i);
5797 fprintf (stream, "%s ", mapname);
5798 need_semicolon = false;
5799 column = 1000;
5800 for (i = 0; i < 0x110000; i++)
5801 if (table[i])
5803 char buf[25+1];
5805 strcpy (buf, "(");
5806 strcat (buf, ucs_symbol (i));
5807 strcat (buf, ",");
5808 strcat (buf, ucs_symbol (func (i)));
5809 strcat (buf, ")");
5811 if (need_semicolon)
5813 fprintf (stream, ";");
5814 column++;
5817 if (column + strlen (buf) > max_column)
5819 fprintf (stream, "/\n ");
5820 column = 3;
5823 fprintf (stream, "%s", buf);
5824 column += strlen (buf);
5825 need_semicolon = true;
5827 fprintf (stream, "\n");
5830 /* Output the width table. */
5832 static void
5833 output_widthmap (FILE *stream)
5837 /* Output the tables to the given file. */
5839 static void
5840 output_tables (const char *filename, const char *version)
5842 FILE *stream;
5843 unsigned int ch;
5845 stream = fopen (filename, "w");
5846 if (stream == NULL)
5848 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5849 exit (1);
5852 fprintf (stream, "escape_char /\n");
5853 fprintf (stream, "comment_char %%\n");
5854 fprintf (stream, "\n");
5855 fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
5856 version);
5857 fprintf (stream, "\n");
5859 fprintf (stream, "LC_IDENTIFICATION\n");
5860 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
5861 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
5862 fprintf (stream, "address \"\"\n");
5863 fprintf (stream, "contact \"\"\n");
5864 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
5865 fprintf (stream, "tel \"\"\n");
5866 fprintf (stream, "fax \"\"\n");
5867 fprintf (stream, "language \"\"\n");
5868 fprintf (stream, "territory \"Earth\"\n");
5869 fprintf (stream, "revision \"%s\"\n", version);
5871 time_t now;
5872 char date[11];
5873 now = time (NULL);
5874 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
5875 fprintf (stream, "date \"%s\"\n", date);
5877 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
5878 fprintf (stream, "END LC_IDENTIFICATION\n");
5879 fprintf (stream, "\n");
5881 /* Verification. */
5882 for (ch = 0; ch < 0x110000; ch++)
5884 /* toupper restriction: "Only characters specified for the keywords
5885 lower and upper shall be specified. */
5886 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5887 fprintf (stderr,
5888 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
5889 ucs_symbol (ch), ch, to_upper (ch));
5891 /* tolower restriction: "Only characters specified for the keywords
5892 lower and upper shall be specified. */
5893 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5894 fprintf (stderr,
5895 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
5896 ucs_symbol (ch), ch, to_lower (ch));
5898 /* alpha restriction: "Characters classified as either upper or lower
5899 shall automatically belong to this class. */
5900 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
5901 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
5903 /* alpha restriction: "No character specified for the keywords cntrl,
5904 digit, punct or space shall be specified." */
5905 if (is_alpha (ch) && is_cntrl (ch))
5906 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
5907 if (is_alpha (ch) && is_digit (ch))
5908 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
5909 if (is_alpha (ch) && is_punct (ch))
5910 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
5911 if (is_alpha (ch) && is_space (ch))
5912 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
5914 /* space restriction: "No character specified for the keywords upper,
5915 lower, alpha, digit, graph or xdigit shall be specified."
5916 upper, lower, alpha already checked above. */
5917 if (is_space (ch) && is_digit (ch))
5918 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
5919 if (is_space (ch) && is_graph (ch))
5920 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
5921 if (is_space (ch) && is_xdigit (ch))
5922 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
5924 /* cntrl restriction: "No character specified for the keywords upper,
5925 lower, alpha, digit, punct, graph, print or xdigit shall be
5926 specified." upper, lower, alpha already checked above. */
5927 if (is_cntrl (ch) && is_digit (ch))
5928 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5929 if (is_cntrl (ch) && is_punct (ch))
5930 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5931 if (is_cntrl (ch) && is_graph (ch))
5932 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5933 if (is_cntrl (ch) && is_print (ch))
5934 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5935 if (is_cntrl (ch) && is_xdigit (ch))
5936 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5938 /* punct restriction: "No character specified for the keywords upper,
5939 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5940 be specified." upper, lower, alpha, cntrl already checked above. */
5941 if (is_punct (ch) && is_digit (ch))
5942 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5943 if (is_punct (ch) && is_xdigit (ch))
5944 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5945 if (is_punct (ch) && (ch == 0x0020))
5946 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5948 /* graph restriction: "No character specified for the keyword cntrl
5949 shall be specified." Already checked above. */
5951 /* print restriction: "No character specified for the keyword cntrl
5952 shall be specified." Already checked above. */
5954 /* graph - print relation: differ only in the <space> character.
5955 How is this possible if there are more than one space character?!
5956 I think susv2/xbd/locale.html should speak of "space characters",
5957 not "space character". */
5958 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5959 fprintf (stderr,
5960 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5961 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5962 fprintf (stderr,
5963 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5966 fprintf (stream, "LC_CTYPE\n");
5967 output_charclass (stream, "upper", is_upper);
5968 output_charclass (stream, "lower", is_lower);
5969 output_charclass (stream, "alpha", is_alpha);
5970 output_charclass (stream, "digit", is_digit);
5971 output_charclass (stream, "outdigit", is_outdigit);
5972 output_charclass (stream, "blank", is_blank);
5973 output_charclass (stream, "space", is_space);
5974 output_charclass (stream, "cntrl", is_cntrl);
5975 output_charclass (stream, "punct", is_punct);
5976 output_charclass (stream, "xdigit", is_xdigit);
5977 output_charclass (stream, "graph", is_graph);
5978 output_charclass (stream, "print", is_print);
5979 output_charclass (stream, "class \"combining\";", is_combining);
5980 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5981 output_charmap (stream, "toupper", to_upper);
5982 output_charmap (stream, "tolower", to_lower);
5983 output_charmap (stream, "map \"totitle\";", to_title);
5984 output_widthmap (stream);
5985 fprintf (stream, "END LC_CTYPE\n");
5987 if (ferror (stream) || fclose (stream))
5989 fprintf (stderr, "error writing to '%s'\n", filename);
5990 exit (1);
5994 #endif
5996 /* ========================================================================= */
5998 /* The width property from the EastAsianWidth.txt file.
5999 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
6000 const char * unicode_width[0x110000];
6002 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
6003 file. */
6004 static void
6005 fill_width (const char *width_filename)
6007 unsigned int i, j;
6008 FILE *stream;
6009 char field0[FIELDLEN];
6010 char field1[FIELDLEN];
6011 char field2[FIELDLEN];
6012 int lineno = 0;
6014 for (i = 0; i < 0x110000; i++)
6015 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
6017 stream = fopen (width_filename, "r");
6018 if (stream == NULL)
6020 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
6021 exit (1);
6024 for (;;)
6026 int n;
6027 int c;
6029 lineno++;
6030 c = getc (stream);
6031 if (c == EOF)
6032 break;
6033 if (c == '#')
6035 do c = getc (stream); while (c != EOF && c != '\n');
6036 continue;
6038 ungetc (c, stream);
6039 n = getfield (stream, field0, ';');
6040 n += getfield (stream, field1, ' ');
6041 n += getfield (stream, field2, '\n');
6042 if (n == 0)
6043 break;
6044 if (n != 3)
6046 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
6047 exit (1);
6049 i = strtoul (field0, NULL, 16);
6050 if (strstr (field0, "..") != NULL)
6052 /* Deal with a range. */
6053 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6054 for (; i <= j; i++)
6055 unicode_width[i] = strdup (field1);
6057 else
6059 /* Single character line. */
6060 unicode_width[i] = strdup (field1);
6064 if (ferror (stream) || fclose (stream))
6066 fprintf (stderr, "error reading from '%s'\n", width_filename);
6067 exit (1);
6071 /* ========================================================================= */
6073 /* Non-spacing attribute and width. */
6075 /* The non-spacing attribute table consists of:
6076 - Non-spacing characters; generated from PropList.txt or
6077 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
6078 - Format control characters; generated from
6079 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
6080 - Zero width characters; generated from
6081 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
6084 static bool
6085 is_nonspacing (unsigned int ch)
6087 return (unicode_attributes[ch].name != NULL
6088 && (get_bidi_category (ch) == UC_BIDI_NSM
6089 || is_category_Cc (ch) || is_category_Cf (ch)
6090 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
6093 static void
6094 output_nonspacing_property (const char *filename)
6096 FILE *stream;
6097 int ind[0x110000 / 0x200];
6098 unsigned int i;
6099 unsigned int i_max;
6100 int next_ind;
6102 stream = fopen (filename, "w");
6103 if (stream == NULL)
6105 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6106 exit (1);
6109 next_ind = 0;
6110 for (i = 0; i < 0x110000 / 0x200; i++)
6112 bool nontrivial = false;
6113 unsigned int ch;
6115 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
6116 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
6117 if (is_nonspacing (ch))
6119 nontrivial = true;
6120 break;
6122 if (nontrivial)
6123 ind[i] = next_ind++;
6124 else
6125 ind[i] = -1;
6128 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
6129 next_ind);
6130 i_max = 0;
6131 for (i = 0; i < 0x110000 / 0x200; i++)
6133 bool nontrivial = (ind[i] >= 0);
6135 if (nontrivial)
6137 unsigned int j;
6139 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
6140 for (j = 0; j < 8; j++)
6142 unsigned int k;
6144 fprintf (stream, " ");
6145 for (k = 0; k < 8; k++)
6147 unsigned int l;
6148 unsigned char bits = 0;
6150 for (l = 0; l < 8; l++)
6152 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
6154 if (is_nonspacing (ch))
6155 bits |= 1 << l;
6157 fprintf (stream, " 0x%02x%c", bits,
6158 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
6160 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6161 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
6163 i_max = i;
6166 fprintf (stream, "};\n");
6168 i_max = ((i_max + 8 - 1) / 8) * 8;
6169 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
6170 i_max);
6172 unsigned int j;
6174 for (j = 0; j < i_max / 8; j++)
6176 unsigned int k;
6178 fprintf (stream, " ");
6179 for (k = 0; k < 8; k++)
6181 i = j * 8 + k;
6182 fprintf (stream, " %2d%c", ind[i],
6183 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
6185 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6186 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
6189 fprintf (stream, "};\n");
6191 if (ferror (stream) || fclose (stream))
6193 fprintf (stderr, "error writing to '%s'\n", filename);
6194 exit (1);
6198 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
6199 static char
6200 symbolic_width (unsigned int ch)
6202 /* Test for unassigned character. */
6203 if (is_property_unassigned_code_value (ch))
6205 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
6206 if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
6207 return 'A';
6208 if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
6209 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
6210 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
6211 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
6212 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
6213 return '2';
6214 return 0;
6216 else
6218 /* Test for non-spacing or control character. */
6219 if (is_category_Cc (ch) && ch < 0x00A0)
6220 return 0;
6221 if (is_nonspacing (ch))
6222 return '0';
6223 /* Test for double-width character. */
6224 if (unicode_width[ch] != NULL
6225 && (strcmp (unicode_width[ch], "W") == 0
6226 || strcmp (unicode_width[ch], "F") == 0))
6227 return '2';
6228 /* Test for half-width character. */
6229 if (unicode_width[ch] != NULL
6230 && strcmp (unicode_width[ch], "H") == 0)
6231 return '1';
6233 /* In ancient CJK encodings, Cyrillic and most other characters are
6234 double-width as well. */
6235 if (ch >= 0x00A1 && ch < 0x10000)
6236 return 'A';
6237 return '1';
6240 static void
6241 output_width_property_test (const char *filename)
6243 FILE *stream;
6244 unsigned int interval_start, interval_end, ch;
6245 char interval_value;
6247 stream = fopen (filename, "w");
6248 if (stream == NULL)
6250 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6251 exit (1);
6254 interval_value = 0;
6255 interval_start = interval_end = 0; /* avoid GCC warning */
6256 for (ch = 0; ch < 0x110000; ch++)
6258 char value = symbolic_width (ch);
6259 if (value != 0) /* skip Cc control characters and unassigned characters */
6261 if (value == interval_value)
6262 /* Extend the interval. */
6263 interval_end = ch;
6264 else
6266 /* Terminate the interval. */
6267 if (interval_value != 0)
6269 if (interval_end == interval_start)
6270 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6271 else
6272 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6274 /* Start a new interval. */
6275 interval_start = interval_end = ch;
6276 interval_value = value;
6280 /* Terminate the last interval. */
6281 if (interval_value != 0)
6283 if (interval_end == interval_start)
6284 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6285 else
6286 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6289 if (ferror (stream) || fclose (stream))
6291 fprintf (stderr, "error writing to '%s'\n", filename);
6292 exit (1);
6296 /* ========================================================================= */
6298 /* Line breaking classification.
6299 Updated for Unicode TR #14 revision 26. */
6301 enum
6303 /* Values >= 30 are resolved at run time. */
6304 LBP_BK = 30, /* mandatory break */
6305 /*LBP_CR, carriage return - not used here because it's a DOSism */
6306 /*LBP_LF, line feed - not used here because it's a DOSism */
6307 LBP_CM = 31, /* attached characters and combining marks */
6308 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
6309 /*LBP_SG, surrogates - not used here because they are not characters */
6310 LBP_WJ = 0, /* word joiner */
6311 LBP_ZW = 32, /* zero width space */
6312 LBP_GL = 1, /* non-breaking (glue) */
6313 LBP_SP = 33, /* space */
6314 LBP_B2 = 2, /* break opportunity before and after */
6315 LBP_BA = 3, /* break opportunity after */
6316 LBP_BB = 4, /* break opportunity before */
6317 LBP_HY = 5, /* hyphen */
6318 LBP_CB = 34, /* contingent break opportunity */
6319 LBP_CL = 6, /* closing punctuation */
6320 LBP_CP = 7, /* closing parenthesis */
6321 LBP_EX = 8, /* exclamation/interrogation */
6322 LBP_IN = 9, /* inseparable */
6323 LBP_NS = 10, /* non starter */
6324 LBP_OP = 11, /* opening punctuation */
6325 LBP_QU = 12, /* ambiguous quotation */
6326 LBP_IS = 13, /* infix separator (numeric) */
6327 LBP_NU = 14, /* numeric */
6328 LBP_PO = 15, /* postfix (numeric) */
6329 LBP_PR = 16, /* prefix (numeric) */
6330 LBP_SY = 17, /* symbols allowing breaks */
6331 LBP_AI = 35, /* ambiguous (alphabetic or ideograph) */
6332 LBP_AL = 18, /* ordinary alphabetic and symbol characters */
6333 /*LBP_CJ, conditional Japanese starter, resolved to NS */
6334 LBP_H2 = 19, /* Hangul LV syllable */
6335 LBP_H3 = 20, /* Hangul LVT syllable */
6336 LBP_HL = 25, /* Hebrew letter */
6337 LBP_ID = 21, /* ideographic */
6338 LBP_JL = 22, /* Hangul L Jamo */
6339 LBP_JV = 23, /* Hangul V Jamo */
6340 LBP_JT = 24, /* Hangul T Jamo */
6341 LBP_RI = 26, /* regional indicator */
6342 LBP_SA = 36, /* complex context (South East Asian) */
6343 LBP_ZWJ = 27, /* zero width joiner */
6344 LBP_EB = 28, /* emoji base */
6345 LBP_EM = 29, /* emoji modifier */
6346 LBP_XX = 37 /* unknown */
6349 /* Returns the line breaking classification for ch, as a bit mask. */
6350 static int64_t
6351 get_lbp (unsigned int ch)
6353 int64_t attr = 0;
6355 /* U+20BC..U+20CF are reserved for prefixes. */
6356 if (unicode_attributes[ch].name == NULL && (ch >= 0x20BC && ch <= 0x20CF))
6357 return (int64_t) 1 << LBP_PR;
6359 if (unicode_attributes[ch].name != NULL)
6361 /* mandatory break */
6362 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
6363 || ch == 0x000C /* form feed */
6364 || ch == 0x000B /* line tabulation */
6365 || ch == 0x2028 /* LINE SEPARATOR */
6366 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
6367 attr |= (int64_t) 1 << LBP_BK;
6369 if (ch == 0x2060 /* WORD JOINER */
6370 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
6371 attr |= (int64_t) 1 << LBP_WJ;
6373 /* zero width space */
6374 if (ch == 0x200B /* ZERO WIDTH SPACE */)
6375 attr |= (int64_t) 1 << LBP_ZW;
6377 /* zero width joiner */
6378 if (ch == 0x200D /* ZERO WIDTH JOINER */)
6379 attr |= (int64_t) 1 << LBP_ZWJ;
6381 /* emoji base */
6382 if (ch == 0x261D /* WHITE UP POINTING INDEX */
6383 || ch == 0x26F9 /* PERSON WITH BALL */
6384 || (ch >= 0x270A && ch <= 0x270D) /* RAISED FIST..WRITING HAND */
6385 || ch == 0x1F385 /* FATHER CHRISTMAS */
6386 || (ch >= 0x1F3C3 && ch <= 0x1F3C4) /* RUNNER..SURFER */
6387 || (ch >= 0x1F3CA && ch <= 0x1F3CB) /* SWIMMER..WEIGHT LIFTER */
6388 || (ch >= 0x1F442 && ch <= 0x1F443) /* EAR..NOSE */
6389 || (ch >= 0x1F446 && ch <= 0x1F450) /* WHITE UP POINTING BACKHAND INDEX..OPEN HANDS SIGN */
6390 || (ch >= 0x1F466 && ch <= 0x1F469) /* BOY..WOMAN */
6391 || ch == 0x1F46E /* POLICE OFFICER */
6392 || (ch >= 0x1F470 && ch <= 0x1F478) /* BRIDE WITH VEIL..PRINCESS */
6393 || ch == 0x1F47C /* BABY ANGEL */
6394 || (ch >= 0x1F481 && ch <= 0x1F483) /* INFORMATION DESK PERSON..DANCER */
6395 || (ch >= 0x1F485 && ch <= 0x1F487) /* NAIL POLISH..HAIRCUT */
6396 || ch == 0x1F4AA /* FLEXED BICEPS */
6397 || ch == 0x1F575 /* SLEUTH OR SPY */
6398 || ch == 0x1F57A /* MAN DANCING */
6399 || ch == 0x1F590 /* RAISED HAND WITH FINGERS SPLAYED */
6400 || (ch >= 0x1F595 && ch <= 0x1F596) /* REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS */
6401 || (ch >= 0x1F645 && ch <= 0x1F647) /* FACE WITH NO GOOD GESTURE..PERSON BOWING DEEPLY */
6402 || (ch >= 0x1F64B && ch <= 0x1F64F) /* HAPPY PERSON RAISING ONE HAND..PERSON WITH FOLDED HANDS */
6403 || ch == 0x1F6A3 /* ROWBOAT */
6404 || (ch >= 0x1F6B4 && ch <= 0x1F6B6) /* BICYCLIST..PEDESTRIAN */
6405 || ch == 0x1F6C0 /* BATH */
6406 || (ch >= 0x1F918 && ch <= 0x1F91E) /* SIGN OF THE HORNS..HAND WITH INDEX AND MIDDLE FINGERS CROSSED */
6407 || ch == 0x1F926 /* FACE PALM */
6408 || ch == 0x1F930 /* PREGNANT WOMAN */
6409 || (ch >= 0x1F933 && ch <= 0x1F939) /* SELFIE..JUGGLING */
6410 || (ch >= 0x1F93C && ch <= 0x1F93E) /* WRESTLERS..HANDBALL */)
6411 attr |= (int64_t) 1 << LBP_EB;
6413 if ((ch >= 0x1F3FB && ch <= 0x1F3FF) /* EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 */)
6414 attr |= (int64_t) 1 << LBP_EM;
6416 /* non-breaking (glue) */
6417 if (ch == 0x00A0 /* NO-BREAK SPACE */
6418 || ch == 0x202F /* NARROW NO-BREAK SPACE */
6419 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
6420 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
6421 || ch == 0x2007 /* FIGURE SPACE */
6422 || ch == 0x2011 /* NON-BREAKING HYPHEN */
6423 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
6424 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
6425 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
6426 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
6427 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6428 || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
6429 || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
6430 attr |= (int64_t) 1 << LBP_GL;
6432 /* space */
6433 if (ch == 0x0020 /* SPACE */)
6434 attr |= (int64_t) 1 << LBP_SP;
6436 /* break opportunity before and after */
6437 if (ch == 0x2014 /* EM DASH */
6438 || ch == 0x2E3A /* TWO-EM DASH */
6439 || ch == 0x2E3B /* THREE-EM DASH */)
6440 attr |= (int64_t) 1 << LBP_B2;
6442 /* break opportunity after */
6443 if (/* Breaking Spaces */
6444 ch == 0x1680 /* OGHAM SPACE MARK */
6445 || ch == 0x2000 /* EN QUAD */
6446 || ch == 0x2001 /* EM QUAD */
6447 || ch == 0x2002 /* EN SPACE */
6448 || ch == 0x2003 /* EM SPACE */
6449 || ch == 0x2004 /* THREE-PER-EM SPACE */
6450 || ch == 0x2005 /* FOUR-PER-EM SPACE */
6451 || ch == 0x2006 /* SIX-PER-EM SPACE */
6452 || ch == 0x2008 /* PUNCTUATION SPACE */
6453 || ch == 0x2009 /* THIN SPACE */
6454 || ch == 0x200A /* HAIR SPACE */
6455 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
6456 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
6457 /* Tabs */
6458 || ch == 0x0009 /* tab */
6459 /* Conditional Hyphens */
6460 || ch == 0x00AD /* SOFT HYPHEN */
6461 /* Breaking Hyphens */
6462 || ch == 0x058A /* ARMENIAN HYPHEN */
6463 || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
6464 || ch == 0x2010 /* HYPHEN */
6465 || ch == 0x2012 /* FIGURE DASH */
6466 || ch == 0x2013 /* EN DASH */
6467 /* Visible Word Dividers */
6468 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
6469 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
6470 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
6471 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
6472 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
6473 || ch == 0x2027 /* HYPHENATION POINT */
6474 || ch == 0x007C /* VERTICAL LINE */
6475 /* Historic Word Separators */
6476 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
6477 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
6478 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
6479 || ch == 0x2056 /* THREE DOT PUNCTUATION */
6480 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
6481 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
6482 || ch == 0x205A /* TWO DOT PUNCTUATION */
6483 || ch == 0x205B /* FOUR DOT MARK */
6484 || ch == 0x205D /* TRICOLON */
6485 || ch == 0x205E /* VERTICAL FOUR DOTS */
6486 || ch == 0x2E19 /* PALM BRANCH */
6487 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
6488 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
6489 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
6490 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
6491 || ch == 0x2E30 /* RING POINT */
6492 || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
6493 || ch == 0x2E33 /* RAISED DOT */
6494 || ch == 0x2E34 /* RAISED COMMA */
6495 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
6496 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
6497 || ch == 0x10102 /* AEGEAN CHECK MARK */
6498 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
6499 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
6500 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
6501 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
6502 /* Dandas */
6503 || ch == 0x0964 /* DEVANAGARI DANDA */
6504 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
6505 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
6506 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
6507 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
6508 || ch == 0x104B /* MYANMAR SIGN SECTION */
6509 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
6510 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
6511 || ch == 0x17D4 /* KHMER SIGN KHAN */
6512 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
6513 || ch == 0x1B5E /* BALINESE CARIK SIKI */
6514 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
6515 || ch == 0xA8CE /* SAURASHTRA DANDA */
6516 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
6517 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
6518 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
6519 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
6520 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
6521 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
6522 /* Tibetan */
6523 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
6524 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
6525 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
6526 || ch == 0x0FBE /* TIBETAN KU RU KHA */
6527 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
6528 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
6529 /* Other Terminating Punctuation */
6530 || ch == 0x1804 /* MONGOLIAN COLON */
6531 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
6532 || ch == 0x1B5A /* BALINESE PANTI */
6533 || ch == 0x1B5B /* BALINESE PAMADA */
6534 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
6535 || ch == 0x1B60 /* BALINESE PAMENENG */
6536 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
6537 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
6538 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
6539 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
6540 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
6541 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
6542 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
6543 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
6544 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
6545 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
6546 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
6547 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
6548 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
6549 || ch == 0x2E43 /* DASH WITH LEFT UPTURN */
6550 || ch == 0x2E44 /* DOUBLE SUSPENSION MARK */
6551 || ch == 0x2E3C /* STENOGRAPHIC FULL STOP */
6552 || ch == 0x2E3D /* VERTICAL SIX DOTS */
6553 || ch == 0x2E3E /* WIGGLY VERTICAL LINE */
6554 || ch == 0x2E40 /* DOUBLE HYPHEN */
6555 || ch == 0x2E41 /* REVERSED COMMA */
6556 || ch == 0xA60D /* VAI COMMA */
6557 || ch == 0xA60F /* VAI QUESTION MARK */
6558 || ch == 0xA92E /* KAYAH LI SIGN CWI */
6559 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
6560 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
6561 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
6562 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
6563 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
6564 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
6565 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
6566 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6567 || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
6568 || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
6569 || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
6570 || ch == 0xA6F3 /* BAMUM FULL STOP */
6571 || ch == 0xA6F4 /* BAMUM COLON */
6572 || ch == 0xA6F5 /* BAMUM COMMA */
6573 || ch == 0xA6F6 /* BAMUM SEMICOLON */
6574 || ch == 0xA6F7 /* BAMUM QUESTION MARK */
6575 || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
6576 || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
6577 || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
6578 || ch == 0xAAF0 /* MEETEI MAYEK CHEIKHAN */
6579 || ch == 0xAAF1 /* MEETEI MAYEK AHANG KHUDAM */
6580 || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
6581 || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
6582 || (ch >= 0x10AF0 && ch <= 0x10AF5) /* MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS */
6583 || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
6584 || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
6585 || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
6586 || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
6587 || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
6588 || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
6589 || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
6590 || ch == 0x11047 /* BRAHMI DANDA */
6591 || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
6592 || ch == 0x110BE /* KAITHI SECTION MARK */
6593 || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
6594 || ch == 0x110C0 /* KAITHI DANDA */
6595 || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
6596 || ch == 0x11140 /* CHAKMA SECTION MARK */
6597 || ch == 0x11141 /* CHAKMA DANDA */
6598 || ch == 0x11142 /* CHAKMA DOUBLE DANDA */
6599 || ch == 0x11143 /* CHAKMA QUESTION MARK */
6600 || ch == 0x111C5 /* SHARADA DANDA */
6601 || ch == 0x111C6 /* SHARADA DOUBLE DANDA */
6602 || ch == 0x111C8 /* SHARADA SEPARATOR */
6603 || (ch >= 0x111DD && ch <= 0x111DF) /* SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 */
6604 || ch == 0x11238 /* KHOJKI DANDA */
6605 || ch == 0x11239 /* KHOJKI DOUBLE DANDA */
6606 || ch == 0x1123B /* KHOJKI SECTION MARK */
6607 || ch == 0x1123C /* KHOJKI DOUBLE SECTION MARK */
6608 || ch == 0x112A9 /* MULTANI SECTION MARK */
6609 || (ch >= 0x1144B && ch <= 0x1144E) /* NEWA DANDA..NEWA GAP FILLER */
6610 || ch == 0x1145B /* NEWA PLACEHOLDER MARK */
6611 || ch == 0x115C2 /* SIDDHAM DANDA */
6612 || ch == 0x115C3 /* SIDDHAM DOUBLE DANDA */
6613 || (ch >= 0x115C9 && ch <= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
6614 || ch == 0x11641 /* MODI DANDA */
6615 || ch == 0x11642 /* MODI DOUBLE DANDA */
6616 || (ch >= 0x1173C && ch <= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
6617 || (ch >= 0x11C41 && ch <= 0x11C45) /* BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 */
6618 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
6619 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
6620 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
6621 || ch == 0x12474 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON */
6622 || ch == 0x16A6E /* MRO DANDA */
6623 || ch == 0x16A6F /* MRO DOUBLE DANDA */
6624 || ch == 0x16AF5 /* BASSA VAH FULL STOP */
6625 || ch == 0x16B37 /* PAHAWH HMONG SIGN VOS THOM */
6626 || ch == 0x16B38 /* PAHAWH HMONG SIGN VOS TSHAB CEEB */
6627 || ch == 0x16B39 /* PAHAWH HMONG SIGN CIM CHEEM */
6628 || ch == 0x16B44 /* PAHAWH HMONG SIGN XAUS */
6629 || ch == 0x1BC9F /* DUPLOYAN PUNCTUATION CHINOOK FULL STOP */
6630 || (ch >= 0x1DA87 && ch <= 0x1DA8A) /* SIGNWRITING COMMA..SIGNWRITING COLON */)
6631 attr |= (int64_t) 1 << LBP_BA;
6633 /* break opportunity before */
6634 if (ch == 0x00B4 /* ACUTE ACCENT */
6635 || ch == 0x1FFD /* GREEK OXIA */
6636 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
6637 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
6638 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
6639 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
6640 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
6641 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
6642 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
6643 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
6644 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
6645 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
6646 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
6647 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
6648 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
6649 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
6650 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
6651 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
6652 || ch == 0xA8FC /* DEVANAGARI SIGN SIDDHAM */
6653 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
6654 || ch == 0x11175 /* MAHAJANI SECTION MARK */
6655 || ch == 0x111DB /* SHARADA SIGN SIDDHAM */
6656 || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */
6657 || (ch >= 0x11660 && ch <= 0x1166C) /* MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT */
6658 || ch == 0x11C70 /* MARCHEN HEAD MARK */)
6659 attr |= (int64_t) 1 << LBP_BB;
6661 /* hyphen */
6662 if (ch == 0x002D /* HYPHEN-MINUS */)
6663 attr |= (int64_t) 1 << LBP_HY;
6665 /* contingent break opportunity */
6666 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
6667 attr |= (int64_t) 1 << LBP_CB;
6669 /* closing parenthesis */
6670 if (ch == 0x0029 /* RIGHT PARENTHESIS */
6671 || ch == 0x005D /* RIGHT SQUARE BRACKET */)
6672 attr |= (int64_t) 1 << LBP_CP;
6674 /* closing punctuation */
6675 if ((unicode_attributes[ch].category[0] == 'P'
6676 && unicode_attributes[ch].category[1] == 'e'
6677 && !(attr & ((int64_t) 1 << LBP_CP)))
6678 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
6679 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
6680 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
6681 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
6682 || ch == 0xFE50 /* SMALL COMMA */
6683 || ch == 0xFE52 /* SMALL FULL STOP */
6684 || ch == 0xFF0C /* FULLWIDTH COMMA */
6685 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
6686 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
6687 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
6688 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6689 || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
6690 || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
6691 || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
6692 || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
6693 || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
6694 || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
6695 || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
6696 || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */
6697 || ch == 0x145CF /* ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK */)
6698 attr |= (int64_t) 1 << LBP_CL;
6700 /* exclamation/interrogation */
6701 if (ch == 0x0021 /* EXCLAMATION MARK */
6702 || ch == 0x003F /* QUESTION MARK */
6703 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
6704 || ch == 0x061B /* ARABIC SEMICOLON */
6705 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
6706 || ch == 0x061F /* ARABIC QUESTION MARK */
6707 || ch == 0x06D4 /* ARABIC FULL STOP */
6708 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
6709 || ch == 0x0F0D /* TIBETAN MARK SHAD */
6710 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
6711 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
6712 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
6713 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
6714 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
6715 || ch == 0x1802 /* MONGOLIAN COMMA */
6716 || ch == 0x1803 /* MONGOLIAN FULL STOP */
6717 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
6718 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
6719 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
6720 || ch == 0x1945 /* LIMBU QUESTION MARK */
6721 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
6722 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
6723 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
6724 || ch == 0x2CFE /* COPTIC FULL STOP */
6725 || ch == 0x2E2E /* REVERSED QUESTION MARK */
6726 || ch == 0xA60E /* VAI FULL STOP */
6727 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
6728 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
6729 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
6730 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
6731 || ch == 0xFE56 /* SMALL QUESTION MARK */
6732 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
6733 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
6734 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */
6735 || ch == 0x115C4 /* SIDDHAM SEPARATOR DOT */
6736 || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */
6737 || ch == 0x11C71 /* MARCHEN MARK SHAD */)
6738 attr |= (int64_t) 1 << LBP_EX;
6740 /* inseparable */
6741 if (ch == 0x2024 /* ONE DOT LEADER */
6742 || ch == 0x2025 /* TWO DOT LEADER */
6743 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
6744 || ch == 0x22EF /* MIDLINE HORIZONTAL ELLIPSIS */
6745 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */
6746 || ch == 0x10AF6 /* MANICHAEAN PUNCTUATION LINE FILLER */)
6747 attr |= (int64_t) 1 << LBP_IN;
6749 /* non starter */
6750 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
6751 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
6752 || ch == 0x203D /* INTERROBANG */
6753 || ch == 0x2047 /* DOUBLE QUESTION MARK */
6754 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
6755 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
6756 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
6757 || ch == 0x301C /* WAVE DASH */
6758 || ch == 0x303C /* MASU MARK */
6759 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
6760 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
6761 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
6762 || ch == 0x309D /* HIRAGANA ITERATION MARK */
6763 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
6764 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
6765 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
6766 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6767 || ch == 0x30FD /* KATAKANA ITERATION MARK */
6768 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
6769 || ch == 0xA015 /* YI SYLLABLE WU */
6770 || ch == 0xFE54 /* SMALL SEMICOLON */
6771 || ch == 0xFE55 /* SMALL COLON */
6772 || ch == 0xFF1A /* FULLWIDTH COLON */
6773 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
6774 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
6775 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6776 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
6777 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
6778 || ch == 0x16FE0 /* TANGUT ITERATION MARK */
6779 || ch == 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
6780 || ch == 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
6781 || ch == 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */
6782 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
6783 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
6784 attr |= (int64_t) 1 << LBP_NS;
6786 /* opening punctuation */
6787 if ((unicode_attributes[ch].category[0] == 'P'
6788 && unicode_attributes[ch].category[1] == 's')
6789 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
6790 || ch == 0x00BF /* INVERTED QUESTION MARK */
6791 || ch == 0x2E18 /* INVERTED INTERROBANG */
6792 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6793 || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
6794 || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
6795 || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
6796 || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
6797 || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
6798 || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
6799 || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */
6800 || (ch >= 0x1E95E && ch <= 0x1E95F) /* ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK */)
6801 attr |= (int64_t) 1 << LBP_OP;
6803 /* ambiguous quotation */
6804 if ((unicode_attributes[ch].category[0] == 'P'
6805 && (unicode_attributes[ch].category[1] == 'f'
6806 || unicode_attributes[ch].category[1] == 'i'))
6807 || ch == 0x0022 /* QUOTATION MARK */
6808 || ch == 0x0027 /* APOSTROPHE */
6809 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
6810 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
6811 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6812 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6813 || ch == 0x275F /* HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT */
6814 || ch == 0x2760 /* HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */
6815 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
6816 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
6817 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
6818 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
6819 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
6820 || ch == 0x2E0B /* RAISED SQUARE */
6821 || ch == 0x1F676 /* SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6822 || ch == 0x1F677 /* SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6823 || ch == 0x1F678 /* SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */)
6824 attr |= (int64_t) 1 << LBP_QU;
6826 /* infix separator (numeric) */
6827 if (ch == 0x002C /* COMMA */
6828 || ch == 0x002E /* FULL STOP */
6829 || ch == 0x003A /* COLON */
6830 || ch == 0x003B /* SEMICOLON */
6831 || ch == 0x037E /* GREEK QUESTION MARK */
6832 || ch == 0x0589 /* ARMENIAN FULL STOP */
6833 || ch == 0x060C /* ARABIC COMMA */
6834 || ch == 0x060D /* ARABIC DATE SEPARATOR */
6835 || ch == 0x07F8 /* NKO COMMA */
6836 || ch == 0x2044 /* FRACTION SLASH */
6837 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
6838 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
6839 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
6840 attr |= (int64_t) 1 << LBP_IS;
6842 /* numeric */
6843 if ((unicode_attributes[ch].category[0] == 'N'
6844 && unicode_attributes[ch].category[1] == 'd'
6845 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
6846 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
6847 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
6848 attr |= (int64_t) 1 << LBP_NU;
6850 /* postfix (numeric) */
6851 if (ch == 0x0025 /* PERCENT SIGN */
6852 || ch == 0x00A2 /* CENT SIGN */
6853 || ch == 0x00B0 /* DEGREE SIGN */
6854 || ch == 0x060B /* AFGHANI SIGN */
6855 || ch == 0x066A /* ARABIC PERCENT SIGN */
6856 || ch == 0x2030 /* PER MILLE SIGN */
6857 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
6858 || ch == 0x2032 /* PRIME */
6859 || ch == 0x2033 /* DOUBLE PRIME */
6860 || ch == 0x2034 /* TRIPLE PRIME */
6861 || ch == 0x2035 /* REVERSED PRIME */
6862 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
6863 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
6864 || ch == 0x20A7 /* PESETA SIGN */
6865 || ch == 0x20BB /* NORDIC MARK SIGN */
6866 || ch == 0x2103 /* DEGREE CELSIUS */
6867 || ch == 0x2109 /* DEGREE FAHRENHEIT */
6868 || ch == 0xFDFC /* RIAL SIGN */
6869 || ch == 0xFE6A /* SMALL PERCENT SIGN */
6870 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
6871 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
6872 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6873 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
6874 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
6875 || ch == 0x09F2 /* BENGALI RUPEE MARK */
6876 || ch == 0x09F3 /* BENGALI RUPEE SIGN */
6877 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
6878 || ch == 0x0D79 /* MALAYALAM DATE MARK */
6879 || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
6880 || ch == 0x20BE /* LARI SIGN */
6881 || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
6882 attr |= (int64_t) 1 << LBP_PO;
6884 /* prefix (numeric) */
6885 if ((unicode_attributes[ch].category[0] == 'S'
6886 && unicode_attributes[ch].category[1] == 'c')
6887 || ch == 0x002B /* PLUS SIGN */
6888 || ch == 0x005C /* REVERSE SOLIDUS */
6889 || ch == 0x00B1 /* PLUS-MINUS SIGN */
6890 || ch == 0x2116 /* NUMERO SIGN */
6891 || ch == 0x2212 /* MINUS SIGN */
6892 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
6893 if (!(attr & ((int64_t) 1 << LBP_PO)))
6894 attr |= (int64_t) 1 << LBP_PR;
6896 /* symbols allowing breaks */
6897 if (ch == 0x002F /* SOLIDUS */)
6898 attr |= (int64_t) 1 << LBP_SY;
6900 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
6901 attr |= (int64_t) 1 << LBP_H2;
6903 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
6904 attr |= (int64_t) 1 << LBP_H3;
6906 if ((ch >= 0x05D0 && ch <= 0x05F2) || ch == 0xFB1D
6907 || (ch >= 0xFB1F && ch <= 0xFB28) || (ch >= 0xFB2A && ch <= 0xFB4F))
6908 attr |= (int64_t) 1 << LBP_HL;
6910 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
6911 attr |= (int64_t) 1 << LBP_JL;
6913 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
6914 attr |= (int64_t) 1 << LBP_JV;
6916 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
6917 attr |= (int64_t) 1 << LBP_JT;
6919 /* regional indicator */
6920 if (ch >= 0x1F1E6 && ch <= 0x1F1FF)
6921 attr |= (int64_t) 1 << LBP_RI;
6923 /* complex context (South East Asian) */
6924 if (((unicode_attributes[ch].category[0] == 'C'
6925 && unicode_attributes[ch].category[1] == 'f')
6926 || (unicode_attributes[ch].category[0] == 'L'
6927 && (unicode_attributes[ch].category[1] == 'm'
6928 || unicode_attributes[ch].category[1] == 'o'))
6929 || (unicode_attributes[ch].category[0] == 'M'
6930 && (unicode_attributes[ch].category[1] == 'c'
6931 || unicode_attributes[ch].category[1] == 'n')
6932 && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
6933 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6934 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
6935 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
6936 || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
6937 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
6938 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
6939 || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
6940 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
6941 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
6942 || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
6943 || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */
6944 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
6945 || ch == 0x1173F /* Ahom */)
6946 && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
6947 || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
6948 || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
6949 || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
6950 || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
6951 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
6952 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
6953 || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */
6954 || (ch >= 0x11700 && ch <= 0x11719) /* Ahom */
6955 || (ch >= 0x1171D && ch <= 0x1172B) /* Ahom */
6956 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
6957 || ch == 0x1173F /* Ahom */))
6958 attr |= (int64_t) 1 << LBP_SA;
6960 /* attached characters and combining marks */
6961 if ((unicode_attributes[ch].category[0] == 'M'
6962 && (unicode_attributes[ch].category[1] == 'c'
6963 || unicode_attributes[ch].category[1] == 'e'
6964 || unicode_attributes[ch].category[1] == 'n'))
6965 || (unicode_attributes[ch].category[0] == 'C'
6966 && (unicode_attributes[ch].category[1] == 'c'
6967 || unicode_attributes[ch].category[1] == 'f')
6968 && ch != 0x110BD /* KAITHI NUMBER SIGN */
6969 && ch != 0x08E2 /* ARABIC DISPUTED END OF AYAH */)
6970 || ch == 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
6971 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW) | ((int64_t) 1 << LBP_ZWJ))))
6972 attr |= (int64_t) 1 << LBP_CM;
6974 /* ideographic */
6975 if (ch == 0x231A /* WATCH */
6976 || ch == 0x231B /* HOURGLASS */
6977 || ch == 0x23F0 /* ALARM CLOCK */
6978 || ch == 0x23F1 /* STOPWATCH */
6979 || ch == 0x23F2 /* TIMER CLOCK */
6980 || ch == 0x23F3 /* HOURGLASS WITH FLOWING SAND */
6981 || ch == 0x2600 /* BLACK SUN WITH RAYS */
6982 || ch == 0x2601 /* CLOUD */
6983 || ch == 0x2602 /* UMBRELLA */
6984 || ch == 0x2603 /* SNOWMAN */
6985 || ch == 0x2614 /* UMBRELLA WITH RAIN DROPS */
6986 || ch == 0x2615 /* HOT BEVERAGE */
6987 || ch == 0x2618 /* SHAMROCK */
6988 || ch == 0x261A /* BLACK LEFT POINTING INDEX */
6989 || ch == 0x261B /* BLACK RIGHT POINTING INDEX */
6990 || ch == 0x261C /* WHITE LEFT POINTING INDEX */
6991 || ch == 0x261D /* WHITE UP POINTING INDEX */
6992 || ch == 0x261E /* WHITE RIGHT POINTING INDEX */
6993 || ch == 0x261F /* WHITE DOWN POINTING INDEX */
6994 || ch == 0x2639 /* WHITE FROWNING FACE */
6995 || ch == 0x263A /* WHITE SMILING FACE */
6996 || ch == 0x263B /* BLACK SMILING FACE */
6997 || ch == 0x2668 /* HOT SPRINGS */
6998 || ch == 0x267F /* WHEELCHAIR SYMBOL */
6999 || ch == 0x26BD /* SOCCER BALL */
7000 || ch == 0x26BE /* BASEBALL */
7001 || ch == 0x26BF /* SQUARED KEY */
7002 || ch == 0x26C0 /* WHITE DRAUGHTS MAN */
7003 || ch == 0x26C1 /* WHITE DRAUGHTS KING */
7004 || ch == 0x26C2 /* BLACK DRAUGHTS MAN */
7005 || ch == 0x26C3 /* BLACK DRAUGHTS KING */
7006 || ch == 0x26C4 /* SNOWMAN WITHOUT SNOW */
7007 || ch == 0x26C5 /* SUN BEHIND CLOUD */
7008 || ch == 0x26C6 /* RAIN */
7009 || ch == 0x26C7 /* BLACK SNOWMAN */
7010 || ch == 0x26C8 /* THUNDER CLOUD AND RAIN */
7011 || ch == 0x26CD /* DISABLED CAR */
7012 || ch == 0x26CF /* PICK */
7013 || ch == 0x26D0 /* CAR SLIDING */
7014 || ch == 0x26D1 /* HELMET WITH WHITE CROSS */
7015 || ch == 0x26D3 /* CHAINS */
7016 || ch == 0x26D4 /* NO ENTRY */
7017 || ch == 0x26D8 /* BLACK LEFT LANE MERGE */
7018 || ch == 0x26D9 /* WHITE LEFT LANE MERGE */
7019 || ch == 0x26DC /* LEFT CLOSED ENTRY */
7020 || ch == 0x26DF /* BLACK TRUCK */
7021 || ch == 0x26E0 /* RESTRICTED LEFT ENTRY-1 */
7022 || ch == 0x26E1 /* RESTRICTED LEFT ENTRY-2 */
7023 || ch == 0x26EA /* CHURCH */
7024 || ch == 0x26F1 /* UMBRELLA ON GROUND */
7025 || ch == 0x26F2 /* FOUNTAIN */
7026 || ch == 0x26F3 /* FLAG IN HOLE */
7027 || ch == 0x26F4 /* FERRY */
7028 || ch == 0x26F5 /* SAILBOAT */
7029 || ch == 0x26F7 /* SKIER */
7030 || ch == 0x26F8 /* ICE SKATE */
7031 || ch == 0x26F9 /* PERSON WITH BALL */
7032 || ch == 0x26FA /* TENT */
7033 || ch == 0x26FD /* FUEL PUMP */
7034 || ch == 0x26FE /* CUP ON BLACK SQUARE */
7035 || ch == 0x26FF /* WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE */
7036 || ch == 0x2700 /* BLACK SAFETY SCISSORS */
7037 || ch == 0x2701 /* UPPER BLADE SCISSORS */
7038 || ch == 0x2702 /* BLACK SCISSORS */
7039 || ch == 0x2703 /* LOWER BLADE SCISSORS */
7040 || ch == 0x2704 /* WHITE SCISSORS */
7041 || ch == 0x2708 /* AIRPLANE */
7042 || ch == 0x2709 /* ENVELOPE */
7043 || ch == 0x270A /* RAISED FIST */
7044 || ch == 0x270B /* RAISED HAND */
7045 || ch == 0x270C /* VICTORY HAND */
7046 || ch == 0x270D /* WRITING HAND */
7047 || ch == 0x2764 /* HEAVY BLACK HEART */
7048 || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
7049 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
7050 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
7051 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
7052 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
7053 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
7054 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
7055 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
7056 || ch == 0xFE62 /* SMALL PLUS SIGN */
7057 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
7058 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
7059 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
7060 || ch == 0xFE66 /* SMALL EQUALS SIGN */
7061 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
7062 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
7063 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
7064 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
7065 || (ch >= 0x3000 && ch <= 0x33FF
7066 && !(attr & (((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
7067 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7068 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
7069 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
7070 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
7071 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
7072 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
7073 || ch == 0xFE45 /* SESAME DOT */
7074 || ch == 0xFE46 /* WHITE SESAME DOT */
7075 || ch == 0xFE49 /* DASHED OVERLINE */
7076 || ch == 0xFE4A /* CENTRELINE OVERLINE */
7077 || ch == 0xFE4B /* WAVY OVERLINE */
7078 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
7079 || ch == 0xFE4D /* DASHED LOW LINE */
7080 || ch == 0xFE4E /* CENTRELINE LOW LINE */
7081 || ch == 0xFE4F /* WAVY LOW LINE */
7082 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
7083 || ch == 0xFE58 /* SMALL EM DASH */
7084 || ch == 0xFE5F /* SMALL NUMBER SIGN */
7085 || ch == 0xFE60 /* SMALL AMPERSAND */
7086 || ch == 0xFE61 /* SMALL ASTERISK */
7087 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
7088 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
7089 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
7090 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
7091 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
7092 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
7093 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
7094 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
7095 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
7096 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
7097 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
7098 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
7099 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
7100 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
7101 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
7102 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
7103 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
7104 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
7105 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
7106 || ch == 0xFF5E /* FULLWIDTH TILDE */
7107 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
7108 || ch == 0xFFE3 /* FULLWIDTH MACRON */
7109 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
7110 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7111 || ch == 0xFF66 /* Halfwidth Katakana */
7112 || (ch >= 0xFF71 && ch <= 0xFF9D) /* Halfwidth Katakana */
7113 || (ch >= 0xFFA0 && ch <= 0xFFBE) /* Halfwidth Hangul */
7114 || (ch >= 0xFFC2 && ch <= 0xFFC7) /* Halfwidth Hangul */
7115 || (ch >= 0xFFCA && ch <= 0xFFCF) /* Halfwidth Hangul */
7116 || (ch >= 0xFFD2 && ch <= 0xFFD7) /* Halfwidth Hangul */
7117 || (ch >= 0xFFDA && ch <= 0xFFDC) /* Halfwidth Hangul */
7118 || (ch >= 0x17000 && ch <= 0x187EC) /* Tangut Ideograph */
7119 || (ch >= 0x18800 && ch <= 0x18AF2) /* Tangut Ideograph */
7120 || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
7121 || (ch >= 0x1F000 && ch <= 0x1F02B) /* Mahjong Tiles */
7122 || (ch >= 0x1F030 && ch <= 0x1F093) /* Domino Tiles */
7123 || (ch >= 0x1F0A0 && ch <= 0x1F0F5) /* Playing Cards */
7124 || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
7125 || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
7126 || (ch >= 0x1F300 && ch <= 0x1F5FF /* Miscellaneous Symbols and Pictographs */
7127 && ch != 0x1F3B5 && ch != 0x1F3B6 && ch != 0x1F3BC
7128 && ch != 0x1F4A0 && ch != 0x1F4A2 && ch != 0x1F4A4
7129 && ch != 0x1F4AF && ch != 0x1F4B1 && ch != 0x1F4B2
7130 && !(ch >= 0x1F39C && ch <= 0x1F39D)
7131 && !(ch >= 0x1F3FB && ch <= 0x1F3FF)
7132 && !(ch >= 0x1F500 && ch <= 0x1F506)
7133 && !(ch >= 0x1F517 && ch <= 0x1F524)
7134 && !(ch >= 0x1F532 && ch <= 0x1F549)
7135 && !(ch >= 0x1F5D4 && ch <= 0x1F5DB)
7136 && !(ch >= 0x1F5F4 && ch <= 0x1F5F9))
7137 || (ch >= 0x1F600 && ch <= 0x1F64F) /* Emoticons */
7138 || (ch >= 0x1F680 && ch <= 0x1F6DF) /* Transport and Map Symbols */
7139 || (ch >= 0x1F6E0 && ch <= 0x1F6EC) /* Transport and Map Symbols */
7140 || (ch >= 0x1F6F0 && ch <= 0x1F6F6) /* Transport and Map Symbols */
7141 || (ch >= 0x1F900 && ch <= 0x1F9FF) /* Supplemental Symbols and Pictographs */
7142 || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
7143 || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */
7144 || (ch >= 0x2B820 && ch <= 0x2CEAF) /* CJK Ideograph Extension E */)
7145 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_EB))))
7147 /* ambiguous (ideograph) ? */
7148 if ((unicode_width[ch] != NULL
7149 && unicode_width[ch][0] == 'A'
7150 && ch >= 0x2000
7151 && ch != 0x2614
7152 && ch != 0x2615
7153 && ch != 0x261C
7154 && ch != 0x261E
7155 && ch != 0x2668
7156 && ch != 0x26BE
7157 && ch != 0x26BF
7158 && !(ch >= 0x26C4 && ch <= 0x26C8)
7159 && ch != 0x26CD
7160 && ch != 0x26CF
7161 && ch != 0x26D0
7162 && ch != 0x26D1
7163 && ch != 0x26D3
7164 && ch != 0x26D4
7165 && ch != 0x26D8
7166 && ch != 0x26D9
7167 && ch != 0x26DC
7168 && ch != 0x26DF
7169 && ch != 0x26E0
7170 && ch != 0x26E1
7171 && ch != 0x26EA
7172 && !(ch >= 0x26F1 && ch <= 0x26F5)
7173 && !(ch >= 0x26F7 && ch <= 0x26FA)
7174 && !(ch >= 0x26FD && ch <= 0x26FF))
7175 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
7176 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
7177 attr |= (int64_t) 1 << LBP_AI;
7178 else
7179 attr |= (int64_t) 1 << LBP_ID;
7182 /* ordinary alphabetic and symbol characters */
7183 if ((unicode_attributes[ch].category[0] == 'L'
7184 && (unicode_attributes[ch].category[1] == 'u'
7185 || unicode_attributes[ch].category[1] == 'l'
7186 || unicode_attributes[ch].category[1] == 't'
7187 || unicode_attributes[ch].category[1] == 'm'
7188 || unicode_attributes[ch].category[1] == 'o'))
7189 || (unicode_attributes[ch].category[0] == 'S'
7190 && (unicode_attributes[ch].category[1] == 'm'
7191 || unicode_attributes[ch].category[1] == 'k'
7192 || unicode_attributes[ch].category[1] == 'o'))
7193 || (unicode_attributes[ch].category[0] == 'N'
7194 && (unicode_attributes[ch].category[1] == 'l'
7195 || unicode_attributes[ch].category[1] == 'o'))
7196 || (unicode_attributes[ch].category[0] == 'P'
7197 && (unicode_attributes[ch].category[1] == 'c'
7198 || unicode_attributes[ch].category[1] == 'd'
7199 || unicode_attributes[ch].category[1] == 'o'))
7200 || ch == 0x0600 /* ARABIC NUMBER SIGN */
7201 || ch == 0x0601 /* ARABIC SIGN SANAH */
7202 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
7203 || ch == 0x0603 /* ARABIC SIGN SAFHA */
7204 || ch == 0x0604 /* ARABIC SIGN SAMVAT */
7205 || ch == 0x0605 /* ARABIC NUMBER MARK ABOVE */
7206 || ch == 0x06DD /* ARABIC END OF AYAH */
7207 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
7208 || ch == 0x08E2 /* ARABIC DISPUTED END OF AYAH */
7209 || ch == 0x2061 /* FUNCTION APPLICATION */
7210 || ch == 0x2062 /* INVISIBLE TIMES */
7211 || ch == 0x2063 /* INVISIBLE SEPARATOR */
7212 || ch == 0x2064 /* INVISIBLE PLUS */
7213 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7214 || ch == 0x110BD /* KAITHI NUMBER SIGN */)
7215 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID) | ((int64_t) 1 << LBP_EB) | ((int64_t) 1 << LBP_EM)))
7216 && ch != 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
7218 /* ambiguous (alphabetic) ? */
7219 if ((unicode_width[ch] != NULL
7220 && unicode_width[ch][0] == 'A'
7221 && ch >= 0x2000
7222 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
7223 && ch != 0x2022 /* BULLET */
7224 && ch != 0x203E /* OVERLINE */
7225 && ch != 0x2126 /* OHM SIGN */
7226 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
7227 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
7228 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
7229 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
7230 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
7231 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
7232 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
7233 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
7234 || ch == 0x00A7 /* SECTION SIGN */
7235 || ch == 0x00A8 /* DIAERESIS */
7236 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
7237 || ch == 0x00B2 /* SUPERSCRIPT TWO */
7238 || ch == 0x00B3 /* SUPERSCRIPT THREE */
7239 || ch == 0x00B6 /* PILCROW SIGN */
7240 || ch == 0x00B7 /* MIDDLE DOT */
7241 || ch == 0x00B8 /* CEDILLA */
7242 || ch == 0x00B9 /* SUPERSCRIPT ONE */
7243 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
7244 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
7245 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
7246 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
7247 || ch == 0x00D7 /* MULTIPLICATION SIGN */
7248 || ch == 0x00F7 /* DIVISION SIGN */
7249 || ch == 0x02C7 /* CARON */
7250 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
7251 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
7252 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
7253 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
7254 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
7255 || ch == 0x02D8 /* BREVE */
7256 || ch == 0x02D9 /* DOT ABOVE */
7257 || ch == 0x02DA /* RING ABOVE */
7258 || ch == 0x02DB /* OGONEK */
7259 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
7260 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
7261 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
7262 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7263 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
7264 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
7265 || ch == 0x2616 /* WHITE SHOGI PIECE */
7266 || ch == 0x2617 /* BLACK SHOGI PIECE */
7267 || ch == 0x2757 /* HEAVY EXCLAMATION MARK SYMBOL */
7268 || ch == 0x2B55 /* HEAVY LARGE CIRCLE */
7269 || ch == 0x1F10B /* DINGBAT CIRCLED SANS-SERIF DIGIT ZERO */
7270 || ch == 0x1F18E /* NEGATIVE SQUARED AB */
7271 || (ch >= 0x1F191 && ch <= 0x1F19A) /* SQUARED CL..SQUARED VS */
7272 || ch == 0x1F10C /* DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */)
7273 attr |= (int64_t) 1 << LBP_AI;
7274 else
7275 attr |= (int64_t) 1 << LBP_AL;
7276 attr &= ~((int64_t) 1 << LBP_CM);
7279 else
7281 /* Unassigned character. */
7282 if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
7283 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
7284 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
7285 || (ch >= 0x1F02C && ch <= 0x1F02F) /* reserved */
7286 || (ch >= 0x1F094 && ch <= 0x1F09F) /* reserved */
7287 || (ch >= 0x1F0AF && ch <= 0x1F0B0) /* reserved */
7288 || ch == 0x1F0C0 /* reserved */
7289 || ch == 0x1F0D0 /* reserved */
7290 || (ch >= 0x1F0F6 && ch <= 0x1F0FF) /* reserved */
7291 || (ch >= 0x1F10D && ch <= 0x1F10F) /* reserved */
7292 || ch == 0x1F12F /* reserved */
7293 || (ch >= 0x1F16C && ch <= 0x1F16F) /* reserved */
7294 || (ch >= 0x1F1AD && ch <= 0x1F1E5) /* reserved */
7295 || (ch >= 0x1F203 && ch <= 0x1F20F) /* reserved */
7296 || (ch >= 0x1F23C && ch <= 0x1F23F) /* reserved */
7297 || (ch >= 0x1F249 && ch <= 0x1F24F) /* reserved */
7298 || (ch >= 0x1F252 && ch <= 0x1F2FF) /* reserved */
7299 || (ch >= 0x1F6D3 && ch <= 0x1F6DF) /* reserved */
7300 || (ch >= 0x1F6ED && ch <= 0x1F6EF) /* reserved */
7301 || (ch >= 0x1F6F7 && ch <= 0x1F6FF) /* reserved */
7302 || (ch >= 0x1F774 && ch <= 0x1F77F) /* reserved */
7303 || (ch >= 0x1F7D5 && ch <= 0x1F7FF) /* reserved */
7304 || (ch >= 0x1F80C && ch <= 0x1F80F) /* reserved */
7305 || (ch >= 0x1F848 && ch <= 0x1F84F) /* reserved */
7306 || (ch >= 0x1F85A && ch <= 0x1F85F) /* reserved */
7307 || (ch >= 0x1F888 && ch <= 0x1F88F) /* reserved */
7308 || (ch >= 0x1F8AE && ch <= 0x1F90F) /* reserved */
7309 || ch == 0x1F91F /* reserved */
7310 || ch == 0x1F93F /* reserved */
7311 || (ch >= 0x1F928 && ch <= 0x1F92F) /* reserved */
7312 || (ch >= 0x1F931 && ch <= 0x1F932) /* reserved */
7313 || (ch >= 0x1F94C && ch <= 0x1F94F) /* reserved */
7314 || (ch >= 0x1F95F && ch <= 0x1F97F) /* reserved */
7315 || (ch >= 0x1F992 && ch <= 0x1F9BF) /* reserved */
7316 || (ch >= 0x1F9C1 && ch <= 0x1FFFD) /* reserved */
7317 || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
7318 || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
7319 Supplementary Ideographic Plane (Plane 2) outside of blocks */
7320 || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
7321 Supplementary Ideographic Plane (Plane 2) outside of blocks */
7322 || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
7323 attr |= (int64_t) 1 << LBP_ID;
7326 if (attr == 0)
7327 /* unknown */
7328 attr |= (int64_t) 1 << LBP_XX;
7330 return attr;
7333 /* Output the line breaking properties in a human readable format. */
7334 static void
7335 debug_output_lbp (FILE *stream)
7337 unsigned int i;
7339 for (i = 0; i < 0x110000; i++)
7341 int64_t attr = get_lbp (i);
7342 if (attr != (int64_t) 1 << LBP_XX)
7344 fprintf (stream, "0x%04X", i);
7345 #define PRINT_BIT(attr,bit) \
7346 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
7347 PRINT_BIT(attr,LBP_BK);
7348 PRINT_BIT(attr,LBP_CM);
7349 PRINT_BIT(attr,LBP_WJ);
7350 PRINT_BIT(attr,LBP_ZW);
7351 PRINT_BIT(attr,LBP_GL);
7352 PRINT_BIT(attr,LBP_SP);
7353 PRINT_BIT(attr,LBP_B2);
7354 PRINT_BIT(attr,LBP_BA);
7355 PRINT_BIT(attr,LBP_BB);
7356 PRINT_BIT(attr,LBP_HY);
7357 PRINT_BIT(attr,LBP_CB);
7358 PRINT_BIT(attr,LBP_CL);
7359 PRINT_BIT(attr,LBP_CP);
7360 PRINT_BIT(attr,LBP_EX);
7361 PRINT_BIT(attr,LBP_IN);
7362 PRINT_BIT(attr,LBP_NS);
7363 PRINT_BIT(attr,LBP_OP);
7364 PRINT_BIT(attr,LBP_QU);
7365 PRINT_BIT(attr,LBP_IS);
7366 PRINT_BIT(attr,LBP_NU);
7367 PRINT_BIT(attr,LBP_PO);
7368 PRINT_BIT(attr,LBP_PR);
7369 PRINT_BIT(attr,LBP_SY);
7370 PRINT_BIT(attr,LBP_AI);
7371 PRINT_BIT(attr,LBP_AL);
7372 PRINT_BIT(attr,LBP_H2);
7373 PRINT_BIT(attr,LBP_H3);
7374 PRINT_BIT(attr,LBP_HL);
7375 PRINT_BIT(attr,LBP_ID);
7376 PRINT_BIT(attr,LBP_JL);
7377 PRINT_BIT(attr,LBP_JV);
7378 PRINT_BIT(attr,LBP_JT);
7379 PRINT_BIT(attr,LBP_RI);
7380 PRINT_BIT(attr,LBP_SA);
7381 PRINT_BIT(attr,LBP_ZWJ);
7382 PRINT_BIT(attr,LBP_EB);
7383 PRINT_BIT(attr,LBP_EM);
7384 PRINT_BIT(attr,LBP_XX);
7385 #undef PRINT_BIT
7386 fprintf (stream, "\n");
7391 static void
7392 debug_output_lbrk_tables (const char *filename)
7394 FILE *stream;
7396 stream = fopen (filename, "w");
7397 if (stream == NULL)
7399 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7400 exit (1);
7403 debug_output_lbp (stream);
7405 if (ferror (stream) || fclose (stream))
7407 fprintf (stderr, "error writing to '%s'\n", filename);
7408 exit (1);
7412 /* The line breaking property from the LineBreak.txt file. */
7413 int unicode_org_lbp[0x110000];
7415 /* Stores in unicode_org_lbp[] the line breaking property from the
7416 LineBreak.txt file. */
7417 static void
7418 fill_org_lbp (const char *linebreak_filename)
7420 unsigned int i, j;
7421 FILE *stream;
7422 char field0[FIELDLEN];
7423 char field1[FIELDLEN];
7424 char field2[FIELDLEN];
7425 int lineno = 0;
7427 for (i = 0; i < 0x110000; i++)
7428 unicode_org_lbp[i] = LBP_XX;
7430 stream = fopen (linebreak_filename, "r");
7431 if (stream == NULL)
7433 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
7434 exit (1);
7437 for (;;)
7439 int n;
7440 int c;
7441 int value;
7443 lineno++;
7444 c = getc (stream);
7445 if (c == EOF)
7446 break;
7447 if (c == '#')
7449 do c = getc (stream); while (c != EOF && c != '\n');
7450 continue;
7452 ungetc (c, stream);
7453 n = getfield (stream, field0, ';');
7454 n += getfield (stream, field1, ' ');
7455 n += getfield (stream, field2, '\n');
7456 if (n == 0)
7457 break;
7458 if (n != 3)
7460 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
7461 lineno);
7462 exit (1);
7464 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
7465 if (false) {}
7466 TRY(LBP_BK)
7467 TRY(LBP_CM)
7468 TRY(LBP_WJ)
7469 TRY(LBP_ZW)
7470 TRY(LBP_GL)
7471 TRY(LBP_SP)
7472 TRY(LBP_B2)
7473 TRY(LBP_BA)
7474 TRY(LBP_BB)
7475 TRY(LBP_HY)
7476 TRY(LBP_CB)
7477 TRY(LBP_CL)
7478 TRY(LBP_CP)
7479 TRY(LBP_EX)
7480 TRY(LBP_IN)
7481 TRY(LBP_NS)
7482 TRY(LBP_OP)
7483 TRY(LBP_QU)
7484 TRY(LBP_IS)
7485 TRY(LBP_NU)
7486 TRY(LBP_PO)
7487 TRY(LBP_PR)
7488 TRY(LBP_SY)
7489 TRY(LBP_AI)
7490 TRY(LBP_AL)
7491 TRY(LBP_H2)
7492 TRY(LBP_H3)
7493 TRY(LBP_HL)
7494 TRY(LBP_ID)
7495 TRY(LBP_JL)
7496 TRY(LBP_JV)
7497 TRY(LBP_JT)
7498 TRY(LBP_RI)
7499 TRY(LBP_SA)
7500 TRY(LBP_ZWJ)
7501 TRY(LBP_EB)
7502 TRY(LBP_EM)
7503 TRY(LBP_XX)
7504 #undef TRY
7505 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
7506 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
7507 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
7508 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
7509 else if (strcmp (field1, "CJ") == 0) value = LBP_NS;
7510 else
7512 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
7513 field1, linebreak_filename, lineno);
7514 exit (1);
7516 i = strtoul (field0, NULL, 16);
7517 if (strstr (field0, "..") != NULL)
7519 /* Deal with a range. */
7520 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
7521 for (; i <= j; i++)
7522 unicode_org_lbp[i] = value;
7524 else
7526 /* Single character line. */
7527 unicode_org_lbp[i] = value;
7531 if (ferror (stream) || fclose (stream))
7533 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
7534 exit (1);
7538 /* Output the line breaking properties in a human readable format. */
7539 static void
7540 debug_output_org_lbp (FILE *stream)
7542 unsigned int i;
7544 for (i = 0; i < 0x110000; i++)
7546 int attr = unicode_org_lbp[i];
7547 if (attr != LBP_XX)
7549 fprintf (stream, "0x%04X", i);
7550 #define PRINT_BIT(attr,bit) \
7551 if (attr == bit) fprintf (stream, " " #bit);
7552 PRINT_BIT(attr,LBP_BK);
7553 PRINT_BIT(attr,LBP_CM);
7554 PRINT_BIT(attr,LBP_WJ);
7555 PRINT_BIT(attr,LBP_ZW);
7556 PRINT_BIT(attr,LBP_GL);
7557 PRINT_BIT(attr,LBP_SP);
7558 PRINT_BIT(attr,LBP_B2);
7559 PRINT_BIT(attr,LBP_BA);
7560 PRINT_BIT(attr,LBP_BB);
7561 PRINT_BIT(attr,LBP_HY);
7562 PRINT_BIT(attr,LBP_CB);
7563 PRINT_BIT(attr,LBP_CL);
7564 PRINT_BIT(attr,LBP_CP);
7565 PRINT_BIT(attr,LBP_EX);
7566 PRINT_BIT(attr,LBP_IN);
7567 PRINT_BIT(attr,LBP_NS);
7568 PRINT_BIT(attr,LBP_OP);
7569 PRINT_BIT(attr,LBP_QU);
7570 PRINT_BIT(attr,LBP_IS);
7571 PRINT_BIT(attr,LBP_NU);
7572 PRINT_BIT(attr,LBP_PO);
7573 PRINT_BIT(attr,LBP_PR);
7574 PRINT_BIT(attr,LBP_SY);
7575 PRINT_BIT(attr,LBP_AI);
7576 PRINT_BIT(attr,LBP_AL);
7577 PRINT_BIT(attr,LBP_H2);
7578 PRINT_BIT(attr,LBP_H3);
7579 PRINT_BIT(attr,LBP_HL);
7580 PRINT_BIT(attr,LBP_ID);
7581 PRINT_BIT(attr,LBP_JL);
7582 PRINT_BIT(attr,LBP_JV);
7583 PRINT_BIT(attr,LBP_JT);
7584 PRINT_BIT(attr,LBP_RI);
7585 PRINT_BIT(attr,LBP_SA);
7586 PRINT_BIT(attr,LBP_ZWJ);
7587 PRINT_BIT(attr,LBP_EB);
7588 PRINT_BIT(attr,LBP_EM);
7589 PRINT_BIT(attr,LBP_XX);
7590 #undef PRINT_BIT
7591 fprintf (stream, "\n");
7596 static void
7597 debug_output_org_lbrk_tables (const char *filename)
7599 FILE *stream;
7601 stream = fopen (filename, "w");
7602 if (stream == NULL)
7604 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7605 exit (1);
7608 debug_output_org_lbp (stream);
7610 if (ferror (stream) || fclose (stream))
7612 fprintf (stderr, "error writing to '%s'\n", filename);
7613 exit (1);
7617 /* Construction of sparse 3-level tables. */
7618 #define TABLE lbp_table
7619 #define ELEMENT unsigned char
7620 #define DEFAULT LBP_XX
7621 #define xmalloc malloc
7622 #define xrealloc realloc
7623 #include "3level.h"
7625 static void
7626 output_lbp (FILE *stream1, FILE *stream2)
7628 unsigned int i;
7629 struct lbp_table t;
7630 unsigned int level1_offset, level2_offset, level3_offset;
7632 t.p = 7;
7633 t.q = 9;
7634 lbp_table_init (&t);
7636 for (i = 0; i < 0x110000; i++)
7638 int64_t attr = get_lbp (i);
7640 /* Now attr should contain exactly one bit. */
7641 assert (attr != 0 && (attr & (attr - 1)) == 0);
7643 if (attr != (int64_t) 1 << LBP_XX)
7645 unsigned int log2_attr;
7646 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7648 lbp_table_add (&t, i, log2_attr);
7652 lbp_table_finalize (&t);
7654 level1_offset =
7655 5 * sizeof (uint32_t);
7656 level2_offset =
7657 5 * sizeof (uint32_t)
7658 + t.level1_size * sizeof (uint32_t);
7659 level3_offset =
7660 5 * sizeof (uint32_t)
7661 + t.level1_size * sizeof (uint32_t)
7662 + (t.level2_size << t.q) * sizeof (uint32_t);
7664 for (i = 0; i < 5; i++)
7665 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
7666 ((uint32_t *) t.result)[i]);
7667 fprintf (stream1, "\n");
7668 fprintf (stream1, "typedef struct\n");
7669 fprintf (stream1, " {\n");
7670 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7671 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7672 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7673 fprintf (stream1, " }\n");
7674 fprintf (stream1, "lbrkprop_t;\n");
7675 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
7677 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
7678 fprintf (stream2, "{\n");
7679 fprintf (stream2, " {");
7680 if (t.level1_size > 8)
7681 fprintf (stream2, "\n ");
7682 for (i = 0; i < t.level1_size; i++)
7684 uint32_t offset;
7685 if (i > 0 && (i % 8) == 0)
7686 fprintf (stream2, "\n ");
7687 offset = ((uint32_t *) (t.result + level1_offset))[i];
7688 if (offset == 0)
7689 fprintf (stream2, " %5d", -1);
7690 else
7691 fprintf (stream2, " %5zu",
7692 (offset - level2_offset) / sizeof (uint32_t));
7693 if (i+1 < t.level1_size)
7694 fprintf (stream2, ",");
7696 if (t.level1_size > 8)
7697 fprintf (stream2, "\n ");
7698 fprintf (stream2, " },\n");
7699 fprintf (stream2, " {");
7700 if (t.level2_size << t.q > 8)
7701 fprintf (stream2, "\n ");
7702 for (i = 0; i < t.level2_size << t.q; i++)
7704 uint32_t offset;
7705 if (i > 0 && (i % 8) == 0)
7706 fprintf (stream2, "\n ");
7707 offset = ((uint32_t *) (t.result + level2_offset))[i];
7708 if (offset == 0)
7709 fprintf (stream2, " %5d", -1);
7710 else
7711 fprintf (stream2, " %5zu",
7712 (offset - level3_offset) / sizeof (unsigned char));
7713 if (i+1 < t.level2_size << t.q)
7714 fprintf (stream2, ",");
7716 if (t.level2_size << t.q > 8)
7717 fprintf (stream2, "\n ");
7718 fprintf (stream2, " },\n");
7719 fprintf (stream2, " {");
7720 if (t.level3_size << t.p > 8)
7721 fprintf (stream2, "\n ");
7722 for (i = 0; i < t.level3_size << t.p; i++)
7724 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7725 const char *value_string;
7726 switch (value)
7728 #define CASE(x) case x: value_string = #x; break;
7729 CASE(LBP_BK);
7730 CASE(LBP_CM);
7731 CASE(LBP_WJ);
7732 CASE(LBP_ZW);
7733 CASE(LBP_GL);
7734 CASE(LBP_SP);
7735 CASE(LBP_B2);
7736 CASE(LBP_BA);
7737 CASE(LBP_BB);
7738 CASE(LBP_HY);
7739 CASE(LBP_CB);
7740 CASE(LBP_CL);
7741 CASE(LBP_CP);
7742 CASE(LBP_EX);
7743 CASE(LBP_IN);
7744 CASE(LBP_NS);
7745 CASE(LBP_OP);
7746 CASE(LBP_QU);
7747 CASE(LBP_IS);
7748 CASE(LBP_NU);
7749 CASE(LBP_PO);
7750 CASE(LBP_PR);
7751 CASE(LBP_SY);
7752 CASE(LBP_AI);
7753 CASE(LBP_AL);
7754 CASE(LBP_H2);
7755 CASE(LBP_H3);
7756 CASE(LBP_HL);
7757 CASE(LBP_ID);
7758 CASE(LBP_JL);
7759 CASE(LBP_JV);
7760 CASE(LBP_JT);
7761 CASE(LBP_RI);
7762 CASE(LBP_SA);
7763 CASE(LBP_ZWJ);
7764 CASE(LBP_EB);
7765 CASE(LBP_EM);
7766 CASE(LBP_XX);
7767 #undef CASE
7768 default:
7769 abort ();
7771 if (i > 0 && (i % 8) == 0)
7772 fprintf (stream2, "\n ");
7773 fprintf (stream2, " %s%s", value_string,
7774 (i+1 < t.level3_size << t.p ? "," : ""));
7776 if (t.level3_size << t.p > 8)
7777 fprintf (stream2, "\n ");
7778 fprintf (stream2, " }\n");
7779 fprintf (stream2, "};\n");
7782 static void
7783 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
7785 const char *filenames[2];
7786 FILE *streams[2];
7787 size_t i;
7789 filenames[0] = filename1;
7790 filenames[1] = filename2;
7792 for (i = 0; i < 2; i++)
7794 streams[i] = fopen (filenames[i], "w");
7795 if (streams[i] == NULL)
7797 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7798 exit (1);
7802 for (i = 0; i < 2; i++)
7804 FILE *stream = streams[i];
7806 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7807 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7808 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7809 version);
7810 fprintf (stream, "\n");
7812 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7813 still carries the GPL header), and it's gnulib-tool which replaces the
7814 GPL header with an LGPL header. */
7815 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
7816 fprintf (stream, "\n");
7817 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7818 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7819 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7820 fprintf (stream, " (at your option) any later version.\n");
7821 fprintf (stream, "\n");
7822 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7823 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7824 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7825 fprintf (stream, " GNU General Public License for more details.\n");
7826 fprintf (stream, "\n");
7827 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7828 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
7829 fprintf (stream, "\n");
7832 output_lbp (streams[0], streams[1]);
7834 for (i = 0; i < 2; i++)
7836 if (ferror (streams[i]) || fclose (streams[i]))
7838 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7839 exit (1);
7844 /* ========================================================================= */
7846 /* Word break property.
7847 Updated for Unicode TR #29 revision 17. */
7849 /* Possible values of the Word_Break property. */
7850 enum
7852 WBP_OTHER = 0,
7853 WBP_CR = 11,
7854 WBP_LF = 12,
7855 WBP_NEWLINE = 10,
7856 WBP_EXTEND = 8,
7857 WBP_FORMAT = 9,
7858 WBP_KATAKANA = 1,
7859 WBP_ALETTER = 2,
7860 WBP_MIDNUMLET = 3,
7861 WBP_MIDLETTER = 4,
7862 WBP_MIDNUM = 5,
7863 WBP_NUMERIC = 6,
7864 WBP_EXTENDNUMLET = 7,
7865 WBP_RI = 13,
7866 WBP_DQ = 14,
7867 WBP_SQ = 15,
7868 WBP_HL = 16,
7869 WBP_ZWJ = 17,
7870 WBP_EB = 18,
7871 WBP_EM = 19,
7872 WBP_GAZ = 20,
7873 WBP_EBG = 21
7876 /* Returns the word breaking property for ch, as a bit mask. */
7877 static int
7878 get_wbp (unsigned int ch)
7880 int attr = 0;
7882 if (unicode_attributes[ch].name != NULL)
7884 if (ch == 0x000D)
7885 attr |= 1 << WBP_CR;
7887 if (ch == 0x000A)
7888 attr |= 1 << WBP_LF;
7890 if (ch == 0x000B || ch == 0x000C
7891 || ch == 0x0085
7892 || ch == 0x2028 || ch == 0x2029)
7893 attr |= 1 << WBP_NEWLINE;
7895 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
7896 || ((unicode_properties[ch] >> PROP_OTHER_GRAPHEME_EXTEND) & 1) != 0
7897 || (unicode_attributes[ch].category != NULL
7898 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
7899 attr |= 1 << WBP_EXTEND;
7901 if (unicode_attributes[ch].category != NULL
7902 && strcmp (unicode_attributes[ch].category, "Cf") == 0
7903 && ch != 0x200B && ch != 0x200C && ch != 0x200D
7904 && !(ch >= 0xe0020 && ch <= 0xe007f))
7905 attr |= 1 << WBP_FORMAT;
7907 if ((unicode_scripts[ch] < numscripts
7908 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
7909 || (ch >= 0x3031 && ch <= 0x3035)
7910 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
7911 || ch == 0xFF70)
7912 attr |= 1 << WBP_KATAKANA;
7914 if ((unicode_scripts[ch] < numscripts
7915 && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0)
7916 && strcmp (unicode_attributes[ch].category, "Lo") == 0)
7917 attr |= 1 << WBP_HL;
7919 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
7920 || ch == 0x05F3)
7921 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
7922 && (attr & (1 << WBP_KATAKANA)) == 0
7923 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
7924 && !(unicode_scripts[ch] < numscripts
7925 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
7926 && (attr & (1 << WBP_EXTEND)) == 0
7927 && (attr & (1 << WBP_HL)) == 0)
7928 attr |= 1 << WBP_ALETTER;
7930 if (is_WBP_MIDNUMLET (ch))
7931 attr |= 1 << WBP_MIDNUMLET;
7933 if (is_WBP_MIDLETTER (ch))
7934 attr |= 1 << WBP_MIDLETTER;
7936 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
7937 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
7938 || ch == 0xFF1B)
7939 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
7940 attr |= 1 << WBP_MIDNUM;
7942 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
7943 && ch != 0x066C)
7944 attr |= 1 << WBP_NUMERIC;
7946 if ((unicode_attributes[ch].category != NULL
7947 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
7948 || ch == 0x202F /* NARROW NO-BREAK SPACE */)
7949 attr |= 1 << WBP_EXTENDNUMLET;
7951 if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
7952 attr |= 1 << WBP_RI;
7954 if (ch == 0x0022)
7955 attr |= 1 << WBP_DQ;
7957 if (ch == 0x0027)
7958 attr |= 1 << WBP_SQ;
7960 if (ch == 0x200D)
7961 attr |= 1 << WBP_ZWJ;
7963 if (ch >= 0x1F466 && ch <= 0x1F469)
7964 attr |= 1 << WBP_EBG;
7965 else if (((get_lbp (ch) >> LBP_EB) & 1) != 0)
7966 attr |= 1 << WBP_EB;
7968 if (((get_lbp (ch) >> LBP_EM) & 1) != 0)
7969 attr |= 1 << WBP_EM;
7971 if (ch == 0x2764 || ch == 0x1F48B || ch == 0x1F5E8)
7972 attr |= 1 << WBP_GAZ;
7975 if (attr == 0)
7976 /* other */
7977 attr |= 1 << WBP_OTHER;
7979 return attr;
7982 /* Output the word break property in a human readable format. */
7983 static void
7984 debug_output_wbp (FILE *stream)
7986 unsigned int i;
7988 for (i = 0; i < 0x110000; i++)
7990 int attr = get_wbp (i);
7991 if (attr != 1 << WBP_OTHER)
7993 fprintf (stream, "0x%04X", i);
7994 if (attr & (1 << WBP_CR))
7995 fprintf (stream, " CR");
7996 if (attr & (1 << WBP_LF))
7997 fprintf (stream, " LF");
7998 if (attr & (1 << WBP_NEWLINE))
7999 fprintf (stream, " Newline");
8000 if (attr & (1 << WBP_EXTEND))
8001 fprintf (stream, " Extend");
8002 if (attr & (1 << WBP_FORMAT))
8003 fprintf (stream, " Format");
8004 if (attr & (1 << WBP_KATAKANA))
8005 fprintf (stream, " Katakana");
8006 if (attr & (1 << WBP_ALETTER))
8007 fprintf (stream, " ALetter");
8008 if (attr & (1 << WBP_MIDNUMLET))
8009 fprintf (stream, " MidNumLet");
8010 if (attr & (1 << WBP_MIDLETTER))
8011 fprintf (stream, " MidLetter");
8012 if (attr & (1 << WBP_MIDNUM))
8013 fprintf (stream, " MidNum");
8014 if (attr & (1 << WBP_NUMERIC))
8015 fprintf (stream, " Numeric");
8016 if (attr & (1 << WBP_EXTENDNUMLET))
8017 fprintf (stream, " ExtendNumLet");
8018 if (attr & (1 << WBP_RI))
8019 fprintf (stream, " Regional_Indicator");
8020 if (attr & (1 << WBP_DQ))
8021 fprintf (stream, " Double_Quote");
8022 if (attr & (1 << WBP_SQ))
8023 fprintf (stream, " Single_Quote");
8024 if (attr & (1 << WBP_HL))
8025 fprintf (stream, " Hebrew_Letter");
8026 if (attr & (1 << WBP_ZWJ))
8027 fprintf (stream, " ZWJ");
8028 if (attr & (1 << WBP_EB))
8029 fprintf (stream, " E_Base");
8030 if (attr & (1 << WBP_EM))
8031 fprintf (stream, " E_Modifier");
8032 if (attr & (1 << WBP_GAZ))
8033 fprintf (stream, " Glue_After_Zwj");
8034 if (attr & (1 << WBP_EBG))
8035 fprintf (stream, " E_Base_GAZ");
8036 fprintf (stream, "\n");
8041 static void
8042 debug_output_wbrk_tables (const char *filename)
8044 FILE *stream;
8046 stream = fopen (filename, "w");
8047 if (stream == NULL)
8049 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8050 exit (1);
8053 debug_output_wbp (stream);
8055 if (ferror (stream) || fclose (stream))
8057 fprintf (stderr, "error writing to '%s'\n", filename);
8058 exit (1);
8062 /* The word break property from the WordBreakProperty.txt file. */
8063 int unicode_org_wbp[0x110000];
8065 /* Stores in unicode_org_wbp[] the word break property from the
8066 WordBreakProperty.txt file. */
8067 static void
8068 fill_org_wbp (const char *wordbreakproperty_filename)
8070 unsigned int i;
8071 FILE *stream;
8073 for (i = 0; i < 0x110000; i++)
8074 unicode_org_wbp[i] = WBP_OTHER;
8076 stream = fopen (wordbreakproperty_filename, "r");
8077 if (stream == NULL)
8079 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
8080 exit (1);
8083 for (;;)
8085 char buf[200+1];
8086 unsigned int i1, i2;
8087 char padding[200+1];
8088 char propname[200+1];
8089 int propvalue;
8091 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8092 break;
8094 if (buf[0] == '\0' || buf[0] == '#')
8095 continue;
8097 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8099 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8101 fprintf (stderr, "parse error in '%s'\n",
8102 wordbreakproperty_filename);
8103 exit (1);
8105 i2 = i1;
8107 #define PROP(name,value) \
8108 if (strcmp (propname, name) == 0) propvalue = value; else
8109 PROP ("CR", WBP_CR)
8110 PROP ("LF", WBP_LF)
8111 PROP ("Newline", WBP_NEWLINE)
8112 PROP ("Extend", WBP_EXTEND)
8113 PROP ("Format", WBP_FORMAT)
8114 PROP ("Katakana", WBP_KATAKANA)
8115 PROP ("ALetter", WBP_ALETTER)
8116 PROP ("MidNumLet", WBP_MIDNUMLET)
8117 PROP ("MidLetter", WBP_MIDLETTER)
8118 PROP ("MidNum", WBP_MIDNUM)
8119 PROP ("Numeric", WBP_NUMERIC)
8120 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
8121 PROP ("Regional_Indicator", WBP_RI)
8122 PROP ("Double_Quote", WBP_DQ)
8123 PROP ("Single_Quote", WBP_SQ)
8124 PROP ("Hebrew_Letter", WBP_HL)
8125 PROP ("ZWJ", WBP_ZWJ)
8126 PROP ("E_Base", WBP_EB)
8127 PROP ("E_Modifier", WBP_EM)
8128 PROP ("Glue_After_Zwj", WBP_GAZ)
8129 PROP ("E_Base_GAZ", WBP_EBG)
8130 #undef PROP
8132 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
8133 wordbreakproperty_filename);
8134 exit (1);
8136 assert (i1 <= i2 && i2 < 0x110000);
8138 for (i = i1; i <= i2; i++)
8139 unicode_org_wbp[i] = propvalue;
8142 if (ferror (stream) || fclose (stream))
8144 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
8145 exit (1);
8149 /* Output the word break property in a human readable format. */
8150 static void
8151 debug_output_org_wbp (FILE *stream)
8153 unsigned int i;
8155 for (i = 0; i < 0x110000; i++)
8157 int propvalue = unicode_org_wbp[i];
8158 if (propvalue != WBP_OTHER)
8160 fprintf (stream, "0x%04X", i);
8161 #define PROP(name,value) \
8162 if (propvalue == value) fprintf (stream, " " name); else
8163 PROP ("CR", WBP_CR)
8164 PROP ("LF", WBP_LF)
8165 PROP ("Newline", WBP_NEWLINE)
8166 PROP ("Extend", WBP_EXTEND)
8167 PROP ("Format", WBP_FORMAT)
8168 PROP ("Katakana", WBP_KATAKANA)
8169 PROP ("ALetter", WBP_ALETTER)
8170 PROP ("MidNumLet", WBP_MIDNUMLET)
8171 PROP ("MidLetter", WBP_MIDLETTER)
8172 PROP ("MidNum", WBP_MIDNUM)
8173 PROP ("Numeric", WBP_NUMERIC)
8174 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
8175 PROP ("Regional_Indicator", WBP_RI)
8176 PROP ("Double_Quote", WBP_DQ)
8177 PROP ("Single_Quote", WBP_SQ)
8178 PROP ("Hebrew_Letter", WBP_HL)
8179 PROP ("ZWJ", WBP_ZWJ)
8180 PROP ("E_Base", WBP_EB)
8181 PROP ("E_Modifier", WBP_EM)
8182 PROP ("Glue_After_Zwj", WBP_GAZ)
8183 PROP ("E_Base_GAZ", WBP_EBG)
8184 #undef PROP
8185 fprintf (stream, " ??");
8186 fprintf (stream, "\n");
8191 static void
8192 debug_output_org_wbrk_tables (const char *filename)
8194 FILE *stream;
8196 stream = fopen (filename, "w");
8197 if (stream == NULL)
8199 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8200 exit (1);
8203 debug_output_org_wbp (stream);
8205 if (ferror (stream) || fclose (stream))
8207 fprintf (stderr, "error writing to '%s'\n", filename);
8208 exit (1);
8212 /* Construction of sparse 3-level tables. */
8213 #define TABLE wbp_table
8214 #define ELEMENT unsigned char
8215 #define DEFAULT WBP_OTHER
8216 #define xmalloc malloc
8217 #define xrealloc realloc
8218 #include "3level.h"
8220 static void
8221 output_wbp (FILE *stream)
8223 unsigned int i;
8224 struct wbp_table t;
8225 unsigned int level1_offset, level2_offset, level3_offset;
8227 t.p = 7;
8228 t.q = 9;
8229 wbp_table_init (&t);
8231 for (i = 0; i < 0x110000; i++)
8233 int attr = get_wbp (i);
8235 /* Now attr should contain exactly one bit. */
8236 assert (attr != 0 && (attr & (attr - 1)) == 0);
8238 if (attr != 1 << WBP_OTHER)
8240 unsigned int log2_attr;
8241 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
8243 wbp_table_add (&t, i, log2_attr);
8247 wbp_table_finalize (&t);
8249 level1_offset =
8250 5 * sizeof (uint32_t);
8251 level2_offset =
8252 5 * sizeof (uint32_t)
8253 + t.level1_size * sizeof (uint32_t);
8254 level3_offset =
8255 5 * sizeof (uint32_t)
8256 + t.level1_size * sizeof (uint32_t)
8257 + (t.level2_size << t.q) * sizeof (uint32_t);
8259 for (i = 0; i < 5; i++)
8260 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
8261 ((uint32_t *) t.result)[i]);
8262 fprintf (stream, "\n");
8263 fprintf (stream, "typedef struct\n");
8264 fprintf (stream, " {\n");
8265 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8266 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
8267 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
8268 fprintf (stream, " }\n");
8269 fprintf (stream, "wbrkprop_t;\n");
8270 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
8271 fprintf (stream, "{\n");
8272 fprintf (stream, " {");
8273 if (t.level1_size > 8)
8274 fprintf (stream, "\n ");
8275 for (i = 0; i < t.level1_size; i++)
8277 uint32_t offset;
8278 if (i > 0 && (i % 8) == 0)
8279 fprintf (stream, "\n ");
8280 offset = ((uint32_t *) (t.result + level1_offset))[i];
8281 if (offset == 0)
8282 fprintf (stream, " %5d", -1);
8283 else
8284 fprintf (stream, " %5zu",
8285 (offset - level2_offset) / sizeof (uint32_t));
8286 if (i+1 < t.level1_size)
8287 fprintf (stream, ",");
8289 if (t.level1_size > 8)
8290 fprintf (stream, "\n ");
8291 fprintf (stream, " },\n");
8292 fprintf (stream, " {");
8293 if (t.level2_size << t.q > 8)
8294 fprintf (stream, "\n ");
8295 for (i = 0; i < t.level2_size << t.q; i++)
8297 uint32_t offset;
8298 if (i > 0 && (i % 8) == 0)
8299 fprintf (stream, "\n ");
8300 offset = ((uint32_t *) (t.result + level2_offset))[i];
8301 if (offset == 0)
8302 fprintf (stream, " %5d", -1);
8303 else
8304 fprintf (stream, " %5zu",
8305 (offset - level3_offset) / sizeof (unsigned char));
8306 if (i+1 < t.level2_size << t.q)
8307 fprintf (stream, ",");
8309 if (t.level2_size << t.q > 8)
8310 fprintf (stream, "\n ");
8311 fprintf (stream, " },\n");
8312 fprintf (stream, " {");
8313 if (t.level3_size << t.p > 4)
8314 fprintf (stream, "\n ");
8315 for (i = 0; i < t.level3_size << t.p; i++)
8317 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
8318 const char *value_string;
8319 switch (value)
8321 #define CASE(x) case x: value_string = #x; break;
8322 CASE(WBP_OTHER);
8323 CASE(WBP_CR);
8324 CASE(WBP_LF);
8325 CASE(WBP_NEWLINE);
8326 CASE(WBP_EXTEND);
8327 CASE(WBP_FORMAT);
8328 CASE(WBP_KATAKANA);
8329 CASE(WBP_ALETTER);
8330 CASE(WBP_MIDNUMLET);
8331 CASE(WBP_MIDLETTER);
8332 CASE(WBP_MIDNUM);
8333 CASE(WBP_NUMERIC);
8334 CASE(WBP_EXTENDNUMLET);
8335 CASE(WBP_RI);
8336 CASE(WBP_DQ);
8337 CASE(WBP_SQ);
8338 CASE(WBP_HL);
8339 CASE(WBP_ZWJ);
8340 CASE(WBP_EB);
8341 CASE(WBP_EM);
8342 CASE(WBP_GAZ);
8343 CASE(WBP_EBG);
8344 #undef CASE
8345 default:
8346 abort ();
8348 if (i > 0 && (i % 4) == 0)
8349 fprintf (stream, "\n ");
8350 fprintf (stream, " %s%s", value_string,
8351 (i+1 < t.level3_size << t.p ? "," : ""));
8353 if (t.level3_size << t.p > 4)
8354 fprintf (stream, "\n ");
8355 fprintf (stream, " }\n");
8356 fprintf (stream, "};\n");
8359 static void
8360 output_wbrk_tables (const char *filename, const char *version)
8362 FILE *stream;
8364 stream = fopen (filename, "w");
8365 if (stream == NULL)
8367 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8368 exit (1);
8371 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8372 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
8373 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8374 version);
8375 fprintf (stream, "\n");
8377 /* Put a GPL header on it. The gnulib module is under LGPL (although it
8378 still carries the GPL header), and it's gnulib-tool which replaces the
8379 GPL header with an LGPL header. */
8380 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
8381 fprintf (stream, "\n");
8382 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8383 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8384 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8385 fprintf (stream, " (at your option) any later version.\n");
8386 fprintf (stream, "\n");
8387 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8388 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8389 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8390 fprintf (stream, " GNU General Public License for more details.\n");
8391 fprintf (stream, "\n");
8392 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8393 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
8394 fprintf (stream, "\n");
8396 output_wbp (stream);
8398 if (ferror (stream) || fclose (stream))
8400 fprintf (stderr, "error writing to '%s'\n", filename);
8401 exit (1);
8405 /* ========================================================================= */
8407 /* Grapheme break property.
8408 Updated for Unicode TR #29 revision 29. */
8410 /* Possible values of the Grapheme_Cluster_Break property. */
8411 enum
8413 GBP_OTHER = 0,
8414 GBP_CR = 1,
8415 GBP_LF = 2,
8416 GBP_CONTROL = 3,
8417 GBP_EXTEND = 4,
8418 GBP_PREPEND = 5,
8419 GBP_SPACINGMARK = 6,
8420 GBP_L = 7,
8421 GBP_V = 8,
8422 GBP_T = 9,
8423 GBP_LV = 10,
8424 GBP_LVT = 11,
8425 GBP_RI = 12,
8426 GBP_ZWJ = 13,
8427 GBP_EB = 14,
8428 GBP_EM = 15,
8429 GBP_GAZ = 16,
8430 GBP_EBG = 17
8433 /* Construction of sparse 3-level tables. */
8434 #define TABLE gbp_table
8435 #define ELEMENT unsigned char
8436 #define DEFAULT GBP_OTHER
8437 #define xmalloc malloc
8438 #define xrealloc realloc
8439 #include "3level.h"
8441 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
8442 int unicode_org_gbp[0x110000];
8444 /* Output the unit test data for the grapheme break property. */
8445 static void
8446 output_gbp_test (const char *filename)
8448 FILE *stream;
8449 bool need_comma;
8450 unsigned int ch;
8452 stream = fopen (filename, "w");
8453 if (stream == NULL)
8455 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8456 exit (1);
8459 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8460 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
8461 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
8462 fprintf (stream, "\n");
8463 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8464 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8465 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8466 fprintf (stream, " (at your option) any later version.\n");
8467 fprintf (stream, "\n");
8468 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8469 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8470 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8471 fprintf (stream, " GNU General Public License for more details.\n");
8472 fprintf (stream, "\n");
8473 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8474 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
8475 fprintf (stream, "\n");
8477 need_comma = false;
8478 for (ch = 0; ch < 0x110000; ch++)
8480 int gbp = unicode_org_gbp[ch];
8481 const char *gbp_string;
8483 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
8484 ch++;
8486 switch (gbp)
8488 #define CASE(x) case x: gbp_string = #x; break;
8489 CASE (GBP_OTHER)
8490 CASE (GBP_CR)
8491 CASE (GBP_LF)
8492 CASE (GBP_CONTROL)
8493 CASE (GBP_EXTEND)
8494 CASE (GBP_PREPEND)
8495 CASE (GBP_SPACINGMARK)
8496 CASE (GBP_L)
8497 CASE (GBP_V)
8498 CASE (GBP_T)
8499 CASE (GBP_LV)
8500 CASE (GBP_LVT)
8501 CASE (GBP_RI)
8502 CASE (GBP_ZWJ)
8503 CASE (GBP_EB)
8504 CASE (GBP_EM)
8505 CASE (GBP_GAZ)
8506 CASE (GBP_EBG)
8507 #undef CASE
8508 default:
8509 abort ();
8512 if (need_comma)
8513 fprintf (stream, ",\n");
8514 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
8516 need_comma = true;
8518 fprintf (stream, "\n");
8520 if (ferror (stream) || fclose (stream))
8522 fprintf (stderr, "error writing to '%s'\n", filename);
8523 exit (1);
8527 /* Output the per-character grapheme break property table. */
8528 static void
8529 output_gbp_table (const char *filename, const char *version)
8531 FILE *stream;
8532 unsigned int ch, i;
8533 struct gbp_table t;
8534 unsigned int level1_offset, level2_offset, level3_offset;
8536 stream = fopen (filename, "w");
8537 if (stream == NULL)
8539 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8540 exit (1);
8543 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8544 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
8545 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8546 version);
8548 t.p = 7;
8549 t.q = 9;
8550 gbp_table_init (&t);
8552 for (ch = 0; ch < 0x110000; ch++)
8553 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
8555 gbp_table_finalize (&t);
8557 /* Offsets in t.result, in memory of this process. */
8558 level1_offset =
8559 5 * sizeof (uint32_t);
8560 level2_offset =
8561 5 * sizeof (uint32_t)
8562 + t.level1_size * sizeof (uint32_t);
8563 level3_offset =
8564 5 * sizeof (uint32_t)
8565 + t.level1_size * sizeof (uint32_t)
8566 + (t.level2_size << t.q) * sizeof (uint32_t);
8568 for (i = 0; i < 5; i++)
8569 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
8570 ((uint32_t *) t.result)[i]);
8571 fprintf (stream, "static const\n");
8572 fprintf (stream, "struct\n");
8573 fprintf (stream, " {\n");
8574 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8575 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
8576 fprintf (stream, " unsigned char level3[%zu << %d];\n",
8577 t.level3_size, t.p);
8578 fprintf (stream, " }\n");
8579 fprintf (stream, "unigbrkprop =\n");
8580 fprintf (stream, "{\n");
8581 fprintf (stream, " {");
8582 if (t.level1_size > 8)
8583 fprintf (stream, "\n ");
8584 for (i = 0; i < t.level1_size; i++)
8586 uint32_t offset;
8587 if (i > 0 && (i % 8) == 0)
8588 fprintf (stream, "\n ");
8589 offset = ((uint32_t *) (t.result + level1_offset))[i];
8590 if (offset == 0)
8591 fprintf (stream, " %5d", -1);
8592 else
8593 fprintf (stream, " %5zu",
8594 (offset - level2_offset) / sizeof (uint32_t));
8595 if (i+1 < t.level1_size)
8596 fprintf (stream, ",");
8598 if (t.level1_size > 8)
8599 fprintf (stream, "\n ");
8600 fprintf (stream, " },\n");
8601 fprintf (stream, " {");
8602 if (t.level2_size << t.q > 8)
8603 fprintf (stream, "\n ");
8604 for (i = 0; i < t.level2_size << t.q; i++)
8606 uint32_t offset;
8607 if (i > 0 && (i % 8) == 0)
8608 fprintf (stream, "\n ");
8609 offset = ((uint32_t *) (t.result + level2_offset))[i];
8610 if (offset == 0)
8611 fprintf (stream, " %5d", -1);
8612 else
8613 fprintf (stream, " %5zu",
8614 (offset - level3_offset) / sizeof (uint8_t));
8615 if (i+1 < t.level2_size << t.q)
8616 fprintf (stream, ",");
8618 if (t.level2_size << t.q > 8)
8619 fprintf (stream, "\n ");
8620 fprintf (stream, " },\n");
8621 fprintf (stream, " {");
8622 if (t.level3_size << t.p > 4)
8623 fprintf (stream, "\n ");
8624 for (i = 0; i < t.level3_size << t.p; i++)
8626 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
8627 const char *value_string;
8628 switch (value)
8630 #define CASE(x) case x: value_string = #x; break;
8631 CASE (GBP_OTHER)
8632 CASE (GBP_CR)
8633 CASE (GBP_LF)
8634 CASE (GBP_CONTROL)
8635 CASE (GBP_EXTEND)
8636 CASE (GBP_PREPEND)
8637 CASE (GBP_SPACINGMARK)
8638 CASE (GBP_L)
8639 CASE (GBP_V)
8640 CASE (GBP_T)
8641 CASE (GBP_LV)
8642 CASE (GBP_LVT)
8643 CASE (GBP_RI)
8644 CASE (GBP_ZWJ)
8645 CASE (GBP_EB)
8646 CASE (GBP_EM)
8647 CASE (GBP_GAZ)
8648 CASE (GBP_EBG)
8649 #undef CASE
8650 default:
8651 abort ();
8653 if (i > 0 && (i % 4) == 0)
8654 fprintf (stream, "\n ");
8655 fprintf (stream, " %s%s", value_string,
8656 (i+1 < t.level3_size << t.p ? "," : ""));
8658 if (t.level3_size << t.p > 4)
8659 fprintf (stream, "\n ");
8660 fprintf (stream, " }\n");
8661 fprintf (stream, "};\n");
8663 if (ferror (stream) || fclose (stream))
8665 fprintf (stderr, "error writing to '%s'\n", filename);
8666 exit (1);
8670 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
8671 GraphemeBreakProperty.txt file. */
8672 static void
8673 fill_org_gbp (const char *graphemebreakproperty_filename)
8675 unsigned int i;
8676 FILE *stream;
8677 int lineno = 0;
8679 for (i = 0; i < 0x110000; i++)
8680 unicode_org_gbp[i] = GBP_OTHER;
8682 stream = fopen (graphemebreakproperty_filename, "r");
8683 if (stream == NULL)
8685 fprintf (stderr, "error during fopen of '%s'\n",
8686 graphemebreakproperty_filename);
8687 exit (1);
8690 for (;;)
8692 char buf[200+1];
8693 unsigned int i1, i2;
8694 char padding[200+1];
8695 char propname[200+1];
8696 int propvalue;
8698 lineno++;
8699 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8700 break;
8702 if (buf[0] == '\0' || buf[0] == '#')
8703 continue;
8705 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8707 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8709 fprintf (stderr, "parse error in '%s'\n",
8710 graphemebreakproperty_filename);
8711 exit (1);
8713 i2 = i1;
8715 #define PROP(name,value) \
8716 if (strcmp (propname, name) == 0) propvalue = value; else
8717 PROP ("CR", GBP_CR)
8718 PROP ("LF", GBP_LF)
8719 PROP ("Control", GBP_CONTROL)
8720 PROP ("Extend", GBP_EXTEND)
8721 PROP ("Prepend", GBP_PREPEND)
8722 PROP ("SpacingMark", GBP_SPACINGMARK)
8723 PROP ("L", GBP_L)
8724 PROP ("V", GBP_V)
8725 PROP ("T", GBP_T)
8726 PROP ("LV", GBP_LV)
8727 PROP ("LVT", GBP_LVT)
8728 PROP ("Regional_Indicator", GBP_RI)
8729 PROP ("ZWJ", GBP_ZWJ)
8730 PROP ("E_Base", GBP_EB)
8731 PROP ("E_Modifier", GBP_EM)
8732 PROP ("Glue_After_Zwj", GBP_GAZ)
8733 PROP ("E_Base_GAZ", GBP_EBG)
8734 #undef PROP
8736 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
8737 graphemebreakproperty_filename, lineno);
8738 exit (1);
8740 assert (i1 <= i2 && i2 < 0x110000);
8742 for (i = i1; i <= i2; i++)
8743 unicode_org_gbp[i] = propvalue;
8746 if (ferror (stream) || fclose (stream))
8748 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
8749 exit (1);
8753 /* ========================================================================= */
8755 /* Composition and decomposition.
8756 Updated for Unicode TR #15 revision 33. */
8758 /* Maximum number of characters into which a single Unicode character can be
8759 decomposed. */
8760 #define MAX_DECOMP_LENGTH 18
8762 enum
8764 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
8765 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
8766 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
8767 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
8768 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
8769 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
8770 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
8771 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
8772 UC_DECOMP_SUPER, /* <super> A superscript form. */
8773 UC_DECOMP_SUB, /* <sub> A subscript form. */
8774 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
8775 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
8776 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
8777 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
8778 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
8779 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
8780 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
8783 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
8784 decompositions). Return the type, or -1 for none. */
8785 static int
8786 get_decomposition (unsigned int ch,
8787 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
8789 const char *decomposition = unicode_attributes[ch].decomposition;
8791 if (decomposition != NULL && decomposition[0] != '\0')
8793 int type = UC_DECOMP_CANONICAL;
8794 unsigned int length;
8795 char *endptr;
8797 if (decomposition[0] == '<')
8799 const char *rangle;
8800 size_t typelen;
8802 rangle = strchr (decomposition + 1, '>');
8803 assert (rangle != NULL);
8804 typelen = rangle + 1 - decomposition;
8805 #define TYPE(t1,t2) \
8806 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
8807 type = t2; \
8808 else
8809 TYPE ("<font>", UC_DECOMP_FONT)
8810 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
8811 TYPE ("<initial>", UC_DECOMP_INITIAL)
8812 TYPE ("<medial>", UC_DECOMP_MEDIAL)
8813 TYPE ("<final>", UC_DECOMP_FINAL)
8814 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
8815 TYPE ("<circle>", UC_DECOMP_CIRCLE)
8816 TYPE ("<super>", UC_DECOMP_SUPER)
8817 TYPE ("<sub>", UC_DECOMP_SUB)
8818 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
8819 TYPE ("<wide>", UC_DECOMP_WIDE)
8820 TYPE ("<narrow>", UC_DECOMP_NARROW)
8821 TYPE ("<small>", UC_DECOMP_SMALL)
8822 TYPE ("<square>", UC_DECOMP_SQUARE)
8823 TYPE ("<fraction>", UC_DECOMP_FRACTION)
8824 TYPE ("<compat>", UC_DECOMP_COMPAT)
8826 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
8827 exit (1);
8829 #undef TYPE
8830 decomposition = rangle + 1;
8831 if (decomposition[0] == ' ')
8832 decomposition++;
8834 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
8836 decomposed[length] = strtoul (decomposition, &endptr, 16);
8837 if (endptr == decomposition)
8838 break;
8839 decomposition = endptr;
8840 if (decomposition[0] == ' ')
8841 decomposition++;
8843 /* Make sure that *DECOMPOSITION is not NULL-terminated.
8844 Otherwise MAX_DECOMP_LENGTH is too small. */
8845 assert (*decomposition == '\0');
8847 *lengthp = length;
8848 return type;
8850 else
8851 return -1;
8854 /* Construction of sparse 3-level tables. */
8855 #define TABLE decomp_table
8856 #define ELEMENT uint16_t
8857 #define DEFAULT (uint16_t)(-1)
8858 #define xmalloc malloc
8859 #define xrealloc realloc
8860 #include "3level.h"
8862 static void
8863 output_decomposition (FILE *stream1, FILE *stream2)
8865 struct decomp_table t;
8866 unsigned int level1_offset, level2_offset, level3_offset;
8867 unsigned int offset;
8868 unsigned int ch;
8869 unsigned int i;
8871 t.p = 5;
8872 t.q = 5;
8873 decomp_table_init (&t);
8875 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
8876 fprintf (stream1, "\n");
8877 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
8878 offset = 0;
8880 for (ch = 0; ch < 0x110000; ch++)
8882 unsigned int length;
8883 unsigned int decomposed[MAX_DECOMP_LENGTH];
8884 int type = get_decomposition (ch, &length, decomposed);
8886 if (type >= 0)
8888 assert (offset < (1 << 15));
8889 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
8891 /* Produce length 3-bytes entries. */
8892 /* We would need a special representation of zero-length entries. */
8893 assert (length != 0);
8894 for (i = 0; i < length; i++)
8896 if (offset > 0)
8897 fprintf (stream2, ",");
8898 if ((offset % 4) == 0)
8899 fprintf (stream2, "\n ");
8900 assert (decomposed[i] < (1 << 18));
8901 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
8902 (((i+1 < length ? (1 << 23) : 0)
8903 | (i == 0 ? (type << 18) : 0)
8904 | decomposed[i]) >> 16) & 0xff,
8905 (decomposed[i] >> 8) & 0xff,
8906 decomposed[i] & 0xff);
8907 offset++;
8912 fprintf (stream2, "\n};\n");
8913 fprintf (stream2, "\n");
8915 decomp_table_finalize (&t);
8917 level1_offset =
8918 5 * sizeof (uint32_t);
8919 level2_offset =
8920 5 * sizeof (uint32_t)
8921 + t.level1_size * sizeof (uint32_t);
8922 level3_offset =
8923 5 * sizeof (uint32_t)
8924 + t.level1_size * sizeof (uint32_t)
8925 + (t.level2_size << t.q) * sizeof (uint32_t);
8927 for (i = 0; i < 5; i++)
8928 fprintf (stream1, "#define decomp_header_%d %d\n", i,
8929 ((uint32_t *) t.result)[i]);
8930 fprintf (stream1, "\n");
8931 fprintf (stream1, "typedef struct\n");
8932 fprintf (stream1, " {\n");
8933 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
8934 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
8935 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
8936 fprintf (stream1, " }\n");
8937 fprintf (stream1, "decomp_index_table_t;\n");
8938 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
8939 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
8940 fprintf (stream2, "{\n");
8941 fprintf (stream2, " {");
8942 if (t.level1_size > 8)
8943 fprintf (stream2, "\n ");
8944 for (i = 0; i < t.level1_size; i++)
8946 uint32_t offset;
8947 if (i > 0 && (i % 8) == 0)
8948 fprintf (stream2, "\n ");
8949 offset = ((uint32_t *) (t.result + level1_offset))[i];
8950 if (offset == 0)
8951 fprintf (stream2, " %5d", -1);
8952 else
8953 fprintf (stream2, " %5zu",
8954 (offset - level2_offset) / sizeof (uint32_t));
8955 if (i+1 < t.level1_size)
8956 fprintf (stream2, ",");
8958 if (t.level1_size > 8)
8959 fprintf (stream2, "\n ");
8960 fprintf (stream2, " },\n");
8961 fprintf (stream2, " {");
8962 if (t.level2_size << t.q > 8)
8963 fprintf (stream2, "\n ");
8964 for (i = 0; i < t.level2_size << t.q; i++)
8966 uint32_t offset;
8967 if (i > 0 && (i % 8) == 0)
8968 fprintf (stream2, "\n ");
8969 offset = ((uint32_t *) (t.result + level2_offset))[i];
8970 if (offset == 0)
8971 fprintf (stream2, " %5d", -1);
8972 else
8973 fprintf (stream2, " %5zu",
8974 (offset - level3_offset) / sizeof (uint16_t));
8975 if (i+1 < t.level2_size << t.q)
8976 fprintf (stream2, ",");
8978 if (t.level2_size << t.q > 8)
8979 fprintf (stream2, "\n ");
8980 fprintf (stream2, " },\n");
8981 fprintf (stream2, " {");
8982 if (t.level3_size << t.p > 8)
8983 fprintf (stream2, "\n ");
8984 for (i = 0; i < t.level3_size << t.p; i++)
8986 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
8987 if (i > 0 && (i % 8) == 0)
8988 fprintf (stream2, "\n ");
8989 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
8990 if (i+1 < t.level3_size << t.p)
8991 fprintf (stream2, ",");
8993 if (t.level3_size << t.p > 8)
8994 fprintf (stream2, "\n ");
8995 fprintf (stream2, " }\n");
8996 fprintf (stream2, "};\n");
8999 static void
9000 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
9002 const char *filenames[2];
9003 FILE *streams[2];
9004 size_t i;
9006 filenames[0] = filename1;
9007 filenames[1] = filename2;
9009 for (i = 0; i < 2; i++)
9011 streams[i] = fopen (filenames[i], "w");
9012 if (streams[i] == NULL)
9014 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
9015 exit (1);
9019 for (i = 0; i < 2; i++)
9021 FILE *stream = streams[i];
9023 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9024 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
9025 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9026 version);
9027 fprintf (stream, "\n");
9030 output_decomposition (streams[0], streams[1]);
9032 for (i = 0; i < 2; i++)
9034 if (ferror (streams[i]) || fclose (streams[i]))
9036 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
9037 exit (1);
9042 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
9043 char unicode_composition_exclusions[0x110000];
9045 static void
9046 fill_composition_exclusions (const char *compositionexclusions_filename)
9048 FILE *stream;
9049 unsigned int i;
9051 stream = fopen (compositionexclusions_filename, "r");
9052 if (stream == NULL)
9054 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
9055 exit (1);
9058 for (i = 0; i < 0x110000; i++)
9059 unicode_composition_exclusions[i] = 0;
9061 for (;;)
9063 char buf[200+1];
9064 unsigned int i;
9066 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9067 break;
9069 if (buf[0] == '\0' || buf[0] == '#')
9070 continue;
9072 if (sscanf (buf, "%X", &i) != 1)
9074 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
9075 exit (1);
9077 assert (i < 0x110000);
9079 unicode_composition_exclusions[i] = 1;
9082 if (ferror (stream) || fclose (stream))
9084 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
9085 exit (1);
9089 static void
9090 debug_output_composition_tables (const char *filename)
9092 FILE *stream;
9093 unsigned int ch;
9095 stream = fopen (filename, "w");
9096 if (stream == NULL)
9098 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9099 exit (1);
9102 for (ch = 0; ch < 0x110000; ch++)
9104 unsigned int length;
9105 unsigned int decomposed[MAX_DECOMP_LENGTH];
9106 int type = get_decomposition (ch, &length, decomposed);
9108 if (type == UC_DECOMP_CANONICAL
9109 /* Consider only binary decompositions.
9110 Exclude singleton decompositions. */
9111 && length == 2)
9113 unsigned int code1 = decomposed[0];
9114 unsigned int code2 = decomposed[1];
9115 unsigned int combined = ch;
9117 /* Exclude decompositions where the first part is not a starter,
9118 i.e. is not of canonical combining class 0. */
9119 if (strcmp (unicode_attributes[code1].combining, "0") == 0
9120 /* Exclude characters listed in CompositionExclusions.txt. */
9121 && !unicode_composition_exclusions[combined])
9123 /* The combined character must now also be a starter.
9124 Verify this. */
9125 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
9127 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
9128 code1,
9129 code2,
9130 combined,
9131 unicode_attributes[code2].combining);
9136 if (ferror (stream) || fclose (stream))
9138 fprintf (stderr, "error writing to '%s'\n", filename);
9139 exit (1);
9143 static void
9144 output_composition_tables (const char *filename, const char *version)
9146 FILE *stream;
9147 unsigned int ch;
9149 stream = fopen (filename, "w");
9150 if (stream == NULL)
9152 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9153 exit (1);
9156 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9157 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
9158 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9159 version);
9160 fprintf (stream, "\n");
9162 /* Put a GPL header on it. The gnulib module is under LGPL (although it
9163 still carries the GPL header), and it's gnulib-tool which replaces the
9164 GPL header with an LGPL header. */
9165 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
9166 fprintf (stream, "\n");
9167 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
9168 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
9169 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
9170 fprintf (stream, " (at your option) any later version.\n");
9171 fprintf (stream, "\n");
9172 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
9173 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
9174 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
9175 fprintf (stream, " GNU General Public License for more details.\n");
9176 fprintf (stream, "\n");
9177 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
9178 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
9179 fprintf (stream, "\n");
9181 /* The composition table is a set of mappings (code1, code2) -> combined,
9182 with 928 entries,
9183 367 values for code1 (from 0x003C to 0x30FD),
9184 54 values for code2 (from 0x0300 to 0x309A).
9185 For a fixed code1, there are from 1 to 19 possible values for code2.
9186 For a fixed code2, there are from 1 to 117 possible values for code1.
9187 This is a very sparse matrix.
9189 We want an O(1) hash lookup.
9191 We could implement the hash lookup by mapping (code1, code2) to a linear
9192 combination mul1*code1 + mul2*code2, which is then used as an index into
9193 a 3-level table. But this leads to a table of size 37 KB.
9195 We use gperf to implement the hash lookup, giving it the 928 sets of
9196 4 bytes (code1, code2) as input. gperf generates a hash table of size
9197 1527, which is quite good (60% filled). It requires an auxiliary table
9198 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
9200 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
9201 fprintf (stream, "%%struct-type\n");
9202 fprintf (stream, "%%language=ANSI-C\n");
9203 fprintf (stream, "%%define slot-name codes\n");
9204 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
9205 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
9206 fprintf (stream, "%%compare-lengths\n");
9207 fprintf (stream, "%%compare-strncmp\n");
9208 fprintf (stream, "%%readonly-tables\n");
9209 fprintf (stream, "%%omit-struct-type\n");
9210 fprintf (stream, "%%%%\n");
9212 for (ch = 0; ch < 0x110000; ch++)
9214 unsigned int length;
9215 unsigned int decomposed[MAX_DECOMP_LENGTH];
9216 int type = get_decomposition (ch, &length, decomposed);
9218 if (type == UC_DECOMP_CANONICAL
9219 /* Consider only binary decompositions.
9220 Exclude singleton decompositions. */
9221 && length == 2)
9223 unsigned int code1 = decomposed[0];
9224 unsigned int code2 = decomposed[1];
9225 unsigned int combined = ch;
9227 /* Exclude decompositions where the first part is not a starter,
9228 i.e. is not of canonical combining class 0. */
9229 if (strcmp (unicode_attributes[code1].combining, "0") == 0
9230 /* Exclude characters listed in CompositionExclusions.txt. */
9231 && !unicode_composition_exclusions[combined])
9233 /* The combined character must now also be a starter.
9234 Verify this. */
9235 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
9237 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
9238 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
9239 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
9240 combined);
9245 if (ferror (stream) || fclose (stream))
9247 fprintf (stderr, "error writing to '%s'\n", filename);
9248 exit (1);
9252 /* ========================================================================= */
9254 /* Output the test for a simple character mapping table to the given file. */
9256 static void
9257 output_simple_mapping_test (const char *filename,
9258 const char *function_name,
9259 unsigned int (*func) (unsigned int),
9260 const char *version)
9262 FILE *stream;
9263 bool need_comma;
9264 unsigned int ch;
9266 stream = fopen (filename, "w");
9267 if (stream == NULL)
9269 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9270 exit (1);
9273 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9274 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
9275 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
9276 fprintf (stream, "\n");
9277 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
9278 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
9279 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
9280 fprintf (stream, " (at your option) any later version.\n");
9281 fprintf (stream, "\n");
9282 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
9283 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
9284 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
9285 fprintf (stream, " GNU General Public License for more details.\n");
9286 fprintf (stream, "\n");
9287 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
9288 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
9289 fprintf (stream, "\n");
9290 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9291 version);
9292 fprintf (stream, "\n");
9293 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
9294 fprintf (stream, "\n");
9296 need_comma = false;
9297 for (ch = 0; ch < 0x110000; ch++)
9299 unsigned int value = func (ch);
9301 if (value != ch)
9303 if (need_comma)
9304 fprintf (stream, ",\n");
9305 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
9306 need_comma = true;
9309 if (need_comma)
9310 fprintf (stream, "\n");
9312 fprintf (stream, "\n");
9313 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
9314 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
9316 if (ferror (stream) || fclose (stream))
9318 fprintf (stderr, "error writing to '%s'\n", filename);
9319 exit (1);
9323 /* Construction of sparse 3-level tables. */
9324 #define TABLE mapping_table
9325 #define ELEMENT int32_t
9326 #define DEFAULT 0
9327 #define xmalloc malloc
9328 #define xrealloc realloc
9329 #include "3level.h"
9331 /* Output a simple character mapping table to the given file. */
9333 static void
9334 output_simple_mapping (const char *filename,
9335 unsigned int (*func) (unsigned int),
9336 const char *version)
9338 FILE *stream;
9339 unsigned int ch, i;
9340 struct mapping_table t;
9341 unsigned int level1_offset, level2_offset, level3_offset;
9343 stream = fopen (filename, "w");
9344 if (stream == NULL)
9346 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9347 exit (1);
9350 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9351 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
9352 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9353 version);
9355 t.p = 7;
9356 t.q = 9;
9357 mapping_table_init (&t);
9359 for (ch = 0; ch < 0x110000; ch++)
9361 int value = (int) func (ch) - (int) ch;
9363 mapping_table_add (&t, ch, value);
9366 mapping_table_finalize (&t);
9368 /* Offsets in t.result, in memory of this process. */
9369 level1_offset =
9370 5 * sizeof (uint32_t);
9371 level2_offset =
9372 5 * sizeof (uint32_t)
9373 + t.level1_size * sizeof (uint32_t);
9374 level3_offset =
9375 5 * sizeof (uint32_t)
9376 + t.level1_size * sizeof (uint32_t)
9377 + (t.level2_size << t.q) * sizeof (uint32_t);
9379 for (i = 0; i < 5; i++)
9380 fprintf (stream, "#define mapping_header_%d %d\n", i,
9381 ((uint32_t *) t.result)[i]);
9382 fprintf (stream, "static const\n");
9383 fprintf (stream, "struct\n");
9384 fprintf (stream, " {\n");
9385 fprintf (stream, " int level1[%zu];\n", t.level1_size);
9386 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
9387 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
9388 fprintf (stream, " }\n");
9389 fprintf (stream, "u_mapping =\n");
9390 fprintf (stream, "{\n");
9391 fprintf (stream, " {");
9392 if (t.level1_size > 8)
9393 fprintf (stream, "\n ");
9394 for (i = 0; i < t.level1_size; i++)
9396 uint32_t offset;
9397 if (i > 0 && (i % 8) == 0)
9398 fprintf (stream, "\n ");
9399 offset = ((uint32_t *) (t.result + level1_offset))[i];
9400 if (offset == 0)
9401 fprintf (stream, " %5d", -1);
9402 else
9403 fprintf (stream, " %5zu",
9404 (offset - level2_offset) / sizeof (uint32_t));
9405 if (i+1 < t.level1_size)
9406 fprintf (stream, ",");
9408 if (t.level1_size > 8)
9409 fprintf (stream, "\n ");
9410 fprintf (stream, " },\n");
9411 fprintf (stream, " {");
9412 if (t.level2_size << t.q > 8)
9413 fprintf (stream, "\n ");
9414 for (i = 0; i < t.level2_size << t.q; i++)
9416 uint32_t offset;
9417 if (i > 0 && (i % 8) == 0)
9418 fprintf (stream, "\n ");
9419 offset = ((uint32_t *) (t.result + level2_offset))[i];
9420 if (offset == 0)
9421 fprintf (stream, " %5d", -1);
9422 else
9423 fprintf (stream, " %5zu",
9424 (offset - level3_offset) / sizeof (int32_t));
9425 if (i+1 < t.level2_size << t.q)
9426 fprintf (stream, ",");
9428 if (t.level2_size << t.q > 8)
9429 fprintf (stream, "\n ");
9430 fprintf (stream, " },\n");
9431 fprintf (stream, " {");
9432 if (t.level3_size << t.p > 8)
9433 fprintf (stream, "\n ");
9434 for (i = 0; i < t.level3_size << t.p; i++)
9436 if (i > 0 && (i % 8) == 0)
9437 fprintf (stream, "\n ");
9438 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
9439 if (i+1 < t.level3_size << t.p)
9440 fprintf (stream, ",");
9442 if (t.level3_size << t.p > 8)
9443 fprintf (stream, "\n ");
9444 fprintf (stream, " }\n");
9445 fprintf (stream, "};\n");
9447 if (ferror (stream) || fclose (stream))
9449 fprintf (stderr, "error writing to '%s'\n", filename);
9450 exit (1);
9454 /* ========================================================================= */
9456 /* A special casing context.
9457 A context is negated through x -> -x. */
9458 enum
9460 SCC_ALWAYS = 0,
9461 SCC_FINAL_SIGMA,
9462 SCC_AFTER_SOFT_DOTTED,
9463 SCC_MORE_ABOVE,
9464 SCC_BEFORE_DOT,
9465 SCC_AFTER_I
9468 /* A special casing rule. */
9469 struct special_casing_rule
9471 unsigned int code;
9472 unsigned int lower_mapping[3];
9473 unsigned int title_mapping[3];
9474 unsigned int upper_mapping[3];
9475 unsigned int casefold_mapping[3];
9476 const char *language;
9477 int context;
9480 /* The special casing rules. */
9481 struct special_casing_rule **casing_rules;
9482 unsigned int num_casing_rules;
9483 unsigned int allocated_casing_rules;
9485 static void
9486 add_casing_rule (struct special_casing_rule *new_rule)
9488 if (num_casing_rules == allocated_casing_rules)
9490 allocated_casing_rules = 2 * allocated_casing_rules;
9491 if (allocated_casing_rules < 16)
9492 allocated_casing_rules = 16;
9493 casing_rules =
9494 (struct special_casing_rule **)
9495 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
9497 casing_rules[num_casing_rules++] = new_rule;
9500 /* Stores in casing_rules the special casing rules found in
9501 specialcasing_filename. */
9502 static void
9503 fill_casing_rules (const char *specialcasing_filename)
9505 FILE *stream;
9507 stream = fopen (specialcasing_filename, "r");
9508 if (stream == NULL)
9510 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
9511 exit (1);
9514 casing_rules = NULL;
9515 num_casing_rules = 0;
9516 allocated_casing_rules = 0;
9518 for (;;)
9520 char buf[200+1];
9521 char *scanptr;
9522 char *endptr;
9523 int i;
9525 unsigned int code;
9526 unsigned int lower_mapping[3];
9527 unsigned int title_mapping[3];
9528 unsigned int upper_mapping[3];
9529 char *language;
9530 int context;
9532 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9533 break;
9535 if (buf[0] == '\0' || buf[0] == '#')
9536 continue;
9538 /* Scan code. */
9539 scanptr = buf;
9540 code = strtoul (scanptr, &endptr, 16);
9541 if (endptr == scanptr)
9543 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9544 exit (1);
9546 scanptr = endptr;
9547 if (*scanptr != ';')
9549 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9550 exit (1);
9552 scanptr++;
9554 /* Scan lower mapping. */
9555 for (i = 0; i < 3; i++)
9556 lower_mapping[i] = 0;
9557 for (i = 0; i < 3; i++)
9559 while (*scanptr == ' ')
9560 scanptr++;
9561 if (*scanptr == ';')
9562 break;
9563 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
9564 if (endptr == scanptr)
9566 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9567 exit (1);
9569 scanptr = endptr;
9571 if (*scanptr != ';')
9573 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9574 exit (1);
9576 scanptr++;
9578 /* Scan title mapping. */
9579 for (i = 0; i < 3; i++)
9580 title_mapping[i] = 0;
9581 for (i = 0; i < 3; i++)
9583 while (*scanptr == ' ')
9584 scanptr++;
9585 if (*scanptr == ';')
9586 break;
9587 title_mapping[i] = strtoul (scanptr, &endptr, 16);
9588 if (endptr == scanptr)
9590 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9591 exit (1);
9593 scanptr = endptr;
9595 if (*scanptr != ';')
9597 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9598 exit (1);
9600 scanptr++;
9602 /* Scan upper mapping. */
9603 for (i = 0; i < 3; i++)
9604 upper_mapping[i] = 0;
9605 for (i = 0; i < 3; i++)
9607 while (*scanptr == ' ')
9608 scanptr++;
9609 if (*scanptr == ';')
9610 break;
9611 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
9612 if (endptr == scanptr)
9614 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9615 exit (1);
9617 scanptr = endptr;
9619 if (*scanptr != ';')
9621 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9622 exit (1);
9624 scanptr++;
9626 /* Scan language and context. */
9627 language = NULL;
9628 context = SCC_ALWAYS;
9629 while (*scanptr == ' ')
9630 scanptr++;
9631 if (*scanptr != '\0' && *scanptr != '#')
9633 const char *word_begin = scanptr;
9634 const char *word_end;
9636 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9637 scanptr++;
9638 word_end = scanptr;
9640 while (*scanptr == ' ')
9641 scanptr++;
9643 if (word_end - word_begin == 2)
9645 language = (char *) malloc ((word_end - word_begin) + 1);
9646 memcpy (language, word_begin, 2);
9647 language[word_end - word_begin] = '\0';
9648 word_begin = word_end = NULL;
9650 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9652 word_begin = scanptr;
9653 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9654 scanptr++;
9655 word_end = scanptr;
9659 if (word_end > word_begin)
9661 bool negate = false;
9663 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
9665 word_begin += 4;
9666 negate = true;
9668 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
9669 context = SCC_FINAL_SIGMA;
9670 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
9671 context = SCC_AFTER_SOFT_DOTTED;
9672 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
9673 context = SCC_MORE_ABOVE;
9674 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
9675 context = SCC_BEFORE_DOT;
9676 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
9677 context = SCC_AFTER_I;
9678 else
9680 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
9681 exit (1);
9683 if (negate)
9684 context = - context;
9687 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9689 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9690 exit (1);
9694 /* Store the rule. */
9696 struct special_casing_rule *new_rule =
9697 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9698 new_rule->code = code;
9699 new_rule->language = language;
9700 new_rule->context = context;
9701 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
9702 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
9703 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
9705 add_casing_rule (new_rule);
9709 if (ferror (stream) || fclose (stream))
9711 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
9712 exit (1);
9716 /* A casefolding rule. */
9717 struct casefold_rule
9719 unsigned int code;
9720 unsigned int mapping[3];
9721 const char *language;
9724 /* The casefolding rules. */
9725 struct casefold_rule **casefolding_rules;
9726 unsigned int num_casefolding_rules;
9727 unsigned int allocated_casefolding_rules;
9729 /* Stores in casefolding_rules the case folding rules found in
9730 casefolding_filename. */
9731 static void
9732 fill_casefolding_rules (const char *casefolding_filename)
9734 FILE *stream;
9736 stream = fopen (casefolding_filename, "r");
9737 if (stream == NULL)
9739 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
9740 exit (1);
9743 casefolding_rules = NULL;
9744 num_casefolding_rules = 0;
9745 allocated_casefolding_rules = 0;
9747 for (;;)
9749 char buf[200+1];
9750 char *scanptr;
9751 char *endptr;
9752 int i;
9754 unsigned int code;
9755 char type;
9756 unsigned int mapping[3];
9758 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9759 break;
9761 if (buf[0] == '\0' || buf[0] == '#')
9762 continue;
9764 /* Scan code. */
9765 scanptr = buf;
9766 code = strtoul (scanptr, &endptr, 16);
9767 if (endptr == scanptr)
9769 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9770 exit (1);
9772 scanptr = endptr;
9773 if (*scanptr != ';')
9775 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9776 exit (1);
9778 scanptr++;
9780 /* Scan type. */
9781 while (*scanptr == ' ')
9782 scanptr++;
9784 switch (*scanptr)
9786 case 'C': case 'F': case 'S': case 'T':
9787 type = *scanptr;
9788 break;
9789 default:
9790 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9791 exit (1);
9793 scanptr++;
9794 if (*scanptr != ';')
9796 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9797 exit (1);
9799 scanptr++;
9801 /* Scan casefold mapping. */
9802 for (i = 0; i < 3; i++)
9803 mapping[i] = 0;
9804 for (i = 0; i < 3; i++)
9806 while (*scanptr == ' ')
9807 scanptr++;
9808 if (*scanptr == ';')
9809 break;
9810 mapping[i] = strtoul (scanptr, &endptr, 16);
9811 if (endptr == scanptr)
9813 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9814 exit (1);
9816 scanptr = endptr;
9818 if (*scanptr != ';')
9820 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9821 exit (1);
9823 scanptr++;
9825 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
9826 if (type != 'S')
9828 const char * const *languages;
9829 unsigned int languages_count;
9831 /* Type 'T' indicates that the rule is applicable to Turkish
9832 languages only. */
9833 if (type == 'T')
9835 static const char * const turkish_languages[] = { "tr", "az" };
9836 languages = turkish_languages;
9837 languages_count = 2;
9839 else
9841 static const char * const all_languages[] = { NULL };
9842 languages = all_languages;
9843 languages_count = 1;
9846 for (i = 0; i < languages_count; i++)
9848 /* Store a new rule. */
9849 struct casefold_rule *new_rule =
9850 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
9851 new_rule->code = code;
9852 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
9853 new_rule->language = languages[i];
9855 if (num_casefolding_rules == allocated_casefolding_rules)
9857 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
9858 if (allocated_casefolding_rules < 16)
9859 allocated_casefolding_rules = 16;
9860 casefolding_rules =
9861 (struct casefold_rule **)
9862 realloc (casefolding_rules,
9863 allocated_casefolding_rules * sizeof (struct casefold_rule *));
9865 casefolding_rules[num_casefolding_rules++] = new_rule;
9870 if (ferror (stream) || fclose (stream))
9872 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
9873 exit (1);
9877 /* Casefold mapping, when it maps to a single character. */
9878 unsigned int unicode_casefold[0x110000];
9880 static unsigned int
9881 to_casefold (unsigned int ch)
9883 return unicode_casefold[ch];
9886 /* Redistribute the casefolding_rules:
9887 - Rules that map to a single character, language independently, are stored
9888 in unicode_casefold.
9889 - Other rules are merged into casing_rules. */
9890 static void
9891 redistribute_casefolding_rules (void)
9893 unsigned int ch, i, j;
9895 /* Fill unicode_casefold[]. */
9896 for (ch = 0; ch < 0x110000; ch++)
9897 unicode_casefold[ch] = ch;
9898 for (i = 0; i < num_casefolding_rules; i++)
9900 struct casefold_rule *cfrule = casefolding_rules[i];
9902 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
9904 ch = cfrule->code;
9905 assert (ch < 0x110000);
9906 unicode_casefold[ch] = cfrule->mapping[0];
9910 /* Extend the special casing rules by filling in their casefold_mapping[]
9911 field. */
9912 for (j = 0; j < num_casing_rules; j++)
9914 struct special_casing_rule *rule = casing_rules[j];
9915 unsigned int k;
9917 rule->casefold_mapping[0] = to_casefold (rule->code);
9918 for (k = 1; k < 3; k++)
9919 rule->casefold_mapping[k] = 0;
9922 /* Now merge the other casefolding rules into casing_rules. */
9923 for (i = 0; i < num_casefolding_rules; i++)
9925 struct casefold_rule *cfrule = casefolding_rules[i];
9927 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
9929 /* Find a rule that applies to the same code, same language, and it
9930 has context SCC_ALWAYS. At the same time, update all rules that
9931 have the same code and same or more specific language. */
9932 struct special_casing_rule *found_rule = NULL;
9934 for (j = 0; j < num_casing_rules; j++)
9936 struct special_casing_rule *rule = casing_rules[j];
9938 if (rule->code == cfrule->code
9939 && (cfrule->language == NULL
9940 || (rule->language != NULL
9941 && strcmp (rule->language, cfrule->language) == 0)))
9943 memcpy (rule->casefold_mapping, cfrule->mapping,
9944 sizeof (rule->casefold_mapping));
9946 if ((cfrule->language == NULL
9947 ? rule->language == NULL
9948 : rule->language != NULL
9949 && strcmp (rule->language, cfrule->language) == 0)
9950 && rule->context == SCC_ALWAYS)
9952 /* Found it. */
9953 found_rule = rule;
9958 if (found_rule == NULL)
9960 /* Create a new rule. */
9961 struct special_casing_rule *new_rule =
9962 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9964 /* Try to find a rule that applies to the same code, no language
9965 restriction, and with context SCC_ALWAYS. */
9966 for (j = 0; j < num_casing_rules; j++)
9968 struct special_casing_rule *rule = casing_rules[j];
9970 if (rule->code == cfrule->code
9971 && rule->context == SCC_ALWAYS
9972 && rule->language == NULL)
9974 /* Found it. */
9975 found_rule = rule;
9976 break;
9980 new_rule->code = cfrule->code;
9981 new_rule->language = cfrule->language;
9982 new_rule->context = SCC_ALWAYS;
9983 if (found_rule != NULL)
9985 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
9986 sizeof (new_rule->lower_mapping));
9987 memcpy (new_rule->title_mapping, found_rule->title_mapping,
9988 sizeof (new_rule->title_mapping));
9989 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
9990 sizeof (new_rule->upper_mapping));
9992 else
9994 unsigned int k;
9996 new_rule->lower_mapping[0] = to_lower (cfrule->code);
9997 for (k = 1; k < 3; k++)
9998 new_rule->lower_mapping[k] = 0;
9999 new_rule->title_mapping[0] = to_title (cfrule->code);
10000 for (k = 1; k < 3; k++)
10001 new_rule->title_mapping[k] = 0;
10002 new_rule->upper_mapping[0] = to_upper (cfrule->code);
10003 for (k = 1; k < 3; k++)
10004 new_rule->upper_mapping[k] = 0;
10006 memcpy (new_rule->casefold_mapping, cfrule->mapping,
10007 sizeof (new_rule->casefold_mapping));
10009 add_casing_rule (new_rule);
10015 static int
10016 compare_casing_rules (const void *a, const void *b)
10018 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
10019 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
10020 unsigned int a_code = a_rule->code;
10021 unsigned int b_code = b_rule->code;
10023 if (a_code < b_code)
10024 return -1;
10025 if (a_code > b_code)
10026 return 1;
10028 /* Sort the more specific rules before the more general ones. */
10029 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
10030 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
10033 static void
10034 sort_casing_rules (void)
10036 /* Sort the rules 1. by code, 2. by specificity. */
10037 if (num_casing_rules > 1)
10038 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
10039 compare_casing_rules);
10042 /* Output the special casing rules. */
10043 static void
10044 output_casing_rules (const char *filename, const char *version)
10046 FILE *stream;
10047 unsigned int i, j;
10048 unsigned int minor;
10050 stream = fopen (filename, "w");
10051 if (stream == NULL)
10053 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10054 exit (1);
10057 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10058 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
10059 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10060 version);
10061 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
10062 fprintf (stream, "%%struct-type\n");
10063 fprintf (stream, "%%language=ANSI-C\n");
10064 fprintf (stream, "%%define slot-name code\n");
10065 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
10066 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
10067 fprintf (stream, "%%compare-lengths\n");
10068 fprintf (stream, "%%compare-strncmp\n");
10069 fprintf (stream, "%%readonly-tables\n");
10070 fprintf (stream, "%%omit-struct-type\n");
10071 fprintf (stream, "%%%%\n");
10073 minor = 0;
10074 for (i = 0; i < num_casing_rules; i++)
10076 struct special_casing_rule *rule = casing_rules[i];
10077 int context;
10079 if (i > 0 && rule->code == casing_rules[i - 1]->code)
10080 minor += 1;
10081 else
10082 minor = 0;
10084 if (!(rule->code < 0x10000))
10086 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
10087 exit (1);
10090 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
10091 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
10093 fprintf (stream, "%d, ",
10094 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
10096 context = rule->context;
10097 if (context < 0)
10099 fprintf (stream, "-");
10100 context = - context;
10102 else
10103 fprintf (stream, " ");
10104 switch (context)
10106 case SCC_ALWAYS:
10107 fprintf (stream, "SCC_ALWAYS ");
10108 break;
10109 case SCC_FINAL_SIGMA:
10110 fprintf (stream, "SCC_FINAL_SIGMA ");
10111 break;
10112 case SCC_AFTER_SOFT_DOTTED:
10113 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
10114 break;
10115 case SCC_MORE_ABOVE:
10116 fprintf (stream, "SCC_MORE_ABOVE ");
10117 break;
10118 case SCC_BEFORE_DOT:
10119 fprintf (stream, "SCC_BEFORE_DOT ");
10120 break;
10121 case SCC_AFTER_I:
10122 fprintf (stream, "SCC_AFTER_I ");
10123 break;
10124 default:
10125 abort ();
10127 fprintf (stream, ", ");
10129 if (rule->language != NULL)
10131 assert (strlen (rule->language) == 2);
10132 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
10134 else
10135 fprintf (stream, "{ '\\0', '\\0' }, ");
10137 fprintf (stream, "{ ");
10138 for (j = 0; j < 3; j++)
10140 if (j > 0)
10141 fprintf (stream, ", ");
10142 if (!(rule->upper_mapping[j] < 0x10000))
10144 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
10145 exit (1);
10147 if (rule->upper_mapping[j] != 0)
10148 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
10149 else
10150 fprintf (stream, " 0");
10152 fprintf (stream, " }, { ");
10153 for (j = 0; j < 3; j++)
10155 if (j > 0)
10156 fprintf (stream, ", ");
10157 if (!(rule->lower_mapping[j] < 0x10000))
10159 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
10160 exit (1);
10162 if (rule->lower_mapping[j] != 0)
10163 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
10164 else
10165 fprintf (stream, " 0");
10167 fprintf (stream, " }, { ");
10168 for (j = 0; j < 3; j++)
10170 if (j > 0)
10171 fprintf (stream, ", ");
10172 if (!(rule->title_mapping[j] < 0x10000))
10174 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
10175 exit (1);
10177 if (rule->title_mapping[j] != 0)
10178 fprintf (stream, "0x%04X", rule->title_mapping[j]);
10179 else
10180 fprintf (stream, " 0");
10182 fprintf (stream, " }, { ");
10183 for (j = 0; j < 3; j++)
10185 if (j > 0)
10186 fprintf (stream, ", ");
10187 if (!(rule->casefold_mapping[j] < 0x10000))
10189 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
10190 exit (1);
10192 if (rule->casefold_mapping[j] != 0)
10193 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
10194 else
10195 fprintf (stream, " 0");
10197 fprintf (stream, " }\n");
10200 if (ferror (stream) || fclose (stream))
10202 fprintf (stderr, "error writing to '%s'\n", filename);
10203 exit (1);
10207 /* ========================================================================= */
10209 /* Quoting the Unicode standard:
10210 Definition: A character is defined to be "cased" if it has the Lowercase
10211 or Uppercase property or has a General_Category value of
10212 Titlecase_Letter. */
10213 static bool
10214 is_cased (unsigned int ch)
10216 return (is_property_lowercase (ch)
10217 || is_property_uppercase (ch)
10218 || is_category_Lt (ch));
10221 /* Quoting the Unicode standard:
10222 Definition: A character is defined to be "case-ignorable" if it has the
10223 value MidLetter {or the value MidNumLet} for the Word_Break property or
10224 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
10225 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10226 The text marked in braces was added in Unicode 5.1.0, see
10227 <https://www.unicode.org/versions/Unicode5.1.0/> section "Update of
10228 Definition of case-ignorable". */
10229 /* Since this predicate is only used for the "Before C" and "After C"
10230 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
10231 This simplifies the evaluation of the regular expressions
10232 \p{cased} (\p{case-ignorable})* C
10234 C (\p{case-ignorable})* \p{cased}
10236 static bool
10237 is_case_ignorable (unsigned int ch)
10239 return (unicode_org_wbp[ch] == WBP_MIDLETTER
10240 || unicode_org_wbp[ch] == WBP_MIDNUMLET
10241 || is_category_Mn (ch)
10242 || is_category_Me (ch)
10243 || is_category_Cf (ch)
10244 || is_category_Lm (ch)
10245 || is_category_Sk (ch))
10246 && !is_cased (ch);
10249 /* ------------------------------------------------------------------------- */
10251 /* Output all case related properties. */
10252 static void
10253 output_casing_properties (const char *version)
10255 #define PROPERTY(FN,P) \
10256 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
10257 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
10258 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
10259 PROPERTY(cased, cased)
10260 PROPERTY(ignorable, case_ignorable)
10261 #undef PROPERTY
10264 /* ========================================================================= */
10267 main (int argc, char * argv[])
10269 const char *unicodedata_filename;
10270 const char *proplist_filename;
10271 const char *derivedproplist_filename;
10272 const char *arabicshaping_filename;
10273 const char *scripts_filename;
10274 const char *blocks_filename;
10275 const char *proplist30_filename;
10276 const char *eastasianwidth_filename;
10277 const char *linebreak_filename;
10278 const char *wordbreakproperty_filename;
10279 const char *graphemebreakproperty_filename;
10280 const char *compositionexclusions_filename;
10281 const char *specialcasing_filename;
10282 const char *casefolding_filename;
10283 const char *version;
10285 if (argc != 16)
10287 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
10288 argv[0]);
10289 exit (1);
10292 unicodedata_filename = argv[1];
10293 proplist_filename = argv[2];
10294 derivedproplist_filename = argv[3];
10295 arabicshaping_filename = argv[4];
10296 scripts_filename = argv[5];
10297 blocks_filename = argv[6];
10298 proplist30_filename = argv[7];
10299 eastasianwidth_filename = argv[8];
10300 linebreak_filename = argv[9];
10301 wordbreakproperty_filename = argv[10];
10302 graphemebreakproperty_filename = argv[11];
10303 compositionexclusions_filename = argv[12];
10304 specialcasing_filename = argv[13];
10305 casefolding_filename = argv[14];
10306 version = argv[15];
10308 fill_attributes (unicodedata_filename);
10309 clear_properties ();
10310 fill_properties (proplist_filename);
10311 fill_properties (derivedproplist_filename);
10312 fill_properties30 (proplist30_filename);
10313 fill_arabicshaping (arabicshaping_filename);
10314 fill_scripts (scripts_filename);
10315 fill_blocks (blocks_filename);
10316 fill_width (eastasianwidth_filename);
10317 fill_org_lbp (linebreak_filename);
10318 fill_org_wbp (wordbreakproperty_filename);
10319 fill_org_gbp (graphemebreakproperty_filename);
10320 fill_composition_exclusions (compositionexclusions_filename);
10321 fill_casing_rules (specialcasing_filename);
10322 fill_casefolding_rules (casefolding_filename);
10323 redistribute_casefolding_rules ();
10324 sort_casing_rules ();
10326 output_categories (version);
10327 output_category ("unictype/categ_of.h", version);
10328 output_combclass ("unictype/combiningclass.h", version);
10329 output_bidi_category ("unictype/bidi_of.h", version);
10330 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
10331 output_decimal_digit ("unictype/decdigit.h", version);
10332 output_digit_test ("../tests/unictype/test-digit.h", version);
10333 output_digit ("unictype/digit.h", version);
10334 output_numeric_test ("../tests/unictype/test-numeric.h", version);
10335 output_numeric ("unictype/numeric.h", version);
10336 output_mirror ("unictype/mirror.h", version);
10337 output_properties (version);
10338 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
10339 output_joining_type ("unictype/joiningtype_of.h", version);
10340 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
10341 output_joining_group ("unictype/joininggroup_of.h", version);
10343 output_scripts (version);
10344 output_scripts_byname (version);
10345 output_blocks (version);
10346 output_ident_properties (version);
10347 output_nonspacing_property ("uniwidth/width.c.part");
10348 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
10349 output_old_ctype (version);
10351 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
10352 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
10353 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
10355 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
10356 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
10357 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
10359 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
10360 output_gbp_table ("unigbrk/gbrkprop.h", version);
10362 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
10363 debug_output_composition_tables ("uninorm/composition.txt");
10364 output_composition_tables ("uninorm/composition-table.gperf", version);
10366 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
10367 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
10368 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
10369 output_simple_mapping ("unicase/toupper.h", to_upper, version);
10370 output_simple_mapping ("unicase/tolower.h", to_lower, version);
10371 output_simple_mapping ("unicase/totitle.h", to_title, version);
10372 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
10373 output_casing_rules ("unicase/special-casing-table.gperf", version);
10374 output_casing_properties (version);
10376 return 0;
10380 * Local Variables:
10381 * coding: utf-8
10382 * compile-command: "\
10383 * gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \\
10384 * ./gen-uni-tables \\
10385 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/UnicodeData.txt \\
10386 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/PropList.txt \\
10387 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/DerivedCoreProperties.txt \\
10388 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/ArabicShaping.txt \\
10389 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/Scripts.txt \\
10390 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/Blocks.txt \\
10391 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \\
10392 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/EastAsianWidth.txt \\
10393 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/LineBreak.txt \\
10394 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/auxiliary/WordBreakProperty.txt \\
10395 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \\
10396 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/CompositionExclusions.txt \\
10397 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/SpecialCasing.txt \\
10398 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/CaseFolding.txt \\
10399 * 9.0.0 \\
10400 * && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \\
10401 * && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt"
10402 * End: