havelib: Support overriding the result of AC_LIB_PREPARE_MULTILIB.
[gnulib.git] / lib / gen-uni-tables.c
blobf704376a580bfb5154bac8d198c75ab0a70b1eba
1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2017 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 /* Usage example:
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/ArabicShaping.txt \
25 /usr/local/share/Unidata/Scripts.txt \
26 /usr/local/share/Unidata/Blocks.txt \
27 /usr/local/share/Unidata/PropList-3.0.1.txt \
28 /usr/local/share/Unidata/EastAsianWidth.txt \
29 /usr/local/share/Unidata/LineBreak.txt \
30 /usr/local/share/Unidata/WordBreakProperty.txt \
31 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
32 /usr/local/share/Unidata/CompositionExclusions.txt \
33 /usr/local/share/Unidata/SpecialCasing.txt \
34 /usr/local/share/Unidata/CaseFolding.txt \
35 8.0.0
38 #include <assert.h>
39 #include <stdbool.h>
40 #include <stdint.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <time.h>
46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48 /* ========================================================================= */
50 /* Reading UnicodeData.txt. */
51 /* See UCD.html. */
53 /* This structure represents one line in the UnicodeData.txt file. */
54 struct unicode_attribute
56 const char *name; /* Character name */
57 const char *category; /* General category */
58 const char *combining; /* Canonical combining class */
59 const char *bidi; /* Bidirectional category */
60 const char *decomposition; /* Character decomposition mapping */
61 const char *decdigit; /* Decimal digit value */
62 const char *digit; /* Digit value */
63 const char *numeric; /* Numeric value */
64 bool mirrored; /* mirrored */
65 const char *oldname; /* Old Unicode 1.0 name */
66 const char *comment; /* Comment */
67 unsigned int upper; /* Uppercase mapping */
68 unsigned int lower; /* Lowercase mapping */
69 unsigned int title; /* Titlecase mapping */
72 /* Missing fields are represented with "" for strings, and NONE for
73 characters. */
74 #define NONE (~(unsigned int)0)
76 /* The entire contents of the UnicodeData.txt file. */
77 struct unicode_attribute unicode_attributes [0x110000];
79 /* Stores in unicode_attributes[i] the values from the given fields. */
80 static void
81 fill_attribute (unsigned int i,
82 const char *field1, const char *field2,
83 const char *field3, const char *field4,
84 const char *field5, const char *field6,
85 const char *field7, const char *field8,
86 const char *field9, const char *field10,
87 const char *field11, const char *field12,
88 const char *field13, const char *field14)
90 struct unicode_attribute * uni;
92 if (i >= 0x110000)
94 fprintf (stderr, "index too large\n");
95 exit (1);
97 if (strcmp (field2, "Cs") == 0)
98 /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */
99 return;
100 uni = &unicode_attributes[i];
101 /* Copy the strings. */
102 uni->name = strdup (field1);
103 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
104 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
105 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
106 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
107 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
108 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
109 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
110 uni->mirrored = (field9[0] == 'Y');
111 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
112 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
113 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
114 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
115 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
118 /* Maximum length of a field in the UnicodeData.txt file. */
119 #define FIELDLEN 160
121 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
122 Reads up to (but excluding) DELIM.
123 Returns 1 when a field was successfully read, otherwise 0. */
124 static int
125 getfield (FILE *stream, char *buffer, int delim)
127 int count = 0;
128 int c;
130 for (; (c = getc (stream)), (c != EOF && c != delim); )
132 /* The original unicode.org UnicodeData.txt file happens to have
133 CR/LF line terminators. Silently convert to LF. */
134 if (c == '\r')
135 continue;
137 /* Put c into the buffer. */
138 if (++count >= FIELDLEN - 1)
140 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
141 exit (1);
143 *buffer++ = c;
146 if (c == EOF)
147 return 0;
149 *buffer = '\0';
150 return 1;
153 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
154 file. */
155 static void
156 fill_attributes (const char *unicodedata_filename)
158 unsigned int i, j;
159 FILE *stream;
160 char field0[FIELDLEN];
161 char field1[FIELDLEN];
162 char field2[FIELDLEN];
163 char field3[FIELDLEN];
164 char field4[FIELDLEN];
165 char field5[FIELDLEN];
166 char field6[FIELDLEN];
167 char field7[FIELDLEN];
168 char field8[FIELDLEN];
169 char field9[FIELDLEN];
170 char field10[FIELDLEN];
171 char field11[FIELDLEN];
172 char field12[FIELDLEN];
173 char field13[FIELDLEN];
174 char field14[FIELDLEN];
175 int lineno = 0;
177 for (i = 0; i < 0x110000; i++)
178 unicode_attributes[i].name = NULL;
180 stream = fopen (unicodedata_filename, "r");
181 if (stream == NULL)
183 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
184 exit (1);
187 for (;;)
189 int n;
191 lineno++;
192 n = getfield (stream, field0, ';');
193 n += getfield (stream, field1, ';');
194 n += getfield (stream, field2, ';');
195 n += getfield (stream, field3, ';');
196 n += getfield (stream, field4, ';');
197 n += getfield (stream, field5, ';');
198 n += getfield (stream, field6, ';');
199 n += getfield (stream, field7, ';');
200 n += getfield (stream, field8, ';');
201 n += getfield (stream, field9, ';');
202 n += getfield (stream, field10, ';');
203 n += getfield (stream, field11, ';');
204 n += getfield (stream, field12, ';');
205 n += getfield (stream, field13, ';');
206 n += getfield (stream, field14, '\n');
207 if (n == 0)
208 break;
209 if (n != 15)
211 fprintf (stderr, "short line in '%s':%d\n",
212 unicodedata_filename, lineno);
213 exit (1);
215 i = strtoul (field0, NULL, 16);
216 if (field1[0] == '<'
217 && strlen (field1) >= 9
218 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
220 /* Deal with a range. */
221 lineno++;
222 n = getfield (stream, field0, ';');
223 n += getfield (stream, field1, ';');
224 n += getfield (stream, field2, ';');
225 n += getfield (stream, field3, ';');
226 n += getfield (stream, field4, ';');
227 n += getfield (stream, field5, ';');
228 n += getfield (stream, field6, ';');
229 n += getfield (stream, field7, ';');
230 n += getfield (stream, field8, ';');
231 n += getfield (stream, field9, ';');
232 n += getfield (stream, field10, ';');
233 n += getfield (stream, field11, ';');
234 n += getfield (stream, field12, ';');
235 n += getfield (stream, field13, ';');
236 n += getfield (stream, field14, '\n');
237 if (n != 15)
239 fprintf (stderr, "missing end range in '%s':%d\n",
240 unicodedata_filename, lineno);
241 exit (1);
243 if (!(field1[0] == '<'
244 && strlen (field1) >= 8
245 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
247 fprintf (stderr, "missing end range in '%s':%d\n",
248 unicodedata_filename, lineno);
249 exit (1);
251 field1[strlen (field1) - 7] = '\0';
252 j = strtoul (field0, NULL, 16);
253 for (; i <= j; i++)
254 fill_attribute (i, field1+1, field2, field3, field4, field5,
255 field6, field7, field8, field9, field10,
256 field11, field12, field13, field14);
258 else
260 /* Single character line */
261 fill_attribute (i, field1, field2, field3, field4, field5,
262 field6, field7, field8, field9, field10,
263 field11, field12, field13, field14);
267 if (ferror (stream) || fclose (stream))
269 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
270 exit (1);
274 /* ========================================================================= */
276 /* General category. */
277 /* See Unicode 3.0 book, section 4.5,
278 UCD.html. */
280 static bool
281 is_category_L (unsigned int ch)
283 return (unicode_attributes[ch].name != NULL
284 && unicode_attributes[ch].category[0] == 'L');
287 static bool
288 is_category_LC (unsigned int ch)
290 /* See PropertyValueAliases.txt. */
291 return (unicode_attributes[ch].name != NULL
292 && unicode_attributes[ch].category[0] == 'L'
293 && (unicode_attributes[ch].category[1] == 'u'
294 || unicode_attributes[ch].category[1] == 'l'
295 || unicode_attributes[ch].category[1] == 't'));
298 static bool
299 is_category_Lu (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 'u');
306 static bool
307 is_category_Ll (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'l');
314 static bool
315 is_category_Lt (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'L'
319 && unicode_attributes[ch].category[1] == 't');
322 static bool
323 is_category_Lm (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'L'
327 && unicode_attributes[ch].category[1] == 'm');
330 static bool
331 is_category_Lo (unsigned int ch)
333 return (unicode_attributes[ch].name != NULL
334 && unicode_attributes[ch].category[0] == 'L'
335 && unicode_attributes[ch].category[1] == 'o');
338 static bool
339 is_category_M (unsigned int ch)
341 return (unicode_attributes[ch].name != NULL
342 && unicode_attributes[ch].category[0] == 'M');
345 static bool
346 is_category_Mn (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'M'
350 && unicode_attributes[ch].category[1] == 'n');
353 static bool
354 is_category_Mc (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'M'
358 && unicode_attributes[ch].category[1] == 'c');
361 static bool
362 is_category_Me (unsigned int ch)
364 return (unicode_attributes[ch].name != NULL
365 && unicode_attributes[ch].category[0] == 'M'
366 && unicode_attributes[ch].category[1] == 'e');
369 static bool
370 is_category_N (unsigned int ch)
372 return (unicode_attributes[ch].name != NULL
373 && unicode_attributes[ch].category[0] == 'N');
376 static bool
377 is_category_Nd (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'N'
381 && unicode_attributes[ch].category[1] == 'd');
384 static bool
385 is_category_Nl (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'N'
389 && unicode_attributes[ch].category[1] == 'l');
392 static bool
393 is_category_No (unsigned int ch)
395 return (unicode_attributes[ch].name != NULL
396 && unicode_attributes[ch].category[0] == 'N'
397 && unicode_attributes[ch].category[1] == 'o');
400 static bool
401 is_category_P (unsigned int ch)
403 return (unicode_attributes[ch].name != NULL
404 && unicode_attributes[ch].category[0] == 'P');
407 static bool
408 is_category_Pc (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 'c');
415 static bool
416 is_category_Pd (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'd');
423 static bool
424 is_category_Ps (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 's');
431 static bool
432 is_category_Pe (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'e');
439 static bool
440 is_category_Pi (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'P'
444 && unicode_attributes[ch].category[1] == 'i');
447 static bool
448 is_category_Pf (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'P'
452 && unicode_attributes[ch].category[1] == 'f');
455 static bool
456 is_category_Po (unsigned int ch)
458 return (unicode_attributes[ch].name != NULL
459 && unicode_attributes[ch].category[0] == 'P'
460 && unicode_attributes[ch].category[1] == 'o');
463 static bool
464 is_category_S (unsigned int ch)
466 return (unicode_attributes[ch].name != NULL
467 && unicode_attributes[ch].category[0] == 'S');
470 static bool
471 is_category_Sm (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'm');
478 static bool
479 is_category_Sc (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'S'
483 && unicode_attributes[ch].category[1] == 'c');
486 static bool
487 is_category_Sk (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'S'
491 && unicode_attributes[ch].category[1] == 'k');
494 static bool
495 is_category_So (unsigned int ch)
497 return (unicode_attributes[ch].name != NULL
498 && unicode_attributes[ch].category[0] == 'S'
499 && unicode_attributes[ch].category[1] == 'o');
502 static bool
503 is_category_Z (unsigned int ch)
505 return (unicode_attributes[ch].name != NULL
506 && unicode_attributes[ch].category[0] == 'Z');
509 static bool
510 is_category_Zs (unsigned int ch)
512 return (unicode_attributes[ch].name != NULL
513 && unicode_attributes[ch].category[0] == 'Z'
514 && unicode_attributes[ch].category[1] == 's');
517 static bool
518 is_category_Zl (unsigned int ch)
520 return (unicode_attributes[ch].name != NULL
521 && unicode_attributes[ch].category[0] == 'Z'
522 && unicode_attributes[ch].category[1] == 'l');
525 static bool
526 is_category_Zp (unsigned int ch)
528 return (unicode_attributes[ch].name != NULL
529 && unicode_attributes[ch].category[0] == 'Z'
530 && unicode_attributes[ch].category[1] == 'p');
533 static bool
534 is_category_C (unsigned int ch)
536 return (unicode_attributes[ch].name == NULL
537 || unicode_attributes[ch].category[0] == 'C');
540 static bool
541 is_category_Cc (unsigned int ch)
543 return (unicode_attributes[ch].name != NULL
544 && unicode_attributes[ch].category[0] == 'C'
545 && unicode_attributes[ch].category[1] == 'c');
548 static bool
549 is_category_Cf (unsigned int ch)
551 return (unicode_attributes[ch].name != NULL
552 && unicode_attributes[ch].category[0] == 'C'
553 && unicode_attributes[ch].category[1] == 'f');
556 static bool
557 is_category_Cs (unsigned int ch)
559 return (ch >= 0xd800 && ch < 0xe000);
562 static bool
563 is_category_Co (unsigned int ch)
565 return (unicode_attributes[ch].name != NULL
566 && unicode_attributes[ch].category[0] == 'C'
567 && unicode_attributes[ch].category[1] == 'o');
570 static bool
571 is_category_Cn (unsigned int ch)
573 return (unicode_attributes[ch].name == NULL
574 && !(ch >= 0xd800 && ch < 0xe000));
577 /* Output a boolean property in a human readable format. */
578 static void
579 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
581 FILE *stream;
582 unsigned int ch;
584 stream = fopen (filename, "w");
585 if (stream == NULL)
587 fprintf (stderr, "cannot open '%s' for writing\n", filename);
588 exit (1);
591 #if 0 /* This yields huge text output. */
592 for (ch = 0; ch < 0x110000; ch++)
593 if (predicate (ch))
595 fprintf (stream, "0x%04X\n", ch);
597 #else
598 for (ch = 0; ch < 0x110000; ch++)
599 if (predicate (ch))
601 unsigned int first = ch;
602 unsigned int last;
604 while (ch + 1 < 0x110000 && predicate (ch + 1))
605 ch++;
606 last = ch;
607 if (first < last)
608 fprintf (stream, "0x%04X..0x%04X\n", first, last);
609 else
610 fprintf (stream, "0x%04X\n", ch);
612 #endif
614 if (ferror (stream) || fclose (stream))
616 fprintf (stderr, "error writing to '%s'\n", filename);
617 exit (1);
621 /* Output the unit test for a boolean property. */
622 static void
623 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
625 FILE *stream;
626 bool need_comma;
627 unsigned int ch;
629 stream = fopen (filename, "w");
630 if (stream == NULL)
632 fprintf (stderr, "cannot open '%s' for writing\n", filename);
633 exit (1);
636 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
637 fprintf (stream, "/* Test the Unicode character type functions.\n");
638 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
639 fprintf (stream, "\n");
640 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
641 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
642 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
643 fprintf (stream, " (at your option) any later version.\n");
644 fprintf (stream, "\n");
645 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
646 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
647 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
648 fprintf (stream, " GNU General Public License for more details.\n");
649 fprintf (stream, "\n");
650 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
651 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
652 fprintf (stream, "\n");
653 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
654 fprintf (stream, "\n");
656 need_comma = false;
657 for (ch = 0; ch < 0x110000; ch++)
658 if (predicate (ch))
660 unsigned int first = ch;
661 unsigned int last;
663 while (ch + 1 < 0x110000 && predicate (ch + 1))
664 ch++;
665 last = ch;
666 if (need_comma)
667 fprintf (stream, ",\n");
668 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
669 need_comma = true;
671 if (need_comma)
672 fprintf (stream, "\n");
674 fprintf (stream, "\n");
675 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
676 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
678 if (ferror (stream) || fclose (stream))
680 fprintf (stderr, "error writing to '%s'\n", filename);
681 exit (1);
685 /* Construction of sparse 3-level tables. */
686 #define TABLE predicate_table
687 #define xmalloc malloc
688 #define xrealloc realloc
689 #include "3levelbit.h"
691 /* Output a boolean property in a three-level bitmap. */
692 static void
693 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
695 FILE *stream;
696 unsigned int ch, i;
697 struct predicate_table t;
698 unsigned int level1_offset, level2_offset, level3_offset;
700 stream = fopen (filename, "w");
701 if (stream == NULL)
703 fprintf (stderr, "cannot open '%s' for writing\n", filename);
704 exit (1);
707 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
708 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
709 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
710 version);
712 t.p = 4; /* or: 5 */
713 t.q = 7; /* or: 6 */
714 predicate_table_init (&t);
716 for (ch = 0; ch < 0x110000; ch++)
717 if (predicate (ch))
718 predicate_table_add (&t, ch);
720 predicate_table_finalize (&t);
722 /* Offsets in t.result, in memory of this process. */
723 level1_offset =
724 5 * sizeof (uint32_t);
725 level2_offset =
726 5 * sizeof (uint32_t)
727 + t.level1_size * sizeof (uint32_t);
728 level3_offset =
729 5 * sizeof (uint32_t)
730 + t.level1_size * sizeof (uint32_t)
731 + (t.level2_size << t.q) * sizeof (uint32_t);
733 for (i = 0; i < 5; i++)
734 if (i != 1)
735 fprintf (stream, "#define header_%d %d\n", i,
736 ((uint32_t *) t.result)[i]);
738 fprintf (stream, "static const\n");
739 fprintf (stream, "struct\n");
740 fprintf (stream, " {\n");
741 fprintf (stream, " int header[1];\n");
742 fprintf (stream, " int level1[%zu];\n", t.level1_size);
743 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
744 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
745 fprintf (stream, " }\n");
746 fprintf (stream, "%s =\n", name);
747 fprintf (stream, "{\n");
748 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
749 fprintf (stream, " {");
750 if (t.level1_size > 1)
751 fprintf (stream, "\n ");
752 for (i = 0; i < t.level1_size; i++)
754 uint32_t offset;
755 if (i > 0 && (i % 1) == 0)
756 fprintf (stream, "\n ");
757 offset = ((uint32_t *) (t.result + level1_offset))[i];
758 if (offset == 0)
759 fprintf (stream, " %5d", -1);
760 else
761 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
762 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
763 if (i+1 < t.level1_size)
764 fprintf (stream, ",");
766 if (t.level1_size > 1)
767 fprintf (stream, "\n ");
768 fprintf (stream, " },\n");
769 fprintf (stream, " {");
770 if (t.level2_size << t.q > 1)
771 fprintf (stream, "\n ");
772 for (i = 0; i < t.level2_size << t.q; i++)
774 uint32_t offset;
775 if (i > 0 && (i % 1) == 0)
776 fprintf (stream, "\n ");
777 offset = ((uint32_t *) (t.result + level2_offset))[i];
778 if (offset == 0)
779 fprintf (stream, " %5d", -1);
780 else
781 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
782 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
783 if (i+1 < t.level2_size << t.q)
784 fprintf (stream, ",");
786 if (t.level2_size << t.q > 1)
787 fprintf (stream, "\n ");
788 fprintf (stream, " },\n");
789 fprintf (stream, " {");
790 if (t.level3_size << t.p > 4)
791 fprintf (stream, "\n ");
792 for (i = 0; i < t.level3_size << t.p; i++)
794 if (i > 0 && (i % 4) == 0)
795 fprintf (stream, "\n ");
796 fprintf (stream, " 0x%08X",
797 ((uint32_t *) (t.result + level3_offset))[i]);
798 if (i+1 < t.level3_size << t.p)
799 fprintf (stream, ",");
801 if (t.level3_size << t.p > 4)
802 fprintf (stream, "\n ");
803 fprintf (stream, " }\n");
804 fprintf (stream, "};\n");
806 if (ferror (stream) || fclose (stream))
808 fprintf (stderr, "error writing to '%s'\n", filename);
809 exit (1);
813 /* Output all categories. */
814 static void
815 output_categories (const char *version)
817 #define CATEGORY(C) \
818 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
819 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
820 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
821 CATEGORY (L)
822 CATEGORY (LC)
823 CATEGORY (Lu)
824 CATEGORY (Ll)
825 CATEGORY (Lt)
826 CATEGORY (Lm)
827 CATEGORY (Lo)
828 CATEGORY (M)
829 CATEGORY (Mn)
830 CATEGORY (Mc)
831 CATEGORY (Me)
832 CATEGORY (N)
833 CATEGORY (Nd)
834 CATEGORY (Nl)
835 CATEGORY (No)
836 CATEGORY (P)
837 CATEGORY (Pc)
838 CATEGORY (Pd)
839 CATEGORY (Ps)
840 CATEGORY (Pe)
841 CATEGORY (Pi)
842 CATEGORY (Pf)
843 CATEGORY (Po)
844 CATEGORY (S)
845 CATEGORY (Sm)
846 CATEGORY (Sc)
847 CATEGORY (Sk)
848 CATEGORY (So)
849 CATEGORY (Z)
850 CATEGORY (Zs)
851 CATEGORY (Zl)
852 CATEGORY (Zp)
853 CATEGORY (C)
854 CATEGORY (Cc)
855 CATEGORY (Cf)
856 CATEGORY (Cs)
857 CATEGORY (Co)
858 CATEGORY (Cn)
859 #undef CATEGORY
862 enum
864 UC_CATEGORY_MASK_L = 0x0000001f,
865 UC_CATEGORY_MASK_LC = 0x00000007,
866 UC_CATEGORY_MASK_Lu = 0x00000001,
867 UC_CATEGORY_MASK_Ll = 0x00000002,
868 UC_CATEGORY_MASK_Lt = 0x00000004,
869 UC_CATEGORY_MASK_Lm = 0x00000008,
870 UC_CATEGORY_MASK_Lo = 0x00000010,
871 UC_CATEGORY_MASK_M = 0x000000e0,
872 UC_CATEGORY_MASK_Mn = 0x00000020,
873 UC_CATEGORY_MASK_Mc = 0x00000040,
874 UC_CATEGORY_MASK_Me = 0x00000080,
875 UC_CATEGORY_MASK_N = 0x00000700,
876 UC_CATEGORY_MASK_Nd = 0x00000100,
877 UC_CATEGORY_MASK_Nl = 0x00000200,
878 UC_CATEGORY_MASK_No = 0x00000400,
879 UC_CATEGORY_MASK_P = 0x0003f800,
880 UC_CATEGORY_MASK_Pc = 0x00000800,
881 UC_CATEGORY_MASK_Pd = 0x00001000,
882 UC_CATEGORY_MASK_Ps = 0x00002000,
883 UC_CATEGORY_MASK_Pe = 0x00004000,
884 UC_CATEGORY_MASK_Pi = 0x00008000,
885 UC_CATEGORY_MASK_Pf = 0x00010000,
886 UC_CATEGORY_MASK_Po = 0x00020000,
887 UC_CATEGORY_MASK_S = 0x003c0000,
888 UC_CATEGORY_MASK_Sm = 0x00040000,
889 UC_CATEGORY_MASK_Sc = 0x00080000,
890 UC_CATEGORY_MASK_Sk = 0x00100000,
891 UC_CATEGORY_MASK_So = 0x00200000,
892 UC_CATEGORY_MASK_Z = 0x01c00000,
893 UC_CATEGORY_MASK_Zs = 0x00400000,
894 UC_CATEGORY_MASK_Zl = 0x00800000,
895 UC_CATEGORY_MASK_Zp = 0x01000000,
896 UC_CATEGORY_MASK_C = 0x3e000000,
897 UC_CATEGORY_MASK_Cc = 0x02000000,
898 UC_CATEGORY_MASK_Cf = 0x04000000,
899 UC_CATEGORY_MASK_Cs = 0x08000000,
900 UC_CATEGORY_MASK_Co = 0x10000000,
901 UC_CATEGORY_MASK_Cn = 0x20000000
904 static int
905 general_category_byname (const char *category_name)
907 if (category_name[0] != '\0'
908 && (category_name[1] == '\0' || category_name[2] == '\0'))
909 switch (category_name[0])
911 case 'L':
912 switch (category_name[1])
914 case '\0': return UC_CATEGORY_MASK_L;
915 case 'C': return UC_CATEGORY_MASK_LC;
916 case 'u': return UC_CATEGORY_MASK_Lu;
917 case 'l': return UC_CATEGORY_MASK_Ll;
918 case 't': return UC_CATEGORY_MASK_Lt;
919 case 'm': return UC_CATEGORY_MASK_Lm;
920 case 'o': return UC_CATEGORY_MASK_Lo;
922 break;
923 case 'M':
924 switch (category_name[1])
926 case '\0': return UC_CATEGORY_MASK_M;
927 case 'n': return UC_CATEGORY_MASK_Mn;
928 case 'c': return UC_CATEGORY_MASK_Mc;
929 case 'e': return UC_CATEGORY_MASK_Me;
931 break;
932 case 'N':
933 switch (category_name[1])
935 case '\0': return UC_CATEGORY_MASK_N;
936 case 'd': return UC_CATEGORY_MASK_Nd;
937 case 'l': return UC_CATEGORY_MASK_Nl;
938 case 'o': return UC_CATEGORY_MASK_No;
940 break;
941 case 'P':
942 switch (category_name[1])
944 case '\0': return UC_CATEGORY_MASK_P;
945 case 'c': return UC_CATEGORY_MASK_Pc;
946 case 'd': return UC_CATEGORY_MASK_Pd;
947 case 's': return UC_CATEGORY_MASK_Ps;
948 case 'e': return UC_CATEGORY_MASK_Pe;
949 case 'i': return UC_CATEGORY_MASK_Pi;
950 case 'f': return UC_CATEGORY_MASK_Pf;
951 case 'o': return UC_CATEGORY_MASK_Po;
953 break;
954 case 'S':
955 switch (category_name[1])
957 case '\0': return UC_CATEGORY_MASK_S;
958 case 'm': return UC_CATEGORY_MASK_Sm;
959 case 'c': return UC_CATEGORY_MASK_Sc;
960 case 'k': return UC_CATEGORY_MASK_Sk;
961 case 'o': return UC_CATEGORY_MASK_So;
963 break;
964 case 'Z':
965 switch (category_name[1])
967 case '\0': return UC_CATEGORY_MASK_Z;
968 case 's': return UC_CATEGORY_MASK_Zs;
969 case 'l': return UC_CATEGORY_MASK_Zl;
970 case 'p': return UC_CATEGORY_MASK_Zp;
972 break;
973 case 'C':
974 switch (category_name[1])
976 case '\0': return UC_CATEGORY_MASK_C;
977 case 'c': return UC_CATEGORY_MASK_Cc;
978 case 'f': return UC_CATEGORY_MASK_Cf;
979 case 's': return UC_CATEGORY_MASK_Cs;
980 case 'o': return UC_CATEGORY_MASK_Co;
981 case 'n': return UC_CATEGORY_MASK_Cn;
983 break;
985 /* Invalid category name. */
986 abort ();
989 /* Construction of sparse 3-level tables. */
990 #define TABLE category_table
991 #define ELEMENT uint8_t
992 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
993 #define xmalloc malloc
994 #define xrealloc realloc
995 #include "3level.h"
997 /* Output the per-character category table. */
998 static void
999 output_category (const char *filename, const char *version)
1001 FILE *stream;
1002 unsigned int ch, i;
1003 struct category_table t;
1004 unsigned int level1_offset, level2_offset, level3_offset;
1005 uint16_t *level3_packed;
1007 stream = fopen (filename, "w");
1008 if (stream == NULL)
1010 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1011 exit (1);
1014 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1015 fprintf (stream, "/* Categories of Unicode characters. */\n");
1016 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1017 version);
1019 t.p = 7;
1020 t.q = 9;
1021 category_table_init (&t);
1023 for (ch = 0; ch < 0x110000; ch++)
1025 int value;
1026 unsigned int log2_value;
1028 if (is_category_Cs (ch))
1029 value = UC_CATEGORY_MASK_Cs;
1030 else if (unicode_attributes[ch].name != NULL)
1031 value = general_category_byname (unicode_attributes[ch].category);
1032 else
1033 continue;
1035 /* Now value should contain exactly one bit. */
1036 assert (value != 0 && (value & (value - 1)) == 0);
1038 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1040 assert (log2_value <= 0x1f);
1042 category_table_add (&t, ch, log2_value);
1045 category_table_finalize (&t);
1047 /* Offsets in t.result, in memory of this process. */
1048 level1_offset =
1049 5 * sizeof (uint32_t);
1050 level2_offset =
1051 5 * sizeof (uint32_t)
1052 + t.level1_size * sizeof (uint32_t);
1053 level3_offset =
1054 5 * sizeof (uint32_t)
1055 + t.level1_size * sizeof (uint32_t)
1056 + (t.level2_size << t.q) * sizeof (uint32_t);
1058 for (i = 0; i < 5; i++)
1059 fprintf (stream, "#define category_header_%d %d\n", i,
1060 ((uint32_t *) t.result)[i]);
1061 fprintf (stream, "static const\n");
1062 fprintf (stream, "struct\n");
1063 fprintf (stream, " {\n");
1064 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1065 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1066 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1067 (1 << t.p) * 5 / 16);
1068 fprintf (stream, " }\n");
1069 fprintf (stream, "u_category =\n");
1070 fprintf (stream, "{\n");
1071 fprintf (stream, " {");
1072 if (t.level1_size > 8)
1073 fprintf (stream, "\n ");
1074 for (i = 0; i < t.level1_size; i++)
1076 uint32_t offset;
1077 if (i > 0 && (i % 8) == 0)
1078 fprintf (stream, "\n ");
1079 offset = ((uint32_t *) (t.result + level1_offset))[i];
1080 if (offset == 0)
1081 fprintf (stream, " %5d", -1);
1082 else
1083 fprintf (stream, " %5zu",
1084 (offset - level2_offset) / sizeof (uint32_t));
1085 if (i+1 < t.level1_size)
1086 fprintf (stream, ",");
1088 if (t.level1_size > 8)
1089 fprintf (stream, "\n ");
1090 fprintf (stream, " },\n");
1091 fprintf (stream, " {");
1092 if (t.level2_size << t.q > 8)
1093 fprintf (stream, "\n ");
1094 for (i = 0; i < t.level2_size << t.q; i++)
1096 uint32_t offset;
1097 if (i > 0 && (i % 8) == 0)
1098 fprintf (stream, "\n ");
1099 offset = ((uint32_t *) (t.result + level2_offset))[i];
1100 if (offset == 0)
1101 fprintf (stream, " %5d", -1);
1102 else
1103 fprintf (stream, " %5zu",
1104 (offset - level3_offset) / sizeof (uint8_t));
1105 if (i+1 < t.level2_size << t.q)
1106 fprintf (stream, ",");
1108 if (t.level2_size << t.q > 8)
1109 fprintf (stream, "\n ");
1110 fprintf (stream, " },\n");
1111 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1112 not 32-bit units, in order to make the lookup function easier. */
1113 level3_packed =
1114 (uint16_t *)
1115 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1116 for (i = 0; i < t.level3_size << t.p; i++)
1118 unsigned int j = (i * 5) / 16;
1119 unsigned int k = (i * 5) % 16;
1120 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1121 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1122 level3_packed[j] = value & 0xffff;
1123 level3_packed[j+1] = value >> 16;
1125 fprintf (stream, " {");
1126 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1127 fprintf (stream, "\n ");
1128 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1130 if (i > 0 && (i % 8) == 0)
1131 fprintf (stream, "\n ");
1132 fprintf (stream, " 0x%04x", level3_packed[i]);
1133 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1134 fprintf (stream, ",");
1136 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1137 fprintf (stream, "\n ");
1138 fprintf (stream, " }\n");
1139 free (level3_packed);
1140 fprintf (stream, "};\n");
1142 if (ferror (stream) || fclose (stream))
1144 fprintf (stderr, "error writing to '%s'\n", filename);
1145 exit (1);
1149 /* ========================================================================= */
1151 /* Canonical combining class. */
1152 /* See Unicode 3.0 book, section 4.2,
1153 UCD.html. */
1155 /* Construction of sparse 3-level tables. */
1156 #define TABLE combclass_table
1157 #define ELEMENT uint8_t
1158 #define DEFAULT 0
1159 #define xmalloc malloc
1160 #define xrealloc realloc
1161 #include "3level.h"
1163 /* Output the per-character combining class table. */
1164 static void
1165 output_combclass (const char *filename, const char *version)
1167 FILE *stream;
1168 unsigned int ch, i;
1169 struct combclass_table t;
1170 unsigned int level1_offset, level2_offset, level3_offset;
1172 stream = fopen (filename, "w");
1173 if (stream == NULL)
1175 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1176 exit (1);
1179 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1180 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1181 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1182 version);
1184 t.p = 7;
1185 t.q = 9;
1186 combclass_table_init (&t);
1188 for (ch = 0; ch < 0x110000; ch++)
1189 if (unicode_attributes[ch].name != NULL)
1191 int value = atoi (unicode_attributes[ch].combining);
1192 assert (value >= 0 && value <= 255);
1193 combclass_table_add (&t, ch, value);
1196 combclass_table_finalize (&t);
1198 /* Offsets in t.result, in memory of this process. */
1199 level1_offset =
1200 5 * sizeof (uint32_t);
1201 level2_offset =
1202 5 * sizeof (uint32_t)
1203 + t.level1_size * sizeof (uint32_t);
1204 level3_offset =
1205 5 * sizeof (uint32_t)
1206 + t.level1_size * sizeof (uint32_t)
1207 + (t.level2_size << t.q) * sizeof (uint32_t);
1209 for (i = 0; i < 5; i++)
1210 fprintf (stream, "#define combclass_header_%d %d\n", i,
1211 ((uint32_t *) t.result)[i]);
1212 fprintf (stream, "static const\n");
1213 fprintf (stream, "struct\n");
1214 fprintf (stream, " {\n");
1215 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1216 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1217 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1218 fprintf (stream, " }\n");
1219 fprintf (stream, "u_combclass =\n");
1220 fprintf (stream, "{\n");
1221 fprintf (stream, " {");
1222 if (t.level1_size > 8)
1223 fprintf (stream, "\n ");
1224 for (i = 0; i < t.level1_size; i++)
1226 uint32_t offset;
1227 if (i > 0 && (i % 8) == 0)
1228 fprintf (stream, "\n ");
1229 offset = ((uint32_t *) (t.result + level1_offset))[i];
1230 if (offset == 0)
1231 fprintf (stream, " %5d", -1);
1232 else
1233 fprintf (stream, " %5zu",
1234 (offset - level2_offset) / sizeof (uint32_t));
1235 if (i+1 < t.level1_size)
1236 fprintf (stream, ",");
1238 if (t.level1_size > 8)
1239 fprintf (stream, "\n ");
1240 fprintf (stream, " },\n");
1241 fprintf (stream, " {");
1242 if (t.level2_size << t.q > 8)
1243 fprintf (stream, "\n ");
1244 for (i = 0; i < t.level2_size << t.q; i++)
1246 uint32_t offset;
1247 if (i > 0 && (i % 8) == 0)
1248 fprintf (stream, "\n ");
1249 offset = ((uint32_t *) (t.result + level2_offset))[i];
1250 if (offset == 0)
1251 fprintf (stream, " %5d", -1);
1252 else
1253 fprintf (stream, " %5zu",
1254 (offset - level3_offset) / sizeof (uint8_t));
1255 if (i+1 < t.level2_size << t.q)
1256 fprintf (stream, ",");
1258 if (t.level2_size << t.q > 8)
1259 fprintf (stream, "\n ");
1260 fprintf (stream, " },\n");
1261 fprintf (stream, " {");
1262 if (t.level3_size << t.p > 8)
1263 fprintf (stream, "\n ");
1264 for (i = 0; i < t.level3_size << t.p; i++)
1266 if (i > 0 && (i % 8) == 0)
1267 fprintf (stream, "\n ");
1268 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1269 if (i+1 < t.level3_size << t.p)
1270 fprintf (stream, ",");
1272 if (t.level3_size << t.p > 8)
1273 fprintf (stream, "\n ");
1274 fprintf (stream, " }\n");
1275 fprintf (stream, "};\n");
1277 if (ferror (stream) || fclose (stream))
1279 fprintf (stderr, "error writing to '%s'\n", filename);
1280 exit (1);
1284 /* ========================================================================= */
1286 /* Bidirectional category. */
1287 /* See Unicode 3.0 book, section 4.3,
1288 UCD.html. */
1290 enum
1292 UC_BIDI_L, /* Left-to-Right */
1293 UC_BIDI_LRE, /* Left-to-Right Embedding */
1294 UC_BIDI_LRO, /* Left-to-Right Override */
1295 UC_BIDI_R, /* Right-to-Left */
1296 UC_BIDI_AL, /* Right-to-Left Arabic */
1297 UC_BIDI_RLE, /* Right-to-Left Embedding */
1298 UC_BIDI_RLO, /* Right-to-Left Override */
1299 UC_BIDI_PDF, /* Pop Directional Format */
1300 UC_BIDI_EN, /* European Number */
1301 UC_BIDI_ES, /* European Number Separator */
1302 UC_BIDI_ET, /* European Number Terminator */
1303 UC_BIDI_AN, /* Arabic Number */
1304 UC_BIDI_CS, /* Common Number Separator */
1305 UC_BIDI_NSM, /* Non-Spacing Mark */
1306 UC_BIDI_BN, /* Boundary Neutral */
1307 UC_BIDI_B, /* Paragraph Separator */
1308 UC_BIDI_S, /* Segment Separator */
1309 UC_BIDI_WS, /* Whitespace */
1310 UC_BIDI_ON, /* Other Neutral */
1311 UC_BIDI_LRI, /* Left-to-Right Isolate */
1312 UC_BIDI_RLI, /* Right-to-Left Isolate */
1313 UC_BIDI_FSI, /* First Strong Isolate */
1314 UC_BIDI_PDI /* Pop Directional Isolate */
1317 static int
1318 bidi_category_byname (const char *category_name)
1320 switch (category_name[0])
1322 case 'A':
1323 switch (category_name[1])
1325 case 'L':
1326 if (category_name[2] == '\0')
1327 return UC_BIDI_AL;
1328 break;
1329 case 'N':
1330 if (category_name[2] == '\0')
1331 return UC_BIDI_AN;
1332 break;
1334 break;
1335 case 'B':
1336 switch (category_name[1])
1338 case '\0':
1339 return UC_BIDI_B;
1340 case 'N':
1341 if (category_name[2] == '\0')
1342 return UC_BIDI_BN;
1343 break;
1345 break;
1346 case 'C':
1347 switch (category_name[1])
1349 case 'S':
1350 if (category_name[2] == '\0')
1351 return UC_BIDI_CS;
1352 break;
1354 break;
1355 case 'E':
1356 switch (category_name[1])
1358 case 'N':
1359 if (category_name[2] == '\0')
1360 return UC_BIDI_EN;
1361 break;
1362 case 'S':
1363 if (category_name[2] == '\0')
1364 return UC_BIDI_ES;
1365 break;
1366 case 'T':
1367 if (category_name[2] == '\0')
1368 return UC_BIDI_ET;
1369 break;
1371 break;
1372 case 'F':
1373 switch (category_name[1])
1375 case 'S':
1376 switch (category_name[2])
1378 case 'I':
1379 if (category_name[3] == '\0')
1380 return UC_BIDI_FSI;
1381 break;
1384 break;
1385 case 'L':
1386 switch (category_name[1])
1388 case '\0':
1389 return UC_BIDI_L;
1390 case 'R':
1391 switch (category_name[2])
1393 case 'E':
1394 if (category_name[3] == '\0')
1395 return UC_BIDI_LRE;
1396 break;
1397 case 'O':
1398 if (category_name[3] == '\0')
1399 return UC_BIDI_LRO;
1400 break;
1401 case 'I':
1402 if (category_name[3] == '\0')
1403 return UC_BIDI_LRI;
1404 break;
1406 break;
1408 break;
1409 case 'N':
1410 switch (category_name[1])
1412 case 'S':
1413 switch (category_name[2])
1415 case 'M':
1416 if (category_name[3] == '\0')
1417 return UC_BIDI_NSM;
1418 break;
1420 break;
1422 break;
1423 case 'O':
1424 switch (category_name[1])
1426 case 'N':
1427 if (category_name[2] == '\0')
1428 return UC_BIDI_ON;
1429 break;
1431 break;
1432 case 'P':
1433 switch (category_name[1])
1435 case 'D':
1436 switch (category_name[2])
1438 case 'F':
1439 if (category_name[3] == '\0')
1440 return UC_BIDI_PDF;
1441 break;
1442 case 'I':
1443 if (category_name[3] == '\0')
1444 return UC_BIDI_PDI;
1445 break;
1447 break;
1449 break;
1450 case 'R':
1451 switch (category_name[1])
1453 case '\0':
1454 return UC_BIDI_R;
1455 case 'L':
1456 switch (category_name[2])
1458 case 'E':
1459 if (category_name[3] == '\0')
1460 return UC_BIDI_RLE;
1461 break;
1462 case 'O':
1463 if (category_name[3] == '\0')
1464 return UC_BIDI_RLO;
1465 break;
1466 case 'I':
1467 if (category_name[3] == '\0')
1468 return UC_BIDI_RLI;
1469 break;
1471 break;
1473 break;
1474 case 'S':
1475 if (category_name[1] == '\0')
1476 return UC_BIDI_S;
1477 break;
1478 case 'W':
1479 switch (category_name[1])
1481 case 'S':
1482 if (category_name[2] == '\0')
1483 return UC_BIDI_WS;
1484 break;
1486 break;
1488 /* Invalid bidi category name. */
1489 abort ();
1492 static int
1493 get_bidi_category (unsigned int ch)
1495 if (unicode_attributes[ch].name != NULL)
1496 return bidi_category_byname (unicode_attributes[ch].bidi);
1497 else
1499 /* The bidi category of unassigned characters depends on the range.
1500 See UTR #9 and DerivedBidiClass.txt. */
1501 if ((ch >= 0x0590 && ch <= 0x05FF)
1502 || (ch >= 0x07FB && ch <= 0x08FF)
1503 || (ch >= 0xFB37 && ch <= 0xFB45)
1504 || (ch >= 0x10800 && ch <= 0x10FFF))
1505 return UC_BIDI_R;
1506 else if ((ch >= 0x0600 && ch <= 0x07BF)
1507 || (ch >= 0x2064 && ch <= 0x2069)
1508 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1509 || (ch >= 0xFDFE && ch <= 0xFEFE))
1510 return UC_BIDI_AL;
1511 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1512 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1513 || (ch & 0xFFFF) == 0xFFFE
1514 || (ch & 0xFFFF) == 0xFFFF
1515 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1516 return UC_BIDI_BN;
1517 else
1518 return UC_BIDI_L;
1522 /* Construction of sparse 3-level tables. */
1523 #define TABLE bidi_category_table
1524 #define ELEMENT uint8_t
1525 #define DEFAULT UC_BIDI_L
1526 #define xmalloc malloc
1527 #define xrealloc realloc
1528 #include "3level.h"
1530 /* Output the per-character bidi category table. */
1531 static void
1532 output_bidi_category (const char *filename, const char *version)
1534 FILE *stream;
1535 unsigned int ch, i;
1536 struct bidi_category_table t;
1537 unsigned int level1_offset, level2_offset, level3_offset;
1538 uint16_t *level3_packed;
1540 stream = fopen (filename, "w");
1541 if (stream == NULL)
1543 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1544 exit (1);
1547 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1548 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1549 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1550 version);
1552 t.p = 7;
1553 t.q = 9;
1554 bidi_category_table_init (&t);
1556 for (ch = 0; ch < 0x110000; ch++)
1558 int value = get_bidi_category (ch);
1560 assert (value <= 0x1f);
1562 bidi_category_table_add (&t, ch, value);
1565 bidi_category_table_finalize (&t);
1567 /* Offsets in t.result, in memory of this process. */
1568 level1_offset =
1569 5 * sizeof (uint32_t);
1570 level2_offset =
1571 5 * sizeof (uint32_t)
1572 + t.level1_size * sizeof (uint32_t);
1573 level3_offset =
1574 5 * sizeof (uint32_t)
1575 + t.level1_size * sizeof (uint32_t)
1576 + (t.level2_size << t.q) * sizeof (uint32_t);
1578 for (i = 0; i < 5; i++)
1579 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1580 ((uint32_t *) t.result)[i]);
1581 fprintf (stream, "static const\n");
1582 fprintf (stream, "struct\n");
1583 fprintf (stream, " {\n");
1584 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1585 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1586 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1587 (1 << t.p) * 5 / 16);
1588 fprintf (stream, " }\n");
1589 fprintf (stream, "u_bidi_category =\n");
1590 fprintf (stream, "{\n");
1591 fprintf (stream, " {");
1592 if (t.level1_size > 8)
1593 fprintf (stream, "\n ");
1594 for (i = 0; i < t.level1_size; i++)
1596 uint32_t offset;
1597 if (i > 0 && (i % 8) == 0)
1598 fprintf (stream, "\n ");
1599 offset = ((uint32_t *) (t.result + level1_offset))[i];
1600 if (offset == 0)
1601 fprintf (stream, " %5d", -1);
1602 else
1603 fprintf (stream, " %5zu",
1604 (offset - level2_offset) / sizeof (uint32_t));
1605 if (i+1 < t.level1_size)
1606 fprintf (stream, ",");
1608 if (t.level1_size > 8)
1609 fprintf (stream, "\n ");
1610 fprintf (stream, " },\n");
1611 fprintf (stream, " {");
1612 if (t.level2_size << t.q > 8)
1613 fprintf (stream, "\n ");
1614 for (i = 0; i < t.level2_size << t.q; i++)
1616 uint32_t offset;
1617 if (i > 0 && (i % 8) == 0)
1618 fprintf (stream, "\n ");
1619 offset = ((uint32_t *) (t.result + level2_offset))[i];
1620 if (offset == 0)
1621 fprintf (stream, " %5d", -1);
1622 else
1623 fprintf (stream, " %5zu",
1624 (offset - level3_offset) / sizeof (uint8_t));
1625 if (i+1 < t.level2_size << t.q)
1626 fprintf (stream, ",");
1628 if (t.level2_size << t.q > 8)
1629 fprintf (stream, "\n ");
1630 fprintf (stream, " },\n");
1631 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1632 not 32-bit units, in order to make the lookup function easier. */
1633 level3_packed =
1634 (uint16_t *)
1635 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1636 for (i = 0; i < t.level3_size << t.p; i++)
1638 unsigned int j = (i * 5) / 16;
1639 unsigned int k = (i * 5) % 16;
1640 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1641 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1642 level3_packed[j] = value & 0xffff;
1643 level3_packed[j+1] = value >> 16;
1645 fprintf (stream, " {");
1646 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1647 fprintf (stream, "\n ");
1648 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1650 if (i > 0 && (i % 8) == 0)
1651 fprintf (stream, "\n ");
1652 fprintf (stream, " 0x%04x", level3_packed[i]);
1653 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1654 fprintf (stream, ",");
1656 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1657 fprintf (stream, "\n ");
1658 fprintf (stream, " }\n");
1659 free (level3_packed);
1660 fprintf (stream, "};\n");
1662 if (ferror (stream) || fclose (stream))
1664 fprintf (stderr, "error writing to '%s'\n", filename);
1665 exit (1);
1669 /* ========================================================================= */
1671 /* Decimal digit value. */
1672 /* See Unicode 3.0 book, section 4.6. */
1674 static int
1675 get_decdigit_value (unsigned int ch)
1677 if (unicode_attributes[ch].name != NULL
1678 && unicode_attributes[ch].decdigit[0] != '\0')
1679 return atoi (unicode_attributes[ch].decdigit);
1680 return -1;
1683 /* Construction of sparse 3-level tables. */
1684 #define TABLE decdigit_table
1685 #define ELEMENT uint8_t
1686 #define DEFAULT 0
1687 #define xmalloc malloc
1688 #define xrealloc realloc
1689 #include "3level.h"
1691 /* Output the unit test for the per-character decimal digit value table. */
1692 static void
1693 output_decimal_digit_test (const char *filename, const char *version)
1695 FILE *stream;
1696 bool need_comma;
1697 unsigned int ch;
1699 stream = fopen (filename, "w");
1700 if (stream == NULL)
1702 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1703 exit (1);
1706 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1707 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1708 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1709 version);
1711 need_comma = false;
1712 for (ch = 0; ch < 0x110000; ch++)
1714 int value = get_decdigit_value (ch);
1716 assert (value >= -1 && value < 10);
1718 if (value >= 0)
1720 if (need_comma)
1721 fprintf (stream, ",\n");
1722 fprintf (stream, " { 0x%04X, %d }", ch, value);
1723 need_comma = true;
1726 if (need_comma)
1727 fprintf (stream, "\n");
1729 if (ferror (stream) || fclose (stream))
1731 fprintf (stderr, "error writing to '%s'\n", filename);
1732 exit (1);
1736 /* Output the per-character decimal digit value table. */
1737 static void
1738 output_decimal_digit (const char *filename, const char *version)
1740 FILE *stream;
1741 unsigned int ch, i;
1742 struct decdigit_table t;
1743 unsigned int level1_offset, level2_offset, level3_offset;
1745 stream = fopen (filename, "w");
1746 if (stream == NULL)
1748 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1749 exit (1);
1752 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1753 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1754 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1755 version);
1757 t.p = 7;
1758 t.q = 9;
1759 decdigit_table_init (&t);
1761 for (ch = 0; ch < 0x110000; ch++)
1763 int value = 1 + get_decdigit_value (ch);
1765 assert (value >= 0 && value <= 10);
1767 decdigit_table_add (&t, ch, value);
1770 decdigit_table_finalize (&t);
1772 /* Offsets in t.result, in memory of this process. */
1773 level1_offset =
1774 5 * sizeof (uint32_t);
1775 level2_offset =
1776 5 * sizeof (uint32_t)
1777 + t.level1_size * sizeof (uint32_t);
1778 level3_offset =
1779 5 * sizeof (uint32_t)
1780 + t.level1_size * sizeof (uint32_t)
1781 + (t.level2_size << t.q) * sizeof (uint32_t);
1783 for (i = 0; i < 5; i++)
1784 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1785 ((uint32_t *) t.result)[i]);
1786 fprintf (stream, "static const\n");
1787 fprintf (stream, "struct\n");
1788 fprintf (stream, " {\n");
1789 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1790 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1791 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1792 t.p - 1);
1793 fprintf (stream, " }\n");
1794 fprintf (stream, "u_decdigit =\n");
1795 fprintf (stream, "{\n");
1796 fprintf (stream, " {");
1797 if (t.level1_size > 8)
1798 fprintf (stream, "\n ");
1799 for (i = 0; i < t.level1_size; i++)
1801 uint32_t offset;
1802 if (i > 0 && (i % 8) == 0)
1803 fprintf (stream, "\n ");
1804 offset = ((uint32_t *) (t.result + level1_offset))[i];
1805 if (offset == 0)
1806 fprintf (stream, " %5d", -1);
1807 else
1808 fprintf (stream, " %5zu",
1809 (offset - level2_offset) / sizeof (uint32_t));
1810 if (i+1 < t.level1_size)
1811 fprintf (stream, ",");
1813 if (t.level1_size > 8)
1814 fprintf (stream, "\n ");
1815 fprintf (stream, " },\n");
1816 fprintf (stream, " {");
1817 if (t.level2_size << t.q > 8)
1818 fprintf (stream, "\n ");
1819 for (i = 0; i < t.level2_size << t.q; i++)
1821 uint32_t offset;
1822 if (i > 0 && (i % 8) == 0)
1823 fprintf (stream, "\n ");
1824 offset = ((uint32_t *) (t.result + level2_offset))[i];
1825 if (offset == 0)
1826 fprintf (stream, " %5d", -1);
1827 else
1828 fprintf (stream, " %5zu",
1829 (offset - level3_offset) / sizeof (uint8_t));
1830 if (i+1 < t.level2_size << t.q)
1831 fprintf (stream, ",");
1833 if (t.level2_size << t.q > 8)
1834 fprintf (stream, "\n ");
1835 fprintf (stream, " },\n");
1836 /* Pack the level3 array. Each entry needs 4 bits only. */
1837 fprintf (stream, " {");
1838 if (t.level3_size << (t.p - 1) > 8)
1839 fprintf (stream, "\n ");
1840 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1842 if (i > 0 && (i % 8) == 0)
1843 fprintf (stream, "\n ");
1844 fprintf (stream, " 0x%02x",
1845 ((uint8_t *) (t.result + level3_offset))[2*i]
1846 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1847 if (i+1 < t.level3_size << (t.p - 1))
1848 fprintf (stream, ",");
1850 if (t.level3_size << (t.p - 1) > 8)
1851 fprintf (stream, "\n ");
1852 fprintf (stream, " }\n");
1853 fprintf (stream, "};\n");
1855 if (ferror (stream) || fclose (stream))
1857 fprintf (stderr, "error writing to '%s'\n", filename);
1858 exit (1);
1862 /* ========================================================================= */
1864 /* Digit value. */
1865 /* See Unicode 3.0 book, section 4.6. */
1867 static int
1868 get_digit_value (unsigned int ch)
1870 if (unicode_attributes[ch].name != NULL
1871 && unicode_attributes[ch].digit[0] != '\0')
1872 return atoi (unicode_attributes[ch].digit);
1873 return -1;
1876 /* Output the unit test for the per-character digit value table. */
1877 static void
1878 output_digit_test (const char *filename, const char *version)
1880 FILE *stream;
1881 bool need_comma;
1882 unsigned int ch;
1884 stream = fopen (filename, "w");
1885 if (stream == NULL)
1887 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1888 exit (1);
1891 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1892 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1893 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1894 version);
1896 need_comma = false;
1897 for (ch = 0; ch < 0x110000; ch++)
1899 int value = get_digit_value (ch);
1901 assert (value >= -1 && value < 10);
1903 if (value >= 0)
1905 if (need_comma)
1906 fprintf (stream, ",\n");
1907 fprintf (stream, " { 0x%04X, %d }", ch, value);
1908 need_comma = true;
1911 if (need_comma)
1912 fprintf (stream, "\n");
1914 if (ferror (stream) || fclose (stream))
1916 fprintf (stderr, "error writing to '%s'\n", filename);
1917 exit (1);
1921 /* Output the per-character digit value table. */
1922 static void
1923 output_digit (const char *filename, const char *version)
1925 FILE *stream;
1926 unsigned int ch, i;
1927 struct decdigit_table t;
1928 unsigned int level1_offset, level2_offset, level3_offset;
1930 stream = fopen (filename, "w");
1931 if (stream == NULL)
1933 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1934 exit (1);
1937 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1938 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1939 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1940 version);
1942 t.p = 7;
1943 t.q = 9;
1944 decdigit_table_init (&t);
1946 for (ch = 0; ch < 0x110000; ch++)
1948 int value = 1 + get_digit_value (ch);
1950 assert (value >= 0 && value <= 10);
1952 decdigit_table_add (&t, ch, value);
1955 decdigit_table_finalize (&t);
1957 /* Offsets in t.result, in memory of this process. */
1958 level1_offset =
1959 5 * sizeof (uint32_t);
1960 level2_offset =
1961 5 * sizeof (uint32_t)
1962 + t.level1_size * sizeof (uint32_t);
1963 level3_offset =
1964 5 * sizeof (uint32_t)
1965 + t.level1_size * sizeof (uint32_t)
1966 + (t.level2_size << t.q) * sizeof (uint32_t);
1968 for (i = 0; i < 5; i++)
1969 fprintf (stream, "#define digit_header_%d %d\n", i,
1970 ((uint32_t *) t.result)[i]);
1971 fprintf (stream, "static const\n");
1972 fprintf (stream, "struct\n");
1973 fprintf (stream, " {\n");
1974 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1975 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1976 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1977 t.p - 1);
1978 fprintf (stream, " }\n");
1979 fprintf (stream, "u_digit =\n");
1980 fprintf (stream, "{\n");
1981 fprintf (stream, " {");
1982 if (t.level1_size > 8)
1983 fprintf (stream, "\n ");
1984 for (i = 0; i < t.level1_size; i++)
1986 uint32_t offset;
1987 if (i > 0 && (i % 8) == 0)
1988 fprintf (stream, "\n ");
1989 offset = ((uint32_t *) (t.result + level1_offset))[i];
1990 if (offset == 0)
1991 fprintf (stream, " %5d", -1);
1992 else
1993 fprintf (stream, " %5zu",
1994 (offset - level2_offset) / sizeof (uint32_t));
1995 if (i+1 < t.level1_size)
1996 fprintf (stream, ",");
1998 if (t.level1_size > 8)
1999 fprintf (stream, "\n ");
2000 fprintf (stream, " },\n");
2001 fprintf (stream, " {");
2002 if (t.level2_size << t.q > 8)
2003 fprintf (stream, "\n ");
2004 for (i = 0; i < t.level2_size << t.q; i++)
2006 uint32_t offset;
2007 if (i > 0 && (i % 8) == 0)
2008 fprintf (stream, "\n ");
2009 offset = ((uint32_t *) (t.result + level2_offset))[i];
2010 if (offset == 0)
2011 fprintf (stream, " %5d", -1);
2012 else
2013 fprintf (stream, " %5zu",
2014 (offset - level3_offset) / sizeof (uint8_t));
2015 if (i+1 < t.level2_size << t.q)
2016 fprintf (stream, ",");
2018 if (t.level2_size << t.q > 8)
2019 fprintf (stream, "\n ");
2020 fprintf (stream, " },\n");
2021 /* Pack the level3 array. Each entry needs 4 bits only. */
2022 fprintf (stream, " {");
2023 if (t.level3_size << (t.p - 1) > 8)
2024 fprintf (stream, "\n ");
2025 for (i = 0; i < t.level3_size << (t.p - 1); i++)
2027 if (i > 0 && (i % 8) == 0)
2028 fprintf (stream, "\n ");
2029 fprintf (stream, " 0x%02x",
2030 ((uint8_t *) (t.result + level3_offset))[2*i]
2031 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
2032 if (i+1 < t.level3_size << (t.p - 1))
2033 fprintf (stream, ",");
2035 if (t.level3_size << (t.p - 1) > 8)
2036 fprintf (stream, "\n ");
2037 fprintf (stream, " }\n");
2038 fprintf (stream, "};\n");
2040 if (ferror (stream) || fclose (stream))
2042 fprintf (stderr, "error writing to '%s'\n", filename);
2043 exit (1);
2047 /* ========================================================================= */
2049 /* Numeric value. */
2050 /* See Unicode 3.0 book, section 4.6. */
2052 typedef struct { int numerator; int denominator; } uc_fraction_t;
2054 static uc_fraction_t
2055 get_numeric_value (unsigned int ch)
2057 uc_fraction_t value;
2059 if (unicode_attributes[ch].name != NULL
2060 && unicode_attributes[ch].numeric[0] != '\0')
2062 const char *str = unicode_attributes[ch].numeric;
2063 /* str is of the form "integer" or "integer/posinteger". */
2064 value.numerator = atoi (str);
2065 if (strchr (str, '/') != NULL)
2066 value.denominator = atoi (strchr (str, '/') + 1);
2067 else
2068 value.denominator = 1;
2070 else
2072 value.numerator = 0;
2073 value.denominator = 0;
2075 return value;
2078 /* Output the unit test for the per-character numeric value table. */
2079 static void
2080 output_numeric_test (const char *filename, const char *version)
2082 FILE *stream;
2083 bool need_comma;
2084 unsigned int ch;
2086 stream = fopen (filename, "w");
2087 if (stream == NULL)
2089 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2090 exit (1);
2093 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2094 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2095 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2096 version);
2098 need_comma = false;
2099 for (ch = 0; ch < 0x110000; ch++)
2101 uc_fraction_t value = get_numeric_value (ch);
2103 if (value.numerator != 0 || value.denominator != 0)
2105 if (need_comma)
2106 fprintf (stream, ",\n");
2107 fprintf (stream, " { 0x%04X, %d, %d }",
2108 ch, value.numerator, value.denominator);
2109 need_comma = true;
2112 if (need_comma)
2113 fprintf (stream, "\n");
2115 if (ferror (stream) || fclose (stream))
2117 fprintf (stderr, "error writing to '%s'\n", filename);
2118 exit (1);
2122 /* Construction of sparse 3-level tables. */
2123 #define TABLE numeric_table
2124 #define ELEMENT uint8_t
2125 #define DEFAULT 0
2126 #define xmalloc malloc
2127 #define xrealloc realloc
2128 #include "3level.h"
2130 /* Output the per-character numeric value table. */
2131 static void
2132 output_numeric (const char *filename, const char *version)
2134 FILE *stream;
2135 uc_fraction_t fractions[160];
2136 unsigned int nfractions;
2137 unsigned int ch, i, j;
2138 struct numeric_table t;
2139 unsigned int level1_offset, level2_offset, level3_offset;
2140 uint16_t *level3_packed;
2142 stream = fopen (filename, "w");
2143 if (stream == NULL)
2145 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2146 exit (1);
2149 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2150 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2151 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2152 version);
2154 /* Create table of occurring fractions. */
2155 nfractions = 0;
2156 for (ch = 0; ch < 0x110000; ch++)
2158 uc_fraction_t value = get_numeric_value (ch);
2160 for (i = 0; i < nfractions; i++)
2161 if (value.numerator == fractions[i].numerator
2162 && value.denominator == fractions[i].denominator)
2163 break;
2164 if (i == nfractions)
2166 assert (nfractions != SIZEOF (fractions));
2167 for (i = 0; i < nfractions; i++)
2168 if (value.denominator < fractions[i].denominator
2169 || (value.denominator == fractions[i].denominator
2170 && value.numerator < fractions[i].numerator))
2171 break;
2172 for (j = nfractions; j > i; j--)
2173 fractions[j] = fractions[j - 1];
2174 fractions[i] = value;
2175 nfractions++;
2179 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2180 nfractions);
2181 fprintf (stream, "{\n");
2182 for (i = 0; i < nfractions; i++)
2184 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2185 fractions[i].denominator);
2186 if (i+1 < nfractions)
2187 fprintf (stream, ",");
2188 fprintf (stream, "\n");
2190 fprintf (stream, "};\n");
2192 t.p = 7;
2193 t.q = 9;
2194 numeric_table_init (&t);
2196 for (ch = 0; ch < 0x110000; ch++)
2198 uc_fraction_t value = get_numeric_value (ch);
2200 for (i = 0; i < nfractions; i++)
2201 if (value.numerator == fractions[i].numerator
2202 && value.denominator == fractions[i].denominator)
2203 break;
2204 assert (i != nfractions);
2206 numeric_table_add (&t, ch, i);
2209 numeric_table_finalize (&t);
2211 /* Offsets in t.result, in memory of this process. */
2212 level1_offset =
2213 5 * sizeof (uint32_t);
2214 level2_offset =
2215 5 * sizeof (uint32_t)
2216 + t.level1_size * sizeof (uint32_t);
2217 level3_offset =
2218 5 * sizeof (uint32_t)
2219 + t.level1_size * sizeof (uint32_t)
2220 + (t.level2_size << t.q) * sizeof (uint32_t);
2222 for (i = 0; i < 5; i++)
2223 fprintf (stream, "#define numeric_header_%d %d\n", i,
2224 ((uint32_t *) t.result)[i]);
2225 fprintf (stream, "static const\n");
2226 fprintf (stream, "struct\n");
2227 fprintf (stream, " {\n");
2228 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2229 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2230 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2231 (1 << t.p) * 8 / 16);
2232 fprintf (stream, " }\n");
2233 fprintf (stream, "u_numeric =\n");
2234 fprintf (stream, "{\n");
2235 fprintf (stream, " {");
2236 if (t.level1_size > 8)
2237 fprintf (stream, "\n ");
2238 for (i = 0; i < t.level1_size; i++)
2240 uint32_t offset;
2241 if (i > 0 && (i % 8) == 0)
2242 fprintf (stream, "\n ");
2243 offset = ((uint32_t *) (t.result + level1_offset))[i];
2244 if (offset == 0)
2245 fprintf (stream, " %5d", -1);
2246 else
2247 fprintf (stream, " %5zu",
2248 (offset - level2_offset) / sizeof (uint32_t));
2249 if (i+1 < t.level1_size)
2250 fprintf (stream, ",");
2252 if (t.level1_size > 8)
2253 fprintf (stream, "\n ");
2254 fprintf (stream, " },\n");
2255 fprintf (stream, " {");
2256 if (t.level2_size << t.q > 8)
2257 fprintf (stream, "\n ");
2258 for (i = 0; i < t.level2_size << t.q; i++)
2260 uint32_t offset;
2261 if (i > 0 && (i % 8) == 0)
2262 fprintf (stream, "\n ");
2263 offset = ((uint32_t *) (t.result + level2_offset))[i];
2264 if (offset == 0)
2265 fprintf (stream, " %5d", -1);
2266 else
2267 fprintf (stream, " %5zu",
2268 (offset - level3_offset) / sizeof (uint8_t));
2269 if (i+1 < t.level2_size << t.q)
2270 fprintf (stream, ",");
2272 if (t.level2_size << t.q > 8)
2273 fprintf (stream, "\n ");
2274 fprintf (stream, " },\n");
2275 /* Pack the level3 array. Each entry needs 8 bits only. Use 16-bit units,
2276 not 32-bit units, in order to make the lookup function easier. */
2277 level3_packed =
2278 (uint16_t *)
2279 calloc ((t.level3_size << t.p) * 8 / 16 + 1, sizeof (uint16_t));
2280 for (i = 0; i < t.level3_size << t.p; i++)
2282 unsigned int j = (i * 8) / 16;
2283 unsigned int k = (i * 8) % 16;
2284 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2285 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2286 level3_packed[j] = value & 0xffff;
2287 level3_packed[j+1] = value >> 16;
2289 fprintf (stream, " {");
2290 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2291 fprintf (stream, "\n ");
2292 for (i = 0; i < (t.level3_size << t.p) * 8 / 16 + 1; i++)
2294 if (i > 0 && (i % 8) == 0)
2295 fprintf (stream, "\n ");
2296 fprintf (stream, " 0x%04x", level3_packed[i]);
2297 if (i+1 < (t.level3_size << t.p) * 8 / 16 + 1)
2298 fprintf (stream, ",");
2300 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2301 fprintf (stream, "\n ");
2302 fprintf (stream, " }\n");
2303 free (level3_packed);
2304 fprintf (stream, "};\n");
2306 if (ferror (stream) || fclose (stream))
2308 fprintf (stderr, "error writing to '%s'\n", filename);
2309 exit (1);
2313 /* ========================================================================= */
2315 /* Mirrored. */
2316 /* See Unicode 3.0 book, section 4.7,
2317 UAX #9. */
2319 /* List of mirrored character pairs. This is a subset of the characters
2320 having the BidiMirrored property. */
2321 static unsigned int mirror_pairs[][2] =
2323 { 0x0028, 0x0029 },
2324 { 0x003C, 0x003E },
2325 { 0x005B, 0x005D },
2326 { 0x007B, 0x007D },
2327 { 0x00AB, 0x00BB },
2328 { 0x2039, 0x203A },
2329 { 0x2045, 0x2046 },
2330 { 0x207D, 0x207E },
2331 { 0x208D, 0x208E },
2332 { 0x2208, 0x220B },
2333 { 0x220A, 0x220D },
2334 { 0x223C, 0x223D },
2335 { 0x2243, 0x22CD },
2336 { 0x2252, 0x2253 },
2337 { 0x2254, 0x2255 },
2338 { 0x2264, 0x2265 },
2339 { 0x2266, 0x2267 },
2340 { 0x226A, 0x226B },
2341 { 0x2276, 0x2277 },
2342 { 0x2278, 0x2279 },
2343 { 0x227A, 0x227B },
2344 { 0x227C, 0x227D },
2345 { 0x2282, 0x2283 },
2346 { 0x2286, 0x2287 },
2347 { 0x228F, 0x2290 },
2348 { 0x2291, 0x2292 },
2349 { 0x22A2, 0x22A3 },
2350 { 0x22B0, 0x22B1 },
2351 { 0x22B2, 0x22B3 },
2352 { 0x22B4, 0x22B5 },
2353 { 0x22B6, 0x22B7 },
2354 { 0x22C9, 0x22CA },
2355 { 0x22CB, 0x22CC },
2356 { 0x22D0, 0x22D1 },
2357 { 0x22D6, 0x22D7 },
2358 { 0x22D8, 0x22D9 },
2359 { 0x22DA, 0x22DB },
2360 { 0x22DC, 0x22DD },
2361 { 0x22DE, 0x22DF },
2362 { 0x22F0, 0x22F1 },
2363 { 0x2308, 0x2309 },
2364 { 0x230A, 0x230B },
2365 { 0x2329, 0x232A },
2366 { 0x3008, 0x3009 },
2367 { 0x300A, 0x300B },
2368 { 0x300C, 0x300D },
2369 { 0x300E, 0x300F },
2370 { 0x3010, 0x3011 },
2371 { 0x3014, 0x3015 },
2372 { 0x3016, 0x3017 },
2373 { 0x3018, 0x3019 },
2374 { 0x301A, 0x301B }
2377 static int
2378 get_mirror_value (unsigned int ch)
2380 bool mirrored;
2381 unsigned int mirror_char;
2382 unsigned int i;
2384 mirrored = (unicode_attributes[ch].name != NULL
2385 && unicode_attributes[ch].mirrored);
2386 mirror_char = 0xfffd;
2387 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2388 if (ch == mirror_pairs[i][0])
2390 mirror_char = mirror_pairs[i][1];
2391 break;
2393 else if (ch == mirror_pairs[i][1])
2395 mirror_char = mirror_pairs[i][0];
2396 break;
2398 if (mirrored)
2399 return (int) mirror_char - (int) ch;
2400 else
2402 assert (mirror_char == 0xfffd);
2403 return 0;
2407 /* Construction of sparse 3-level tables. */
2408 #define TABLE mirror_table
2409 #define ELEMENT int32_t
2410 #define DEFAULT 0
2411 #define xmalloc malloc
2412 #define xrealloc realloc
2413 #include "3level.h"
2415 /* Output the per-character mirror table. */
2416 static void
2417 output_mirror (const char *filename, const char *version)
2419 FILE *stream;
2420 unsigned int ch, i;
2421 struct mirror_table t;
2422 unsigned int level1_offset, level2_offset, level3_offset;
2424 stream = fopen (filename, "w");
2425 if (stream == NULL)
2427 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2428 exit (1);
2431 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2432 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2433 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2434 version);
2436 t.p = 7;
2437 t.q = 9;
2438 mirror_table_init (&t);
2440 for (ch = 0; ch < 0x110000; ch++)
2442 int value = get_mirror_value (ch);
2444 mirror_table_add (&t, ch, value);
2447 mirror_table_finalize (&t);
2449 /* Offsets in t.result, in memory of this process. */
2450 level1_offset =
2451 5 * sizeof (uint32_t);
2452 level2_offset =
2453 5 * sizeof (uint32_t)
2454 + t.level1_size * sizeof (uint32_t);
2455 level3_offset =
2456 5 * sizeof (uint32_t)
2457 + t.level1_size * sizeof (uint32_t)
2458 + (t.level2_size << t.q) * sizeof (uint32_t);
2460 for (i = 0; i < 5; i++)
2461 fprintf (stream, "#define mirror_header_%d %d\n", i,
2462 ((uint32_t *) t.result)[i]);
2463 fprintf (stream, "static const\n");
2464 fprintf (stream, "struct\n");
2465 fprintf (stream, " {\n");
2466 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2467 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2468 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2469 fprintf (stream, " }\n");
2470 fprintf (stream, "u_mirror =\n");
2471 fprintf (stream, "{\n");
2472 fprintf (stream, " {");
2473 if (t.level1_size > 8)
2474 fprintf (stream, "\n ");
2475 for (i = 0; i < t.level1_size; i++)
2477 uint32_t offset;
2478 if (i > 0 && (i % 8) == 0)
2479 fprintf (stream, "\n ");
2480 offset = ((uint32_t *) (t.result + level1_offset))[i];
2481 if (offset == 0)
2482 fprintf (stream, " %5d", -1);
2483 else
2484 fprintf (stream, " %5zu",
2485 (offset - level2_offset) / sizeof (uint32_t));
2486 if (i+1 < t.level1_size)
2487 fprintf (stream, ",");
2489 if (t.level1_size > 8)
2490 fprintf (stream, "\n ");
2491 fprintf (stream, " },\n");
2492 fprintf (stream, " {");
2493 if (t.level2_size << t.q > 8)
2494 fprintf (stream, "\n ");
2495 for (i = 0; i < t.level2_size << t.q; i++)
2497 uint32_t offset;
2498 if (i > 0 && (i % 8) == 0)
2499 fprintf (stream, "\n ");
2500 offset = ((uint32_t *) (t.result + level2_offset))[i];
2501 if (offset == 0)
2502 fprintf (stream, " %5d", -1);
2503 else
2504 fprintf (stream, " %5zu",
2505 (offset - level3_offset) / sizeof (int32_t));
2506 if (i+1 < t.level2_size << t.q)
2507 fprintf (stream, ",");
2509 if (t.level2_size << t.q > 8)
2510 fprintf (stream, "\n ");
2511 fprintf (stream, " },\n");
2512 fprintf (stream, " {");
2513 if (t.level3_size << t.p > 8)
2514 fprintf (stream, "\n ");
2515 for (i = 0; i < t.level3_size << t.p; i++)
2517 if (i > 0 && (i % 8) == 0)
2518 fprintf (stream, "\n ");
2519 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2520 if (i+1 < t.level3_size << t.p)
2521 fprintf (stream, ",");
2523 if (t.level3_size << t.p > 8)
2524 fprintf (stream, "\n ");
2525 fprintf (stream, " }\n");
2526 fprintf (stream, "};\n");
2528 if (ferror (stream) || fclose (stream))
2530 fprintf (stderr, "error writing to '%s'\n", filename);
2531 exit (1);
2535 /* ========================================================================= */
2537 /* Particular values of the word break property. */
2539 static bool
2540 is_WBP_MIDNUMLET (unsigned int ch)
2542 return (ch == 0x002E || ch == 0x2018 || ch == 0x2019
2543 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2546 static bool
2547 is_WBP_MIDLETTER (unsigned int ch)
2549 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2550 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A
2551 || ch == 0x02D7);
2554 /* ========================================================================= */
2556 /* Properties. */
2558 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2559 enum
2561 /* PropList.txt */
2562 PROP_WHITE_SPACE,
2563 PROP_BIDI_CONTROL,
2564 PROP_JOIN_CONTROL,
2565 PROP_DASH,
2566 PROP_HYPHEN,
2567 PROP_QUOTATION_MARK,
2568 PROP_TERMINAL_PUNCTUATION,
2569 PROP_OTHER_MATH,
2570 PROP_HEX_DIGIT,
2571 PROP_ASCII_HEX_DIGIT,
2572 PROP_OTHER_ALPHABETIC,
2573 PROP_IDEOGRAPHIC,
2574 PROP_DIACRITIC,
2575 PROP_EXTENDER,
2576 PROP_OTHER_LOWERCASE,
2577 PROP_OTHER_UPPERCASE,
2578 PROP_NONCHARACTER_CODE_POINT,
2579 PROP_OTHER_GRAPHEME_EXTEND,
2580 PROP_IDS_BINARY_OPERATOR,
2581 PROP_IDS_TRINARY_OPERATOR,
2582 PROP_RADICAL,
2583 PROP_UNIFIED_IDEOGRAPH,
2584 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2585 PROP_DEPRECATED,
2586 PROP_SOFT_DOTTED,
2587 PROP_LOGICAL_ORDER_EXCEPTION,
2588 PROP_OTHER_ID_START,
2589 PROP_OTHER_ID_CONTINUE,
2590 PROP_STERM,
2591 PROP_VARIATION_SELECTOR,
2592 PROP_PATTERN_WHITE_SPACE,
2593 PROP_PATTERN_SYNTAX,
2594 /* DerivedCoreProperties.txt */
2595 PROP_MATH,
2596 PROP_ALPHABETIC,
2597 PROP_LOWERCASE,
2598 PROP_UPPERCASE,
2599 PROP_CASED,
2600 PROP_CASE_IGNORABLE,
2601 PROP_CHANGES_WHEN_LOWERCASED,
2602 PROP_CHANGES_WHEN_UPPERCASED,
2603 PROP_CHANGES_WHEN_TITLECASED,
2604 PROP_CHANGES_WHEN_CASEFOLDED,
2605 PROP_CHANGES_WHEN_CASEMAPPED,
2606 PROP_ID_START,
2607 PROP_ID_CONTINUE,
2608 PROP_XID_START,
2609 PROP_XID_CONTINUE,
2610 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2611 PROP_GRAPHEME_EXTEND,
2612 PROP_GRAPHEME_BASE,
2613 PROP_GRAPHEME_LINK
2615 unsigned long long unicode_properties[0x110000];
2617 static void
2618 clear_properties (void)
2620 unsigned int i;
2622 for (i = 0; i < 0x110000; i++)
2623 unicode_properties[i] = 0;
2626 /* Stores in unicode_properties[] the properties from the
2627 PropList.txt or DerivedCoreProperties.txt file. */
2628 static void
2629 fill_properties (const char *proplist_filename)
2631 unsigned int i;
2632 FILE *stream;
2634 stream = fopen (proplist_filename, "r");
2635 if (stream == NULL)
2637 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2638 exit (1);
2641 for (;;)
2643 char buf[200+1];
2644 unsigned int i1, i2;
2645 char padding[200+1];
2646 char propname[200+1];
2647 unsigned int propvalue;
2649 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2650 break;
2652 if (buf[0] == '\0' || buf[0] == '#')
2653 continue;
2655 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2657 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2659 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2660 exit (1);
2662 i2 = i1;
2664 #define PROP(name,value) \
2665 if (strcmp (propname, name) == 0) propvalue = value; else
2666 /* PropList.txt */
2667 PROP ("White_Space", PROP_WHITE_SPACE)
2668 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2669 PROP ("Join_Control", PROP_JOIN_CONTROL)
2670 PROP ("Dash", PROP_DASH)
2671 PROP ("Hyphen", PROP_HYPHEN)
2672 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2673 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2674 PROP ("Other_Math", PROP_OTHER_MATH)
2675 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2676 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2677 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2678 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2679 PROP ("Diacritic", PROP_DIACRITIC)
2680 PROP ("Extender", PROP_EXTENDER)
2681 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2682 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2683 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2684 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2685 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2686 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2687 PROP ("Radical", PROP_RADICAL)
2688 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2689 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2690 PROP ("Deprecated", PROP_DEPRECATED)
2691 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2692 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2693 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2694 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2695 PROP ("STerm", PROP_STERM)
2696 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2697 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2698 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2699 /* DerivedCoreProperties.txt */
2700 PROP ("Math", PROP_MATH)
2701 PROP ("Alphabetic", PROP_ALPHABETIC)
2702 PROP ("Lowercase", PROP_LOWERCASE)
2703 PROP ("Uppercase", PROP_UPPERCASE)
2704 PROP ("Cased", PROP_CASED)
2705 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2706 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2707 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2708 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2709 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2710 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2711 PROP ("ID_Start", PROP_ID_START)
2712 PROP ("ID_Continue", PROP_ID_CONTINUE)
2713 PROP ("XID_Start", PROP_XID_START)
2714 PROP ("XID_Continue", PROP_XID_CONTINUE)
2715 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2716 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2717 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2718 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2719 #undef PROP
2721 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2722 proplist_filename);
2723 exit (1);
2725 assert (i1 <= i2 && i2 < 0x110000);
2727 for (i = i1; i <= i2; i++)
2728 unicode_properties[i] |= 1ULL << propvalue;
2731 if (ferror (stream) || fclose (stream))
2733 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2734 exit (1);
2738 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2739 file. */
2740 static void
2741 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2743 unsigned int i;
2744 FILE *stream;
2745 char buf[100+1];
2747 for (i = 0; i < 0x110000; i++)
2748 array[i] = 0;
2750 stream = fopen (proplist_filename, "r");
2751 if (stream == NULL)
2753 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2754 exit (1);
2757 /* Search for the "Property dump for: ..." line. */
2760 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2762 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2763 exit (1);
2766 while (strstr (buf, property_name) == NULL);
2768 for (;;)
2770 unsigned int i1, i2;
2772 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2773 break;
2774 if (buf[0] == '*')
2775 break;
2776 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2778 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2780 fprintf (stderr, "parse error in property in '%s'\n",
2781 proplist_filename);
2782 exit (1);
2785 else if (strlen (buf) >= 4)
2787 if (sscanf (buf, "%4X", &i1) < 1)
2789 fprintf (stderr, "parse error in property in '%s'\n",
2790 proplist_filename);
2791 exit (1);
2793 i2 = i1;
2795 else
2797 fprintf (stderr, "parse error in property in '%s'\n",
2798 proplist_filename);
2799 exit (1);
2801 assert (i1 <= i2 && i2 < 0x110000);
2802 for (i = i1; i <= i2; i++)
2803 array[i] = 1;
2806 if (ferror (stream) || fclose (stream))
2808 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2809 exit (1);
2813 /* Properties from Unicode 3.0 PropList.txt file. */
2815 /* The paired punctuation property from the PropList.txt file. */
2816 char unicode_pairedpunctuation[0x110000];
2818 /* The left of pair property from the PropList.txt file. */
2819 char unicode_leftofpair[0x110000];
2821 static void
2822 fill_properties30 (const char *proplist30_filename)
2824 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2825 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2828 /* ------------------------------------------------------------------------- */
2830 /* See PropList.txt, UCD.html. */
2831 static bool
2832 is_property_white_space (unsigned int ch)
2834 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2837 /* See Unicode 3.0 book, section 4.10,
2838 PropList.txt, UCD.html,
2839 DerivedCoreProperties.txt, UCD.html. */
2840 static bool
2841 is_property_alphabetic (unsigned int ch)
2843 bool result1 =
2844 is_category_L (ch)
2845 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2846 /* For some reason, the following are listed as having property
2847 Alphabetic but not as having property Other_Alphabetic. */
2848 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2849 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2850 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2851 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2852 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2853 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2854 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2855 || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
2856 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2857 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2858 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2859 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2860 || (ch >= 0x12400 && ch <= 0x1246E); /* CUNEIFORM NUMERIC SIGNS */
2861 bool result2 =
2862 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2864 assert (result1 == result2);
2865 return result1;
2868 /* See PropList.txt, UCD.html. */
2869 static bool
2870 is_property_other_alphabetic (unsigned int ch)
2872 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2875 /* See PropList.txt, UCD.html. */
2876 static bool
2877 is_property_not_a_character (unsigned int ch)
2879 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2882 /* See PropList.txt, UCD.html,
2883 DerivedCoreProperties.txt, UCD.html. */
2884 static bool
2885 is_property_default_ignorable_code_point (unsigned int ch)
2887 bool result1 =
2888 (is_category_Cf (ch)
2889 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2890 && !((ch >= 0x0600 && ch <= 0x0605) || ch == 0x06DD || ch == 0x070F)
2891 /* For some reason, the following are not listed as having property
2892 Default_Ignorable_Code_Point. */
2893 && !(ch == 0x110BD))
2894 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2895 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2896 bool result2 =
2897 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2899 assert (result1 == result2);
2900 return result1;
2903 /* See PropList.txt, UCD.html. */
2904 static bool
2905 is_property_other_default_ignorable_code_point (unsigned int ch)
2907 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2910 /* See PropList.txt, UCD.html. */
2911 static bool
2912 is_property_deprecated (unsigned int ch)
2914 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2917 /* See PropList.txt, UCD.html. */
2918 static bool
2919 is_property_logical_order_exception (unsigned int ch)
2921 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2924 /* See PropList.txt, UCD.html. */
2925 static bool
2926 is_property_variation_selector (unsigned int ch)
2928 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2931 /* See PropList-3.0.1.txt. */
2932 static bool
2933 is_property_private_use (unsigned int ch)
2935 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2936 return (ch >= 0xE000 && ch <= 0xF8FF)
2937 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2938 || (ch >= 0x100000 && ch <= 0x10FFFD);
2941 /* See PropList-3.0.1.txt. */
2942 static bool
2943 is_property_unassigned_code_value (unsigned int ch)
2945 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2948 /* See PropList.txt, UCD.html,
2949 DerivedCoreProperties.txt, UCD.html. */
2950 static bool
2951 is_property_uppercase (unsigned int ch)
2953 bool result1 =
2954 is_category_Lu (ch)
2955 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2956 bool result2 =
2957 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2959 assert (result1 == result2);
2960 return result1;
2963 /* See PropList.txt, UCD.html. */
2964 static bool
2965 is_property_other_uppercase (unsigned int ch)
2967 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2970 /* See PropList.txt, UCD.html,
2971 DerivedCoreProperties.txt, UCD.html. */
2972 static bool
2973 is_property_lowercase (unsigned int ch)
2975 bool result1 =
2976 is_category_Ll (ch)
2977 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2978 bool result2 =
2979 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2981 assert (result1 == result2);
2982 return result1;
2985 /* See PropList.txt, UCD.html. */
2986 static bool
2987 is_property_other_lowercase (unsigned int ch)
2989 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2992 /* See PropList-3.0.1.txt. */
2993 static bool
2994 is_property_titlecase (unsigned int ch)
2996 return is_category_Lt (ch);
2999 /* See DerivedCoreProperties.txt. */
3000 static bool
3001 is_property_cased (unsigned int ch)
3003 bool result1 = (is_property_lowercase (ch)
3004 || is_property_uppercase (ch)
3005 || is_category_Lt (ch));
3006 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
3008 assert (result1 == result2);
3009 return result1;
3012 /* See DerivedCoreProperties.txt. */
3013 static bool
3014 is_property_case_ignorable (unsigned int ch)
3016 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
3017 || ch == 0x0027
3018 || is_category_Mn (ch)
3019 || is_category_Me (ch)
3020 || is_category_Cf (ch)
3021 || is_category_Lm (ch)
3022 || is_category_Sk (ch));
3023 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
3025 assert (result1 == result2);
3026 return result1;
3029 /* See DerivedCoreProperties.txt. */
3030 static bool
3031 is_property_changes_when_lowercased (unsigned int ch)
3033 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
3034 bool result2 = (unicode_attributes[ch].name != NULL
3035 && unicode_attributes[ch].lower != NONE
3036 && unicode_attributes[ch].lower != ch);
3038 assert (result1 == result2);
3039 return result1;
3042 /* See DerivedCoreProperties.txt. */
3043 static bool
3044 is_property_changes_when_uppercased (unsigned int ch)
3046 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3049 /* See DerivedCoreProperties.txt. */
3050 static bool
3051 is_property_changes_when_titlecased (unsigned int ch)
3053 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3056 /* See DerivedCoreProperties.txt. */
3057 static bool
3058 is_property_changes_when_casefolded (unsigned int ch)
3060 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3063 /* See DerivedCoreProperties.txt. */
3064 static bool
3065 is_property_changes_when_casemapped (unsigned int ch)
3067 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3070 /* See PropList.txt, UCD.html. */
3071 static bool
3072 is_property_soft_dotted (unsigned int ch)
3074 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3077 /* See DerivedCoreProperties.txt, UCD.html. */
3078 static bool
3079 is_property_id_start (unsigned int ch)
3081 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3084 /* See PropList.txt, UCD.html. */
3085 static bool
3086 is_property_other_id_start (unsigned int ch)
3088 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3091 /* See DerivedCoreProperties.txt, UCD.html. */
3092 static bool
3093 is_property_id_continue (unsigned int ch)
3095 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3098 /* See PropList.txt, UCD.html. */
3099 static bool
3100 is_property_other_id_continue (unsigned int ch)
3102 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3105 /* See DerivedCoreProperties.txt, UCD.html. */
3106 static bool
3107 is_property_xid_start (unsigned int ch)
3109 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3112 /* See DerivedCoreProperties.txt, UCD.html. */
3113 static bool
3114 is_property_xid_continue (unsigned int ch)
3116 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3119 /* See PropList.txt, UCD.html. */
3120 static bool
3121 is_property_pattern_white_space (unsigned int ch)
3123 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3126 /* See PropList.txt, UCD.html. */
3127 static bool
3128 is_property_pattern_syntax (unsigned int ch)
3130 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3133 /* See PropList.txt, UCD.html. */
3134 static bool
3135 is_property_join_control (unsigned int ch)
3137 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3140 /* See DerivedCoreProperties.txt, UCD.html. */
3141 static bool
3142 is_property_grapheme_base (unsigned int ch)
3144 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3147 /* See DerivedCoreProperties.txt, UCD.html. */
3148 static bool
3149 is_property_grapheme_extend (unsigned int ch)
3151 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3154 /* See PropList.txt, UCD.html. */
3155 static bool
3156 is_property_other_grapheme_extend (unsigned int ch)
3158 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3161 /* See DerivedCoreProperties.txt, UCD.html. */
3162 static bool
3163 is_property_grapheme_link (unsigned int ch)
3165 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3168 /* See PropList.txt, UCD.html. */
3169 static bool
3170 is_property_bidi_control (unsigned int ch)
3172 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3175 /* See PropList-3.0.1.txt. */
3176 static bool
3177 is_property_bidi_left_to_right (unsigned int ch)
3179 return (get_bidi_category (ch) == UC_BIDI_L);
3182 /* See PropList-3.0.1.txt. */
3183 static bool
3184 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3186 return (get_bidi_category (ch) == UC_BIDI_R);
3189 /* See PropList-3.0.1.txt. */
3190 static bool
3191 is_property_bidi_arabic_right_to_left (unsigned int ch)
3193 return (get_bidi_category (ch) == UC_BIDI_AL);
3196 /* See PropList-3.0.1.txt. */
3197 static bool
3198 is_property_bidi_european_digit (unsigned int ch)
3200 return (get_bidi_category (ch) == UC_BIDI_EN);
3203 /* See PropList-3.0.1.txt. */
3204 static bool
3205 is_property_bidi_eur_num_separator (unsigned int ch)
3207 return (get_bidi_category (ch) == UC_BIDI_ES);
3210 /* See PropList-3.0.1.txt. */
3211 static bool
3212 is_property_bidi_eur_num_terminator (unsigned int ch)
3214 return (get_bidi_category (ch) == UC_BIDI_ET);
3217 /* See PropList-3.0.1.txt. */
3218 static bool
3219 is_property_bidi_arabic_digit (unsigned int ch)
3221 return (get_bidi_category (ch) == UC_BIDI_AN);
3224 /* See PropList-3.0.1.txt. */
3225 static bool
3226 is_property_bidi_common_separator (unsigned int ch)
3228 return (get_bidi_category (ch) == UC_BIDI_CS);
3231 /* See PropList-3.0.1.txt. */
3232 static bool
3233 is_property_bidi_block_separator (unsigned int ch)
3235 return (get_bidi_category (ch) == UC_BIDI_B);
3238 /* See PropList-3.0.1.txt. */
3239 static bool
3240 is_property_bidi_segment_separator (unsigned int ch)
3242 return (get_bidi_category (ch) == UC_BIDI_S);
3245 /* See PropList-3.0.1.txt. */
3246 static bool
3247 is_property_bidi_whitespace (unsigned int ch)
3249 return (get_bidi_category (ch) == UC_BIDI_WS);
3252 /* See PropList-3.0.1.txt. */
3253 static bool
3254 is_property_bidi_non_spacing_mark (unsigned int ch)
3256 return (get_bidi_category (ch) == UC_BIDI_NSM);
3259 /* See PropList-3.0.1.txt. */
3260 static bool
3261 is_property_bidi_boundary_neutral (unsigned int ch)
3263 return (get_bidi_category (ch) == UC_BIDI_BN);
3266 /* See PropList-3.0.1.txt. */
3267 static bool
3268 is_property_bidi_pdf (unsigned int ch)
3270 return (get_bidi_category (ch) == UC_BIDI_PDF);
3273 /* See PropList-3.0.1.txt. */
3274 static bool
3275 is_property_bidi_embedding_or_override (unsigned int ch)
3277 int category = get_bidi_category (ch);
3278 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3279 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3282 /* See PropList-3.0.1.txt. */
3283 static bool
3284 is_property_bidi_other_neutral (unsigned int ch)
3286 return (get_bidi_category (ch) == UC_BIDI_ON);
3289 /* See PropList.txt, UCD.html. */
3290 static bool
3291 is_property_hex_digit (unsigned int ch)
3293 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3296 /* See PropList.txt, UCD.html. */
3297 static bool
3298 is_property_ascii_hex_digit (unsigned int ch)
3300 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3303 /* See Unicode 3.0 book, section 4.10,
3304 PropList.txt, UCD.html. */
3305 static bool
3306 is_property_ideographic (unsigned int ch)
3308 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3311 /* See PropList.txt, UCD.html. */
3312 static bool
3313 is_property_unified_ideograph (unsigned int ch)
3315 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3318 /* See PropList.txt, UCD.html. */
3319 static bool
3320 is_property_radical (unsigned int ch)
3322 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3325 /* See PropList.txt, UCD.html. */
3326 static bool
3327 is_property_ids_binary_operator (unsigned int ch)
3329 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3332 /* See PropList.txt, UCD.html. */
3333 static bool
3334 is_property_ids_trinary_operator (unsigned int ch)
3336 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3339 /* See PropList-3.0.1.txt. */
3340 static bool
3341 is_property_zero_width (unsigned int ch)
3343 return is_category_Cf (ch)
3344 || (unicode_attributes[ch].name != NULL
3345 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3348 /* See PropList-3.0.1.txt. */
3349 static bool
3350 is_property_space (unsigned int ch)
3352 return is_category_Zs (ch);
3355 /* See PropList-3.0.1.txt. */
3356 static bool
3357 is_property_non_break (unsigned int ch)
3359 /* This is exactly the set of characters having line breaking
3360 property GL. */
3361 return (ch == 0x00A0 /* NO-BREAK SPACE */
3362 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3363 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3364 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3365 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3366 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3367 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3368 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3369 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3370 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3371 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3372 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3373 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3374 || ch == 0x2007 /* FIGURE SPACE */
3375 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3376 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3379 /* See PropList-3.0.1.txt. */
3380 static bool
3381 is_property_iso_control (unsigned int ch)
3383 bool result1 =
3384 (unicode_attributes[ch].name != NULL
3385 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3386 bool result2 =
3387 is_category_Cc (ch);
3389 assert (result1 == result2);
3390 return result1;
3393 /* See PropList-3.0.1.txt. */
3394 static bool
3395 is_property_format_control (unsigned int ch)
3397 return (is_category_Cf (ch)
3398 && get_bidi_category (ch) == UC_BIDI_BN
3399 && !is_property_join_control (ch)
3400 && ch != 0xFEFF);
3403 /* See PropList.txt, UCD.html. */
3404 static bool
3405 is_property_dash (unsigned int ch)
3407 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3410 /* See PropList.txt, UCD.html. */
3411 static bool
3412 is_property_hyphen (unsigned int ch)
3414 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3417 /* See PropList-3.0.1.txt. */
3418 static bool
3419 is_property_punctuation (unsigned int ch)
3421 return is_category_P (ch);
3424 /* See PropList-3.0.1.txt. */
3425 static bool
3426 is_property_line_separator (unsigned int ch)
3428 return is_category_Zl (ch);
3431 /* See PropList-3.0.1.txt. */
3432 static bool
3433 is_property_paragraph_separator (unsigned int ch)
3435 return is_category_Zp (ch);
3438 /* See PropList.txt, UCD.html. */
3439 static bool
3440 is_property_quotation_mark (unsigned int ch)
3442 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3445 /* See PropList.txt, UCD.html. */
3446 static bool
3447 is_property_sentence_terminal (unsigned int ch)
3449 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3452 /* See PropList.txt, UCD.html. */
3453 static bool
3454 is_property_terminal_punctuation (unsigned int ch)
3456 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3459 /* See PropList-3.0.1.txt. */
3460 static bool
3461 is_property_currency_symbol (unsigned int ch)
3463 return is_category_Sc (ch);
3466 /* See Unicode 3.0 book, section 4.9,
3467 PropList.txt, UCD.html,
3468 DerivedCoreProperties.txt, UCD.html. */
3469 static bool
3470 is_property_math (unsigned int ch)
3472 bool result1 =
3473 is_category_Sm (ch)
3474 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3475 bool result2 =
3476 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3478 assert (result1 == result2);
3479 return result1;
3482 /* See PropList.txt, UCD.html. */
3483 static bool
3484 is_property_other_math (unsigned int ch)
3486 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3489 /* See PropList-3.0.1.txt. */
3490 static bool
3491 is_property_paired_punctuation (unsigned int ch)
3493 return unicode_pairedpunctuation[ch];
3496 /* See PropList-3.0.1.txt. */
3497 static bool
3498 is_property_left_of_pair (unsigned int ch)
3500 return unicode_leftofpair[ch];
3503 /* See PropList-3.0.1.txt. */
3504 static bool
3505 is_property_combining (unsigned int ch)
3507 return (unicode_attributes[ch].name != NULL
3508 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3509 || is_category_Mc (ch)
3510 || is_category_Me (ch)
3511 || is_category_Mn (ch)));
3514 #if 0 /* same as is_property_bidi_non_spacing_mark */
3515 /* See PropList-3.0.1.txt. */
3516 static bool
3517 is_property_non_spacing (unsigned int ch)
3519 return (unicode_attributes[ch].name != NULL
3520 && get_bidi_category (ch) == UC_BIDI_NSM);
3522 #endif
3524 /* See PropList-3.0.1.txt. */
3525 static bool
3526 is_property_composite (unsigned int ch)
3528 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3529 logical in some sense. */
3530 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3531 return true;
3532 if (unicode_attributes[ch].name != NULL
3533 && unicode_attributes[ch].decomposition != NULL)
3535 /* Test whether the decomposition contains more than one character,
3536 and the first is not a space. */
3537 const char *decomp = unicode_attributes[ch].decomposition;
3538 if (decomp[0] == '<')
3540 decomp = strchr (decomp, '>') + 1;
3541 if (decomp[0] == ' ')
3542 decomp++;
3544 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3546 return false;
3549 /* See PropList-3.0.1.txt. */
3550 static bool
3551 is_property_decimal_digit (unsigned int ch)
3553 return is_category_Nd (ch);
3556 /* See PropList-3.0.1.txt. */
3557 static bool
3558 is_property_numeric (unsigned int ch)
3560 return ((get_numeric_value (ch)).denominator > 0)
3561 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3562 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3565 /* See PropList.txt, UCD.html. */
3566 static bool
3567 is_property_diacritic (unsigned int ch)
3569 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3572 /* See PropList.txt, UCD.html. */
3573 static bool
3574 is_property_extender (unsigned int ch)
3576 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3579 /* See PropList-3.0.1.txt. */
3580 static bool
3581 is_property_ignorable_control (unsigned int ch)
3583 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3584 || is_category_Cf (ch))
3585 && ch != 0x0000;
3588 /* ------------------------------------------------------------------------- */
3590 /* Output all properties. */
3591 static void
3592 output_properties (const char *version)
3594 #define PROPERTY(P) \
3595 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3596 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3597 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3598 PROPERTY(white_space)
3599 PROPERTY(alphabetic)
3600 PROPERTY(other_alphabetic)
3601 PROPERTY(not_a_character)
3602 PROPERTY(default_ignorable_code_point)
3603 PROPERTY(other_default_ignorable_code_point)
3604 PROPERTY(deprecated)
3605 PROPERTY(logical_order_exception)
3606 PROPERTY(variation_selector)
3607 PROPERTY(private_use)
3608 PROPERTY(unassigned_code_value)
3609 PROPERTY(uppercase)
3610 PROPERTY(other_uppercase)
3611 PROPERTY(lowercase)
3612 PROPERTY(other_lowercase)
3613 PROPERTY(titlecase)
3614 PROPERTY(cased)
3615 PROPERTY(case_ignorable)
3616 PROPERTY(changes_when_lowercased)
3617 PROPERTY(changes_when_uppercased)
3618 PROPERTY(changes_when_titlecased)
3619 PROPERTY(changes_when_casefolded)
3620 PROPERTY(changes_when_casemapped)
3621 PROPERTY(soft_dotted)
3622 PROPERTY(id_start)
3623 PROPERTY(other_id_start)
3624 PROPERTY(id_continue)
3625 PROPERTY(other_id_continue)
3626 PROPERTY(xid_start)
3627 PROPERTY(xid_continue)
3628 PROPERTY(pattern_white_space)
3629 PROPERTY(pattern_syntax)
3630 PROPERTY(join_control)
3631 PROPERTY(grapheme_base)
3632 PROPERTY(grapheme_extend)
3633 PROPERTY(other_grapheme_extend)
3634 PROPERTY(grapheme_link)
3635 PROPERTY(bidi_control)
3636 PROPERTY(bidi_left_to_right)
3637 PROPERTY(bidi_hebrew_right_to_left)
3638 PROPERTY(bidi_arabic_right_to_left)
3639 PROPERTY(bidi_european_digit)
3640 PROPERTY(bidi_eur_num_separator)
3641 PROPERTY(bidi_eur_num_terminator)
3642 PROPERTY(bidi_arabic_digit)
3643 PROPERTY(bidi_common_separator)
3644 PROPERTY(bidi_block_separator)
3645 PROPERTY(bidi_segment_separator)
3646 PROPERTY(bidi_whitespace)
3647 PROPERTY(bidi_non_spacing_mark)
3648 PROPERTY(bidi_boundary_neutral)
3649 PROPERTY(bidi_pdf)
3650 PROPERTY(bidi_embedding_or_override)
3651 PROPERTY(bidi_other_neutral)
3652 PROPERTY(hex_digit)
3653 PROPERTY(ascii_hex_digit)
3654 PROPERTY(ideographic)
3655 PROPERTY(unified_ideograph)
3656 PROPERTY(radical)
3657 PROPERTY(ids_binary_operator)
3658 PROPERTY(ids_trinary_operator)
3659 PROPERTY(zero_width)
3660 PROPERTY(space)
3661 PROPERTY(non_break)
3662 PROPERTY(iso_control)
3663 PROPERTY(format_control)
3664 PROPERTY(dash)
3665 PROPERTY(hyphen)
3666 PROPERTY(punctuation)
3667 PROPERTY(line_separator)
3668 PROPERTY(paragraph_separator)
3669 PROPERTY(quotation_mark)
3670 PROPERTY(sentence_terminal)
3671 PROPERTY(terminal_punctuation)
3672 PROPERTY(currency_symbol)
3673 PROPERTY(math)
3674 PROPERTY(other_math)
3675 PROPERTY(paired_punctuation)
3676 PROPERTY(left_of_pair)
3677 PROPERTY(combining)
3678 PROPERTY(composite)
3679 PROPERTY(decimal_digit)
3680 PROPERTY(numeric)
3681 PROPERTY(diacritic)
3682 PROPERTY(extender)
3683 PROPERTY(ignorable_control)
3684 #undef PROPERTY
3687 /* ========================================================================= */
3689 /* Arabic Shaping. */
3691 enum
3693 UC_JOINING_TYPE_U, /* Non_Joining */
3694 UC_JOINING_TYPE_T, /* Transparent */
3695 UC_JOINING_TYPE_C, /* Join_Causing */
3696 UC_JOINING_TYPE_L, /* Left_Joining */
3697 UC_JOINING_TYPE_R, /* Right_Joining */
3698 UC_JOINING_TYPE_D /* Dual_Joining */
3701 static uint8_t unicode_joining_type[0x110000];
3703 enum
3705 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
3706 UC_JOINING_GROUP_AIN, /* Ain */
3707 UC_JOINING_GROUP_ALAPH, /* Alaph */
3708 UC_JOINING_GROUP_ALEF, /* Alef */
3709 UC_JOINING_GROUP_BEH, /* Beh */
3710 UC_JOINING_GROUP_BETH, /* Beth */
3711 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
3712 UC_JOINING_GROUP_DAL, /* Dal */
3713 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
3714 UC_JOINING_GROUP_E, /* E */
3715 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
3716 UC_JOINING_GROUP_FE, /* Fe */
3717 UC_JOINING_GROUP_FEH, /* Feh */
3718 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
3719 UC_JOINING_GROUP_GAF, /* Gaf */
3720 UC_JOINING_GROUP_GAMAL, /* Gamal */
3721 UC_JOINING_GROUP_HAH, /* Hah */
3722 UC_JOINING_GROUP_HE, /* He */
3723 UC_JOINING_GROUP_HEH, /* Heh */
3724 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
3725 UC_JOINING_GROUP_HETH, /* Heth */
3726 UC_JOINING_GROUP_KAF, /* Kaf */
3727 UC_JOINING_GROUP_KAPH, /* Kaph */
3728 UC_JOINING_GROUP_KHAPH, /* Khaph */
3729 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
3730 UC_JOINING_GROUP_LAM, /* Lam */
3731 UC_JOINING_GROUP_LAMADH, /* Lamadh */
3732 UC_JOINING_GROUP_MEEM, /* Meem */
3733 UC_JOINING_GROUP_MIM, /* Mim */
3734 UC_JOINING_GROUP_NOON, /* Noon */
3735 UC_JOINING_GROUP_NUN, /* Nun */
3736 UC_JOINING_GROUP_NYA, /* Nya */
3737 UC_JOINING_GROUP_PE, /* Pe */
3738 UC_JOINING_GROUP_QAF, /* Qaf */
3739 UC_JOINING_GROUP_QAPH, /* Qaph */
3740 UC_JOINING_GROUP_REH, /* Reh */
3741 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
3742 UC_JOINING_GROUP_SAD, /* Sad */
3743 UC_JOINING_GROUP_SADHE, /* Sadhe */
3744 UC_JOINING_GROUP_SEEN, /* Seen */
3745 UC_JOINING_GROUP_SEMKATH, /* Semkath */
3746 UC_JOINING_GROUP_SHIN, /* Shin */
3747 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
3748 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
3749 UC_JOINING_GROUP_TAH, /* Tah */
3750 UC_JOINING_GROUP_TAW, /* Taw */
3751 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
3752 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
3753 UC_JOINING_GROUP_TETH, /* Teth */
3754 UC_JOINING_GROUP_WAW, /* Waw */
3755 UC_JOINING_GROUP_YEH, /* Yeh */
3756 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
3757 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
3758 UC_JOINING_GROUP_YUDH, /* Yudh */
3759 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
3760 UC_JOINING_GROUP_ZAIN, /* Zain */
3761 UC_JOINING_GROUP_ZHAIN, /* Zhain */
3762 UC_JOINING_GROUP_ROHINGYA_YEH, /* Rohingya_Yeh */
3763 UC_JOINING_GROUP_STRAIGHT_WAW, /* Straight_Waw */
3764 UC_JOINING_GROUP_MANICHAEAN_ALEPH, /* Manichaean_Aleph */
3765 UC_JOINING_GROUP_MANICHAEAN_BETH, /* Manichaean_Beth */
3766 UC_JOINING_GROUP_MANICHAEAN_GIMEL, /* Manichaean_Gimel */
3767 UC_JOINING_GROUP_MANICHAEAN_DALETH, /* Manichaean_Daleth */
3768 UC_JOINING_GROUP_MANICHAEAN_WAW, /* Manichaean_Waw */
3769 UC_JOINING_GROUP_MANICHAEAN_ZAYIN, /* Manichaean_Zayin */
3770 UC_JOINING_GROUP_MANICHAEAN_HETH, /* Manichaean_Heth */
3771 UC_JOINING_GROUP_MANICHAEAN_TETH, /* Manichaean_Teth */
3772 UC_JOINING_GROUP_MANICHAEAN_YODH, /* Manichaean_Yodh */
3773 UC_JOINING_GROUP_MANICHAEAN_KAPH, /* Manichaean_Kaph */
3774 UC_JOINING_GROUP_MANICHAEAN_LAMEDH, /* Manichaean_Lamedh */
3775 UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, /* Manichaean_Dhamedh */
3776 UC_JOINING_GROUP_MANICHAEAN_THAMEDH, /* Manichaean_Thamedh */
3777 UC_JOINING_GROUP_MANICHAEAN_MEM, /* Manichaean_Mem */
3778 UC_JOINING_GROUP_MANICHAEAN_NUN, /* Manichaean_Nun */
3779 UC_JOINING_GROUP_MANICHAEAN_SAMEKH, /* Manichaean_Aleph */
3780 UC_JOINING_GROUP_MANICHAEAN_AYIN, /* Manichaean_Ayin */
3781 UC_JOINING_GROUP_MANICHAEAN_PE, /* Manichaean_Pe */
3782 UC_JOINING_GROUP_MANICHAEAN_SADHE, /* Manichaean_Sadhe */
3783 UC_JOINING_GROUP_MANICHAEAN_QOPH, /* Manichaean_Qoph */
3784 UC_JOINING_GROUP_MANICHAEAN_RESH, /* Manichaean_Resh */
3785 UC_JOINING_GROUP_MANICHAEAN_TAW, /* Manichaean_Taw */
3786 UC_JOINING_GROUP_MANICHAEAN_ONE, /* Manichaean_One */
3787 UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
3788 UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
3789 UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
3790 UC_JOINING_GROUP_MANICHAEAN_HUNDRED /* Manichaean_Hundred */
3793 static uint8_t unicode_joining_group[0x110000];
3795 static void
3796 fill_arabicshaping (const char *arabicshaping_filename)
3798 FILE *stream;
3799 unsigned int i;
3800 int lineno;
3802 stream = fopen (arabicshaping_filename, "r");
3803 if (stream == NULL)
3805 fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
3806 exit (1);
3809 for (i = 0; i < 0x110000; i++)
3811 unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
3812 unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
3815 lineno = 0;
3816 for (;;)
3818 char buf[100+1];
3819 char separator1[100+1];
3820 char padding1[100+1];
3821 char schematic_name[100+1];
3822 char separator2[100+1];
3823 char padding2[100+1];
3824 char joining_type_name[100+1];
3825 char separator3[100+1];
3826 char padding3[100+1];
3827 char joining_group_name[100+1];
3828 int joining_type;
3829 int joining_group;
3831 lineno++;
3832 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
3833 break;
3835 if (buf[0] == '\0' || buf[0] == '#')
3836 continue;
3838 if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]",
3839 &i, separator1, padding1, schematic_name, separator2,
3840 padding2, joining_type_name, separator3, padding3,
3841 joining_group_name) != 10)
3843 fprintf (stderr, "parse error in '%s':%d\n",
3844 arabicshaping_filename, lineno);
3845 exit (1);
3847 assert (i < 0x110000);
3849 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
3850 if (false) {}
3851 TRY(UC_JOINING_TYPE_U)
3852 TRY(UC_JOINING_TYPE_T)
3853 TRY(UC_JOINING_TYPE_C)
3854 TRY(UC_JOINING_TYPE_L)
3855 TRY(UC_JOINING_TYPE_R)
3856 TRY(UC_JOINING_TYPE_D)
3857 #undef TRY
3858 else
3860 fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
3861 joining_type_name, arabicshaping_filename, lineno);
3862 exit (1);
3865 /* Remove trailing spaces. */
3866 while (joining_group_name[0] != '\0'
3867 && joining_group_name[strlen (joining_group_name) - 1] == ' ')
3868 joining_group_name[strlen (joining_group_name) - 1] = '\0';
3870 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
3871 if (false) {}
3872 TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
3873 TRY(UC_JOINING_GROUP_AIN, "AIN")
3874 TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
3875 TRY(UC_JOINING_GROUP_ALEF, "ALEF")
3876 TRY(UC_JOINING_GROUP_BEH, "BEH")
3877 TRY(UC_JOINING_GROUP_BETH, "BETH")
3878 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
3879 TRY(UC_JOINING_GROUP_DAL, "DAL")
3880 TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
3881 TRY(UC_JOINING_GROUP_E, "E")
3882 TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
3883 TRY(UC_JOINING_GROUP_FE, "FE")
3884 TRY(UC_JOINING_GROUP_FEH, "FEH")
3885 TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
3886 TRY(UC_JOINING_GROUP_GAF, "GAF")
3887 TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
3888 TRY(UC_JOINING_GROUP_HAH, "HAH")
3889 TRY(UC_JOINING_GROUP_HE, "HE")
3890 TRY(UC_JOINING_GROUP_HEH, "HEH")
3891 TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
3892 TRY(UC_JOINING_GROUP_HETH, "HETH")
3893 TRY(UC_JOINING_GROUP_KAF, "KAF")
3894 TRY(UC_JOINING_GROUP_KAPH, "KAPH")
3895 TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
3896 TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
3897 TRY(UC_JOINING_GROUP_LAM, "LAM")
3898 TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
3899 TRY(UC_JOINING_GROUP_MEEM, "MEEM")
3900 TRY(UC_JOINING_GROUP_MIM, "MIM")
3901 TRY(UC_JOINING_GROUP_NOON, "NOON")
3902 TRY(UC_JOINING_GROUP_NUN, "NUN")
3903 TRY(UC_JOINING_GROUP_NYA, "NYA")
3904 TRY(UC_JOINING_GROUP_PE, "PE")
3905 TRY(UC_JOINING_GROUP_QAF, "QAF")
3906 TRY(UC_JOINING_GROUP_QAPH, "QAPH")
3907 TRY(UC_JOINING_GROUP_REH, "REH")
3908 TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
3909 TRY(UC_JOINING_GROUP_SAD, "SAD")
3910 TRY(UC_JOINING_GROUP_SADHE, "SADHE")
3911 TRY(UC_JOINING_GROUP_SEEN, "SEEN")
3912 TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
3913 TRY(UC_JOINING_GROUP_SHIN, "SHIN")
3914 TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
3915 TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
3916 TRY(UC_JOINING_GROUP_TAH, "TAH")
3917 TRY(UC_JOINING_GROUP_TAW, "TAW")
3918 TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
3919 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
3920 TRY(UC_JOINING_GROUP_TETH, "TETH")
3921 TRY(UC_JOINING_GROUP_WAW, "WAW")
3922 TRY(UC_JOINING_GROUP_YEH, "YEH")
3923 TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
3924 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
3925 TRY(UC_JOINING_GROUP_YUDH, "YUDH")
3926 TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
3927 TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
3928 TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
3929 TRY(UC_JOINING_GROUP_ROHINGYA_YEH, "ROHINGYA YEH")
3930 TRY(UC_JOINING_GROUP_STRAIGHT_WAW, "STRAIGHT WAW")
3931 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH, "MANICHAEAN ALEPH")
3932 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH, "MANICHAEAN BETH")
3933 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL, "MANICHAEAN GIMEL")
3934 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH, "MANICHAEAN DALETH")
3935 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW, "MANICHAEAN WAW")
3936 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN, "MANICHAEAN ZAYIN")
3937 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH, "MANICHAEAN HETH")
3938 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH, "MANICHAEAN TETH")
3939 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH, "MANICHAEAN YODH")
3940 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH, "MANICHAEAN KAPH")
3941 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH, "MANICHAEAN LAMEDH")
3942 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, "MANICHAEAN DHAMEDH")
3943 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH, "MANICHAEAN THAMEDH")
3944 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM, "MANICHAEAN MEM")
3945 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN, "MANICHAEAN NUN")
3946 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH, "MANICHAEAN SAMEKH")
3947 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN, "MANICHAEAN AYIN")
3948 TRY(UC_JOINING_GROUP_MANICHAEAN_PE, "MANICHAEAN PE")
3949 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE, "MANICHAEAN SADHE")
3950 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH, "MANICHAEAN QOPH")
3951 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH, "MANICHAEAN RESH")
3952 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW, "MANICHAEAN TAW")
3953 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE, "MANICHAEAN ONE")
3954 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE, "MANICHAEAN FIVE")
3955 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN, "MANICHAEAN TEN")
3956 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY, "MANICHAEAN TWENTY")
3957 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED, "MANICHAEAN HUNDRED")
3958 #undef TRY
3959 else
3961 fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
3962 joining_group_name, arabicshaping_filename, lineno);
3963 exit (1);
3966 unicode_joining_type[i] = joining_type;
3967 unicode_joining_group[i] = joining_group;
3970 if (ferror (stream) || fclose (stream))
3972 fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
3973 exit (1);
3977 /* Convert a Joining_Type value to a C identifier. */
3978 static const char *
3979 joining_type_as_c_identifier (int joining_type)
3981 #define TRY(value) if (joining_type == value) return #value;
3982 TRY(UC_JOINING_TYPE_U)
3983 TRY(UC_JOINING_TYPE_T)
3984 TRY(UC_JOINING_TYPE_C)
3985 TRY(UC_JOINING_TYPE_L)
3986 TRY(UC_JOINING_TYPE_R)
3987 TRY(UC_JOINING_TYPE_D)
3988 #undef TRY
3989 abort ();
3992 static void
3993 output_joining_type_test (const char *filename, const char *version)
3995 FILE *stream;
3996 bool need_comma;
3997 unsigned int ch;
3999 stream = fopen (filename, "w");
4000 if (stream == NULL)
4002 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4003 exit (1);
4006 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4007 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4008 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4009 version);
4011 need_comma = false;
4012 for (ch = 0; ch < 0x110000; ch++)
4014 int value = unicode_joining_type[ch];
4016 if (value != (uint8_t)~(uint8_t)0)
4018 if (need_comma)
4019 fprintf (stream, ",\n");
4020 fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
4021 need_comma = true;
4024 if (need_comma)
4025 fprintf (stream, "\n");
4027 if (ferror (stream) || fclose (stream))
4029 fprintf (stderr, "error writing to '%s'\n", filename);
4030 exit (1);
4034 /* Construction of sparse 3-level tables. */
4035 #define TABLE joining_type_table
4036 #define ELEMENT uint8_t
4037 #define DEFAULT (uint8_t)~(uint8_t)0
4038 #define xmalloc malloc
4039 #define xrealloc realloc
4040 #include "3level.h"
4042 static void
4043 output_joining_type (const char *filename, const char *version)
4045 FILE *stream;
4046 unsigned int ch, i;
4047 struct joining_type_table t;
4048 unsigned int level1_offset, level2_offset, level3_offset;
4049 uint8_t *level3_packed;
4051 stream = fopen (filename, "w");
4052 if (stream == NULL)
4054 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4055 exit (1);
4058 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4059 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4060 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4061 version);
4063 t.p = 7;
4064 t.q = 9;
4065 joining_type_table_init (&t);
4067 for (ch = 0; ch < 0x110000; ch++)
4069 uint8_t value = unicode_joining_type[ch];
4071 assert (value == (uint8_t)~(uint8_t)0 || value <= 0x0f);
4073 joining_type_table_add (&t, ch, value);
4076 joining_type_table_finalize (&t);
4078 /* Offsets in t.result, in memory of this process. */
4079 level1_offset =
4080 5 * sizeof (uint32_t);
4081 level2_offset =
4082 5 * sizeof (uint32_t)
4083 + t.level1_size * sizeof (uint32_t);
4084 level3_offset =
4085 5 * sizeof (uint32_t)
4086 + t.level1_size * sizeof (uint32_t)
4087 + (t.level2_size << t.q) * sizeof (uint32_t);
4089 for (i = 0; i < 5; i++)
4090 fprintf (stream, "#define joining_type_header_%d %d\n", i,
4091 ((uint32_t *) t.result)[i]);
4092 fprintf (stream, "static const\n");
4093 fprintf (stream, "struct\n");
4094 fprintf (stream, " {\n");
4095 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4096 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4097 fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
4098 (1 << t.p) * 4 / 8);
4099 fprintf (stream, " }\n");
4100 fprintf (stream, "u_joining_type =\n");
4101 fprintf (stream, "{\n");
4102 fprintf (stream, " {");
4103 if (t.level1_size > 8)
4104 fprintf (stream, "\n ");
4105 for (i = 0; i < t.level1_size; i++)
4107 uint32_t offset;
4108 if (i > 0 && (i % 8) == 0)
4109 fprintf (stream, "\n ");
4110 offset = ((uint32_t *) (t.result + level1_offset))[i];
4111 if (offset == 0)
4112 fprintf (stream, " %5d", -1);
4113 else
4114 fprintf (stream, " %5zu",
4115 (offset - level2_offset) / sizeof (uint32_t));
4116 if (i+1 < t.level1_size)
4117 fprintf (stream, ",");
4119 if (t.level1_size > 8)
4120 fprintf (stream, "\n ");
4121 fprintf (stream, " },\n");
4122 fprintf (stream, " {");
4123 if (t.level2_size << t.q > 8)
4124 fprintf (stream, "\n ");
4125 for (i = 0; i < t.level2_size << t.q; i++)
4127 uint32_t offset;
4128 if (i > 0 && (i % 8) == 0)
4129 fprintf (stream, "\n ");
4130 offset = ((uint32_t *) (t.result + level2_offset))[i];
4131 if (offset == 0)
4132 fprintf (stream, " %5d", -1);
4133 else
4134 fprintf (stream, " %5zu",
4135 (offset - level3_offset) / sizeof (uint8_t));
4136 if (i+1 < t.level2_size << t.q)
4137 fprintf (stream, ",");
4139 if (t.level2_size << t.q > 8)
4140 fprintf (stream, "\n ");
4141 fprintf (stream, " },\n");
4142 /* Pack the level3 array. Each entry needs 4 bits only. */
4143 level3_packed =
4144 (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
4145 for (i = 0; i < t.level3_size << t.p; i++)
4147 unsigned int j = (i * 4) / 8;
4148 unsigned int k = (i * 4) % 8;
4149 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
4150 level3_packed[j] |= (value << k);
4152 fprintf (stream, " {");
4153 if ((t.level3_size << t.p) * 4 / 8 > 8)
4154 fprintf (stream, "\n ");
4155 for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
4157 if (i > 0 && (i % 8) == 0)
4158 fprintf (stream, "\n ");
4159 fprintf (stream, " 0x%02x", level3_packed[i]);
4160 if (i+1 < (t.level3_size << t.p) * 4 / 8)
4161 fprintf (stream, ",");
4163 if ((t.level3_size << t.p) * 4 / 8 > 8)
4164 fprintf (stream, "\n ");
4165 fprintf (stream, " }\n");
4166 free (level3_packed);
4167 fprintf (stream, "};\n");
4169 if (ferror (stream) || fclose (stream))
4171 fprintf (stderr, "error writing to '%s'\n", filename);
4172 exit (1);
4176 /* Convert a Joining_Group value to a C identifier. */
4177 static const char *
4178 joining_group_as_c_identifier (int joining_group)
4180 #define TRY(value) if (joining_group == value) return #value;
4181 TRY(UC_JOINING_GROUP_NONE)
4182 TRY(UC_JOINING_GROUP_AIN)
4183 TRY(UC_JOINING_GROUP_ALAPH)
4184 TRY(UC_JOINING_GROUP_ALEF)
4185 TRY(UC_JOINING_GROUP_BEH)
4186 TRY(UC_JOINING_GROUP_BETH)
4187 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
4188 TRY(UC_JOINING_GROUP_DAL)
4189 TRY(UC_JOINING_GROUP_DALATH_RISH)
4190 TRY(UC_JOINING_GROUP_E)
4191 TRY(UC_JOINING_GROUP_FARSI_YEH)
4192 TRY(UC_JOINING_GROUP_FE)
4193 TRY(UC_JOINING_GROUP_FEH)
4194 TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
4195 TRY(UC_JOINING_GROUP_GAF)
4196 TRY(UC_JOINING_GROUP_GAMAL)
4197 TRY(UC_JOINING_GROUP_HAH)
4198 TRY(UC_JOINING_GROUP_HE)
4199 TRY(UC_JOINING_GROUP_HEH)
4200 TRY(UC_JOINING_GROUP_HEH_GOAL)
4201 TRY(UC_JOINING_GROUP_HETH)
4202 TRY(UC_JOINING_GROUP_KAF)
4203 TRY(UC_JOINING_GROUP_KAPH)
4204 TRY(UC_JOINING_GROUP_KHAPH)
4205 TRY(UC_JOINING_GROUP_KNOTTED_HEH)
4206 TRY(UC_JOINING_GROUP_LAM)
4207 TRY(UC_JOINING_GROUP_LAMADH)
4208 TRY(UC_JOINING_GROUP_MEEM)
4209 TRY(UC_JOINING_GROUP_MIM)
4210 TRY(UC_JOINING_GROUP_NOON)
4211 TRY(UC_JOINING_GROUP_NUN)
4212 TRY(UC_JOINING_GROUP_NYA)
4213 TRY(UC_JOINING_GROUP_PE)
4214 TRY(UC_JOINING_GROUP_QAF)
4215 TRY(UC_JOINING_GROUP_QAPH)
4216 TRY(UC_JOINING_GROUP_REH)
4217 TRY(UC_JOINING_GROUP_REVERSED_PE)
4218 TRY(UC_JOINING_GROUP_SAD)
4219 TRY(UC_JOINING_GROUP_SADHE)
4220 TRY(UC_JOINING_GROUP_SEEN)
4221 TRY(UC_JOINING_GROUP_SEMKATH)
4222 TRY(UC_JOINING_GROUP_SHIN)
4223 TRY(UC_JOINING_GROUP_SWASH_KAF)
4224 TRY(UC_JOINING_GROUP_SYRIAC_WAW)
4225 TRY(UC_JOINING_GROUP_TAH)
4226 TRY(UC_JOINING_GROUP_TAW)
4227 TRY(UC_JOINING_GROUP_TEH_MARBUTA)
4228 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
4229 TRY(UC_JOINING_GROUP_TETH)
4230 TRY(UC_JOINING_GROUP_WAW)
4231 TRY(UC_JOINING_GROUP_YEH)
4232 TRY(UC_JOINING_GROUP_YEH_BARREE)
4233 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
4234 TRY(UC_JOINING_GROUP_YUDH)
4235 TRY(UC_JOINING_GROUP_YUDH_HE)
4236 TRY(UC_JOINING_GROUP_ZAIN)
4237 TRY(UC_JOINING_GROUP_ZHAIN)
4238 TRY(UC_JOINING_GROUP_ROHINGYA_YEH)
4239 TRY(UC_JOINING_GROUP_STRAIGHT_WAW)
4240 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH)
4241 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH)
4242 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL)
4243 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH)
4244 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW)
4245 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN)
4246 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH)
4247 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH)
4248 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH)
4249 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH)
4250 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH)
4251 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH)
4252 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH)
4253 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM)
4254 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN)
4255 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH)
4256 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN)
4257 TRY(UC_JOINING_GROUP_MANICHAEAN_PE)
4258 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE)
4259 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH)
4260 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH)
4261 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW)
4262 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE)
4263 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE)
4264 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN)
4265 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY)
4266 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED)
4267 #undef TRY
4268 abort ();
4271 static void
4272 output_joining_group_test (const char *filename, const char *version)
4274 FILE *stream;
4275 bool need_comma;
4276 unsigned int ch;
4278 stream = fopen (filename, "w");
4279 if (stream == NULL)
4281 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4282 exit (1);
4285 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4286 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4287 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4288 version);
4290 need_comma = false;
4291 for (ch = 0; ch < 0x110000; ch++)
4293 int value = unicode_joining_group[ch];
4295 if (value != UC_JOINING_GROUP_NONE)
4297 if (need_comma)
4298 fprintf (stream, ",\n");
4299 fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
4300 need_comma = true;
4303 if (need_comma)
4304 fprintf (stream, "\n");
4306 if (ferror (stream) || fclose (stream))
4308 fprintf (stderr, "error writing to '%s'\n", filename);
4309 exit (1);
4313 /* Construction of sparse 3-level tables. */
4314 #define TABLE joining_group_table
4315 #define ELEMENT uint8_t
4316 #define DEFAULT UC_JOINING_GROUP_NONE
4317 #define xmalloc malloc
4318 #define xrealloc realloc
4319 #include "3level.h"
4321 static void
4322 output_joining_group (const char *filename, const char *version)
4324 FILE *stream;
4325 unsigned int ch, i;
4326 struct joining_group_table t;
4327 unsigned int level1_offset, level2_offset, level3_offset;
4328 uint16_t *level3_packed;
4330 stream = fopen (filename, "w");
4331 if (stream == NULL)
4333 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4334 exit (1);
4337 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4338 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4339 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4340 version);
4342 t.p = 7;
4343 t.q = 9;
4344 joining_group_table_init (&t);
4346 for (ch = 0; ch < 0x110000; ch++)
4348 uint8_t value = unicode_joining_group[ch];
4350 assert (value <= 0x7f);
4352 joining_group_table_add (&t, ch, value);
4355 joining_group_table_finalize (&t);
4357 /* Offsets in t.result, in memory of this process. */
4358 level1_offset =
4359 5 * sizeof (uint32_t);
4360 level2_offset =
4361 5 * sizeof (uint32_t)
4362 + t.level1_size * sizeof (uint32_t);
4363 level3_offset =
4364 5 * sizeof (uint32_t)
4365 + t.level1_size * sizeof (uint32_t)
4366 + (t.level2_size << t.q) * sizeof (uint32_t);
4368 for (i = 0; i < 5; i++)
4369 fprintf (stream, "#define joining_group_header_%d %d\n", i,
4370 ((uint32_t *) t.result)[i]);
4371 fprintf (stream, "static const\n");
4372 fprintf (stream, "struct\n");
4373 fprintf (stream, " {\n");
4374 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4375 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4376 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
4377 (1 << t.p) * 7 / 16);
4378 fprintf (stream, " }\n");
4379 fprintf (stream, "u_joining_group =\n");
4380 fprintf (stream, "{\n");
4381 fprintf (stream, " {");
4382 if (t.level1_size > 8)
4383 fprintf (stream, "\n ");
4384 for (i = 0; i < t.level1_size; i++)
4386 uint32_t offset;
4387 if (i > 0 && (i % 8) == 0)
4388 fprintf (stream, "\n ");
4389 offset = ((uint32_t *) (t.result + level1_offset))[i];
4390 if (offset == 0)
4391 fprintf (stream, " %5d", -1);
4392 else
4393 fprintf (stream, " %5zu",
4394 (offset - level2_offset) / sizeof (uint32_t));
4395 if (i+1 < t.level1_size)
4396 fprintf (stream, ",");
4398 if (t.level1_size > 8)
4399 fprintf (stream, "\n ");
4400 fprintf (stream, " },\n");
4401 fprintf (stream, " {");
4402 if (t.level2_size << t.q > 8)
4403 fprintf (stream, "\n ");
4404 for (i = 0; i < t.level2_size << t.q; i++)
4406 uint32_t offset;
4407 if (i > 0 && (i % 8) == 0)
4408 fprintf (stream, "\n ");
4409 offset = ((uint32_t *) (t.result + level2_offset))[i];
4410 if (offset == 0)
4411 fprintf (stream, " %5d", -1);
4412 else
4413 fprintf (stream, " %5zu",
4414 (offset - level3_offset) / sizeof (uint8_t));
4415 if (i+1 < t.level2_size << t.q)
4416 fprintf (stream, ",");
4418 if (t.level2_size << t.q > 8)
4419 fprintf (stream, "\n ");
4420 fprintf (stream, " },\n");
4421 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
4422 not 32-bit units, in order to make the lookup function easier. */
4423 level3_packed =
4424 (uint16_t *)
4425 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
4426 for (i = 0; i < t.level3_size << t.p; i++)
4428 unsigned int j = (i * 7) / 16;
4429 unsigned int k = (i * 7) % 16;
4430 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
4431 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
4432 level3_packed[j] = value & 0xffff;
4433 level3_packed[j+1] = value >> 16;
4435 fprintf (stream, " {");
4436 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4437 fprintf (stream, "\n ");
4438 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
4440 if (i > 0 && (i % 8) == 0)
4441 fprintf (stream, "\n ");
4442 fprintf (stream, " 0x%04x", level3_packed[i]);
4443 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
4444 fprintf (stream, ",");
4446 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4447 fprintf (stream, "\n ");
4448 fprintf (stream, " }\n");
4449 free (level3_packed);
4450 fprintf (stream, "};\n");
4452 if (ferror (stream) || fclose (stream))
4454 fprintf (stderr, "error writing to '%s'\n", filename);
4455 exit (1);
4459 /* ========================================================================= */
4461 /* Scripts. */
4463 static const char *scripts[256];
4464 static unsigned int numscripts;
4466 static uint8_t unicode_scripts[0x110000];
4468 static void
4469 fill_scripts (const char *scripts_filename)
4471 FILE *stream;
4472 unsigned int i;
4474 stream = fopen (scripts_filename, "r");
4475 if (stream == NULL)
4477 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
4478 exit (1);
4481 numscripts = 0;
4483 for (i = 0; i < 0x110000; i++)
4484 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
4486 for (;;)
4488 char buf[200+1];
4489 unsigned int i1, i2;
4490 char padding[200+1];
4491 char scriptname[200+1];
4492 int script;
4494 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4495 break;
4497 if (buf[0] == '\0' || buf[0] == '#')
4498 continue;
4500 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
4502 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
4504 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
4505 exit (1);
4507 i2 = i1;
4509 assert (i2 >= i1);
4510 assert (i2 < 0x110000);
4512 for (script = numscripts - 1; script >= 0; script--)
4513 if (strcmp (scripts[script], scriptname) == 0)
4514 break;
4515 if (script < 0)
4517 scripts[numscripts] = strdup (scriptname);
4518 script = numscripts;
4519 numscripts++;
4520 assert (numscripts != 256);
4523 for (i = i1; i <= i2; i++)
4525 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
4526 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
4527 unicode_scripts[i] = script;
4531 if (ferror (stream) || fclose (stream))
4533 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
4534 exit (1);
4538 /* Construction of sparse 3-level tables. */
4539 #define TABLE script_table
4540 #define ELEMENT uint8_t
4541 #define DEFAULT (uint8_t)~(uint8_t)0
4542 #define xmalloc malloc
4543 #define xrealloc realloc
4544 #include "3level.h"
4546 static void
4547 output_scripts (const char *version)
4549 const char *filename = "unictype/scripts.h";
4550 FILE *stream;
4551 unsigned int ch, s, i;
4552 struct script_table t;
4553 unsigned int level1_offset, level2_offset, level3_offset;
4555 typedef struct
4557 const char *lowercase_name;
4559 scriptinfo_t;
4560 scriptinfo_t scriptinfo[256];
4562 stream = fopen (filename, "w");
4563 if (stream == NULL)
4565 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4566 exit (1);
4569 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4570 fprintf (stream, "/* Unicode scripts. */\n");
4571 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4572 version);
4574 for (s = 0; s < numscripts; s++)
4576 char *lcp = strdup (scripts[s]);
4577 char *cp;
4579 for (cp = lcp; *cp != '\0'; cp++)
4580 if (*cp >= 'A' && *cp <= 'Z')
4581 *cp += 'a' - 'A';
4583 scriptinfo[s].lowercase_name = lcp;
4586 for (s = 0; s < numscripts; s++)
4588 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
4589 scriptinfo[s].lowercase_name);
4590 fprintf (stream, "{\n");
4591 i = 0;
4592 for (ch = 0; ch < 0x110000; ch++)
4593 if (unicode_scripts[ch] == s)
4595 unsigned int start;
4596 unsigned int end;
4598 start = ch;
4599 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
4600 ch++;
4601 end = ch;
4603 if (i > 0)
4604 fprintf (stream, ",\n");
4605 if (start == end)
4606 fprintf (stream, " { 0x%04X, 1, 1 }", start);
4607 else
4608 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
4609 start, end);
4610 i++;
4612 fprintf (stream, "\n");
4613 fprintf (stream, "};\n");
4616 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
4617 fprintf (stream, "{\n");
4618 for (s = 0; s < numscripts; s++)
4620 fprintf (stream, " {\n");
4621 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
4622 scriptinfo[s].lowercase_name);
4623 fprintf (stream, " script_%s_intervals,\n",
4624 scriptinfo[s].lowercase_name);
4625 fprintf (stream, " \"%s\"\n", scripts[s]);
4626 fprintf (stream, " }");
4627 if (s+1 < numscripts)
4628 fprintf (stream, ",");
4629 fprintf (stream, "\n");
4631 fprintf (stream, "};\n");
4633 t.p = 7;
4634 t.q = 9;
4635 script_table_init (&t);
4637 for (ch = 0; ch < 0x110000; ch++)
4639 unsigned int s = unicode_scripts[ch];
4640 if (s != (uint8_t)~(uint8_t)0)
4641 script_table_add (&t, ch, s);
4644 script_table_finalize (&t);
4646 /* Offsets in t.result, in memory of this process. */
4647 level1_offset =
4648 5 * sizeof (uint32_t);
4649 level2_offset =
4650 5 * sizeof (uint32_t)
4651 + t.level1_size * sizeof (uint32_t);
4652 level3_offset =
4653 5 * sizeof (uint32_t)
4654 + t.level1_size * sizeof (uint32_t)
4655 + (t.level2_size << t.q) * sizeof (uint32_t);
4657 for (i = 0; i < 5; i++)
4658 fprintf (stream, "#define script_header_%d %d\n", i,
4659 ((uint32_t *) t.result)[i]);
4660 fprintf (stream, "static const\n");
4661 fprintf (stream, "struct\n");
4662 fprintf (stream, " {\n");
4663 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4664 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4665 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
4666 fprintf (stream, " }\n");
4667 fprintf (stream, "u_script =\n");
4668 fprintf (stream, "{\n");
4669 fprintf (stream, " {");
4670 if (t.level1_size > 8)
4671 fprintf (stream, "\n ");
4672 for (i = 0; i < t.level1_size; i++)
4674 uint32_t offset;
4675 if (i > 0 && (i % 8) == 0)
4676 fprintf (stream, "\n ");
4677 offset = ((uint32_t *) (t.result + level1_offset))[i];
4678 if (offset == 0)
4679 fprintf (stream, " %5d", -1);
4680 else
4681 fprintf (stream, " %5zu",
4682 (offset - level2_offset) / sizeof (uint32_t));
4683 if (i+1 < t.level1_size)
4684 fprintf (stream, ",");
4686 if (t.level1_size > 8)
4687 fprintf (stream, "\n ");
4688 fprintf (stream, " },\n");
4689 fprintf (stream, " {");
4690 if (t.level2_size << t.q > 8)
4691 fprintf (stream, "\n ");
4692 for (i = 0; i < t.level2_size << t.q; i++)
4694 uint32_t offset;
4695 if (i > 0 && (i % 8) == 0)
4696 fprintf (stream, "\n ");
4697 offset = ((uint32_t *) (t.result + level2_offset))[i];
4698 if (offset == 0)
4699 fprintf (stream, " %5d", -1);
4700 else
4701 fprintf (stream, " %5zu",
4702 (offset - level3_offset) / sizeof (uint8_t));
4703 if (i+1 < t.level2_size << t.q)
4704 fprintf (stream, ",");
4706 if (t.level2_size << t.q > 8)
4707 fprintf (stream, "\n ");
4708 fprintf (stream, " },\n");
4709 fprintf (stream, " {");
4710 if (t.level3_size << t.p > 8)
4711 fprintf (stream, "\n ");
4712 for (i = 0; i < t.level3_size << t.p; i++)
4714 if (i > 0 && (i % 8) == 0)
4715 fprintf (stream, "\n ");
4716 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
4717 if (i+1 < t.level3_size << t.p)
4718 fprintf (stream, ",");
4720 if (t.level3_size << t.p > 8)
4721 fprintf (stream, "\n ");
4722 fprintf (stream, " }\n");
4723 fprintf (stream, "};\n");
4725 if (ferror (stream) || fclose (stream))
4727 fprintf (stderr, "error writing to '%s'\n", filename);
4728 exit (1);
4732 static void
4733 output_scripts_byname (const char *version)
4735 const char *filename = "unictype/scripts_byname.gperf";
4736 FILE *stream;
4737 unsigned int s;
4739 stream = fopen (filename, "w");
4740 if (stream == NULL)
4742 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4743 exit (1);
4746 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4747 fprintf (stream, "/* Unicode scripts. */\n");
4748 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4749 version);
4750 fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
4751 fprintf (stream, "%%struct-type\n");
4752 fprintf (stream, "%%language=ANSI-C\n");
4753 fprintf (stream, "%%define hash-function-name scripts_hash\n");
4754 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
4755 fprintf (stream, "%%readonly-tables\n");
4756 fprintf (stream, "%%global-table\n");
4757 fprintf (stream, "%%define word-array-name script_names\n");
4758 fprintf (stream, "%%pic\n");
4759 fprintf (stream, "%%define string-pool-name script_stringpool\n");
4760 fprintf (stream, "%%%%\n");
4761 for (s = 0; s < numscripts; s++)
4762 fprintf (stream, "%s, %u\n", scripts[s], s);
4764 if (ferror (stream) || fclose (stream))
4766 fprintf (stderr, "error writing to '%s'\n", filename);
4767 exit (1);
4771 /* ========================================================================= */
4773 /* Blocks. */
4775 typedef struct { unsigned int start; unsigned int end; const char *name; }
4776 block_t;
4777 static block_t blocks[384];
4778 static unsigned int numblocks;
4780 static void
4781 fill_blocks (const char *blocks_filename)
4783 FILE *stream;
4785 stream = fopen (blocks_filename, "r");
4786 if (stream == NULL)
4788 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
4789 exit (1);
4792 for (;;)
4794 char buf[200+1];
4795 unsigned int i1, i2;
4796 char padding[200+1];
4797 char blockname[200+1];
4799 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4800 break;
4802 if (buf[0] == '\0' || buf[0] == '#')
4803 continue;
4805 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
4807 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
4808 exit (1);
4810 blocks[numblocks].start = i1;
4811 blocks[numblocks].end = i2;
4812 blocks[numblocks].name = strdup (blockname);
4813 /* It must be sorted. */
4814 assert (numblocks == 0 || blocks[numblocks-1].end < blocks[numblocks].start);
4815 numblocks++;
4816 assert (numblocks != SIZEOF (blocks));
4819 if (ferror (stream) || fclose (stream))
4821 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
4822 exit (1);
4826 /* Return the smallest block index among the blocks for characters >= ch. */
4827 static unsigned int
4828 block_first_index (unsigned int ch)
4830 /* Binary search. */
4831 unsigned int lo = 0;
4832 unsigned int hi = numblocks;
4833 /* Invariants:
4834 All blocks[i], i < lo, have blocks[i].end < ch,
4835 all blocks[i], i >= hi, have blocks[i].end >= ch. */
4836 while (lo < hi)
4838 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4839 if (blocks[mid].end < ch)
4840 lo = mid + 1;
4841 else
4842 hi = mid;
4844 return hi;
4847 /* Return the largest block index among the blocks for characters <= ch,
4848 plus 1. */
4849 static unsigned int
4850 block_last_index (unsigned int ch)
4852 /* Binary search. */
4853 unsigned int lo = 0;
4854 unsigned int hi = numblocks;
4855 /* Invariants:
4856 All blocks[i], i < lo, have blocks[i].start <= ch,
4857 all blocks[i], i >= hi, have blocks[i].start > ch. */
4858 while (lo < hi)
4860 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4861 if (blocks[mid].start <= ch)
4862 lo = mid + 1;
4863 else
4864 hi = mid;
4866 return hi;
4869 static void
4870 output_blocks (const char *version)
4872 const char *filename = "unictype/blocks.h";
4873 const unsigned int shift = 8; /* bits to shift away for array access */
4874 const unsigned int threshold = 0x28000; /* cut-off table here to save space */
4875 FILE *stream;
4876 unsigned int i;
4877 unsigned int i1;
4879 stream = fopen (filename, "w");
4880 if (stream == NULL)
4882 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4883 exit (1);
4886 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4887 fprintf (stream, "/* Unicode blocks. */\n");
4888 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4889 version);
4891 fprintf (stream, "static const uc_block_t blocks[] =\n");
4892 fprintf (stream, "{\n");
4893 for (i = 0; i < numblocks; i++)
4895 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
4896 blocks[i].end, blocks[i].name);
4897 if (i+1 < numblocks)
4898 fprintf (stream, ",");
4899 fprintf (stream, "\n");
4901 fprintf (stream, "};\n");
4902 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
4903 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
4904 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
4905 threshold >> shift);
4906 fprintf (stream, "{\n");
4907 for (i1 = 0; i1 < (threshold >> shift); i1++)
4909 unsigned int first_index = block_first_index (i1 << shift);
4910 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
4911 fprintf (stream, " %3d, %3d", first_index, last_index);
4912 if (i1+1 < (threshold >> shift))
4913 fprintf (stream, ",");
4914 fprintf (stream, "\n");
4916 fprintf (stream, "};\n");
4917 fprintf (stream, "#define blocks_upper_first_index %d\n",
4918 block_first_index (threshold));
4919 fprintf (stream, "#define blocks_upper_last_index %d\n",
4920 block_last_index (0x10FFFF));
4922 if (ferror (stream) || fclose (stream))
4924 fprintf (stderr, "error writing to '%s'\n", filename);
4925 exit (1);
4929 /* ========================================================================= */
4931 /* C and Java syntax. */
4933 enum
4935 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4936 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4937 UC_IDENTIFIER_INVALID, /* not valid */
4938 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4941 /* ISO C 99 section 6.4.(3). */
4942 static bool
4943 is_c_whitespace (unsigned int ch)
4945 return (ch == ' ' /* space */
4946 || ch == '\t' /* horizontal tab */
4947 || ch == '\n' || ch == '\r' /* new-line */
4948 || ch == '\v' /* vertical tab */
4949 || ch == '\f'); /* form-feed */
4952 /* ISO C 99 section 6.4.2.1 and appendix D. */
4953 static int
4954 c_ident_category (unsigned int ch)
4956 /* Section 6.4.2.1. */
4957 if (ch >= '0' && ch <= '9')
4958 return UC_IDENTIFIER_VALID;
4959 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4960 return UC_IDENTIFIER_START;
4961 /* Appendix D. */
4962 if (0
4963 /* Latin */
4964 || (ch == 0x00AA)
4965 || (ch == 0x00BA)
4966 || (ch >= 0x00C0 && ch <= 0x00D6)
4967 || (ch >= 0x00D8 && ch <= 0x00F6)
4968 || (ch >= 0x00F8 && ch <= 0x01F5)
4969 || (ch >= 0x01FA && ch <= 0x0217)
4970 || (ch >= 0x0250 && ch <= 0x02A8)
4971 || (ch >= 0x1E00 && ch <= 0x1E9B)
4972 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4973 || (ch == 0x207F)
4974 /* Greek */
4975 || (ch == 0x0386)
4976 || (ch >= 0x0388 && ch <= 0x038A)
4977 || (ch == 0x038C)
4978 || (ch >= 0x038E && ch <= 0x03A1)
4979 || (ch >= 0x03A3 && ch <= 0x03CE)
4980 || (ch >= 0x03D0 && ch <= 0x03D6)
4981 || (ch == 0x03DA)
4982 || (ch == 0x03DC)
4983 || (ch == 0x03DE)
4984 || (ch == 0x03E0)
4985 || (ch >= 0x03E2 && ch <= 0x03F3)
4986 || (ch >= 0x1F00 && ch <= 0x1F15)
4987 || (ch >= 0x1F18 && ch <= 0x1F1D)
4988 || (ch >= 0x1F20 && ch <= 0x1F45)
4989 || (ch >= 0x1F48 && ch <= 0x1F4D)
4990 || (ch >= 0x1F50 && ch <= 0x1F57)
4991 || (ch == 0x1F59)
4992 || (ch == 0x1F5B)
4993 || (ch == 0x1F5D)
4994 || (ch >= 0x1F5F && ch <= 0x1F7D)
4995 || (ch >= 0x1F80 && ch <= 0x1FB4)
4996 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4997 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4998 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4999 || (ch >= 0x1FD0 && ch <= 0x1FD3)
5000 || (ch >= 0x1FD6 && ch <= 0x1FDB)
5001 || (ch >= 0x1FE0 && ch <= 0x1FEC)
5002 || (ch >= 0x1FF2 && ch <= 0x1FF4)
5003 || (ch >= 0x1FF6 && ch <= 0x1FFC)
5004 /* Cyrillic */
5005 || (ch >= 0x0401 && ch <= 0x040C)
5006 || (ch >= 0x040E && ch <= 0x044F)
5007 || (ch >= 0x0451 && ch <= 0x045C)
5008 || (ch >= 0x045E && ch <= 0x0481)
5009 || (ch >= 0x0490 && ch <= 0x04C4)
5010 || (ch >= 0x04C7 && ch <= 0x04C8)
5011 || (ch >= 0x04CB && ch <= 0x04CC)
5012 || (ch >= 0x04D0 && ch <= 0x04EB)
5013 || (ch >= 0x04EE && ch <= 0x04F5)
5014 || (ch >= 0x04F8 && ch <= 0x04F9)
5015 /* Armenian */
5016 || (ch >= 0x0531 && ch <= 0x0556)
5017 || (ch >= 0x0561 && ch <= 0x0587)
5018 /* Hebrew */
5019 || (ch >= 0x05B0 && ch <= 0x05B9)
5020 || (ch >= 0x05BB && ch <= 0x05BD)
5021 || (ch == 0x05BF)
5022 || (ch >= 0x05C1 && ch <= 0x05C2)
5023 || (ch >= 0x05D0 && ch <= 0x05EA)
5024 || (ch >= 0x05F0 && ch <= 0x05F2)
5025 /* Arabic */
5026 || (ch >= 0x0621 && ch <= 0x063A)
5027 || (ch >= 0x0640 && ch <= 0x0652)
5028 || (ch >= 0x0670 && ch <= 0x06B7)
5029 || (ch >= 0x06BA && ch <= 0x06BE)
5030 || (ch >= 0x06C0 && ch <= 0x06CE)
5031 || (ch >= 0x06D0 && ch <= 0x06DC)
5032 || (ch >= 0x06E5 && ch <= 0x06E8)
5033 || (ch >= 0x06EA && ch <= 0x06ED)
5034 /* Devanagari */
5035 || (ch >= 0x0901 && ch <= 0x0903)
5036 || (ch >= 0x0905 && ch <= 0x0939)
5037 || (ch >= 0x093E && ch <= 0x094D)
5038 || (ch >= 0x0950 && ch <= 0x0952)
5039 || (ch >= 0x0958 && ch <= 0x0963)
5040 /* Bengali */
5041 || (ch >= 0x0981 && ch <= 0x0983)
5042 || (ch >= 0x0985 && ch <= 0x098C)
5043 || (ch >= 0x098F && ch <= 0x0990)
5044 || (ch >= 0x0993 && ch <= 0x09A8)
5045 || (ch >= 0x09AA && ch <= 0x09B0)
5046 || (ch == 0x09B2)
5047 || (ch >= 0x09B6 && ch <= 0x09B9)
5048 || (ch >= 0x09BE && ch <= 0x09C4)
5049 || (ch >= 0x09C7 && ch <= 0x09C8)
5050 || (ch >= 0x09CB && ch <= 0x09CD)
5051 || (ch >= 0x09DC && ch <= 0x09DD)
5052 || (ch >= 0x09DF && ch <= 0x09E3)
5053 || (ch >= 0x09F0 && ch <= 0x09F1)
5054 /* Gurmukhi */
5055 || (ch == 0x0A02)
5056 || (ch >= 0x0A05 && ch <= 0x0A0A)
5057 || (ch >= 0x0A0F && ch <= 0x0A10)
5058 || (ch >= 0x0A13 && ch <= 0x0A28)
5059 || (ch >= 0x0A2A && ch <= 0x0A30)
5060 || (ch >= 0x0A32 && ch <= 0x0A33)
5061 || (ch >= 0x0A35 && ch <= 0x0A36)
5062 || (ch >= 0x0A38 && ch <= 0x0A39)
5063 || (ch >= 0x0A3E && ch <= 0x0A42)
5064 || (ch >= 0x0A47 && ch <= 0x0A48)
5065 || (ch >= 0x0A4B && ch <= 0x0A4D)
5066 || (ch >= 0x0A59 && ch <= 0x0A5C)
5067 || (ch == 0x0A5E)
5068 || (ch == 0x0A74)
5069 /* Gujarati */
5070 || (ch >= 0x0A81 && ch <= 0x0A83)
5071 || (ch >= 0x0A85 && ch <= 0x0A8B)
5072 || (ch == 0x0A8D)
5073 || (ch >= 0x0A8F && ch <= 0x0A91)
5074 || (ch >= 0x0A93 && ch <= 0x0AA8)
5075 || (ch >= 0x0AAA && ch <= 0x0AB0)
5076 || (ch >= 0x0AB2 && ch <= 0x0AB3)
5077 || (ch >= 0x0AB5 && ch <= 0x0AB9)
5078 || (ch >= 0x0ABD && ch <= 0x0AC5)
5079 || (ch >= 0x0AC7 && ch <= 0x0AC9)
5080 || (ch >= 0x0ACB && ch <= 0x0ACD)
5081 || (ch == 0x0AD0)
5082 || (ch == 0x0AE0)
5083 /* Oriya */
5084 || (ch >= 0x0B01 && ch <= 0x0B03)
5085 || (ch >= 0x0B05 && ch <= 0x0B0C)
5086 || (ch >= 0x0B0F && ch <= 0x0B10)
5087 || (ch >= 0x0B13 && ch <= 0x0B28)
5088 || (ch >= 0x0B2A && ch <= 0x0B30)
5089 || (ch >= 0x0B32 && ch <= 0x0B33)
5090 || (ch >= 0x0B36 && ch <= 0x0B39)
5091 || (ch >= 0x0B3E && ch <= 0x0B43)
5092 || (ch >= 0x0B47 && ch <= 0x0B48)
5093 || (ch >= 0x0B4B && ch <= 0x0B4D)
5094 || (ch >= 0x0B5C && ch <= 0x0B5D)
5095 || (ch >= 0x0B5F && ch <= 0x0B61)
5096 /* Tamil */
5097 || (ch >= 0x0B82 && ch <= 0x0B83)
5098 || (ch >= 0x0B85 && ch <= 0x0B8A)
5099 || (ch >= 0x0B8E && ch <= 0x0B90)
5100 || (ch >= 0x0B92 && ch <= 0x0B95)
5101 || (ch >= 0x0B99 && ch <= 0x0B9A)
5102 || (ch == 0x0B9C)
5103 || (ch >= 0x0B9E && ch <= 0x0B9F)
5104 || (ch >= 0x0BA3 && ch <= 0x0BA4)
5105 || (ch >= 0x0BA8 && ch <= 0x0BAA)
5106 || (ch >= 0x0BAE && ch <= 0x0BB5)
5107 || (ch >= 0x0BB7 && ch <= 0x0BB9)
5108 || (ch >= 0x0BBE && ch <= 0x0BC2)
5109 || (ch >= 0x0BC6 && ch <= 0x0BC8)
5110 || (ch >= 0x0BCA && ch <= 0x0BCD)
5111 /* Telugu */
5112 || (ch >= 0x0C01 && ch <= 0x0C03)
5113 || (ch >= 0x0C05 && ch <= 0x0C0C)
5114 || (ch >= 0x0C0E && ch <= 0x0C10)
5115 || (ch >= 0x0C12 && ch <= 0x0C28)
5116 || (ch >= 0x0C2A && ch <= 0x0C33)
5117 || (ch >= 0x0C35 && ch <= 0x0C39)
5118 || (ch >= 0x0C3E && ch <= 0x0C44)
5119 || (ch >= 0x0C46 && ch <= 0x0C48)
5120 || (ch >= 0x0C4A && ch <= 0x0C4D)
5121 || (ch >= 0x0C60 && ch <= 0x0C61)
5122 /* Kannada */
5123 || (ch >= 0x0C82 && ch <= 0x0C83)
5124 || (ch >= 0x0C85 && ch <= 0x0C8C)
5125 || (ch >= 0x0C8E && ch <= 0x0C90)
5126 || (ch >= 0x0C92 && ch <= 0x0CA8)
5127 || (ch >= 0x0CAA && ch <= 0x0CB3)
5128 || (ch >= 0x0CB5 && ch <= 0x0CB9)
5129 || (ch >= 0x0CBE && ch <= 0x0CC4)
5130 || (ch >= 0x0CC6 && ch <= 0x0CC8)
5131 || (ch >= 0x0CCA && ch <= 0x0CCD)
5132 || (ch == 0x0CDE)
5133 || (ch >= 0x0CE0 && ch <= 0x0CE1)
5134 /* Malayalam */
5135 || (ch >= 0x0D02 && ch <= 0x0D03)
5136 || (ch >= 0x0D05 && ch <= 0x0D0C)
5137 || (ch >= 0x0D0E && ch <= 0x0D10)
5138 || (ch >= 0x0D12 && ch <= 0x0D28)
5139 || (ch >= 0x0D2A && ch <= 0x0D39)
5140 || (ch >= 0x0D3E && ch <= 0x0D43)
5141 || (ch >= 0x0D46 && ch <= 0x0D48)
5142 || (ch >= 0x0D4A && ch <= 0x0D4D)
5143 || (ch >= 0x0D60 && ch <= 0x0D61)
5144 /* Thai */
5145 || (ch >= 0x0E01 && ch <= 0x0E3A)
5146 || (ch >= 0x0E40 && ch <= 0x0E5B)
5147 /* Lao */
5148 || (ch >= 0x0E81 && ch <= 0x0E82)
5149 || (ch == 0x0E84)
5150 || (ch >= 0x0E87 && ch <= 0x0E88)
5151 || (ch == 0x0E8A)
5152 || (ch == 0x0E8D)
5153 || (ch >= 0x0E94 && ch <= 0x0E97)
5154 || (ch >= 0x0E99 && ch <= 0x0E9F)
5155 || (ch >= 0x0EA1 && ch <= 0x0EA3)
5156 || (ch == 0x0EA5)
5157 || (ch == 0x0EA7)
5158 || (ch >= 0x0EAA && ch <= 0x0EAB)
5159 || (ch >= 0x0EAD && ch <= 0x0EAE)
5160 || (ch >= 0x0EB0 && ch <= 0x0EB9)
5161 || (ch >= 0x0EBB && ch <= 0x0EBD)
5162 || (ch >= 0x0EC0 && ch <= 0x0EC4)
5163 || (ch == 0x0EC6)
5164 || (ch >= 0x0EC8 && ch <= 0x0ECD)
5165 || (ch >= 0x0EDC && ch <= 0x0EDD)
5166 /* Tibetan */
5167 || (ch == 0x0F00)
5168 || (ch >= 0x0F18 && ch <= 0x0F19)
5169 || (ch == 0x0F35)
5170 || (ch == 0x0F37)
5171 || (ch == 0x0F39)
5172 || (ch >= 0x0F3E && ch <= 0x0F47)
5173 || (ch >= 0x0F49 && ch <= 0x0F69)
5174 || (ch >= 0x0F71 && ch <= 0x0F84)
5175 || (ch >= 0x0F86 && ch <= 0x0F8B)
5176 || (ch >= 0x0F90 && ch <= 0x0F95)
5177 || (ch == 0x0F97)
5178 || (ch >= 0x0F99 && ch <= 0x0FAD)
5179 || (ch >= 0x0FB1 && ch <= 0x0FB7)
5180 || (ch == 0x0FB9)
5181 /* Georgian */
5182 || (ch >= 0x10A0 && ch <= 0x10C5)
5183 || (ch >= 0x10D0 && ch <= 0x10F6)
5184 /* Hiragana */
5185 || (ch >= 0x3041 && ch <= 0x3093)
5186 || (ch >= 0x309B && ch <= 0x309C)
5187 /* Katakana */
5188 || (ch >= 0x30A1 && ch <= 0x30F6)
5189 || (ch >= 0x30FB && ch <= 0x30FC)
5190 /* Bopomofo */
5191 || (ch >= 0x3105 && ch <= 0x312C)
5192 /* CJK Unified Ideographs */
5193 || (ch >= 0x4E00 && ch <= 0x9FA5)
5194 /* Hangul */
5195 || (ch >= 0xAC00 && ch <= 0xD7A3)
5196 /* Digits */
5197 || (ch >= 0x0660 && ch <= 0x0669)
5198 || (ch >= 0x06F0 && ch <= 0x06F9)
5199 || (ch >= 0x0966 && ch <= 0x096F)
5200 || (ch >= 0x09E6 && ch <= 0x09EF)
5201 || (ch >= 0x0A66 && ch <= 0x0A6F)
5202 || (ch >= 0x0AE6 && ch <= 0x0AEF)
5203 || (ch >= 0x0B66 && ch <= 0x0B6F)
5204 || (ch >= 0x0BE7 && ch <= 0x0BEF)
5205 || (ch >= 0x0C66 && ch <= 0x0C6F)
5206 || (ch >= 0x0CE6 && ch <= 0x0CEF)
5207 || (ch >= 0x0D66 && ch <= 0x0D6F)
5208 || (ch >= 0x0E50 && ch <= 0x0E59)
5209 || (ch >= 0x0ED0 && ch <= 0x0ED9)
5210 || (ch >= 0x0F20 && ch <= 0x0F33)
5211 /* Special characters */
5212 || (ch == 0x00B5)
5213 || (ch == 0x00B7)
5214 || (ch >= 0x02B0 && ch <= 0x02B8)
5215 || (ch == 0x02BB)
5216 || (ch >= 0x02BD && ch <= 0x02C1)
5217 || (ch >= 0x02D0 && ch <= 0x02D1)
5218 || (ch >= 0x02E0 && ch <= 0x02E4)
5219 || (ch == 0x037A)
5220 || (ch == 0x0559)
5221 || (ch == 0x093D)
5222 || (ch == 0x0B3D)
5223 || (ch == 0x1FBE)
5224 || (ch >= 0x203F && ch <= 0x2040)
5225 || (ch == 0x2102)
5226 || (ch == 0x2107)
5227 || (ch >= 0x210A && ch <= 0x2113)
5228 || (ch == 0x2115)
5229 || (ch >= 0x2118 && ch <= 0x211D)
5230 || (ch == 0x2124)
5231 || (ch == 0x2126)
5232 || (ch == 0x2128)
5233 || (ch >= 0x212A && ch <= 0x2131)
5234 || (ch >= 0x2133 && ch <= 0x2138)
5235 || (ch >= 0x2160 && ch <= 0x2182)
5236 || (ch >= 0x3005 && ch <= 0x3007)
5237 || (ch >= 0x3021 && ch <= 0x3029)
5239 return UC_IDENTIFIER_START;
5240 return UC_IDENTIFIER_INVALID;
5243 /* The Java Language Specification, 3rd edition, §3.6.
5244 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
5245 static bool
5246 is_java_whitespace (unsigned int ch)
5248 return (ch == ' ' || ch == '\t' || ch == '\f'
5249 || ch == '\n' || ch == '\r');
5252 /* The Java Language Specification, 3rd edition, §3.8.
5253 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
5254 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
5255 static int
5256 java_ident_category (unsigned int ch)
5258 /* FIXME: Check this against Sun's JDK implementation. */
5259 if (is_category_L (ch) /* = Character.isLetter(ch) */
5260 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
5261 || is_category_Sc (ch) /* currency symbol */
5262 || is_category_Pc (ch) /* connector punctuation */
5264 return UC_IDENTIFIER_START;
5265 if (is_category_Nd (ch) /* digit */
5266 || is_category_Mc (ch) /* combining mark */
5267 || is_category_Mn (ch) /* non-spacing mark */
5269 return UC_IDENTIFIER_VALID;
5270 if ((ch >= 0x0000 && ch <= 0x0008)
5271 || (ch >= 0x000E && ch <= 0x001B)
5272 || (ch >= 0x007F && ch <= 0x009F)
5273 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
5275 return UC_IDENTIFIER_IGNORABLE;
5276 return UC_IDENTIFIER_INVALID;
5279 /* Construction of sparse 3-level tables. */
5280 #define TABLE identsyntax_table
5281 #define ELEMENT uint8_t
5282 #define DEFAULT UC_IDENTIFIER_INVALID
5283 #define xmalloc malloc
5284 #define xrealloc realloc
5285 #include "3level.h"
5287 /* Output an identifier syntax categorization in a three-level bitmap. */
5288 static void
5289 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
5291 FILE *stream;
5292 unsigned int ch, i;
5293 struct identsyntax_table t;
5294 unsigned int level1_offset, level2_offset, level3_offset;
5296 stream = fopen (filename, "w");
5297 if (stream == NULL)
5299 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5300 exit (1);
5303 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5304 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
5305 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5306 version);
5308 t.p = 7; /* or 8 */
5309 t.q = 5; /* or 4 */
5310 identsyntax_table_init (&t);
5312 for (ch = 0; ch < 0x110000; ch++)
5314 int syntaxcode = predicate (ch);
5316 assert (syntaxcode <= 0x03);
5318 if (syntaxcode != UC_IDENTIFIER_INVALID)
5319 identsyntax_table_add (&t, ch, syntaxcode);
5322 identsyntax_table_finalize (&t);
5324 /* Offsets in t.result, in memory of this process. */
5325 level1_offset =
5326 5 * sizeof (uint32_t);
5327 level2_offset =
5328 5 * sizeof (uint32_t)
5329 + t.level1_size * sizeof (uint32_t);
5330 level3_offset =
5331 5 * sizeof (uint32_t)
5332 + t.level1_size * sizeof (uint32_t)
5333 + (t.level2_size << t.q) * sizeof (uint32_t);
5335 for (i = 0; i < 5; i++)
5336 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
5337 ((uint32_t *) t.result)[i]);
5338 fprintf (stream, "static const\n");
5339 fprintf (stream, "struct\n");
5340 fprintf (stream, " {\n");
5341 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5342 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
5343 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
5344 (1 << t.p) * 2 / 16);
5345 fprintf (stream, " }\n");
5346 fprintf (stream, "%s =\n", name);
5347 fprintf (stream, "{\n");
5348 fprintf (stream, " {");
5349 if (t.level1_size > 8)
5350 fprintf (stream, "\n ");
5351 for (i = 0; i < t.level1_size; i++)
5353 uint32_t offset;
5354 if (i > 0 && (i % 8) == 0)
5355 fprintf (stream, "\n ");
5356 offset = ((uint32_t *) (t.result + level1_offset))[i];
5357 if (offset == 0)
5358 fprintf (stream, " %5d", -1);
5359 else
5360 fprintf (stream, " %5zu",
5361 (offset - level2_offset) / sizeof (uint32_t));
5362 if (i+1 < t.level1_size)
5363 fprintf (stream, ",");
5365 if (t.level1_size > 8)
5366 fprintf (stream, "\n ");
5367 fprintf (stream, " },\n");
5368 fprintf (stream, " {");
5369 if (t.level2_size << t.q > 8)
5370 fprintf (stream, "\n ");
5371 for (i = 0; i < t.level2_size << t.q; i++)
5373 uint32_t offset;
5374 if (i > 0 && (i % 8) == 0)
5375 fprintf (stream, "\n ");
5376 offset = ((uint32_t *) (t.result + level2_offset))[i];
5377 if (offset == 0)
5378 fprintf (stream, " %5d", -1);
5379 else
5380 fprintf (stream, " %5zu",
5381 (offset - level3_offset) / sizeof (uint8_t));
5382 if (i+1 < t.level2_size << t.q)
5383 fprintf (stream, ",");
5385 if (t.level2_size << t.q > 8)
5386 fprintf (stream, "\n ");
5387 fprintf (stream, " },\n");
5388 /* Pack the level3 array. Each entry needs 2 bits only. */
5389 fprintf (stream, " {");
5390 if ((t.level3_size << t.p) * 2 / 16 > 8)
5391 fprintf (stream, "\n ");
5392 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
5394 if (i > 0 && (i % 8) == 0)
5395 fprintf (stream, "\n ");
5396 fprintf (stream, " 0x%04x",
5397 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
5398 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
5399 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
5400 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
5401 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
5402 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
5403 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
5404 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
5405 if (i+1 < (t.level3_size << t.p) * 2 / 16)
5406 fprintf (stream, ",");
5408 if ((t.level3_size << t.p) * 2 / 16 > 8)
5409 fprintf (stream, "\n ");
5410 fprintf (stream, " }\n");
5411 fprintf (stream, "};\n");
5413 if (ferror (stream) || fclose (stream))
5415 fprintf (stderr, "error writing to '%s'\n", filename);
5416 exit (1);
5420 static void
5421 output_ident_properties (const char *version)
5423 #define PROPERTY(P) \
5424 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
5425 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5426 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
5427 PROPERTY(c_whitespace)
5428 PROPERTY(java_whitespace)
5429 #undef PROPERTY
5431 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
5432 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
5435 /* ========================================================================= */
5437 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
5438 glibc/localedata/locales/i18n file, generated by
5439 glibc/localedata/gen-unicode-ctype.c. */
5441 /* Character mappings. */
5443 static unsigned int
5444 to_upper (unsigned int ch)
5446 if (unicode_attributes[ch].name != NULL
5447 && unicode_attributes[ch].upper != NONE)
5448 return unicode_attributes[ch].upper;
5449 else
5450 return ch;
5453 static unsigned int
5454 to_lower (unsigned int ch)
5456 if (unicode_attributes[ch].name != NULL
5457 && unicode_attributes[ch].lower != NONE)
5458 return unicode_attributes[ch].lower;
5459 else
5460 return ch;
5463 static unsigned int
5464 to_title (unsigned int ch)
5466 if (unicode_attributes[ch].name != NULL
5467 && unicode_attributes[ch].title != NONE)
5468 return unicode_attributes[ch].title;
5469 else
5470 return ch;
5473 /* Character class properties. */
5475 static bool
5476 is_upper (unsigned int ch)
5478 return (to_lower (ch) != ch);
5481 static bool
5482 is_lower (unsigned int ch)
5484 return (to_upper (ch) != ch)
5485 /* <U00DF> is lowercase, but without simple to_upper mapping. */
5486 || (ch == 0x00DF);
5489 static bool
5490 is_alpha (unsigned int ch)
5492 return (unicode_attributes[ch].name != NULL
5493 && ((unicode_attributes[ch].category[0] == 'L'
5494 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5495 <U0E2F>, <U0E46> should belong to is_punct. */
5496 && (ch != 0x0E2F) && (ch != 0x0E46))
5497 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5498 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
5499 || (ch == 0x0E31)
5500 || (ch >= 0x0E34 && ch <= 0x0E3A)
5501 || (ch >= 0x0E47 && ch <= 0x0E4E)
5502 /* Avoid warning for <U0345>. */
5503 || (ch == 0x0345)
5504 /* Avoid warnings for <U2160>..<U217F>. */
5505 || (unicode_attributes[ch].category[0] == 'N'
5506 && unicode_attributes[ch].category[1] == 'l')
5507 /* Avoid warnings for <U24B6>..<U24E9>. */
5508 || (unicode_attributes[ch].category[0] == 'S'
5509 && unicode_attributes[ch].category[1] == 'o'
5510 && strstr (unicode_attributes[ch].name, " LETTER ")
5511 != NULL)
5512 /* Consider all the non-ASCII digits as alphabetic.
5513 ISO C 99 forbids us to have them in category "digit",
5514 but we want iswalnum to return true on them. */
5515 || (unicode_attributes[ch].category[0] == 'N'
5516 && unicode_attributes[ch].category[1] == 'd'
5517 && !(ch >= 0x0030 && ch <= 0x0039))));
5520 static bool
5521 is_digit (unsigned int ch)
5523 #if 0
5524 return (unicode_attributes[ch].name != NULL
5525 && unicode_attributes[ch].category[0] == 'N'
5526 && unicode_attributes[ch].category[1] == 'd');
5527 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
5528 a zero. Must add <0> in front of them by hand. */
5529 #else
5530 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
5531 takes it away:
5532 7.25.2.1.5:
5533 The iswdigit function tests for any wide character that corresponds
5534 to a decimal-digit character (as defined in 5.2.1).
5535 5.2.1:
5536 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
5538 return (ch >= 0x0030 && ch <= 0x0039);
5539 #endif
5542 static bool
5543 is_alnum (unsigned int ch)
5545 return is_alpha (ch) || is_digit (ch);
5548 static bool
5549 is_blank (unsigned int ch)
5551 return (ch == 0x0009 /* '\t' */
5552 /* Category Zs without mention of "<noBreak>" */
5553 || (unicode_attributes[ch].name != NULL
5554 && unicode_attributes[ch].category[0] == 'Z'
5555 && unicode_attributes[ch].category[1] == 's'
5556 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
5559 static bool
5560 is_space (unsigned int ch)
5562 /* Don't make U+00A0 a space. Non-breaking space means that all programs
5563 should treat it like a punctuation character, not like a space. */
5564 return (ch == 0x0020 /* ' ' */
5565 || ch == 0x000C /* '\f' */
5566 || ch == 0x000A /* '\n' */
5567 || ch == 0x000D /* '\r' */
5568 || ch == 0x0009 /* '\t' */
5569 || ch == 0x000B /* '\v' */
5570 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
5571 || (unicode_attributes[ch].name != NULL
5572 && unicode_attributes[ch].category[0] == 'Z'
5573 && (unicode_attributes[ch].category[1] == 'l'
5574 || unicode_attributes[ch].category[1] == 'p'
5575 || (unicode_attributes[ch].category[1] == 's'
5576 && !strstr (unicode_attributes[ch].decomposition,
5577 "<noBreak>")))));
5580 static bool
5581 is_cntrl (unsigned int ch)
5583 return (unicode_attributes[ch].name != NULL
5584 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
5585 /* Categories Zl and Zp */
5586 || (unicode_attributes[ch].category[0] == 'Z'
5587 && (unicode_attributes[ch].category[1] == 'l'
5588 || unicode_attributes[ch].category[1] == 'p'))));
5591 static bool
5592 is_xdigit (unsigned int ch)
5594 #if 0
5595 return is_digit (ch)
5596 || (ch >= 0x0041 && ch <= 0x0046)
5597 || (ch >= 0x0061 && ch <= 0x0066);
5598 #else
5599 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
5600 takes it away:
5601 7.25.2.1.12:
5602 The iswxdigit function tests for any wide character that corresponds
5603 to a hexadecimal-digit character (as defined in 6.4.4.1).
5604 6.4.4.1:
5605 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
5607 return (ch >= 0x0030 && ch <= 0x0039)
5608 || (ch >= 0x0041 && ch <= 0x0046)
5609 || (ch >= 0x0061 && ch <= 0x0066);
5610 #endif
5613 static bool
5614 is_graph (unsigned int ch)
5616 return (unicode_attributes[ch].name != NULL
5617 && strcmp (unicode_attributes[ch].name, "<control>")
5618 && !is_space (ch));
5621 static bool
5622 is_print (unsigned int ch)
5624 return (unicode_attributes[ch].name != NULL
5625 && strcmp (unicode_attributes[ch].name, "<control>")
5626 /* Categories Zl and Zp */
5627 && !(unicode_attributes[ch].name != NULL
5628 && unicode_attributes[ch].category[0] == 'Z'
5629 && (unicode_attributes[ch].category[1] == 'l'
5630 || unicode_attributes[ch].category[1] == 'p')));
5633 static bool
5634 is_punct (unsigned int ch)
5636 #if 0
5637 return (unicode_attributes[ch].name != NULL
5638 && unicode_attributes[ch].category[0] == 'P');
5639 #else
5640 /* The traditional POSIX definition of punctuation is every graphic,
5641 non-alphanumeric character. */
5642 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
5643 #endif
5646 /* Output all properties. */
5647 static void
5648 output_old_ctype (const char *version)
5650 #define PROPERTY(P) \
5651 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
5652 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5653 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
5654 PROPERTY(alnum)
5655 PROPERTY(alpha)
5656 PROPERTY(cntrl)
5657 PROPERTY(digit)
5658 PROPERTY(graph)
5659 PROPERTY(lower)
5660 PROPERTY(print)
5661 PROPERTY(punct)
5662 PROPERTY(space)
5663 PROPERTY(upper)
5664 PROPERTY(xdigit)
5665 PROPERTY(blank)
5666 #undef PROPERTY
5669 #if 0
5671 static bool
5672 is_combining (unsigned int ch)
5674 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
5675 file. In 3.0.1 it was identical to the union of the general categories
5676 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
5677 PropList.txt file, so we take the latter definition. */
5678 return (unicode_attributes[ch].name != NULL
5679 && unicode_attributes[ch].category[0] == 'M'
5680 && (unicode_attributes[ch].category[1] == 'n'
5681 || unicode_attributes[ch].category[1] == 'c'
5682 || unicode_attributes[ch].category[1] == 'e'));
5685 static bool
5686 is_combining_level3 (unsigned int ch)
5688 return is_combining (ch)
5689 && !(unicode_attributes[ch].combining[0] != '\0'
5690 && unicode_attributes[ch].combining[0] != '0'
5691 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
5694 /* Return the UCS symbol string for a Unicode character. */
5695 static const char *
5696 ucs_symbol (unsigned int i)
5698 static char buf[11+1];
5700 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
5701 return buf;
5704 /* Return the UCS symbol range string for a Unicode characters interval. */
5705 static const char *
5706 ucs_symbol_range (unsigned int low, unsigned int high)
5708 static char buf[24+1];
5710 strcpy (buf, ucs_symbol (low));
5711 strcat (buf, "..");
5712 strcat (buf, ucs_symbol (high));
5713 return buf;
5716 /* Output a character class (= property) table. */
5718 static void
5719 output_charclass (FILE *stream, const char *classname,
5720 bool (*func) (unsigned int))
5722 char table[0x110000];
5723 unsigned int i;
5724 bool need_semicolon;
5725 const int max_column = 75;
5726 int column;
5728 for (i = 0; i < 0x110000; i++)
5729 table[i] = (int) func (i);
5731 fprintf (stream, "%s ", classname);
5732 need_semicolon = false;
5733 column = 1000;
5734 for (i = 0; i < 0x110000; )
5736 if (!table[i])
5737 i++;
5738 else
5740 unsigned int low, high;
5741 char buf[25];
5743 low = i;
5745 i++;
5746 while (i < 0x110000 && table[i]);
5747 high = i - 1;
5749 if (low == high)
5750 strcpy (buf, ucs_symbol (low));
5751 else
5752 strcpy (buf, ucs_symbol_range (low, high));
5754 if (need_semicolon)
5756 fprintf (stream, ";");
5757 column++;
5760 if (column + strlen (buf) > max_column)
5762 fprintf (stream, "/\n ");
5763 column = 3;
5766 fprintf (stream, "%s", buf);
5767 column += strlen (buf);
5768 need_semicolon = true;
5771 fprintf (stream, "\n");
5774 /* Output a character mapping table. */
5776 static void
5777 output_charmap (FILE *stream, const char *mapname,
5778 unsigned int (*func) (unsigned int))
5780 char table[0x110000];
5781 unsigned int i;
5782 bool need_semicolon;
5783 const int max_column = 75;
5784 int column;
5786 for (i = 0; i < 0x110000; i++)
5787 table[i] = (func (i) != i);
5789 fprintf (stream, "%s ", mapname);
5790 need_semicolon = false;
5791 column = 1000;
5792 for (i = 0; i < 0x110000; i++)
5793 if (table[i])
5795 char buf[25+1];
5797 strcpy (buf, "(");
5798 strcat (buf, ucs_symbol (i));
5799 strcat (buf, ",");
5800 strcat (buf, ucs_symbol (func (i)));
5801 strcat (buf, ")");
5803 if (need_semicolon)
5805 fprintf (stream, ";");
5806 column++;
5809 if (column + strlen (buf) > max_column)
5811 fprintf (stream, "/\n ");
5812 column = 3;
5815 fprintf (stream, "%s", buf);
5816 column += strlen (buf);
5817 need_semicolon = true;
5819 fprintf (stream, "\n");
5822 /* Output the width table. */
5824 static void
5825 output_widthmap (FILE *stream)
5829 /* Output the tables to the given file. */
5831 static void
5832 output_tables (const char *filename, const char *version)
5834 FILE *stream;
5835 unsigned int ch;
5837 stream = fopen (filename, "w");
5838 if (stream == NULL)
5840 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5841 exit (1);
5844 fprintf (stream, "escape_char /\n");
5845 fprintf (stream, "comment_char %%\n");
5846 fprintf (stream, "\n");
5847 fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
5848 version);
5849 fprintf (stream, "\n");
5851 fprintf (stream, "LC_IDENTIFICATION\n");
5852 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
5853 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
5854 fprintf (stream, "address \"\"\n");
5855 fprintf (stream, "contact \"\"\n");
5856 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
5857 fprintf (stream, "tel \"\"\n");
5858 fprintf (stream, "fax \"\"\n");
5859 fprintf (stream, "language \"\"\n");
5860 fprintf (stream, "territory \"Earth\"\n");
5861 fprintf (stream, "revision \"%s\"\n", version);
5863 time_t now;
5864 char date[11];
5865 now = time (NULL);
5866 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
5867 fprintf (stream, "date \"%s\"\n", date);
5869 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
5870 fprintf (stream, "END LC_IDENTIFICATION\n");
5871 fprintf (stream, "\n");
5873 /* Verification. */
5874 for (ch = 0; ch < 0x110000; ch++)
5876 /* toupper restriction: "Only characters specified for the keywords
5877 lower and upper shall be specified. */
5878 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5879 fprintf (stderr,
5880 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
5881 ucs_symbol (ch), ch, to_upper (ch));
5883 /* tolower restriction: "Only characters specified for the keywords
5884 lower and upper shall be specified. */
5885 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5886 fprintf (stderr,
5887 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
5888 ucs_symbol (ch), ch, to_lower (ch));
5890 /* alpha restriction: "Characters classified as either upper or lower
5891 shall automatically belong to this class. */
5892 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
5893 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
5895 /* alpha restriction: "No character specified for the keywords cntrl,
5896 digit, punct or space shall be specified." */
5897 if (is_alpha (ch) && is_cntrl (ch))
5898 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
5899 if (is_alpha (ch) && is_digit (ch))
5900 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
5901 if (is_alpha (ch) && is_punct (ch))
5902 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
5903 if (is_alpha (ch) && is_space (ch))
5904 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
5906 /* space restriction: "No character specified for the keywords upper,
5907 lower, alpha, digit, graph or xdigit shall be specified."
5908 upper, lower, alpha already checked above. */
5909 if (is_space (ch) && is_digit (ch))
5910 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
5911 if (is_space (ch) && is_graph (ch))
5912 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
5913 if (is_space (ch) && is_xdigit (ch))
5914 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
5916 /* cntrl restriction: "No character specified for the keywords upper,
5917 lower, alpha, digit, punct, graph, print or xdigit shall be
5918 specified." upper, lower, alpha already checked above. */
5919 if (is_cntrl (ch) && is_digit (ch))
5920 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5921 if (is_cntrl (ch) && is_punct (ch))
5922 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5923 if (is_cntrl (ch) && is_graph (ch))
5924 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5925 if (is_cntrl (ch) && is_print (ch))
5926 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5927 if (is_cntrl (ch) && is_xdigit (ch))
5928 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5930 /* punct restriction: "No character specified for the keywords upper,
5931 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5932 be specified." upper, lower, alpha, cntrl already checked above. */
5933 if (is_punct (ch) && is_digit (ch))
5934 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5935 if (is_punct (ch) && is_xdigit (ch))
5936 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5937 if (is_punct (ch) && (ch == 0x0020))
5938 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5940 /* graph restriction: "No character specified for the keyword cntrl
5941 shall be specified." Already checked above. */
5943 /* print restriction: "No character specified for the keyword cntrl
5944 shall be specified." Already checked above. */
5946 /* graph - print relation: differ only in the <space> character.
5947 How is this possible if there are more than one space character?!
5948 I think susv2/xbd/locale.html should speak of "space characters",
5949 not "space character". */
5950 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5951 fprintf (stderr,
5952 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5953 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5954 fprintf (stderr,
5955 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5958 fprintf (stream, "LC_CTYPE\n");
5959 output_charclass (stream, "upper", is_upper);
5960 output_charclass (stream, "lower", is_lower);
5961 output_charclass (stream, "alpha", is_alpha);
5962 output_charclass (stream, "digit", is_digit);
5963 output_charclass (stream, "outdigit", is_outdigit);
5964 output_charclass (stream, "blank", is_blank);
5965 output_charclass (stream, "space", is_space);
5966 output_charclass (stream, "cntrl", is_cntrl);
5967 output_charclass (stream, "punct", is_punct);
5968 output_charclass (stream, "xdigit", is_xdigit);
5969 output_charclass (stream, "graph", is_graph);
5970 output_charclass (stream, "print", is_print);
5971 output_charclass (stream, "class \"combining\";", is_combining);
5972 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5973 output_charmap (stream, "toupper", to_upper);
5974 output_charmap (stream, "tolower", to_lower);
5975 output_charmap (stream, "map \"totitle\";", to_title);
5976 output_widthmap (stream);
5977 fprintf (stream, "END LC_CTYPE\n");
5979 if (ferror (stream) || fclose (stream))
5981 fprintf (stderr, "error writing to '%s'\n", filename);
5982 exit (1);
5986 #endif
5988 /* ========================================================================= */
5990 /* The width property from the EastAsianWidth.txt file.
5991 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5992 const char * unicode_width[0x110000];
5994 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5995 file. */
5996 static void
5997 fill_width (const char *width_filename)
5999 unsigned int i, j;
6000 FILE *stream;
6001 char field0[FIELDLEN];
6002 char field1[FIELDLEN];
6003 char field2[FIELDLEN];
6004 int lineno = 0;
6006 for (i = 0; i < 0x110000; i++)
6007 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
6009 stream = fopen (width_filename, "r");
6010 if (stream == NULL)
6012 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
6013 exit (1);
6016 for (;;)
6018 int n;
6019 int c;
6021 lineno++;
6022 c = getc (stream);
6023 if (c == EOF)
6024 break;
6025 if (c == '#')
6027 do c = getc (stream); while (c != EOF && c != '\n');
6028 continue;
6030 ungetc (c, stream);
6031 n = getfield (stream, field0, ';');
6032 n += getfield (stream, field1, ' ');
6033 n += getfield (stream, field2, '\n');
6034 if (n == 0)
6035 break;
6036 if (n != 3)
6038 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
6039 exit (1);
6041 i = strtoul (field0, NULL, 16);
6042 if (strstr (field0, "..") != NULL)
6044 /* Deal with a range. */
6045 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6046 for (; i <= j; i++)
6047 unicode_width[i] = strdup (field1);
6049 else
6051 /* Single character line. */
6052 unicode_width[i] = strdup (field1);
6056 if (ferror (stream) || fclose (stream))
6058 fprintf (stderr, "error reading from '%s'\n", width_filename);
6059 exit (1);
6063 /* ========================================================================= */
6065 /* Non-spacing attribute and width. */
6067 /* The non-spacing attribute table consists of:
6068 - Non-spacing characters; generated from PropList.txt or
6069 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
6070 - Format control characters; generated from
6071 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
6072 - Zero width characters; generated from
6073 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
6076 static bool
6077 is_nonspacing (unsigned int ch)
6079 return (unicode_attributes[ch].name != NULL
6080 && (get_bidi_category (ch) == UC_BIDI_NSM
6081 || is_category_Cc (ch) || is_category_Cf (ch)
6082 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
6085 static void
6086 output_nonspacing_property (const char *filename)
6088 FILE *stream;
6089 int ind[0x110000 / 0x200];
6090 unsigned int i;
6091 unsigned int i_max;
6092 int next_ind;
6094 stream = fopen (filename, "w");
6095 if (stream == NULL)
6097 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6098 exit (1);
6101 next_ind = 0;
6102 for (i = 0; i < 0x110000 / 0x200; i++)
6104 bool nontrivial = false;
6105 unsigned int ch;
6107 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
6108 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
6109 if (is_nonspacing (ch))
6111 nontrivial = true;
6112 break;
6114 if (nontrivial)
6115 ind[i] = next_ind++;
6116 else
6117 ind[i] = -1;
6120 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
6121 next_ind);
6122 i_max = 0;
6123 for (i = 0; i < 0x110000 / 0x200; i++)
6125 bool nontrivial = (ind[i] >= 0);
6127 if (nontrivial)
6129 unsigned int j;
6131 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
6132 for (j = 0; j < 8; j++)
6134 unsigned int k;
6136 fprintf (stream, " ");
6137 for (k = 0; k < 8; k++)
6139 unsigned int l;
6140 unsigned char bits = 0;
6142 for (l = 0; l < 8; l++)
6144 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
6146 if (is_nonspacing (ch))
6147 bits |= 1 << l;
6149 fprintf (stream, " 0x%02x%c", bits,
6150 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
6152 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6153 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
6155 i_max = i;
6158 fprintf (stream, "};\n");
6160 i_max = ((i_max + 8 - 1) / 8) * 8;
6161 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
6162 i_max);
6164 unsigned int j;
6166 for (j = 0; j < i_max / 8; j++)
6168 unsigned int k;
6170 fprintf (stream, " ");
6171 for (k = 0; k < 8; k++)
6173 i = j * 8 + k;
6174 fprintf (stream, " %2d%c", ind[i],
6175 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
6177 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6178 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
6181 fprintf (stream, "};\n");
6183 if (ferror (stream) || fclose (stream))
6185 fprintf (stderr, "error writing to '%s'\n", filename);
6186 exit (1);
6190 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
6191 static char
6192 symbolic_width (unsigned int ch)
6194 /* Test for unassigned character. */
6195 if (is_property_unassigned_code_value (ch))
6197 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
6198 if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
6199 return 'A';
6200 if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
6201 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
6202 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
6203 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
6204 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
6205 return '2';
6206 return 0;
6208 else
6210 /* Test for non-spacing or control character. */
6211 if (is_category_Cc (ch) && ch < 0x00A0)
6212 return 0;
6213 if (is_nonspacing (ch))
6214 return '0';
6215 /* Test for double-width character. */
6216 if (unicode_width[ch] != NULL
6217 && (strcmp (unicode_width[ch], "W") == 0
6218 || strcmp (unicode_width[ch], "F") == 0))
6219 return '2';
6220 /* Test for half-width character. */
6221 if (unicode_width[ch] != NULL
6222 && strcmp (unicode_width[ch], "H") == 0)
6223 return '1';
6225 /* In ancient CJK encodings, Cyrillic and most other characters are
6226 double-width as well. */
6227 if (ch >= 0x00A1 && ch < 0x10000)
6228 return 'A';
6229 return '1';
6232 static void
6233 output_width_property_test (const char *filename)
6235 FILE *stream;
6236 unsigned int interval_start, interval_end, ch;
6237 char interval_value;
6239 stream = fopen (filename, "w");
6240 if (stream == NULL)
6242 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6243 exit (1);
6246 interval_value = 0;
6247 interval_start = interval_end = 0; /* avoid GCC warning */
6248 for (ch = 0; ch < 0x110000; ch++)
6250 char value = symbolic_width (ch);
6251 if (value != 0) /* skip Cc control characters and unassigned characters */
6253 if (value == interval_value)
6254 /* Extend the interval. */
6255 interval_end = ch;
6256 else
6258 /* Terminate the interval. */
6259 if (interval_value != 0)
6261 if (interval_end == interval_start)
6262 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6263 else
6264 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6266 /* Start a new interval. */
6267 interval_start = interval_end = ch;
6268 interval_value = value;
6272 /* Terminate the last interval. */
6273 if (interval_value != 0)
6275 if (interval_end == interval_start)
6276 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6277 else
6278 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6281 if (ferror (stream) || fclose (stream))
6283 fprintf (stderr, "error writing to '%s'\n", filename);
6284 exit (1);
6288 /* ========================================================================= */
6290 /* Line breaking classification.
6291 Updated for Unicode TR #14 revision 26. */
6293 enum
6295 /* Values >= 27 are resolved at run time. */
6296 LBP_BK = 27, /* mandatory break */
6297 /*LBP_CR, carriage return - not used here because it's a DOSism */
6298 /*LBP_LF, line feed - not used here because it's a DOSism */
6299 LBP_CM = 28, /* attached characters and combining marks */
6300 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
6301 /*LBP_SG, surrogates - not used here because they are not characters */
6302 LBP_WJ = 0, /* word joiner */
6303 LBP_ZW = 29, /* zero width space */
6304 LBP_GL = 1, /* non-breaking (glue) */
6305 LBP_SP = 30, /* space */
6306 LBP_B2 = 2, /* break opportunity before and after */
6307 LBP_BA = 3, /* break opportunity after */
6308 LBP_BB = 4, /* break opportunity before */
6309 LBP_HY = 5, /* hyphen */
6310 LBP_CB = 31, /* contingent break opportunity */
6311 LBP_CL = 6, /* closing punctuation */
6312 LBP_CP = 7, /* closing parenthesis */
6313 LBP_EX = 8, /* exclamation/interrogation */
6314 LBP_IN = 9, /* inseparable */
6315 LBP_NS = 10, /* non starter */
6316 LBP_OP = 11, /* opening punctuation */
6317 LBP_QU = 12, /* ambiguous quotation */
6318 LBP_IS = 13, /* infix separator (numeric) */
6319 LBP_NU = 14, /* numeric */
6320 LBP_PO = 15, /* postfix (numeric) */
6321 LBP_PR = 16, /* prefix (numeric) */
6322 LBP_SY = 17, /* symbols allowing breaks */
6323 LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */
6324 LBP_AL = 18, /* ordinary alphabetic and symbol characters */
6325 /*LBP_CJ, conditional Japanese starter, resolved to NS */
6326 LBP_H2 = 19, /* Hangul LV syllable */
6327 LBP_H3 = 20, /* Hangul LVT syllable */
6328 LBP_HL = 25, /* Hebrew letter */
6329 LBP_ID = 21, /* ideographic */
6330 LBP_JL = 22, /* Hangul L Jamo */
6331 LBP_JV = 23, /* Hangul V Jamo */
6332 LBP_JT = 24, /* Hangul T Jamo */
6333 LBP_RI = 26, /* regional indicator */
6334 LBP_SA = 33, /* complex context (South East Asian) */
6335 LBP_XX = 34 /* unknown */
6338 /* Returns the line breaking classification for ch, as a bit mask. */
6339 static int64_t
6340 get_lbp (unsigned int ch)
6342 int64_t attr = 0;
6344 /* U+20BC..U+20CF are reserved for prefixes. */
6345 if (unicode_attributes[ch].name == NULL && (ch >= 0x20BC && ch <= 0x20CF))
6346 return (int64_t) 1 << LBP_PR;
6348 if (unicode_attributes[ch].name != NULL)
6350 /* mandatory break */
6351 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
6352 || ch == 0x000C /* form feed */
6353 || ch == 0x000B /* line tabulation */
6354 || ch == 0x2028 /* LINE SEPARATOR */
6355 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
6356 attr |= (int64_t) 1 << LBP_BK;
6358 if (ch == 0x2060 /* WORD JOINER */
6359 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
6360 attr |= (int64_t) 1 << LBP_WJ;
6362 /* zero width space */
6363 if (ch == 0x200B /* ZERO WIDTH SPACE */)
6364 attr |= (int64_t) 1 << LBP_ZW;
6366 /* non-breaking (glue) */
6367 if (ch == 0x00A0 /* NO-BREAK SPACE */
6368 || ch == 0x202F /* NARROW NO-BREAK SPACE */
6369 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
6370 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
6371 || ch == 0x2007 /* FIGURE SPACE */
6372 || ch == 0x2011 /* NON-BREAKING HYPHEN */
6373 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
6374 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
6375 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
6376 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
6377 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6378 || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
6379 || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
6380 attr |= (int64_t) 1 << LBP_GL;
6382 /* space */
6383 if (ch == 0x0020 /* SPACE */)
6384 attr |= (int64_t) 1 << LBP_SP;
6386 /* break opportunity before and after */
6387 if (ch == 0x2014 /* EM DASH */
6388 || ch == 0x2E3A /* TWO-EM DASH */
6389 || ch == 0x2E3B /* THREE-EM DASH */)
6390 attr |= (int64_t) 1 << LBP_B2;
6392 /* break opportunity after */
6393 if (/* Breaking Spaces */
6394 ch == 0x1680 /* OGHAM SPACE MARK */
6395 || ch == 0x2000 /* EN QUAD */
6396 || ch == 0x2001 /* EM QUAD */
6397 || ch == 0x2002 /* EN SPACE */
6398 || ch == 0x2003 /* EM SPACE */
6399 || ch == 0x2004 /* THREE-PER-EM SPACE */
6400 || ch == 0x2005 /* FOUR-PER-EM SPACE */
6401 || ch == 0x2006 /* SIX-PER-EM SPACE */
6402 || ch == 0x2008 /* PUNCTUATION SPACE */
6403 || ch == 0x2009 /* THIN SPACE */
6404 || ch == 0x200A /* HAIR SPACE */
6405 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
6406 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
6407 /* Tabs */
6408 || ch == 0x0009 /* tab */
6409 /* Conditional Hyphens */
6410 || ch == 0x00AD /* SOFT HYPHEN */
6411 /* Breaking Hyphens */
6412 || ch == 0x058A /* ARMENIAN HYPHEN */
6413 || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
6414 || ch == 0x2010 /* HYPHEN */
6415 || ch == 0x2012 /* FIGURE DASH */
6416 || ch == 0x2013 /* EN DASH */
6417 /* Visible Word Dividers */
6418 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
6419 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
6420 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
6421 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
6422 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
6423 || ch == 0x2027 /* HYPHENATION POINT */
6424 || ch == 0x007C /* VERTICAL LINE */
6425 /* Historic Word Separators */
6426 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
6427 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
6428 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
6429 || ch == 0x2056 /* THREE DOT PUNCTUATION */
6430 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
6431 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
6432 || ch == 0x205A /* TWO DOT PUNCTUATION */
6433 || ch == 0x205B /* FOUR DOT MARK */
6434 || ch == 0x205D /* TRICOLON */
6435 || ch == 0x205E /* VERTICAL FOUR DOTS */
6436 || ch == 0x2E19 /* PALM BRANCH */
6437 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
6438 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
6439 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
6440 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
6441 || ch == 0x2E30 /* RING POINT */
6442 || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
6443 || ch == 0x2E33 /* RAISED DOT */
6444 || ch == 0x2E34 /* RAISED COMMA */
6445 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
6446 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
6447 || ch == 0x10102 /* AEGEAN CHECK MARK */
6448 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
6449 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
6450 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
6451 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
6452 /* Dandas */
6453 || ch == 0x0964 /* DEVANAGARI DANDA */
6454 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
6455 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
6456 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
6457 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
6458 || ch == 0x104B /* MYANMAR SIGN SECTION */
6459 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
6460 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
6461 || ch == 0x17D4 /* KHMER SIGN KHAN */
6462 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
6463 || ch == 0x1B5E /* BALINESE CARIK SIKI */
6464 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
6465 || ch == 0xA8CE /* SAURASHTRA DANDA */
6466 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
6467 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
6468 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
6469 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
6470 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
6471 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
6472 /* Tibetan */
6473 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
6474 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
6475 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
6476 || ch == 0x0FBE /* TIBETAN KU RU KHA */
6477 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
6478 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
6479 /* Other Terminating Punctuation */
6480 || ch == 0x1804 /* MONGOLIAN COLON */
6481 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
6482 || ch == 0x1B5A /* BALINESE PANTI */
6483 || ch == 0x1B5B /* BALINESE PAMADA */
6484 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
6485 || ch == 0x1B60 /* BALINESE PAMENENG */
6486 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
6487 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
6488 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
6489 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
6490 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
6491 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
6492 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
6493 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
6494 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
6495 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
6496 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
6497 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
6498 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
6499 || ch == 0x2E3C /* STENOGRAPHIC FULL STOP */
6500 || ch == 0x2E3D /* VERTICAL SIX DOTS */
6501 || ch == 0x2E3E /* WIGGLY VERTICAL LINE */
6502 || ch == 0x2E40 /* DOUBLE HYPHEN */
6503 || ch == 0x2E41 /* REVERSED COMMA */
6504 || ch == 0xA60D /* VAI COMMA */
6505 || ch == 0xA60F /* VAI QUESTION MARK */
6506 || ch == 0xA92E /* KAYAH LI SIGN CWI */
6507 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
6508 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
6509 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
6510 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
6511 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
6512 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
6513 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
6514 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6515 || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
6516 || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
6517 || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
6518 || ch == 0xA6F3 /* BAMUM FULL STOP */
6519 || ch == 0xA6F4 /* BAMUM COLON */
6520 || ch == 0xA6F5 /* BAMUM COMMA */
6521 || ch == 0xA6F6 /* BAMUM SEMICOLON */
6522 || ch == 0xA6F7 /* BAMUM QUESTION MARK */
6523 || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
6524 || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
6525 || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
6526 || ch == 0xAAF0 /* MEETEI MAYEK CHEIKHAN */
6527 || ch == 0xAAF1 /* MEETEI MAYEK AHANG KHUDAM */
6528 || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
6529 || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
6530 || (ch >= 0x10AF0 && ch <= 0x10AF5) /* MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS */
6531 || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
6532 || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
6533 || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
6534 || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
6535 || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
6536 || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
6537 || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
6538 || ch == 0x11047 /* BRAHMI DANDA */
6539 || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
6540 || ch == 0x110BE /* KAITHI SECTION MARK */
6541 || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
6542 || ch == 0x110C0 /* KAITHI DANDA */
6543 || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
6544 || ch == 0x11140 /* CHAKMA SECTION MARK */
6545 || ch == 0x11141 /* CHAKMA DANDA */
6546 || ch == 0x11142 /* CHAKMA DOUBLE DANDA */
6547 || ch == 0x11143 /* CHAKMA QUESTION MARK */
6548 || ch == 0x111C5 /* SHARADA DANDA */
6549 || ch == 0x111C6 /* SHARADA DOUBLE DANDA */
6550 || ch == 0x111C8 /* SHARADA SEPARATOR */
6551 || (ch >= 0x111DD && ch <= 0x111DF) /* SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 */
6552 || ch == 0x11238 /* KHOJKI DANDA */
6553 || ch == 0x11239 /* KHOJKI DOUBLE DANDA */
6554 || ch == 0x1123B /* KHOJKI SECTION MARK */
6555 || ch == 0x1123C /* KHOJKI DOUBLE SECTION MARK */
6556 || ch == 0x112A9 /* MULTANI SECTION MARK */
6557 || ch == 0x115C2 /* SIDDHAM DANDA */
6558 || ch == 0x115C3 /* SIDDHAM DOUBLE DANDA */
6559 || (ch >= 0x115C9 && ch <= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
6560 || ch == 0x11641 /* MODI DANDA */
6561 || ch == 0x11642 /* MODI DOUBLE DANDA */
6562 || (ch >= 0x1173C && ch <= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
6563 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
6564 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
6565 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
6566 || ch == 0x12474 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON */
6567 || ch == 0x16A6E /* MRO DANDA */
6568 || ch == 0x16A6F /* MRO DOUBLE DANDA */
6569 || ch == 0x16AF5 /* BASSA VAH FULL STOP */
6570 || ch == 0x16B37 /* PAHAWH HMONG SIGN VOS THOM */
6571 || ch == 0x16B38 /* PAHAWH HMONG SIGN VOS TSHAB CEEB */
6572 || ch == 0x16B39 /* PAHAWH HMONG SIGN CIM CHEEM */
6573 || ch == 0x16B44 /* PAHAWH HMONG SIGN XAUS */
6574 || ch == 0x1BC9F /* DUPLOYAN PUNCTUATION CHINOOK FULL STOP */
6575 || (ch >= 0x1DA87 && ch <= 0x1DA8A) /* SIGNWRITING COMMA..SIGNWRITING COLON */)
6576 attr |= (int64_t) 1 << LBP_BA;
6578 /* break opportunity before */
6579 if (ch == 0x00B4 /* ACUTE ACCENT */
6580 || ch == 0x1FFD /* GREEK OXIA */
6581 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
6582 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
6583 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
6584 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
6585 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
6586 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
6587 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
6588 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
6589 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
6590 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
6591 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
6592 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
6593 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
6594 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
6595 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
6596 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
6597 || ch == 0xA8FC /* DEVANAGARI SIGN SIDDHAM */
6598 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
6599 || ch == 0x11175 /* MAHAJANI SECTION MARK */
6600 || ch == 0x111DB /* SHARADA SIGN SIDDHAM */
6601 || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */)
6602 attr |= (int64_t) 1 << LBP_BB;
6604 /* hyphen */
6605 if (ch == 0x002D /* HYPHEN-MINUS */)
6606 attr |= (int64_t) 1 << LBP_HY;
6608 /* contingent break opportunity */
6609 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
6610 attr |= (int64_t) 1 << LBP_CB;
6612 /* closing parenthesis */
6613 if (ch == 0x0029 /* RIGHT PARENTHESIS */
6614 || ch == 0x005D /* RIGHT SQUARE BRACKET */)
6615 attr |= (int64_t) 1 << LBP_CP;
6617 /* closing punctuation */
6618 if ((unicode_attributes[ch].category[0] == 'P'
6619 && unicode_attributes[ch].category[1] == 'e'
6620 && !(attr & ((int64_t) 1 << LBP_CP)))
6621 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
6622 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
6623 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
6624 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
6625 || ch == 0xFE50 /* SMALL COMMA */
6626 || ch == 0xFE52 /* SMALL FULL STOP */
6627 || ch == 0xFF0C /* FULLWIDTH COMMA */
6628 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
6629 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
6630 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
6631 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6632 || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
6633 || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
6634 || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
6635 || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
6636 || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
6637 || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
6638 || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
6639 || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */
6640 || ch == 0x145CF /* ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK */)
6641 attr |= (int64_t) 1 << LBP_CL;
6643 /* exclamation/interrogation */
6644 if (ch == 0x0021 /* EXCLAMATION MARK */
6645 || ch == 0x003F /* QUESTION MARK */
6646 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
6647 || ch == 0x061B /* ARABIC SEMICOLON */
6648 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
6649 || ch == 0x061F /* ARABIC QUESTION MARK */
6650 || ch == 0x06D4 /* ARABIC FULL STOP */
6651 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
6652 || ch == 0x0F0D /* TIBETAN MARK SHAD */
6653 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
6654 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
6655 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
6656 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
6657 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
6658 || ch == 0x1802 /* MONGOLIAN COMMA */
6659 || ch == 0x1803 /* MONGOLIAN FULL STOP */
6660 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
6661 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
6662 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
6663 || ch == 0x1945 /* LIMBU QUESTION MARK */
6664 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
6665 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
6666 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
6667 || ch == 0x2CFE /* COPTIC FULL STOP */
6668 || ch == 0x2E2E /* REVERSED QUESTION MARK */
6669 || ch == 0xA60E /* VAI FULL STOP */
6670 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
6671 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
6672 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
6673 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
6674 || ch == 0xFE56 /* SMALL QUESTION MARK */
6675 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
6676 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
6677 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */
6678 || ch == 0x115C4 /* SIDDHAM SEPARATOR DOT */
6679 || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */)
6680 attr |= (int64_t) 1 << LBP_EX;
6682 /* inseparable */
6683 if (ch == 0x2024 /* ONE DOT LEADER */
6684 || ch == 0x2025 /* TWO DOT LEADER */
6685 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
6686 || ch == 0x22EF /* MIDLINE HORIZONTAL ELLIPSIS */
6687 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */
6688 || ch == 0x10AF6 /* MANICHAEAN PUNCTUATION LINE FILLER */)
6689 attr |= (int64_t) 1 << LBP_IN;
6691 /* non starter */
6692 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
6693 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
6694 || ch == 0x203D /* INTERROBANG */
6695 || ch == 0x2047 /* DOUBLE QUESTION MARK */
6696 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
6697 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
6698 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
6699 || ch == 0x301C /* WAVE DASH */
6700 || ch == 0x303C /* MASU MARK */
6701 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
6702 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
6703 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
6704 || ch == 0x309D /* HIRAGANA ITERATION MARK */
6705 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
6706 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
6707 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
6708 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6709 || ch == 0x30FD /* KATAKANA ITERATION MARK */
6710 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
6711 || ch == 0xA015 /* YI SYLLABLE WU */
6712 || ch == 0xFE54 /* SMALL SEMICOLON */
6713 || ch == 0xFE55 /* SMALL COLON */
6714 || ch == 0xFF1A /* FULLWIDTH COLON */
6715 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
6716 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
6717 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6718 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
6719 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
6720 || ch == 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
6721 || ch == 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
6722 || ch == 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */
6723 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
6724 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
6725 attr |= (int64_t) 1 << LBP_NS;
6727 /* opening punctuation */
6728 if ((unicode_attributes[ch].category[0] == 'P'
6729 && unicode_attributes[ch].category[1] == 's')
6730 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
6731 || ch == 0x00BF /* INVERTED QUESTION MARK */
6732 || ch == 0x2E18 /* INVERTED INTERROBANG */
6733 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6734 || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
6735 || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
6736 || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
6737 || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
6738 || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
6739 || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
6740 || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */)
6741 attr |= (int64_t) 1 << LBP_OP;
6743 /* ambiguous quotation */
6744 if ((unicode_attributes[ch].category[0] == 'P'
6745 && (unicode_attributes[ch].category[1] == 'f'
6746 || unicode_attributes[ch].category[1] == 'i'))
6747 || ch == 0x0022 /* QUOTATION MARK */
6748 || ch == 0x0027 /* APOSTROPHE */
6749 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
6750 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
6751 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6752 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6753 || ch == 0x275F /* HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT */
6754 || ch == 0x2760 /* HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */
6755 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
6756 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
6757 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
6758 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
6759 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
6760 || ch == 0x2E0B /* RAISED SQUARE */
6761 || ch == 0x1F676 /* SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6762 || ch == 0x1F677 /* SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6763 || ch == 0x1F678 /* SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */)
6764 attr |= (int64_t) 1 << LBP_QU;
6766 /* infix separator (numeric) */
6767 if (ch == 0x002C /* COMMA */
6768 || ch == 0x002E /* FULL STOP */
6769 || ch == 0x003A /* COLON */
6770 || ch == 0x003B /* SEMICOLON */
6771 || ch == 0x037E /* GREEK QUESTION MARK */
6772 || ch == 0x0589 /* ARMENIAN FULL STOP */
6773 || ch == 0x060C /* ARABIC COMMA */
6774 || ch == 0x060D /* ARABIC DATE SEPARATOR */
6775 || ch == 0x07F8 /* NKO COMMA */
6776 || ch == 0x2044 /* FRACTION SLASH */
6777 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
6778 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
6779 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
6780 attr |= (int64_t) 1 << LBP_IS;
6782 /* numeric */
6783 if ((unicode_attributes[ch].category[0] == 'N'
6784 && unicode_attributes[ch].category[1] == 'd'
6785 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
6786 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
6787 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
6788 attr |= (int64_t) 1 << LBP_NU;
6790 /* postfix (numeric) */
6791 if (ch == 0x0025 /* PERCENT SIGN */
6792 || ch == 0x00A2 /* CENT SIGN */
6793 || ch == 0x00B0 /* DEGREE SIGN */
6794 || ch == 0x060B /* AFGHANI SIGN */
6795 || ch == 0x066A /* ARABIC PERCENT SIGN */
6796 || ch == 0x2030 /* PER MILLE SIGN */
6797 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
6798 || ch == 0x2032 /* PRIME */
6799 || ch == 0x2033 /* DOUBLE PRIME */
6800 || ch == 0x2034 /* TRIPLE PRIME */
6801 || ch == 0x2035 /* REVERSED PRIME */
6802 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
6803 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
6804 || ch == 0x20A7 /* PESETA SIGN */
6805 || ch == 0x20BB /* NORDIC MARK SIGN */
6806 || ch == 0x2103 /* DEGREE CELSIUS */
6807 || ch == 0x2109 /* DEGREE FAHRENHEIT */
6808 || ch == 0xFDFC /* RIAL SIGN */
6809 || ch == 0xFE6A /* SMALL PERCENT SIGN */
6810 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
6811 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
6812 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6813 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
6814 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
6815 || ch == 0x09F2 /* BENGALI RUPEE MARK */
6816 || ch == 0x09F3 /* BENGALI RUPEE SIGN */
6817 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
6818 || ch == 0x0D79 /* MALAYALAM DATE MARK */
6819 || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
6820 || ch == 0x20BE /* LARI SIGN */
6821 || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
6822 attr |= (int64_t) 1 << LBP_PO;
6824 /* prefix (numeric) */
6825 if ((unicode_attributes[ch].category[0] == 'S'
6826 && unicode_attributes[ch].category[1] == 'c')
6827 || ch == 0x002B /* PLUS SIGN */
6828 || ch == 0x005C /* REVERSE SOLIDUS */
6829 || ch == 0x00B1 /* PLUS-MINUS SIGN */
6830 || ch == 0x2116 /* NUMERO SIGN */
6831 || ch == 0x2212 /* MINUS SIGN */
6832 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
6833 if (!(attr & ((int64_t) 1 << LBP_PO)))
6834 attr |= (int64_t) 1 << LBP_PR;
6836 /* symbols allowing breaks */
6837 if (ch == 0x002F /* SOLIDUS */)
6838 attr |= (int64_t) 1 << LBP_SY;
6840 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
6841 attr |= (int64_t) 1 << LBP_H2;
6843 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
6844 attr |= (int64_t) 1 << LBP_H3;
6846 if ((ch >= 0x05D0 && ch <= 0x05F2) || ch == 0xFB1D
6847 || (ch >= 0xFB1F && ch <= 0xFB28) || (ch >= 0xFB2A && ch <= 0xFB4F))
6848 attr |= (int64_t) 1 << LBP_HL;
6850 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
6851 attr |= (int64_t) 1 << LBP_JL;
6853 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
6854 attr |= (int64_t) 1 << LBP_JV;
6856 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
6857 attr |= (int64_t) 1 << LBP_JT;
6859 /* regional indicator */
6860 if (ch >= 0x1F1E6 && ch <= 0x1F1FF)
6861 attr |= (int64_t) 1 << LBP_RI;
6863 /* complex context (South East Asian) */
6864 if (((unicode_attributes[ch].category[0] == 'C'
6865 && unicode_attributes[ch].category[1] == 'f')
6866 || (unicode_attributes[ch].category[0] == 'L'
6867 && (unicode_attributes[ch].category[1] == 'm'
6868 || unicode_attributes[ch].category[1] == 'o'))
6869 || (unicode_attributes[ch].category[0] == 'M'
6870 && (unicode_attributes[ch].category[1] == 'c'
6871 || unicode_attributes[ch].category[1] == 'n')
6872 && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
6873 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6874 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
6875 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
6876 || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
6877 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
6878 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
6879 || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
6880 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
6881 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
6882 || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
6883 || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */
6884 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
6885 || ch == 0x1173F /* Ahom */)
6886 && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
6887 || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
6888 || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
6889 || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
6890 || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
6891 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
6892 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
6893 || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */
6894 || (ch >= 0x11700 && ch <= 0x11719) /* Ahom */
6895 || (ch >= 0x1171D && ch <= 0x1172B) /* Ahom */
6896 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
6897 || ch == 0x1173F /* Ahom */))
6898 attr |= (int64_t) 1 << LBP_SA;
6900 /* attached characters and combining marks */
6901 if ((unicode_attributes[ch].category[0] == 'M'
6902 && (unicode_attributes[ch].category[1] == 'c'
6903 || unicode_attributes[ch].category[1] == 'e'
6904 || unicode_attributes[ch].category[1] == 'n'))
6905 || (unicode_attributes[ch].category[0] == 'C'
6906 && (unicode_attributes[ch].category[1] == 'c'
6907 || unicode_attributes[ch].category[1] == 'f')
6908 && ch != 0x110BD /* KAITHI NUMBER SIGN */)
6909 || ch == 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
6910 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
6911 attr |= (int64_t) 1 << LBP_CM;
6913 /* ideographic */
6914 if (ch == 0x231A /* WATCH */
6915 || ch == 0x231B /* HOURGLASS */
6916 || ch == 0x23F0 /* ALARM CLOCK */
6917 || ch == 0x23F1 /* STOPWATCH */
6918 || ch == 0x23F2 /* TIMER CLOCK */
6919 || ch == 0x23F3 /* HOURGLASS WITH FLOWING SAND */
6920 || ch == 0x2600 /* BLACK SUN WITH RAYS */
6921 || ch == 0x2601 /* CLOUD */
6922 || ch == 0x2602 /* UMBRELLA */
6923 || ch == 0x2603 /* SNOWMAN */
6924 || ch == 0x2614 /* UMBRELLA WITH RAIN DROPS */
6925 || ch == 0x2615 /* HOT BEVERAGE */
6926 || ch == 0x2618 /* SHAMROCK */
6927 || ch == 0x261A /* BLACK LEFT POINTING INDEX */
6928 || ch == 0x261B /* BLACK RIGHT POINTING INDEX */
6929 || ch == 0x261C /* WHITE LEFT POINTING INDEX */
6930 || ch == 0x261D /* WHITE UP POINTING INDEX */
6931 || ch == 0x261E /* WHITE RIGHT POINTING INDEX */
6932 || ch == 0x261F /* WHITE DOWN POINTING INDEX */
6933 || ch == 0x2639 /* WHITE FROWNING FACE */
6934 || ch == 0x263A /* WHITE SMILING FACE */
6935 || ch == 0x263B /* BLACK SMILING FACE */
6936 || ch == 0x2668 /* HOT SPRINGS */
6937 || ch == 0x267F /* WHEELCHAIR SYMBOL */
6938 || ch == 0x26BD /* SOCCER BALL */
6939 || ch == 0x26BE /* BASEBALL */
6940 || ch == 0x26BF /* SQUARED KEY */
6941 || ch == 0x26C0 /* WHITE DRAUGHTS MAN */
6942 || ch == 0x26C1 /* WHITE DRAUGHTS KING */
6943 || ch == 0x26C2 /* BLACK DRAUGHTS MAN */
6944 || ch == 0x26C3 /* BLACK DRAUGHTS KING */
6945 || ch == 0x26C4 /* SNOWMAN WITHOUT SNOW */
6946 || ch == 0x26C5 /* SUN BEHIND CLOUD */
6947 || ch == 0x26C6 /* RAIN */
6948 || ch == 0x26C7 /* BLACK SNOWMAN */
6949 || ch == 0x26C8 /* THUNDER CLOUD AND RAIN */
6950 || ch == 0x26CD /* DISABLED CAR */
6951 || ch == 0x26CF /* PICK */
6952 || ch == 0x26D0 /* CAR SLIDING */
6953 || ch == 0x26D1 /* HELMET WITH WHITE CROSS */
6954 || ch == 0x26D3 /* CHAINS */
6955 || ch == 0x26D4 /* NO ENTRY */
6956 || ch == 0x26D8 /* BLACK LEFT LANE MERGE */
6957 || ch == 0x26D9 /* WHITE LEFT LANE MERGE */
6958 || ch == 0x26DC /* LEFT CLOSED ENTRY */
6959 || ch == 0x26DF /* BLACK TRUCK */
6960 || ch == 0x26E0 /* RESTRICTED LEFT ENTRY-1 */
6961 || ch == 0x26E1 /* RESTRICTED LEFT ENTRY-2 */
6962 || ch == 0x26EA /* CHURCH */
6963 || ch == 0x26F1 /* UMBRELLA ON GROUND */
6964 || ch == 0x26F2 /* FOUNTAIN */
6965 || ch == 0x26F3 /* FLAG IN HOLE */
6966 || ch == 0x26F4 /* FERRY */
6967 || ch == 0x26F5 /* SAILBOAT */
6968 || ch == 0x26F7 /* SKIER */
6969 || ch == 0x26F8 /* ICE SKATE */
6970 || ch == 0x26F9 /* PERSON WITH BALL */
6971 || ch == 0x26FA /* TENT */
6972 || ch == 0x26FD /* FUEL PUMP */
6973 || ch == 0x26FE /* CUP ON BLACK SQUARE */
6974 || ch == 0x26FF /* WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE */
6975 || ch == 0x2700 /* BLACK SAFETY SCISSORS */
6976 || ch == 0x2701 /* UPPER BLADE SCISSORS */
6977 || ch == 0x2702 /* BLACK SCISSORS */
6978 || ch == 0x2703 /* LOWER BLADE SCISSORS */
6979 || ch == 0x2704 /* WHITE SCISSORS */
6980 || ch == 0x2708 /* AIRPLANE */
6981 || ch == 0x2709 /* ENVELOPE */
6982 || ch == 0x270A /* RAISED FIST */
6983 || ch == 0x270B /* RAISED HAND */
6984 || ch == 0x270C /* VICTORY HAND */
6985 || ch == 0x270D /* WRITING HAND */
6986 || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
6987 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
6988 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
6989 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
6990 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
6991 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
6992 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
6993 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
6994 || ch == 0xFE62 /* SMALL PLUS SIGN */
6995 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
6996 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
6997 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
6998 || ch == 0xFE66 /* SMALL EQUALS SIGN */
6999 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
7000 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
7001 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
7002 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
7003 || (ch >= 0x3000 && ch <= 0x33FF
7004 && !(attr & (((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
7005 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7006 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
7007 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
7008 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
7009 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
7010 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
7011 || ch == 0xFE45 /* SESAME DOT */
7012 || ch == 0xFE46 /* WHITE SESAME DOT */
7013 || ch == 0xFE49 /* DASHED OVERLINE */
7014 || ch == 0xFE4A /* CENTRELINE OVERLINE */
7015 || ch == 0xFE4B /* WAVY OVERLINE */
7016 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
7017 || ch == 0xFE4D /* DASHED LOW LINE */
7018 || ch == 0xFE4E /* CENTRELINE LOW LINE */
7019 || ch == 0xFE4F /* WAVY LOW LINE */
7020 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
7021 || ch == 0xFE58 /* SMALL EM DASH */
7022 || ch == 0xFE5F /* SMALL NUMBER SIGN */
7023 || ch == 0xFE60 /* SMALL AMPERSAND */
7024 || ch == 0xFE61 /* SMALL ASTERISK */
7025 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
7026 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
7027 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
7028 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
7029 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
7030 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
7031 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
7032 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
7033 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
7034 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
7035 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
7036 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
7037 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
7038 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
7039 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
7040 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
7041 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
7042 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
7043 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
7044 || ch == 0xFF5E /* FULLWIDTH TILDE */
7045 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
7046 || ch == 0xFFE3 /* FULLWIDTH MACRON */
7047 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
7048 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7049 || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
7050 || (ch >= 0x1F000 && ch <= 0x1F02B) /* Mahjong Tiles */
7051 || (ch >= 0x1F030 && ch <= 0x1F093) /* Domino Tiles */
7052 || (ch >= 0x1F0A0 && ch <= 0x1F0F5) /* Playing Cards */
7053 || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
7054 || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
7055 || (ch >= 0x1F300 && ch <= 0x1F5FF /* Miscellaneous Symbols and Pictographs */
7056 && ch != 0x1F3B5 && ch != 0x1F3B6 && ch != 0x1F3BC
7057 && ch != 0x1F4A0 && ch != 0x1F4A2 && ch != 0x1F4A4
7058 && ch != 0x1F4AF && ch != 0x1F4B1 && ch != 0x1F4B2
7059 && !(ch >= 0x1F39C && ch <= 0x1F39D)
7060 && !(ch >= 0x1F3FB && ch <= 0x1F3FF)
7061 && !(ch >= 0x1F500 && ch <= 0x1F506)
7062 && !(ch >= 0x1F517 && ch <= 0x1F524)
7063 && !(ch >= 0x1F532 && ch <= 0x1F549)
7064 && !(ch >= 0x1F5D4 && ch <= 0x1F5DB)
7065 && !(ch >= 0x1F5F4 && ch <= 0x1F5F9))
7066 || (ch >= 0x1F600 && ch <= 0x1F64F) /* Emoticons */
7067 || (ch >= 0x1F680 && ch <= 0x1F6D0) /* Transport and Map Symbols */
7068 || (ch >= 0x1F6E0 && ch <= 0x1F6EC) /* Transport and Map Symbols */
7069 || (ch >= 0x1F6F0 && ch <= 0x1F6F3) /* Transport and Map Symbols */
7070 || (ch >= 0x1F900 && ch <= 0x1F9FF) /* Supplemental Symbols and Pictographs */
7071 || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
7072 || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */
7073 || (ch >= 0x2B820 && ch <= 0x2CEAF) /* CJK Ideograph Extension E */)
7074 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
7076 /* ambiguous (ideograph) ? */
7077 if ((unicode_width[ch] != NULL
7078 && unicode_width[ch][0] == 'A'
7079 && ch >= 0x2000
7080 && ch != 0x2614
7081 && ch != 0x2615
7082 && ch != 0x261C
7083 && ch != 0x261E
7084 && ch != 0x2668
7085 && ch != 0x26BE
7086 && ch != 0x26BF
7087 && !(ch >= 0x26C4 && ch <= 0x26C8)
7088 && ch != 0x26CD
7089 && ch != 0x26CF
7090 && ch != 0x26D0
7091 && ch != 0x26D1
7092 && ch != 0x26D3
7093 && ch != 0x26D4
7094 && ch != 0x26D8
7095 && ch != 0x26D9
7096 && ch != 0x26DC
7097 && ch != 0x26DF
7098 && ch != 0x26E0
7099 && ch != 0x26E1
7100 && ch != 0x26EA
7101 && !(ch >= 0x26F1 && ch <= 0x26F5)
7102 && !(ch >= 0x26F7 && ch <= 0x26FA)
7103 && !(ch >= 0x26FD && ch <= 0x26FF))
7104 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
7105 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
7106 attr |= (int64_t) 1 << LBP_AI;
7107 else
7108 attr |= (int64_t) 1 << LBP_ID;
7111 /* ordinary alphabetic and symbol characters */
7112 if ((unicode_attributes[ch].category[0] == 'L'
7113 && (unicode_attributes[ch].category[1] == 'u'
7114 || unicode_attributes[ch].category[1] == 'l'
7115 || unicode_attributes[ch].category[1] == 't'
7116 || unicode_attributes[ch].category[1] == 'm'
7117 || unicode_attributes[ch].category[1] == 'o'))
7118 || (unicode_attributes[ch].category[0] == 'S'
7119 && (unicode_attributes[ch].category[1] == 'm'
7120 || unicode_attributes[ch].category[1] == 'k'
7121 || unicode_attributes[ch].category[1] == 'o'))
7122 || (unicode_attributes[ch].category[0] == 'N'
7123 && (unicode_attributes[ch].category[1] == 'l'
7124 || unicode_attributes[ch].category[1] == 'o'))
7125 || (unicode_attributes[ch].category[0] == 'P'
7126 && (unicode_attributes[ch].category[1] == 'c'
7127 || unicode_attributes[ch].category[1] == 'd'
7128 || unicode_attributes[ch].category[1] == 'o'))
7129 || ch == 0x0600 /* ARABIC NUMBER SIGN */
7130 || ch == 0x0601 /* ARABIC SIGN SANAH */
7131 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
7132 || ch == 0x0603 /* ARABIC SIGN SAFHA */
7133 || ch == 0x0604 /* ARABIC SIGN SAMVAT */
7134 || ch == 0x0605 /* ARABIC NUMBER MARK ABOVE */
7135 || ch == 0x06DD /* ARABIC END OF AYAH */
7136 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
7137 || ch == 0x2061 /* FUNCTION APPLICATION */
7138 || ch == 0x2062 /* INVISIBLE TIMES */
7139 || ch == 0x2063 /* INVISIBLE SEPARATOR */
7140 || ch == 0x2064 /* INVISIBLE PLUS */
7141 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7142 || ch == 0x110BD /* KAITHI NUMBER SIGN */)
7143 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))
7144 && ch != 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
7146 /* ambiguous (alphabetic) ? */
7147 if ((unicode_width[ch] != NULL
7148 && unicode_width[ch][0] == 'A'
7149 && ch >= 0x2000
7150 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
7151 && ch != 0x2022 /* BULLET */
7152 && ch != 0x203E /* OVERLINE */
7153 && ch != 0x2126 /* OHM SIGN */
7154 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
7155 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
7156 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
7157 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
7158 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
7159 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
7160 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
7161 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
7162 || ch == 0x00A7 /* SECTION SIGN */
7163 || ch == 0x00A8 /* DIAERESIS */
7164 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
7165 || ch == 0x00B2 /* SUPERSCRIPT TWO */
7166 || ch == 0x00B3 /* SUPERSCRIPT THREE */
7167 || ch == 0x00B6 /* PILCROW SIGN */
7168 || ch == 0x00B7 /* MIDDLE DOT */
7169 || ch == 0x00B8 /* CEDILLA */
7170 || ch == 0x00B9 /* SUPERSCRIPT ONE */
7171 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
7172 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
7173 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
7174 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
7175 || ch == 0x00D7 /* MULTIPLICATION SIGN */
7176 || ch == 0x00F7 /* DIVISION SIGN */
7177 || ch == 0x02C7 /* CARON */
7178 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
7179 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
7180 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
7181 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
7182 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
7183 || ch == 0x02D8 /* BREVE */
7184 || ch == 0x02D9 /* DOT ABOVE */
7185 || ch == 0x02DA /* RING ABOVE */
7186 || ch == 0x02DB /* OGONEK */
7187 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
7188 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
7189 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
7190 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7191 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
7192 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
7193 || ch == 0x2616 /* WHITE SHOGI PIECE */
7194 || ch == 0x2617 /* BLACK SHOGI PIECE */
7195 || ch == 0x1F10B /* DINGBAT CIRCLED SANS-SERIF DIGIT ZERO */
7196 || ch == 0x1F10C /* DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */)
7197 attr |= (int64_t) 1 << LBP_AI;
7198 else
7199 attr |= (int64_t) 1 << LBP_AL;
7200 attr &= ~((int64_t) 1 << LBP_CM);
7203 else
7205 /* Unassigned character. */
7206 if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
7207 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
7208 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
7209 || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
7210 || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
7211 Supplementary Ideographic Plane (Plane 2) outside of blocks */
7212 || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
7213 Supplementary Ideographic Plane (Plane 2) outside of blocks */
7214 || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
7215 attr |= (int64_t) 1 << LBP_ID;
7218 if (attr == 0)
7219 /* unknown */
7220 attr |= (int64_t) 1 << LBP_XX;
7222 return attr;
7225 /* Output the line breaking properties in a human readable format. */
7226 static void
7227 debug_output_lbp (FILE *stream)
7229 unsigned int i;
7231 for (i = 0; i < 0x110000; i++)
7233 int64_t attr = get_lbp (i);
7234 if (attr != (int64_t) 1 << LBP_XX)
7236 fprintf (stream, "0x%04X", i);
7237 #define PRINT_BIT(attr,bit) \
7238 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
7239 PRINT_BIT(attr,LBP_BK);
7240 PRINT_BIT(attr,LBP_CM);
7241 PRINT_BIT(attr,LBP_WJ);
7242 PRINT_BIT(attr,LBP_ZW);
7243 PRINT_BIT(attr,LBP_GL);
7244 PRINT_BIT(attr,LBP_SP);
7245 PRINT_BIT(attr,LBP_B2);
7246 PRINT_BIT(attr,LBP_BA);
7247 PRINT_BIT(attr,LBP_BB);
7248 PRINT_BIT(attr,LBP_HY);
7249 PRINT_BIT(attr,LBP_CB);
7250 PRINT_BIT(attr,LBP_CL);
7251 PRINT_BIT(attr,LBP_CP);
7252 PRINT_BIT(attr,LBP_EX);
7253 PRINT_BIT(attr,LBP_IN);
7254 PRINT_BIT(attr,LBP_NS);
7255 PRINT_BIT(attr,LBP_OP);
7256 PRINT_BIT(attr,LBP_QU);
7257 PRINT_BIT(attr,LBP_IS);
7258 PRINT_BIT(attr,LBP_NU);
7259 PRINT_BIT(attr,LBP_PO);
7260 PRINT_BIT(attr,LBP_PR);
7261 PRINT_BIT(attr,LBP_SY);
7262 PRINT_BIT(attr,LBP_AI);
7263 PRINT_BIT(attr,LBP_AL);
7264 PRINT_BIT(attr,LBP_H2);
7265 PRINT_BIT(attr,LBP_H3);
7266 PRINT_BIT(attr,LBP_HL);
7267 PRINT_BIT(attr,LBP_ID);
7268 PRINT_BIT(attr,LBP_JL);
7269 PRINT_BIT(attr,LBP_JV);
7270 PRINT_BIT(attr,LBP_JT);
7271 PRINT_BIT(attr,LBP_RI);
7272 PRINT_BIT(attr,LBP_SA);
7273 PRINT_BIT(attr,LBP_XX);
7274 #undef PRINT_BIT
7275 fprintf (stream, "\n");
7280 static void
7281 debug_output_lbrk_tables (const char *filename)
7283 FILE *stream;
7285 stream = fopen (filename, "w");
7286 if (stream == NULL)
7288 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7289 exit (1);
7292 debug_output_lbp (stream);
7294 if (ferror (stream) || fclose (stream))
7296 fprintf (stderr, "error writing to '%s'\n", filename);
7297 exit (1);
7301 /* The line breaking property from the LineBreak.txt file. */
7302 int unicode_org_lbp[0x110000];
7304 /* Stores in unicode_org_lbp[] the line breaking property from the
7305 LineBreak.txt file. */
7306 static void
7307 fill_org_lbp (const char *linebreak_filename)
7309 unsigned int i, j;
7310 FILE *stream;
7311 char field0[FIELDLEN];
7312 char field1[FIELDLEN];
7313 char field2[FIELDLEN];
7314 int lineno = 0;
7316 for (i = 0; i < 0x110000; i++)
7317 unicode_org_lbp[i] = LBP_XX;
7319 stream = fopen (linebreak_filename, "r");
7320 if (stream == NULL)
7322 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
7323 exit (1);
7326 for (;;)
7328 int n;
7329 int c;
7330 int value;
7332 lineno++;
7333 c = getc (stream);
7334 if (c == EOF)
7335 break;
7336 if (c == '#')
7338 do c = getc (stream); while (c != EOF && c != '\n');
7339 continue;
7341 ungetc (c, stream);
7342 n = getfield (stream, field0, ';');
7343 n += getfield (stream, field1, ' ');
7344 n += getfield (stream, field2, '\n');
7345 if (n == 0)
7346 break;
7347 if (n != 3)
7349 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
7350 lineno);
7351 exit (1);
7353 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
7354 if (false) {}
7355 TRY(LBP_BK)
7356 TRY(LBP_CM)
7357 TRY(LBP_WJ)
7358 TRY(LBP_ZW)
7359 TRY(LBP_GL)
7360 TRY(LBP_SP)
7361 TRY(LBP_B2)
7362 TRY(LBP_BA)
7363 TRY(LBP_BB)
7364 TRY(LBP_HY)
7365 TRY(LBP_CB)
7366 TRY(LBP_CL)
7367 TRY(LBP_CP)
7368 TRY(LBP_EX)
7369 TRY(LBP_IN)
7370 TRY(LBP_NS)
7371 TRY(LBP_OP)
7372 TRY(LBP_QU)
7373 TRY(LBP_IS)
7374 TRY(LBP_NU)
7375 TRY(LBP_PO)
7376 TRY(LBP_PR)
7377 TRY(LBP_SY)
7378 TRY(LBP_AI)
7379 TRY(LBP_AL)
7380 TRY(LBP_H2)
7381 TRY(LBP_H3)
7382 TRY(LBP_HL)
7383 TRY(LBP_ID)
7384 TRY(LBP_JL)
7385 TRY(LBP_JV)
7386 TRY(LBP_JT)
7387 TRY(LBP_RI)
7388 TRY(LBP_SA)
7389 TRY(LBP_XX)
7390 #undef TRY
7391 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
7392 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
7393 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
7394 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
7395 else if (strcmp (field1, "CJ") == 0) value = LBP_NS;
7396 else
7398 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
7399 field1, linebreak_filename, lineno);
7400 exit (1);
7402 i = strtoul (field0, NULL, 16);
7403 if (strstr (field0, "..") != NULL)
7405 /* Deal with a range. */
7406 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
7407 for (; i <= j; i++)
7408 unicode_org_lbp[i] = value;
7410 else
7412 /* Single character line. */
7413 unicode_org_lbp[i] = value;
7417 if (ferror (stream) || fclose (stream))
7419 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
7420 exit (1);
7424 /* Output the line breaking properties in a human readable format. */
7425 static void
7426 debug_output_org_lbp (FILE *stream)
7428 unsigned int i;
7430 for (i = 0; i < 0x110000; i++)
7432 int attr = unicode_org_lbp[i];
7433 if (attr != LBP_XX)
7435 fprintf (stream, "0x%04X", i);
7436 #define PRINT_BIT(attr,bit) \
7437 if (attr == bit) fprintf (stream, " " #bit);
7438 PRINT_BIT(attr,LBP_BK);
7439 PRINT_BIT(attr,LBP_CM);
7440 PRINT_BIT(attr,LBP_WJ);
7441 PRINT_BIT(attr,LBP_ZW);
7442 PRINT_BIT(attr,LBP_GL);
7443 PRINT_BIT(attr,LBP_SP);
7444 PRINT_BIT(attr,LBP_B2);
7445 PRINT_BIT(attr,LBP_BA);
7446 PRINT_BIT(attr,LBP_BB);
7447 PRINT_BIT(attr,LBP_HY);
7448 PRINT_BIT(attr,LBP_CB);
7449 PRINT_BIT(attr,LBP_CL);
7450 PRINT_BIT(attr,LBP_CP);
7451 PRINT_BIT(attr,LBP_EX);
7452 PRINT_BIT(attr,LBP_IN);
7453 PRINT_BIT(attr,LBP_NS);
7454 PRINT_BIT(attr,LBP_OP);
7455 PRINT_BIT(attr,LBP_QU);
7456 PRINT_BIT(attr,LBP_IS);
7457 PRINT_BIT(attr,LBP_NU);
7458 PRINT_BIT(attr,LBP_PO);
7459 PRINT_BIT(attr,LBP_PR);
7460 PRINT_BIT(attr,LBP_SY);
7461 PRINT_BIT(attr,LBP_AI);
7462 PRINT_BIT(attr,LBP_AL);
7463 PRINT_BIT(attr,LBP_H2);
7464 PRINT_BIT(attr,LBP_H3);
7465 PRINT_BIT(attr,LBP_HL);
7466 PRINT_BIT(attr,LBP_ID);
7467 PRINT_BIT(attr,LBP_JL);
7468 PRINT_BIT(attr,LBP_JV);
7469 PRINT_BIT(attr,LBP_JT);
7470 PRINT_BIT(attr,LBP_RI);
7471 PRINT_BIT(attr,LBP_SA);
7472 PRINT_BIT(attr,LBP_XX);
7473 #undef PRINT_BIT
7474 fprintf (stream, "\n");
7479 static void
7480 debug_output_org_lbrk_tables (const char *filename)
7482 FILE *stream;
7484 stream = fopen (filename, "w");
7485 if (stream == NULL)
7487 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7488 exit (1);
7491 debug_output_org_lbp (stream);
7493 if (ferror (stream) || fclose (stream))
7495 fprintf (stderr, "error writing to '%s'\n", filename);
7496 exit (1);
7500 /* Construction of sparse 3-level tables. */
7501 #define TABLE lbp_table
7502 #define ELEMENT unsigned char
7503 #define DEFAULT LBP_XX
7504 #define xmalloc malloc
7505 #define xrealloc realloc
7506 #include "3level.h"
7508 static void
7509 output_lbp (FILE *stream1, FILE *stream2)
7511 unsigned int i;
7512 struct lbp_table t;
7513 unsigned int level1_offset, level2_offset, level3_offset;
7515 t.p = 7;
7516 t.q = 9;
7517 lbp_table_init (&t);
7519 for (i = 0; i < 0x110000; i++)
7521 int64_t attr = get_lbp (i);
7523 /* Now attr should contain exactly one bit. */
7524 assert (attr != 0 && (attr & (attr - 1)) == 0);
7526 if (attr != (int64_t) 1 << LBP_XX)
7528 unsigned int log2_attr;
7529 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7531 lbp_table_add (&t, i, log2_attr);
7535 lbp_table_finalize (&t);
7537 level1_offset =
7538 5 * sizeof (uint32_t);
7539 level2_offset =
7540 5 * sizeof (uint32_t)
7541 + t.level1_size * sizeof (uint32_t);
7542 level3_offset =
7543 5 * sizeof (uint32_t)
7544 + t.level1_size * sizeof (uint32_t)
7545 + (t.level2_size << t.q) * sizeof (uint32_t);
7547 for (i = 0; i < 5; i++)
7548 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
7549 ((uint32_t *) t.result)[i]);
7550 fprintf (stream1, "\n");
7551 fprintf (stream1, "typedef struct\n");
7552 fprintf (stream1, " {\n");
7553 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7554 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7555 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7556 fprintf (stream1, " }\n");
7557 fprintf (stream1, "lbrkprop_t;\n");
7558 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
7560 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
7561 fprintf (stream2, "{\n");
7562 fprintf (stream2, " {");
7563 if (t.level1_size > 8)
7564 fprintf (stream2, "\n ");
7565 for (i = 0; i < t.level1_size; i++)
7567 uint32_t offset;
7568 if (i > 0 && (i % 8) == 0)
7569 fprintf (stream2, "\n ");
7570 offset = ((uint32_t *) (t.result + level1_offset))[i];
7571 if (offset == 0)
7572 fprintf (stream2, " %5d", -1);
7573 else
7574 fprintf (stream2, " %5zu",
7575 (offset - level2_offset) / sizeof (uint32_t));
7576 if (i+1 < t.level1_size)
7577 fprintf (stream2, ",");
7579 if (t.level1_size > 8)
7580 fprintf (stream2, "\n ");
7581 fprintf (stream2, " },\n");
7582 fprintf (stream2, " {");
7583 if (t.level2_size << t.q > 8)
7584 fprintf (stream2, "\n ");
7585 for (i = 0; i < t.level2_size << t.q; i++)
7587 uint32_t offset;
7588 if (i > 0 && (i % 8) == 0)
7589 fprintf (stream2, "\n ");
7590 offset = ((uint32_t *) (t.result + level2_offset))[i];
7591 if (offset == 0)
7592 fprintf (stream2, " %5d", -1);
7593 else
7594 fprintf (stream2, " %5zu",
7595 (offset - level3_offset) / sizeof (unsigned char));
7596 if (i+1 < t.level2_size << t.q)
7597 fprintf (stream2, ",");
7599 if (t.level2_size << t.q > 8)
7600 fprintf (stream2, "\n ");
7601 fprintf (stream2, " },\n");
7602 fprintf (stream2, " {");
7603 if (t.level3_size << t.p > 8)
7604 fprintf (stream2, "\n ");
7605 for (i = 0; i < t.level3_size << t.p; i++)
7607 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7608 const char *value_string;
7609 switch (value)
7611 #define CASE(x) case x: value_string = #x; break;
7612 CASE(LBP_BK);
7613 CASE(LBP_CM);
7614 CASE(LBP_WJ);
7615 CASE(LBP_ZW);
7616 CASE(LBP_GL);
7617 CASE(LBP_SP);
7618 CASE(LBP_B2);
7619 CASE(LBP_BA);
7620 CASE(LBP_BB);
7621 CASE(LBP_HY);
7622 CASE(LBP_CB);
7623 CASE(LBP_CL);
7624 CASE(LBP_CP);
7625 CASE(LBP_EX);
7626 CASE(LBP_IN);
7627 CASE(LBP_NS);
7628 CASE(LBP_OP);
7629 CASE(LBP_QU);
7630 CASE(LBP_IS);
7631 CASE(LBP_NU);
7632 CASE(LBP_PO);
7633 CASE(LBP_PR);
7634 CASE(LBP_SY);
7635 CASE(LBP_AI);
7636 CASE(LBP_AL);
7637 CASE(LBP_H2);
7638 CASE(LBP_H3);
7639 CASE(LBP_HL);
7640 CASE(LBP_ID);
7641 CASE(LBP_JL);
7642 CASE(LBP_JV);
7643 CASE(LBP_JT);
7644 CASE(LBP_RI);
7645 CASE(LBP_SA);
7646 CASE(LBP_XX);
7647 #undef CASE
7648 default:
7649 abort ();
7651 if (i > 0 && (i % 8) == 0)
7652 fprintf (stream2, "\n ");
7653 fprintf (stream2, " %s%s", value_string,
7654 (i+1 < t.level3_size << t.p ? "," : ""));
7656 if (t.level3_size << t.p > 8)
7657 fprintf (stream2, "\n ");
7658 fprintf (stream2, " }\n");
7659 fprintf (stream2, "};\n");
7662 static void
7663 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
7665 const char *filenames[2];
7666 FILE *streams[2];
7667 size_t i;
7669 filenames[0] = filename1;
7670 filenames[1] = filename2;
7672 for (i = 0; i < 2; i++)
7674 streams[i] = fopen (filenames[i], "w");
7675 if (streams[i] == NULL)
7677 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7678 exit (1);
7682 for (i = 0; i < 2; i++)
7684 FILE *stream = streams[i];
7686 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7687 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7688 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7689 version);
7690 fprintf (stream, "\n");
7692 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7693 still carries the GPL header), and it's gnulib-tool which replaces the
7694 GPL header with an LGPL header. */
7695 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
7696 fprintf (stream, "\n");
7697 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7698 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7699 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7700 fprintf (stream, " (at your option) any later version.\n");
7701 fprintf (stream, "\n");
7702 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7703 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7704 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7705 fprintf (stream, " GNU General Public License for more details.\n");
7706 fprintf (stream, "\n");
7707 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7708 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7709 fprintf (stream, "\n");
7712 output_lbp (streams[0], streams[1]);
7714 for (i = 0; i < 2; i++)
7716 if (ferror (streams[i]) || fclose (streams[i]))
7718 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7719 exit (1);
7724 /* ========================================================================= */
7726 /* Word break property.
7727 Updated for Unicode TR #29 revision 17. */
7729 /* Possible values of the Word_Break property. */
7730 enum
7732 WBP_OTHER = 0,
7733 WBP_CR = 11,
7734 WBP_LF = 12,
7735 WBP_NEWLINE = 10,
7736 WBP_EXTEND = 8,
7737 WBP_FORMAT = 9,
7738 WBP_KATAKANA = 1,
7739 WBP_ALETTER = 2,
7740 WBP_MIDNUMLET = 3,
7741 WBP_MIDLETTER = 4,
7742 WBP_MIDNUM = 5,
7743 WBP_NUMERIC = 6,
7744 WBP_EXTENDNUMLET = 7,
7745 WBP_RI = 13,
7746 WBP_DQ = 14,
7747 WBP_SQ = 15,
7748 WBP_HL = 16
7751 /* Returns the word breaking property for ch, as a bit mask. */
7752 static int
7753 get_wbp (unsigned int ch)
7755 int attr = 0;
7757 if (unicode_attributes[ch].name != NULL)
7759 if (ch == 0x000D)
7760 attr |= 1 << WBP_CR;
7762 if (ch == 0x000A)
7763 attr |= 1 << WBP_LF;
7765 if (ch == 0x000B || ch == 0x000C
7766 || ch == 0x0085
7767 || ch == 0x2028 || ch == 0x2029)
7768 attr |= 1 << WBP_NEWLINE;
7770 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
7771 || (unicode_attributes[ch].category != NULL
7772 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
7773 attr |= 1 << WBP_EXTEND;
7775 if (unicode_attributes[ch].category != NULL
7776 && strcmp (unicode_attributes[ch].category, "Cf") == 0
7777 && ch != 0x200B && ch != 0x200C && ch != 0x200D)
7778 attr |= 1 << WBP_FORMAT;
7780 if ((unicode_scripts[ch] < numscripts
7781 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
7782 || (ch >= 0x3031 && ch <= 0x3035)
7783 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
7784 || ch == 0xFF70)
7785 attr |= 1 << WBP_KATAKANA;
7787 if ((unicode_scripts[ch] < numscripts
7788 && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0)
7789 && strcmp (unicode_attributes[ch].category, "Lo") == 0)
7790 attr |= 1 << WBP_HL;
7792 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
7793 || ch == 0x05F3)
7794 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
7795 && (attr & (1 << WBP_KATAKANA)) == 0
7796 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
7797 && !(unicode_scripts[ch] < numscripts
7798 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
7799 && (attr & (1 << WBP_EXTEND)) == 0
7800 && (attr & (1 << WBP_HL)) == 0)
7801 attr |= 1 << WBP_ALETTER;
7803 if (is_WBP_MIDNUMLET (ch))
7804 attr |= 1 << WBP_MIDNUMLET;
7806 if (is_WBP_MIDLETTER (ch))
7807 attr |= 1 << WBP_MIDLETTER;
7809 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
7810 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
7811 || ch == 0xFF1B)
7812 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
7813 attr |= 1 << WBP_MIDNUM;
7815 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
7816 && ch != 0x066C)
7817 attr |= 1 << WBP_NUMERIC;
7819 if (unicode_attributes[ch].category != NULL
7820 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
7821 attr |= 1 << WBP_EXTENDNUMLET;
7823 if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
7824 attr |= 1 << WBP_RI;
7826 if (ch == 0x0022)
7827 attr |= 1 << WBP_DQ;
7829 if (ch == 0x0027)
7830 attr |= 1 << WBP_SQ;
7833 if (attr == 0)
7834 /* other */
7835 attr |= 1 << WBP_OTHER;
7837 return attr;
7840 /* Output the word break property in a human readable format. */
7841 static void
7842 debug_output_wbp (FILE *stream)
7844 unsigned int i;
7846 for (i = 0; i < 0x110000; i++)
7848 int attr = get_wbp (i);
7849 if (attr != 1 << WBP_OTHER)
7851 fprintf (stream, "0x%04X", i);
7852 if (attr & (1 << WBP_CR))
7853 fprintf (stream, " CR");
7854 if (attr & (1 << WBP_LF))
7855 fprintf (stream, " LF");
7856 if (attr & (1 << WBP_NEWLINE))
7857 fprintf (stream, " Newline");
7858 if (attr & (1 << WBP_EXTEND))
7859 fprintf (stream, " Extend");
7860 if (attr & (1 << WBP_FORMAT))
7861 fprintf (stream, " Format");
7862 if (attr & (1 << WBP_KATAKANA))
7863 fprintf (stream, " Katakana");
7864 if (attr & (1 << WBP_ALETTER))
7865 fprintf (stream, " ALetter");
7866 if (attr & (1 << WBP_MIDNUMLET))
7867 fprintf (stream, " MidNumLet");
7868 if (attr & (1 << WBP_MIDLETTER))
7869 fprintf (stream, " MidLetter");
7870 if (attr & (1 << WBP_MIDNUM))
7871 fprintf (stream, " MidNum");
7872 if (attr & (1 << WBP_NUMERIC))
7873 fprintf (stream, " Numeric");
7874 if (attr & (1 << WBP_EXTENDNUMLET))
7875 fprintf (stream, " ExtendNumLet");
7876 if (attr & (1 << WBP_RI))
7877 fprintf (stream, " Regional_Indicator");
7878 if (attr & (1 << WBP_DQ))
7879 fprintf (stream, " Double_Quote");
7880 if (attr & (1 << WBP_SQ))
7881 fprintf (stream, " Single_Quote");
7882 if (attr & (1 << WBP_HL))
7883 fprintf (stream, " Hebrew_Letter");
7884 fprintf (stream, "\n");
7889 static void
7890 debug_output_wbrk_tables (const char *filename)
7892 FILE *stream;
7894 stream = fopen (filename, "w");
7895 if (stream == NULL)
7897 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7898 exit (1);
7901 debug_output_wbp (stream);
7903 if (ferror (stream) || fclose (stream))
7905 fprintf (stderr, "error writing to '%s'\n", filename);
7906 exit (1);
7910 /* The word break property from the WordBreakProperty.txt file. */
7911 int unicode_org_wbp[0x110000];
7913 /* Stores in unicode_org_wbp[] the word break property from the
7914 WordBreakProperty.txt file. */
7915 static void
7916 fill_org_wbp (const char *wordbreakproperty_filename)
7918 unsigned int i;
7919 FILE *stream;
7921 for (i = 0; i < 0x110000; i++)
7922 unicode_org_wbp[i] = WBP_OTHER;
7924 stream = fopen (wordbreakproperty_filename, "r");
7925 if (stream == NULL)
7927 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
7928 exit (1);
7931 for (;;)
7933 char buf[200+1];
7934 unsigned int i1, i2;
7935 char padding[200+1];
7936 char propname[200+1];
7937 int propvalue;
7939 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7940 break;
7942 if (buf[0] == '\0' || buf[0] == '#')
7943 continue;
7945 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
7947 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
7949 fprintf (stderr, "parse error in '%s'\n",
7950 wordbreakproperty_filename);
7951 exit (1);
7953 i2 = i1;
7955 #define PROP(name,value) \
7956 if (strcmp (propname, name) == 0) propvalue = value; else
7957 PROP ("CR", WBP_CR)
7958 PROP ("LF", WBP_LF)
7959 PROP ("Newline", WBP_NEWLINE)
7960 PROP ("Extend", WBP_EXTEND)
7961 PROP ("Format", WBP_FORMAT)
7962 PROP ("Katakana", WBP_KATAKANA)
7963 PROP ("ALetter", WBP_ALETTER)
7964 PROP ("MidNumLet", WBP_MIDNUMLET)
7965 PROP ("MidLetter", WBP_MIDLETTER)
7966 PROP ("MidNum", WBP_MIDNUM)
7967 PROP ("Numeric", WBP_NUMERIC)
7968 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
7969 PROP ("Regional_Indicator", WBP_RI)
7970 PROP ("Double_Quote", WBP_DQ)
7971 PROP ("Single_Quote", WBP_SQ)
7972 PROP ("Hebrew_Letter", WBP_HL)
7973 #undef PROP
7975 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
7976 wordbreakproperty_filename);
7977 exit (1);
7979 assert (i1 <= i2 && i2 < 0x110000);
7981 for (i = i1; i <= i2; i++)
7982 unicode_org_wbp[i] = propvalue;
7985 if (ferror (stream) || fclose (stream))
7987 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
7988 exit (1);
7992 /* Output the word break property in a human readable format. */
7993 static void
7994 debug_output_org_wbp (FILE *stream)
7996 unsigned int i;
7998 for (i = 0; i < 0x110000; i++)
8000 int propvalue = unicode_org_wbp[i];
8001 if (propvalue != WBP_OTHER)
8003 fprintf (stream, "0x%04X", i);
8004 #define PROP(name,value) \
8005 if (propvalue == value) fprintf (stream, " " name); else
8006 PROP ("CR", WBP_CR)
8007 PROP ("LF", WBP_LF)
8008 PROP ("Newline", WBP_NEWLINE)
8009 PROP ("Extend", WBP_EXTEND)
8010 PROP ("Format", WBP_FORMAT)
8011 PROP ("Katakana", WBP_KATAKANA)
8012 PROP ("ALetter", WBP_ALETTER)
8013 PROP ("MidNumLet", WBP_MIDNUMLET)
8014 PROP ("MidLetter", WBP_MIDLETTER)
8015 PROP ("MidNum", WBP_MIDNUM)
8016 PROP ("Numeric", WBP_NUMERIC)
8017 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
8018 PROP ("Regional_Indicator", WBP_RI)
8019 PROP ("Double_Quote", WBP_DQ)
8020 PROP ("Single_Quote", WBP_SQ)
8021 PROP ("Hebrew_Letter", WBP_HL)
8022 #undef PROP
8023 fprintf (stream, " ??");
8024 fprintf (stream, "\n");
8029 static void
8030 debug_output_org_wbrk_tables (const char *filename)
8032 FILE *stream;
8034 stream = fopen (filename, "w");
8035 if (stream == NULL)
8037 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8038 exit (1);
8041 debug_output_org_wbp (stream);
8043 if (ferror (stream) || fclose (stream))
8045 fprintf (stderr, "error writing to '%s'\n", filename);
8046 exit (1);
8050 /* Construction of sparse 3-level tables. */
8051 #define TABLE wbp_table
8052 #define ELEMENT unsigned char
8053 #define DEFAULT WBP_OTHER
8054 #define xmalloc malloc
8055 #define xrealloc realloc
8056 #include "3level.h"
8058 static void
8059 output_wbp (FILE *stream)
8061 unsigned int i;
8062 struct wbp_table t;
8063 unsigned int level1_offset, level2_offset, level3_offset;
8065 t.p = 7;
8066 t.q = 9;
8067 wbp_table_init (&t);
8069 for (i = 0; i < 0x110000; i++)
8071 int attr = get_wbp (i);
8073 /* Now attr should contain exactly one bit. */
8074 assert (attr != 0 && (attr & (attr - 1)) == 0);
8076 if (attr != 1 << WBP_OTHER)
8078 unsigned int log2_attr;
8079 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
8081 wbp_table_add (&t, i, log2_attr);
8085 wbp_table_finalize (&t);
8087 level1_offset =
8088 5 * sizeof (uint32_t);
8089 level2_offset =
8090 5 * sizeof (uint32_t)
8091 + t.level1_size * sizeof (uint32_t);
8092 level3_offset =
8093 5 * sizeof (uint32_t)
8094 + t.level1_size * sizeof (uint32_t)
8095 + (t.level2_size << t.q) * sizeof (uint32_t);
8097 for (i = 0; i < 5; i++)
8098 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
8099 ((uint32_t *) t.result)[i]);
8100 fprintf (stream, "\n");
8101 fprintf (stream, "typedef struct\n");
8102 fprintf (stream, " {\n");
8103 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8104 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
8105 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
8106 fprintf (stream, " }\n");
8107 fprintf (stream, "wbrkprop_t;\n");
8108 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
8109 fprintf (stream, "{\n");
8110 fprintf (stream, " {");
8111 if (t.level1_size > 8)
8112 fprintf (stream, "\n ");
8113 for (i = 0; i < t.level1_size; i++)
8115 uint32_t offset;
8116 if (i > 0 && (i % 8) == 0)
8117 fprintf (stream, "\n ");
8118 offset = ((uint32_t *) (t.result + level1_offset))[i];
8119 if (offset == 0)
8120 fprintf (stream, " %5d", -1);
8121 else
8122 fprintf (stream, " %5zu",
8123 (offset - level2_offset) / sizeof (uint32_t));
8124 if (i+1 < t.level1_size)
8125 fprintf (stream, ",");
8127 if (t.level1_size > 8)
8128 fprintf (stream, "\n ");
8129 fprintf (stream, " },\n");
8130 fprintf (stream, " {");
8131 if (t.level2_size << t.q > 8)
8132 fprintf (stream, "\n ");
8133 for (i = 0; i < t.level2_size << t.q; i++)
8135 uint32_t offset;
8136 if (i > 0 && (i % 8) == 0)
8137 fprintf (stream, "\n ");
8138 offset = ((uint32_t *) (t.result + level2_offset))[i];
8139 if (offset == 0)
8140 fprintf (stream, " %5d", -1);
8141 else
8142 fprintf (stream, " %5zu",
8143 (offset - level3_offset) / sizeof (unsigned char));
8144 if (i+1 < t.level2_size << t.q)
8145 fprintf (stream, ",");
8147 if (t.level2_size << t.q > 8)
8148 fprintf (stream, "\n ");
8149 fprintf (stream, " },\n");
8150 fprintf (stream, " {");
8151 if (t.level3_size << t.p > 4)
8152 fprintf (stream, "\n ");
8153 for (i = 0; i < t.level3_size << t.p; i++)
8155 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
8156 const char *value_string;
8157 switch (value)
8159 #define CASE(x) case x: value_string = #x; break;
8160 CASE(WBP_OTHER);
8161 CASE(WBP_CR);
8162 CASE(WBP_LF);
8163 CASE(WBP_NEWLINE);
8164 CASE(WBP_EXTEND);
8165 CASE(WBP_FORMAT);
8166 CASE(WBP_KATAKANA);
8167 CASE(WBP_ALETTER);
8168 CASE(WBP_MIDNUMLET);
8169 CASE(WBP_MIDLETTER);
8170 CASE(WBP_MIDNUM);
8171 CASE(WBP_NUMERIC);
8172 CASE(WBP_EXTENDNUMLET);
8173 CASE(WBP_RI);
8174 CASE(WBP_DQ);
8175 CASE(WBP_SQ);
8176 CASE(WBP_HL);
8177 #undef CASE
8178 default:
8179 abort ();
8181 if (i > 0 && (i % 4) == 0)
8182 fprintf (stream, "\n ");
8183 fprintf (stream, " %s%s", value_string,
8184 (i+1 < t.level3_size << t.p ? "," : ""));
8186 if (t.level3_size << t.p > 4)
8187 fprintf (stream, "\n ");
8188 fprintf (stream, " }\n");
8189 fprintf (stream, "};\n");
8192 static void
8193 output_wbrk_tables (const char *filename, const char *version)
8195 FILE *stream;
8197 stream = fopen (filename, "w");
8198 if (stream == NULL)
8200 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8201 exit (1);
8204 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8205 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
8206 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8207 version);
8208 fprintf (stream, "\n");
8210 /* Put a GPL header on it. The gnulib module is under LGPL (although it
8211 still carries the GPL header), and it's gnulib-tool which replaces the
8212 GPL header with an LGPL header. */
8213 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
8214 fprintf (stream, "\n");
8215 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8216 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8217 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8218 fprintf (stream, " (at your option) any later version.\n");
8219 fprintf (stream, "\n");
8220 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8221 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8222 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8223 fprintf (stream, " GNU General Public License for more details.\n");
8224 fprintf (stream, "\n");
8225 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8226 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8227 fprintf (stream, "\n");
8229 output_wbp (stream);
8231 if (ferror (stream) || fclose (stream))
8233 fprintf (stderr, "error writing to '%s'\n", filename);
8234 exit (1);
8238 /* ========================================================================= */
8240 /* Grapheme break property.
8241 Updated for Unicode TR #29 revision 17. */
8243 /* Possible values of the Grapheme_Cluster_Break property. */
8244 enum
8246 GBP_OTHER = 0,
8247 GBP_CR = 1,
8248 GBP_LF = 2,
8249 GBP_CONTROL = 3,
8250 GBP_EXTEND = 4,
8251 GBP_PREPEND = 5,
8252 GBP_SPACINGMARK = 6,
8253 GBP_L = 7,
8254 GBP_V = 8,
8255 GBP_T = 9,
8256 GBP_LV = 10,
8257 GBP_LVT = 11,
8258 GBP_RI = 12
8261 /* Construction of sparse 3-level tables. */
8262 #define TABLE gbp_table
8263 #define ELEMENT unsigned char
8264 #define DEFAULT GBP_OTHER
8265 #define xmalloc malloc
8266 #define xrealloc realloc
8267 #include "3level.h"
8269 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
8270 int unicode_org_gbp[0x110000];
8272 /* Output the unit test data for the grapheme break property. */
8273 static void
8274 output_gbp_test (const char *filename)
8276 FILE *stream;
8277 bool need_comma;
8278 unsigned int ch;
8280 stream = fopen (filename, "w");
8281 if (stream == NULL)
8283 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8284 exit (1);
8287 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8288 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
8289 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
8290 fprintf (stream, "\n");
8291 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8292 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8293 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8294 fprintf (stream, " (at your option) any later version.\n");
8295 fprintf (stream, "\n");
8296 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8297 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8298 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8299 fprintf (stream, " GNU General Public License for more details.\n");
8300 fprintf (stream, "\n");
8301 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8302 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8303 fprintf (stream, "\n");
8305 need_comma = false;
8306 for (ch = 0; ch < 0x110000; ch++)
8308 int gbp = unicode_org_gbp[ch];
8309 const char *gbp_string;
8311 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
8312 ch++;
8314 switch (gbp)
8316 #define CASE(x) case x: gbp_string = #x; break;
8317 CASE (GBP_OTHER)
8318 CASE (GBP_CR)
8319 CASE (GBP_LF)
8320 CASE (GBP_CONTROL)
8321 CASE (GBP_EXTEND)
8322 CASE (GBP_PREPEND)
8323 CASE (GBP_SPACINGMARK)
8324 CASE (GBP_L)
8325 CASE (GBP_V)
8326 CASE (GBP_T)
8327 CASE (GBP_LV)
8328 CASE (GBP_LVT)
8329 CASE (GBP_RI)
8330 #undef CASE
8331 default:
8332 abort ();
8335 if (need_comma)
8336 fprintf (stream, ",\n");
8337 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
8339 need_comma = true;
8341 fprintf (stream, "\n");
8343 if (ferror (stream) || fclose (stream))
8345 fprintf (stderr, "error writing to '%s'\n", filename);
8346 exit (1);
8350 /* Output the per-character grapheme break property table. */
8351 static void
8352 output_gbp_table (const char *filename, const char *version)
8354 FILE *stream;
8355 unsigned int ch, i;
8356 struct gbp_table t;
8357 unsigned int level1_offset, level2_offset, level3_offset;
8359 stream = fopen (filename, "w");
8360 if (stream == NULL)
8362 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8363 exit (1);
8366 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8367 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
8368 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8369 version);
8371 t.p = 7;
8372 t.q = 9;
8373 gbp_table_init (&t);
8375 for (ch = 0; ch < 0x110000; ch++)
8376 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
8378 gbp_table_finalize (&t);
8380 /* Offsets in t.result, in memory of this process. */
8381 level1_offset =
8382 5 * sizeof (uint32_t);
8383 level2_offset =
8384 5 * sizeof (uint32_t)
8385 + t.level1_size * sizeof (uint32_t);
8386 level3_offset =
8387 5 * sizeof (uint32_t)
8388 + t.level1_size * sizeof (uint32_t)
8389 + (t.level2_size << t.q) * sizeof (uint32_t);
8391 for (i = 0; i < 5; i++)
8392 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
8393 ((uint32_t *) t.result)[i]);
8394 fprintf (stream, "static const\n");
8395 fprintf (stream, "struct\n");
8396 fprintf (stream, " {\n");
8397 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8398 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
8399 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
8400 t.level3_size, t.p);
8401 fprintf (stream, " }\n");
8402 fprintf (stream, "unigbrkprop =\n");
8403 fprintf (stream, "{\n");
8404 fprintf (stream, " {");
8405 if (t.level1_size > 8)
8406 fprintf (stream, "\n ");
8407 for (i = 0; i < t.level1_size; i++)
8409 uint32_t offset;
8410 if (i > 0 && (i % 8) == 0)
8411 fprintf (stream, "\n ");
8412 offset = ((uint32_t *) (t.result + level1_offset))[i];
8413 if (offset == 0)
8414 fprintf (stream, " %5d", -1);
8415 else
8416 fprintf (stream, " %5zu",
8417 (offset - level2_offset) / sizeof (uint32_t));
8418 if (i+1 < t.level1_size)
8419 fprintf (stream, ",");
8421 if (t.level1_size > 8)
8422 fprintf (stream, "\n ");
8423 fprintf (stream, " },\n");
8424 fprintf (stream, " {");
8425 if (t.level2_size << t.q > 8)
8426 fprintf (stream, "\n ");
8427 for (i = 0; i < t.level2_size << t.q; i++)
8429 uint32_t offset;
8430 if (i > 0 && (i % 8) == 0)
8431 fprintf (stream, "\n ");
8432 offset = ((uint32_t *) (t.result + level2_offset))[i];
8433 if (offset == 0)
8434 fprintf (stream, " %5d", -1);
8435 else
8436 fprintf (stream, " %5zu",
8437 (offset - level3_offset) / sizeof (uint8_t) / 2);
8438 if (i+1 < t.level2_size << t.q)
8439 fprintf (stream, ",");
8441 if (t.level2_size << t.q > 8)
8442 fprintf (stream, "\n ");
8443 fprintf (stream, " },\n");
8444 fprintf (stream, " {");
8445 if (t.level3_size << t.p > 8)
8446 fprintf (stream, "\n ");
8447 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
8449 unsigned char *p = (unsigned char *) (t.result + level3_offset);
8450 unsigned char value0 = p[i * 2];
8451 unsigned char value1 = p[i * 2 + 1];
8452 if (i > 0 && (i % 8) == 0)
8453 fprintf (stream, "\n ");
8454 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
8455 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
8457 if (t.level3_size << t.p > 8)
8458 fprintf (stream, "\n ");
8459 fprintf (stream, " }\n");
8460 fprintf (stream, "};\n");
8462 if (ferror (stream) || fclose (stream))
8464 fprintf (stderr, "error writing to '%s'\n", filename);
8465 exit (1);
8469 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
8470 GraphemeBreakProperty.txt file. */
8471 static void
8472 fill_org_gbp (const char *graphemebreakproperty_filename)
8474 unsigned int i;
8475 FILE *stream;
8476 int lineno = 0;
8478 for (i = 0; i < 0x110000; i++)
8479 unicode_org_gbp[i] = GBP_OTHER;
8481 stream = fopen (graphemebreakproperty_filename, "r");
8482 if (stream == NULL)
8484 fprintf (stderr, "error during fopen of '%s'\n",
8485 graphemebreakproperty_filename);
8486 exit (1);
8489 for (;;)
8491 char buf[200+1];
8492 unsigned int i1, i2;
8493 char padding[200+1];
8494 char propname[200+1];
8495 int propvalue;
8497 lineno++;
8498 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8499 break;
8501 if (buf[0] == '\0' || buf[0] == '#')
8502 continue;
8504 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8506 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8508 fprintf (stderr, "parse error in '%s'\n",
8509 graphemebreakproperty_filename);
8510 exit (1);
8512 i2 = i1;
8514 #define PROP(name,value) \
8515 if (strcmp (propname, name) == 0) propvalue = value; else
8516 PROP ("CR", GBP_CR)
8517 PROP ("LF", GBP_LF)
8518 PROP ("Control", GBP_CONTROL)
8519 PROP ("Extend", GBP_EXTEND)
8520 PROP ("Prepend", GBP_PREPEND)
8521 PROP ("SpacingMark", GBP_SPACINGMARK)
8522 PROP ("L", GBP_L)
8523 PROP ("V", GBP_V)
8524 PROP ("T", GBP_T)
8525 PROP ("LV", GBP_LV)
8526 PROP ("LVT", GBP_LVT)
8527 PROP ("Regional_Indicator", GBP_RI)
8528 #undef PROP
8530 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
8531 graphemebreakproperty_filename, lineno);
8532 exit (1);
8534 assert (i1 <= i2 && i2 < 0x110000);
8536 for (i = i1; i <= i2; i++)
8537 unicode_org_gbp[i] = propvalue;
8540 if (ferror (stream) || fclose (stream))
8542 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
8543 exit (1);
8547 /* ========================================================================= */
8549 /* Composition and decomposition.
8550 Updated for Unicode TR #15 revision 33. */
8552 /* Maximum number of characters into which a single Unicode character can be
8553 decomposed. */
8554 #define MAX_DECOMP_LENGTH 18
8556 enum
8558 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
8559 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
8560 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
8561 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
8562 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
8563 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
8564 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
8565 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
8566 UC_DECOMP_SUPER, /* <super> A superscript form. */
8567 UC_DECOMP_SUB, /* <sub> A subscript form. */
8568 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
8569 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
8570 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
8571 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
8572 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
8573 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
8574 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
8577 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
8578 decompositions). Return the type, or -1 for none. */
8579 static int
8580 get_decomposition (unsigned int ch,
8581 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
8583 const char *decomposition = unicode_attributes[ch].decomposition;
8585 if (decomposition != NULL && decomposition[0] != '\0')
8587 int type = UC_DECOMP_CANONICAL;
8588 unsigned int length;
8589 char *endptr;
8591 if (decomposition[0] == '<')
8593 const char *rangle;
8594 size_t typelen;
8596 rangle = strchr (decomposition + 1, '>');
8597 assert (rangle != NULL);
8598 typelen = rangle + 1 - decomposition;
8599 #define TYPE(t1,t2) \
8600 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
8601 type = t2; \
8602 else
8603 TYPE ("<font>", UC_DECOMP_FONT)
8604 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
8605 TYPE ("<initial>", UC_DECOMP_INITIAL)
8606 TYPE ("<medial>", UC_DECOMP_MEDIAL)
8607 TYPE ("<final>", UC_DECOMP_FINAL)
8608 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
8609 TYPE ("<circle>", UC_DECOMP_CIRCLE)
8610 TYPE ("<super>", UC_DECOMP_SUPER)
8611 TYPE ("<sub>", UC_DECOMP_SUB)
8612 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
8613 TYPE ("<wide>", UC_DECOMP_WIDE)
8614 TYPE ("<narrow>", UC_DECOMP_NARROW)
8615 TYPE ("<small>", UC_DECOMP_SMALL)
8616 TYPE ("<square>", UC_DECOMP_SQUARE)
8617 TYPE ("<fraction>", UC_DECOMP_FRACTION)
8618 TYPE ("<compat>", UC_DECOMP_COMPAT)
8620 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
8621 exit (1);
8623 #undef TYPE
8624 decomposition = rangle + 1;
8625 if (decomposition[0] == ' ')
8626 decomposition++;
8628 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
8630 decomposed[length] = strtoul (decomposition, &endptr, 16);
8631 if (endptr == decomposition)
8632 break;
8633 decomposition = endptr;
8634 if (decomposition[0] == ' ')
8635 decomposition++;
8637 /* Make sure that *DECOMPOSITION is not NULL-terminated.
8638 Otherwise MAX_DECOMP_LENGTH is too small. */
8639 assert (*decomposition == '\0');
8641 *lengthp = length;
8642 return type;
8644 else
8645 return -1;
8648 /* Construction of sparse 3-level tables. */
8649 #define TABLE decomp_table
8650 #define ELEMENT uint16_t
8651 #define DEFAULT (uint16_t)(-1)
8652 #define xmalloc malloc
8653 #define xrealloc realloc
8654 #include "3level.h"
8656 static void
8657 output_decomposition (FILE *stream1, FILE *stream2)
8659 struct decomp_table t;
8660 unsigned int level1_offset, level2_offset, level3_offset;
8661 unsigned int offset;
8662 unsigned int ch;
8663 unsigned int i;
8665 t.p = 5;
8666 t.q = 5;
8667 decomp_table_init (&t);
8669 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
8670 fprintf (stream1, "\n");
8671 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
8672 offset = 0;
8674 for (ch = 0; ch < 0x110000; ch++)
8676 unsigned int length;
8677 unsigned int decomposed[MAX_DECOMP_LENGTH];
8678 int type = get_decomposition (ch, &length, decomposed);
8680 if (type >= 0)
8682 assert (offset < (1 << 15));
8683 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
8685 /* Produce length 3-bytes entries. */
8686 /* We would need a special representation of zero-length entries. */
8687 assert (length != 0);
8688 for (i = 0; i < length; i++)
8690 if (offset > 0)
8691 fprintf (stream2, ",");
8692 if ((offset % 4) == 0)
8693 fprintf (stream2, "\n ");
8694 assert (decomposed[i] < (1 << 18));
8695 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
8696 (((i+1 < length ? (1 << 23) : 0)
8697 | (i == 0 ? (type << 18) : 0)
8698 | decomposed[i]) >> 16) & 0xff,
8699 (decomposed[i] >> 8) & 0xff,
8700 decomposed[i] & 0xff);
8701 offset++;
8706 fprintf (stream2, "\n};\n");
8707 fprintf (stream2, "\n");
8709 decomp_table_finalize (&t);
8711 level1_offset =
8712 5 * sizeof (uint32_t);
8713 level2_offset =
8714 5 * sizeof (uint32_t)
8715 + t.level1_size * sizeof (uint32_t);
8716 level3_offset =
8717 5 * sizeof (uint32_t)
8718 + t.level1_size * sizeof (uint32_t)
8719 + (t.level2_size << t.q) * sizeof (uint32_t);
8721 for (i = 0; i < 5; i++)
8722 fprintf (stream1, "#define decomp_header_%d %d\n", i,
8723 ((uint32_t *) t.result)[i]);
8724 fprintf (stream1, "\n");
8725 fprintf (stream1, "typedef struct\n");
8726 fprintf (stream1, " {\n");
8727 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
8728 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
8729 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
8730 fprintf (stream1, " }\n");
8731 fprintf (stream1, "decomp_index_table_t;\n");
8732 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
8733 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
8734 fprintf (stream2, "{\n");
8735 fprintf (stream2, " {");
8736 if (t.level1_size > 8)
8737 fprintf (stream2, "\n ");
8738 for (i = 0; i < t.level1_size; i++)
8740 uint32_t offset;
8741 if (i > 0 && (i % 8) == 0)
8742 fprintf (stream2, "\n ");
8743 offset = ((uint32_t *) (t.result + level1_offset))[i];
8744 if (offset == 0)
8745 fprintf (stream2, " %5d", -1);
8746 else
8747 fprintf (stream2, " %5zu",
8748 (offset - level2_offset) / sizeof (uint32_t));
8749 if (i+1 < t.level1_size)
8750 fprintf (stream2, ",");
8752 if (t.level1_size > 8)
8753 fprintf (stream2, "\n ");
8754 fprintf (stream2, " },\n");
8755 fprintf (stream2, " {");
8756 if (t.level2_size << t.q > 8)
8757 fprintf (stream2, "\n ");
8758 for (i = 0; i < t.level2_size << t.q; i++)
8760 uint32_t offset;
8761 if (i > 0 && (i % 8) == 0)
8762 fprintf (stream2, "\n ");
8763 offset = ((uint32_t *) (t.result + level2_offset))[i];
8764 if (offset == 0)
8765 fprintf (stream2, " %5d", -1);
8766 else
8767 fprintf (stream2, " %5zu",
8768 (offset - level3_offset) / sizeof (uint16_t));
8769 if (i+1 < t.level2_size << t.q)
8770 fprintf (stream2, ",");
8772 if (t.level2_size << t.q > 8)
8773 fprintf (stream2, "\n ");
8774 fprintf (stream2, " },\n");
8775 fprintf (stream2, " {");
8776 if (t.level3_size << t.p > 8)
8777 fprintf (stream2, "\n ");
8778 for (i = 0; i < t.level3_size << t.p; i++)
8780 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
8781 if (i > 0 && (i % 8) == 0)
8782 fprintf (stream2, "\n ");
8783 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
8784 if (i+1 < t.level3_size << t.p)
8785 fprintf (stream2, ",");
8787 if (t.level3_size << t.p > 8)
8788 fprintf (stream2, "\n ");
8789 fprintf (stream2, " }\n");
8790 fprintf (stream2, "};\n");
8793 static void
8794 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
8796 const char *filenames[2];
8797 FILE *streams[2];
8798 size_t i;
8800 filenames[0] = filename1;
8801 filenames[1] = filename2;
8803 for (i = 0; i < 2; i++)
8805 streams[i] = fopen (filenames[i], "w");
8806 if (streams[i] == NULL)
8808 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
8809 exit (1);
8813 for (i = 0; i < 2; i++)
8815 FILE *stream = streams[i];
8817 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8818 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
8819 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8820 version);
8821 fprintf (stream, "\n");
8824 output_decomposition (streams[0], streams[1]);
8826 for (i = 0; i < 2; i++)
8828 if (ferror (streams[i]) || fclose (streams[i]))
8830 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
8831 exit (1);
8836 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
8837 char unicode_composition_exclusions[0x110000];
8839 static void
8840 fill_composition_exclusions (const char *compositionexclusions_filename)
8842 FILE *stream;
8843 unsigned int i;
8845 stream = fopen (compositionexclusions_filename, "r");
8846 if (stream == NULL)
8848 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
8849 exit (1);
8852 for (i = 0; i < 0x110000; i++)
8853 unicode_composition_exclusions[i] = 0;
8855 for (;;)
8857 char buf[200+1];
8858 unsigned int i;
8860 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8861 break;
8863 if (buf[0] == '\0' || buf[0] == '#')
8864 continue;
8866 if (sscanf (buf, "%X", &i) != 1)
8868 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
8869 exit (1);
8871 assert (i < 0x110000);
8873 unicode_composition_exclusions[i] = 1;
8876 if (ferror (stream) || fclose (stream))
8878 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
8879 exit (1);
8883 static void
8884 debug_output_composition_tables (const char *filename)
8886 FILE *stream;
8887 unsigned int ch;
8889 stream = fopen (filename, "w");
8890 if (stream == NULL)
8892 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8893 exit (1);
8896 for (ch = 0; ch < 0x110000; ch++)
8898 unsigned int length;
8899 unsigned int decomposed[MAX_DECOMP_LENGTH];
8900 int type = get_decomposition (ch, &length, decomposed);
8902 if (type == UC_DECOMP_CANONICAL
8903 /* Consider only binary decompositions.
8904 Exclude singleton decompositions. */
8905 && length == 2)
8907 unsigned int code1 = decomposed[0];
8908 unsigned int code2 = decomposed[1];
8909 unsigned int combined = ch;
8911 /* Exclude decompositions where the first part is not a starter,
8912 i.e. is not of canonical combining class 0. */
8913 if (strcmp (unicode_attributes[code1].combining, "0") == 0
8914 /* Exclude characters listed in CompositionExclusions.txt. */
8915 && !unicode_composition_exclusions[combined])
8917 /* The combined character must now also be a starter.
8918 Verify this. */
8919 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
8921 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
8922 code1,
8923 code2,
8924 combined,
8925 unicode_attributes[code2].combining);
8930 if (ferror (stream) || fclose (stream))
8932 fprintf (stderr, "error writing to '%s'\n", filename);
8933 exit (1);
8937 static void
8938 output_composition_tables (const char *filename, const char *version)
8940 FILE *stream;
8941 unsigned int ch;
8943 stream = fopen (filename, "w");
8944 if (stream == NULL)
8946 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8947 exit (1);
8950 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8951 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
8952 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8953 version);
8954 fprintf (stream, "\n");
8956 /* Put a GPL header on it. The gnulib module is under LGPL (although it
8957 still carries the GPL header), and it's gnulib-tool which replaces the
8958 GPL header with an LGPL header. */
8959 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
8960 fprintf (stream, "\n");
8961 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8962 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8963 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8964 fprintf (stream, " (at your option) any later version.\n");
8965 fprintf (stream, "\n");
8966 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8967 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8968 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8969 fprintf (stream, " GNU General Public License for more details.\n");
8970 fprintf (stream, "\n");
8971 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8972 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8973 fprintf (stream, "\n");
8975 /* The composition table is a set of mappings (code1, code2) -> combined,
8976 with 928 entries,
8977 367 values for code1 (from 0x003C to 0x30FD),
8978 54 values for code2 (from 0x0300 to 0x309A).
8979 For a fixed code1, there are from 1 to 19 possible values for code2.
8980 For a fixed code2, there are from 1 to 117 possible values for code1.
8981 This is a very sparse matrix.
8983 We want an O(1) hash lookup.
8985 We could implement the hash lookup by mapping (code1, code2) to a linear
8986 combination mul1*code1 + mul2*code2, which is then used as an index into
8987 a 3-level table. But this leads to a table of size 37 KB.
8989 We use gperf to implement the hash lookup, giving it the 928 sets of
8990 4 bytes (code1, code2) as input. gperf generates a hash table of size
8991 1527, which is quite good (60% filled). It requires an auxiliary table
8992 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
8994 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
8995 fprintf (stream, "%%struct-type\n");
8996 fprintf (stream, "%%language=ANSI-C\n");
8997 fprintf (stream, "%%define slot-name codes\n");
8998 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
8999 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
9000 fprintf (stream, "%%compare-lengths\n");
9001 fprintf (stream, "%%compare-strncmp\n");
9002 fprintf (stream, "%%readonly-tables\n");
9003 fprintf (stream, "%%omit-struct-type\n");
9004 fprintf (stream, "%%%%\n");
9006 for (ch = 0; ch < 0x110000; ch++)
9008 unsigned int length;
9009 unsigned int decomposed[MAX_DECOMP_LENGTH];
9010 int type = get_decomposition (ch, &length, decomposed);
9012 if (type == UC_DECOMP_CANONICAL
9013 /* Consider only binary decompositions.
9014 Exclude singleton decompositions. */
9015 && length == 2)
9017 unsigned int code1 = decomposed[0];
9018 unsigned int code2 = decomposed[1];
9019 unsigned int combined = ch;
9021 /* Exclude decompositions where the first part is not a starter,
9022 i.e. is not of canonical combining class 0. */
9023 if (strcmp (unicode_attributes[code1].combining, "0") == 0
9024 /* Exclude characters listed in CompositionExclusions.txt. */
9025 && !unicode_composition_exclusions[combined])
9027 /* The combined character must now also be a starter.
9028 Verify this. */
9029 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
9031 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
9032 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
9033 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
9034 combined);
9039 if (ferror (stream) || fclose (stream))
9041 fprintf (stderr, "error writing to '%s'\n", filename);
9042 exit (1);
9046 /* ========================================================================= */
9048 /* Output the test for a simple character mapping table to the given file. */
9050 static void
9051 output_simple_mapping_test (const char *filename,
9052 const char *function_name,
9053 unsigned int (*func) (unsigned int),
9054 const char *version)
9056 FILE *stream;
9057 bool need_comma;
9058 unsigned int ch;
9060 stream = fopen (filename, "w");
9061 if (stream == NULL)
9063 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9064 exit (1);
9067 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9068 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
9069 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
9070 fprintf (stream, "\n");
9071 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
9072 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
9073 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
9074 fprintf (stream, " (at your option) any later version.\n");
9075 fprintf (stream, "\n");
9076 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
9077 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
9078 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
9079 fprintf (stream, " GNU General Public License for more details.\n");
9080 fprintf (stream, "\n");
9081 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
9082 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
9083 fprintf (stream, "\n");
9084 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9085 version);
9086 fprintf (stream, "\n");
9087 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
9088 fprintf (stream, "\n");
9090 need_comma = false;
9091 for (ch = 0; ch < 0x110000; ch++)
9093 unsigned int value = func (ch);
9095 if (value != ch)
9097 if (need_comma)
9098 fprintf (stream, ",\n");
9099 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
9100 need_comma = true;
9103 if (need_comma)
9104 fprintf (stream, "\n");
9106 fprintf (stream, "\n");
9107 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
9108 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
9110 if (ferror (stream) || fclose (stream))
9112 fprintf (stderr, "error writing to '%s'\n", filename);
9113 exit (1);
9117 /* Construction of sparse 3-level tables. */
9118 #define TABLE mapping_table
9119 #define ELEMENT int32_t
9120 #define DEFAULT 0
9121 #define xmalloc malloc
9122 #define xrealloc realloc
9123 #include "3level.h"
9125 /* Output a simple character mapping table to the given file. */
9127 static void
9128 output_simple_mapping (const char *filename,
9129 unsigned int (*func) (unsigned int),
9130 const char *version)
9132 FILE *stream;
9133 unsigned int ch, i;
9134 struct mapping_table t;
9135 unsigned int level1_offset, level2_offset, level3_offset;
9137 stream = fopen (filename, "w");
9138 if (stream == NULL)
9140 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9141 exit (1);
9144 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9145 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
9146 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9147 version);
9149 t.p = 7;
9150 t.q = 9;
9151 mapping_table_init (&t);
9153 for (ch = 0; ch < 0x110000; ch++)
9155 int value = (int) func (ch) - (int) ch;
9157 mapping_table_add (&t, ch, value);
9160 mapping_table_finalize (&t);
9162 /* Offsets in t.result, in memory of this process. */
9163 level1_offset =
9164 5 * sizeof (uint32_t);
9165 level2_offset =
9166 5 * sizeof (uint32_t)
9167 + t.level1_size * sizeof (uint32_t);
9168 level3_offset =
9169 5 * sizeof (uint32_t)
9170 + t.level1_size * sizeof (uint32_t)
9171 + (t.level2_size << t.q) * sizeof (uint32_t);
9173 for (i = 0; i < 5; i++)
9174 fprintf (stream, "#define mapping_header_%d %d\n", i,
9175 ((uint32_t *) t.result)[i]);
9176 fprintf (stream, "static const\n");
9177 fprintf (stream, "struct\n");
9178 fprintf (stream, " {\n");
9179 fprintf (stream, " int level1[%zu];\n", t.level1_size);
9180 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
9181 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
9182 fprintf (stream, " }\n");
9183 fprintf (stream, "u_mapping =\n");
9184 fprintf (stream, "{\n");
9185 fprintf (stream, " {");
9186 if (t.level1_size > 8)
9187 fprintf (stream, "\n ");
9188 for (i = 0; i < t.level1_size; i++)
9190 uint32_t offset;
9191 if (i > 0 && (i % 8) == 0)
9192 fprintf (stream, "\n ");
9193 offset = ((uint32_t *) (t.result + level1_offset))[i];
9194 if (offset == 0)
9195 fprintf (stream, " %5d", -1);
9196 else
9197 fprintf (stream, " %5zu",
9198 (offset - level2_offset) / sizeof (uint32_t));
9199 if (i+1 < t.level1_size)
9200 fprintf (stream, ",");
9202 if (t.level1_size > 8)
9203 fprintf (stream, "\n ");
9204 fprintf (stream, " },\n");
9205 fprintf (stream, " {");
9206 if (t.level2_size << t.q > 8)
9207 fprintf (stream, "\n ");
9208 for (i = 0; i < t.level2_size << t.q; i++)
9210 uint32_t offset;
9211 if (i > 0 && (i % 8) == 0)
9212 fprintf (stream, "\n ");
9213 offset = ((uint32_t *) (t.result + level2_offset))[i];
9214 if (offset == 0)
9215 fprintf (stream, " %5d", -1);
9216 else
9217 fprintf (stream, " %5zu",
9218 (offset - level3_offset) / sizeof (int32_t));
9219 if (i+1 < t.level2_size << t.q)
9220 fprintf (stream, ",");
9222 if (t.level2_size << t.q > 8)
9223 fprintf (stream, "\n ");
9224 fprintf (stream, " },\n");
9225 fprintf (stream, " {");
9226 if (t.level3_size << t.p > 8)
9227 fprintf (stream, "\n ");
9228 for (i = 0; i < t.level3_size << t.p; i++)
9230 if (i > 0 && (i % 8) == 0)
9231 fprintf (stream, "\n ");
9232 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
9233 if (i+1 < t.level3_size << t.p)
9234 fprintf (stream, ",");
9236 if (t.level3_size << t.p > 8)
9237 fprintf (stream, "\n ");
9238 fprintf (stream, " }\n");
9239 fprintf (stream, "};\n");
9241 if (ferror (stream) || fclose (stream))
9243 fprintf (stderr, "error writing to '%s'\n", filename);
9244 exit (1);
9248 /* ========================================================================= */
9250 /* A special casing context.
9251 A context is negated through x -> -x. */
9252 enum
9254 SCC_ALWAYS = 0,
9255 SCC_FINAL_SIGMA,
9256 SCC_AFTER_SOFT_DOTTED,
9257 SCC_MORE_ABOVE,
9258 SCC_BEFORE_DOT,
9259 SCC_AFTER_I
9262 /* A special casing rule. */
9263 struct special_casing_rule
9265 unsigned int code;
9266 unsigned int lower_mapping[3];
9267 unsigned int title_mapping[3];
9268 unsigned int upper_mapping[3];
9269 unsigned int casefold_mapping[3];
9270 const char *language;
9271 int context;
9274 /* The special casing rules. */
9275 struct special_casing_rule **casing_rules;
9276 unsigned int num_casing_rules;
9277 unsigned int allocated_casing_rules;
9279 static void
9280 add_casing_rule (struct special_casing_rule *new_rule)
9282 if (num_casing_rules == allocated_casing_rules)
9284 allocated_casing_rules = 2 * allocated_casing_rules;
9285 if (allocated_casing_rules < 16)
9286 allocated_casing_rules = 16;
9287 casing_rules =
9288 (struct special_casing_rule **)
9289 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
9291 casing_rules[num_casing_rules++] = new_rule;
9294 /* Stores in casing_rules the special casing rules found in
9295 specialcasing_filename. */
9296 static void
9297 fill_casing_rules (const char *specialcasing_filename)
9299 FILE *stream;
9301 stream = fopen (specialcasing_filename, "r");
9302 if (stream == NULL)
9304 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
9305 exit (1);
9308 casing_rules = NULL;
9309 num_casing_rules = 0;
9310 allocated_casing_rules = 0;
9312 for (;;)
9314 char buf[200+1];
9315 char *scanptr;
9316 char *endptr;
9317 int i;
9319 unsigned int code;
9320 unsigned int lower_mapping[3];
9321 unsigned int title_mapping[3];
9322 unsigned int upper_mapping[3];
9323 char *language;
9324 int context;
9326 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9327 break;
9329 if (buf[0] == '\0' || buf[0] == '#')
9330 continue;
9332 /* Scan code. */
9333 scanptr = buf;
9334 code = strtoul (scanptr, &endptr, 16);
9335 if (endptr == scanptr)
9337 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9338 exit (1);
9340 scanptr = endptr;
9341 if (*scanptr != ';')
9343 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9344 exit (1);
9346 scanptr++;
9348 /* Scan lower mapping. */
9349 for (i = 0; i < 3; i++)
9350 lower_mapping[i] = 0;
9351 for (i = 0; i < 3; i++)
9353 while (*scanptr == ' ')
9354 scanptr++;
9355 if (*scanptr == ';')
9356 break;
9357 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
9358 if (endptr == scanptr)
9360 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9361 exit (1);
9363 scanptr = endptr;
9365 if (*scanptr != ';')
9367 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9368 exit (1);
9370 scanptr++;
9372 /* Scan title mapping. */
9373 for (i = 0; i < 3; i++)
9374 title_mapping[i] = 0;
9375 for (i = 0; i < 3; i++)
9377 while (*scanptr == ' ')
9378 scanptr++;
9379 if (*scanptr == ';')
9380 break;
9381 title_mapping[i] = strtoul (scanptr, &endptr, 16);
9382 if (endptr == scanptr)
9384 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9385 exit (1);
9387 scanptr = endptr;
9389 if (*scanptr != ';')
9391 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9392 exit (1);
9394 scanptr++;
9396 /* Scan upper mapping. */
9397 for (i = 0; i < 3; i++)
9398 upper_mapping[i] = 0;
9399 for (i = 0; i < 3; i++)
9401 while (*scanptr == ' ')
9402 scanptr++;
9403 if (*scanptr == ';')
9404 break;
9405 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
9406 if (endptr == scanptr)
9408 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9409 exit (1);
9411 scanptr = endptr;
9413 if (*scanptr != ';')
9415 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9416 exit (1);
9418 scanptr++;
9420 /* Scan language and context. */
9421 language = NULL;
9422 context = SCC_ALWAYS;
9423 while (*scanptr == ' ')
9424 scanptr++;
9425 if (*scanptr != '\0' && *scanptr != '#')
9427 const char *word_begin = scanptr;
9428 const char *word_end;
9430 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9431 scanptr++;
9432 word_end = scanptr;
9434 while (*scanptr == ' ')
9435 scanptr++;
9437 if (word_end - word_begin == 2)
9439 language = (char *) malloc ((word_end - word_begin) + 1);
9440 memcpy (language, word_begin, 2);
9441 language[word_end - word_begin] = '\0';
9442 word_begin = word_end = NULL;
9444 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9446 word_begin = scanptr;
9447 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9448 scanptr++;
9449 word_end = scanptr;
9453 if (word_end > word_begin)
9455 bool negate = false;
9457 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
9459 word_begin += 4;
9460 negate = true;
9462 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
9463 context = SCC_FINAL_SIGMA;
9464 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
9465 context = SCC_AFTER_SOFT_DOTTED;
9466 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
9467 context = SCC_MORE_ABOVE;
9468 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
9469 context = SCC_BEFORE_DOT;
9470 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
9471 context = SCC_AFTER_I;
9472 else
9474 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
9475 exit (1);
9477 if (negate)
9478 context = - context;
9481 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9483 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9484 exit (1);
9488 /* Store the rule. */
9490 struct special_casing_rule *new_rule =
9491 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9492 new_rule->code = code;
9493 new_rule->language = language;
9494 new_rule->context = context;
9495 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
9496 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
9497 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
9499 add_casing_rule (new_rule);
9503 if (ferror (stream) || fclose (stream))
9505 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
9506 exit (1);
9510 /* A casefolding rule. */
9511 struct casefold_rule
9513 unsigned int code;
9514 unsigned int mapping[3];
9515 const char *language;
9518 /* The casefolding rules. */
9519 struct casefold_rule **casefolding_rules;
9520 unsigned int num_casefolding_rules;
9521 unsigned int allocated_casefolding_rules;
9523 /* Stores in casefolding_rules the case folding rules found in
9524 casefolding_filename. */
9525 static void
9526 fill_casefolding_rules (const char *casefolding_filename)
9528 FILE *stream;
9530 stream = fopen (casefolding_filename, "r");
9531 if (stream == NULL)
9533 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
9534 exit (1);
9537 casefolding_rules = NULL;
9538 num_casefolding_rules = 0;
9539 allocated_casefolding_rules = 0;
9541 for (;;)
9543 char buf[200+1];
9544 char *scanptr;
9545 char *endptr;
9546 int i;
9548 unsigned int code;
9549 char type;
9550 unsigned int mapping[3];
9552 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9553 break;
9555 if (buf[0] == '\0' || buf[0] == '#')
9556 continue;
9558 /* Scan code. */
9559 scanptr = buf;
9560 code = strtoul (scanptr, &endptr, 16);
9561 if (endptr == scanptr)
9563 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9564 exit (1);
9566 scanptr = endptr;
9567 if (*scanptr != ';')
9569 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9570 exit (1);
9572 scanptr++;
9574 /* Scan type. */
9575 while (*scanptr == ' ')
9576 scanptr++;
9578 switch (*scanptr)
9580 case 'C': case 'F': case 'S': case 'T':
9581 type = *scanptr;
9582 break;
9583 default:
9584 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9585 exit (1);
9587 scanptr++;
9588 if (*scanptr != ';')
9590 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9591 exit (1);
9593 scanptr++;
9595 /* Scan casefold mapping. */
9596 for (i = 0; i < 3; i++)
9597 mapping[i] = 0;
9598 for (i = 0; i < 3; i++)
9600 while (*scanptr == ' ')
9601 scanptr++;
9602 if (*scanptr == ';')
9603 break;
9604 mapping[i] = strtoul (scanptr, &endptr, 16);
9605 if (endptr == scanptr)
9607 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9608 exit (1);
9610 scanptr = endptr;
9612 if (*scanptr != ';')
9614 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9615 exit (1);
9617 scanptr++;
9619 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
9620 if (type != 'S')
9622 const char * const *languages;
9623 unsigned int languages_count;
9625 /* Type 'T' indicates that the rule is applicable to Turkish
9626 languages only. */
9627 if (type == 'T')
9629 static const char * const turkish_languages[] = { "tr", "az" };
9630 languages = turkish_languages;
9631 languages_count = 2;
9633 else
9635 static const char * const all_languages[] = { NULL };
9636 languages = all_languages;
9637 languages_count = 1;
9640 for (i = 0; i < languages_count; i++)
9642 /* Store a new rule. */
9643 struct casefold_rule *new_rule =
9644 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
9645 new_rule->code = code;
9646 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
9647 new_rule->language = languages[i];
9649 if (num_casefolding_rules == allocated_casefolding_rules)
9651 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
9652 if (allocated_casefolding_rules < 16)
9653 allocated_casefolding_rules = 16;
9654 casefolding_rules =
9655 (struct casefold_rule **)
9656 realloc (casefolding_rules,
9657 allocated_casefolding_rules * sizeof (struct casefold_rule *));
9659 casefolding_rules[num_casefolding_rules++] = new_rule;
9664 if (ferror (stream) || fclose (stream))
9666 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
9667 exit (1);
9671 /* Casefold mapping, when it maps to a single character. */
9672 unsigned int unicode_casefold[0x110000];
9674 static unsigned int
9675 to_casefold (unsigned int ch)
9677 return unicode_casefold[ch];
9680 /* Redistribute the casefolding_rules:
9681 - Rules that map to a single character, language independently, are stored
9682 in unicode_casefold.
9683 - Other rules are merged into casing_rules. */
9684 static void
9685 redistribute_casefolding_rules (void)
9687 unsigned int ch, i, j;
9689 /* Fill unicode_casefold[]. */
9690 for (ch = 0; ch < 0x110000; ch++)
9691 unicode_casefold[ch] = ch;
9692 for (i = 0; i < num_casefolding_rules; i++)
9694 struct casefold_rule *cfrule = casefolding_rules[i];
9696 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
9698 ch = cfrule->code;
9699 assert (ch < 0x110000);
9700 unicode_casefold[ch] = cfrule->mapping[0];
9704 /* Extend the special casing rules by filling in their casefold_mapping[]
9705 field. */
9706 for (j = 0; j < num_casing_rules; j++)
9708 struct special_casing_rule *rule = casing_rules[j];
9709 unsigned int k;
9711 rule->casefold_mapping[0] = to_casefold (rule->code);
9712 for (k = 1; k < 3; k++)
9713 rule->casefold_mapping[k] = 0;
9716 /* Now merge the other casefolding rules into casing_rules. */
9717 for (i = 0; i < num_casefolding_rules; i++)
9719 struct casefold_rule *cfrule = casefolding_rules[i];
9721 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
9723 /* Find a rule that applies to the same code, same language, and it
9724 has context SCC_ALWAYS. At the same time, update all rules that
9725 have the same code and same or more specific language. */
9726 struct special_casing_rule *found_rule = NULL;
9728 for (j = 0; j < num_casing_rules; j++)
9730 struct special_casing_rule *rule = casing_rules[j];
9732 if (rule->code == cfrule->code
9733 && (cfrule->language == NULL
9734 || (rule->language != NULL
9735 && strcmp (rule->language, cfrule->language) == 0)))
9737 memcpy (rule->casefold_mapping, cfrule->mapping,
9738 sizeof (rule->casefold_mapping));
9740 if ((cfrule->language == NULL
9741 ? rule->language == NULL
9742 : rule->language != NULL
9743 && strcmp (rule->language, cfrule->language) == 0)
9744 && rule->context == SCC_ALWAYS)
9746 /* Found it. */
9747 found_rule = rule;
9752 if (found_rule == NULL)
9754 /* Create a new rule. */
9755 struct special_casing_rule *new_rule =
9756 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9758 /* Try to find a rule that applies to the same code, no language
9759 restriction, and with context SCC_ALWAYS. */
9760 for (j = 0; j < num_casing_rules; j++)
9762 struct special_casing_rule *rule = casing_rules[j];
9764 if (rule->code == cfrule->code
9765 && rule->context == SCC_ALWAYS
9766 && rule->language == NULL)
9768 /* Found it. */
9769 found_rule = rule;
9770 break;
9774 new_rule->code = cfrule->code;
9775 new_rule->language = cfrule->language;
9776 new_rule->context = SCC_ALWAYS;
9777 if (found_rule != NULL)
9779 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
9780 sizeof (new_rule->lower_mapping));
9781 memcpy (new_rule->title_mapping, found_rule->title_mapping,
9782 sizeof (new_rule->title_mapping));
9783 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
9784 sizeof (new_rule->upper_mapping));
9786 else
9788 unsigned int k;
9790 new_rule->lower_mapping[0] = to_lower (cfrule->code);
9791 for (k = 1; k < 3; k++)
9792 new_rule->lower_mapping[k] = 0;
9793 new_rule->title_mapping[0] = to_title (cfrule->code);
9794 for (k = 1; k < 3; k++)
9795 new_rule->title_mapping[k] = 0;
9796 new_rule->upper_mapping[0] = to_upper (cfrule->code);
9797 for (k = 1; k < 3; k++)
9798 new_rule->upper_mapping[k] = 0;
9800 memcpy (new_rule->casefold_mapping, cfrule->mapping,
9801 sizeof (new_rule->casefold_mapping));
9803 add_casing_rule (new_rule);
9809 static int
9810 compare_casing_rules (const void *a, const void *b)
9812 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
9813 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
9814 unsigned int a_code = a_rule->code;
9815 unsigned int b_code = b_rule->code;
9817 if (a_code < b_code)
9818 return -1;
9819 if (a_code > b_code)
9820 return 1;
9822 /* Sort the more specific rules before the more general ones. */
9823 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
9824 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
9827 static void
9828 sort_casing_rules (void)
9830 /* Sort the rules 1. by code, 2. by specificity. */
9831 if (num_casing_rules > 1)
9832 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
9833 compare_casing_rules);
9836 /* Output the special casing rules. */
9837 static void
9838 output_casing_rules (const char *filename, const char *version)
9840 FILE *stream;
9841 unsigned int i, j;
9842 unsigned int minor;
9844 stream = fopen (filename, "w");
9845 if (stream == NULL)
9847 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9848 exit (1);
9851 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9852 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
9853 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9854 version);
9855 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
9856 fprintf (stream, "%%struct-type\n");
9857 fprintf (stream, "%%language=ANSI-C\n");
9858 fprintf (stream, "%%define slot-name code\n");
9859 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
9860 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
9861 fprintf (stream, "%%compare-lengths\n");
9862 fprintf (stream, "%%compare-strncmp\n");
9863 fprintf (stream, "%%readonly-tables\n");
9864 fprintf (stream, "%%omit-struct-type\n");
9865 fprintf (stream, "%%%%\n");
9867 minor = 0;
9868 for (i = 0; i < num_casing_rules; i++)
9870 struct special_casing_rule *rule = casing_rules[i];
9871 int context;
9873 if (i > 0 && rule->code == casing_rules[i - 1]->code)
9874 minor += 1;
9875 else
9876 minor = 0;
9878 if (!(rule->code < 0x10000))
9880 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
9881 exit (1);
9884 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
9885 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
9887 fprintf (stream, "%d, ",
9888 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
9890 context = rule->context;
9891 if (context < 0)
9893 fprintf (stream, "-");
9894 context = - context;
9896 else
9897 fprintf (stream, " ");
9898 switch (context)
9900 case SCC_ALWAYS:
9901 fprintf (stream, "SCC_ALWAYS ");
9902 break;
9903 case SCC_FINAL_SIGMA:
9904 fprintf (stream, "SCC_FINAL_SIGMA ");
9905 break;
9906 case SCC_AFTER_SOFT_DOTTED:
9907 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
9908 break;
9909 case SCC_MORE_ABOVE:
9910 fprintf (stream, "SCC_MORE_ABOVE ");
9911 break;
9912 case SCC_BEFORE_DOT:
9913 fprintf (stream, "SCC_BEFORE_DOT ");
9914 break;
9915 case SCC_AFTER_I:
9916 fprintf (stream, "SCC_AFTER_I ");
9917 break;
9918 default:
9919 abort ();
9921 fprintf (stream, ", ");
9923 if (rule->language != NULL)
9925 assert (strlen (rule->language) == 2);
9926 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
9928 else
9929 fprintf (stream, "{ '\\0', '\\0' }, ");
9931 fprintf (stream, "{ ");
9932 for (j = 0; j < 3; j++)
9934 if (j > 0)
9935 fprintf (stream, ", ");
9936 if (!(rule->upper_mapping[j] < 0x10000))
9938 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
9939 exit (1);
9941 if (rule->upper_mapping[j] != 0)
9942 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
9943 else
9944 fprintf (stream, " 0");
9946 fprintf (stream, " }, { ");
9947 for (j = 0; j < 3; j++)
9949 if (j > 0)
9950 fprintf (stream, ", ");
9951 if (!(rule->lower_mapping[j] < 0x10000))
9953 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
9954 exit (1);
9956 if (rule->lower_mapping[j] != 0)
9957 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
9958 else
9959 fprintf (stream, " 0");
9961 fprintf (stream, " }, { ");
9962 for (j = 0; j < 3; j++)
9964 if (j > 0)
9965 fprintf (stream, ", ");
9966 if (!(rule->title_mapping[j] < 0x10000))
9968 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
9969 exit (1);
9971 if (rule->title_mapping[j] != 0)
9972 fprintf (stream, "0x%04X", rule->title_mapping[j]);
9973 else
9974 fprintf (stream, " 0");
9976 fprintf (stream, " }, { ");
9977 for (j = 0; j < 3; j++)
9979 if (j > 0)
9980 fprintf (stream, ", ");
9981 if (!(rule->casefold_mapping[j] < 0x10000))
9983 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
9984 exit (1);
9986 if (rule->casefold_mapping[j] != 0)
9987 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
9988 else
9989 fprintf (stream, " 0");
9991 fprintf (stream, " }\n");
9994 if (ferror (stream) || fclose (stream))
9996 fprintf (stderr, "error writing to '%s'\n", filename);
9997 exit (1);
10001 /* ========================================================================= */
10003 /* Quoting the Unicode standard:
10004 Definition: A character is defined to be "cased" if it has the Lowercase
10005 or Uppercase property or has a General_Category value of
10006 Titlecase_Letter. */
10007 static bool
10008 is_cased (unsigned int ch)
10010 return (is_property_lowercase (ch)
10011 || is_property_uppercase (ch)
10012 || is_category_Lt (ch));
10015 /* Quoting the Unicode standard:
10016 Definition: A character is defined to be "case-ignorable" if it has the
10017 value MidLetter {or the value MidNumLet} for the Word_Break property or
10018 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
10019 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10020 The text marked in braces was added in Unicode 5.1.0, see
10021 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
10022 Definition of case-ignorable". */
10023 /* Since this predicate is only used for the "Before C" and "After C"
10024 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
10025 This simplifies the evaluation of the regular expressions
10026 \p{cased} (\p{case-ignorable})* C
10028 C (\p{case-ignorable})* \p{cased}
10030 static bool
10031 is_case_ignorable (unsigned int ch)
10033 return (unicode_org_wbp[ch] == WBP_MIDLETTER
10034 || unicode_org_wbp[ch] == WBP_MIDNUMLET
10035 || is_category_Mn (ch)
10036 || is_category_Me (ch)
10037 || is_category_Cf (ch)
10038 || is_category_Lm (ch)
10039 || is_category_Sk (ch))
10040 && !is_cased (ch);
10043 /* ------------------------------------------------------------------------- */
10045 /* Output all case related properties. */
10046 static void
10047 output_casing_properties (const char *version)
10049 #define PROPERTY(FN,P) \
10050 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
10051 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
10052 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
10053 PROPERTY(cased, cased)
10054 PROPERTY(ignorable, case_ignorable)
10055 #undef PROPERTY
10058 /* ========================================================================= */
10061 main (int argc, char * argv[])
10063 const char *unicodedata_filename;
10064 const char *proplist_filename;
10065 const char *derivedproplist_filename;
10066 const char *arabicshaping_filename;
10067 const char *scripts_filename;
10068 const char *blocks_filename;
10069 const char *proplist30_filename;
10070 const char *eastasianwidth_filename;
10071 const char *linebreak_filename;
10072 const char *wordbreakproperty_filename;
10073 const char *graphemebreakproperty_filename;
10074 const char *compositionexclusions_filename;
10075 const char *specialcasing_filename;
10076 const char *casefolding_filename;
10077 const char *version;
10079 if (argc != 16)
10081 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
10082 argv[0]);
10083 exit (1);
10086 unicodedata_filename = argv[1];
10087 proplist_filename = argv[2];
10088 derivedproplist_filename = argv[3];
10089 arabicshaping_filename = argv[4];
10090 scripts_filename = argv[5];
10091 blocks_filename = argv[6];
10092 proplist30_filename = argv[7];
10093 eastasianwidth_filename = argv[8];
10094 linebreak_filename = argv[9];
10095 wordbreakproperty_filename = argv[10];
10096 graphemebreakproperty_filename = argv[11];
10097 compositionexclusions_filename = argv[12];
10098 specialcasing_filename = argv[13];
10099 casefolding_filename = argv[14];
10100 version = argv[15];
10102 fill_attributes (unicodedata_filename);
10103 clear_properties ();
10104 fill_properties (proplist_filename);
10105 fill_properties (derivedproplist_filename);
10106 fill_properties30 (proplist30_filename);
10107 fill_arabicshaping (arabicshaping_filename);
10108 fill_scripts (scripts_filename);
10109 fill_blocks (blocks_filename);
10110 fill_width (eastasianwidth_filename);
10111 fill_org_lbp (linebreak_filename);
10112 fill_org_wbp (wordbreakproperty_filename);
10113 fill_org_gbp (graphemebreakproperty_filename);
10114 fill_composition_exclusions (compositionexclusions_filename);
10115 fill_casing_rules (specialcasing_filename);
10116 fill_casefolding_rules (casefolding_filename);
10117 redistribute_casefolding_rules ();
10118 sort_casing_rules ();
10120 output_categories (version);
10121 output_category ("unictype/categ_of.h", version);
10122 output_combclass ("unictype/combiningclass.h", version);
10123 output_bidi_category ("unictype/bidi_of.h", version);
10124 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
10125 output_decimal_digit ("unictype/decdigit.h", version);
10126 output_digit_test ("../tests/unictype/test-digit.h", version);
10127 output_digit ("unictype/digit.h", version);
10128 output_numeric_test ("../tests/unictype/test-numeric.h", version);
10129 output_numeric ("unictype/numeric.h", version);
10130 output_mirror ("unictype/mirror.h", version);
10131 output_properties (version);
10132 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
10133 output_joining_type ("unictype/joiningtype_of.h", version);
10134 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
10135 output_joining_group ("unictype/joininggroup_of.h", version);
10137 output_scripts (version);
10138 output_scripts_byname (version);
10139 output_blocks (version);
10140 output_ident_properties (version);
10141 output_nonspacing_property ("uniwidth/width.c.part");
10142 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
10143 output_old_ctype (version);
10145 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
10146 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
10147 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
10149 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
10150 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
10151 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
10153 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
10154 output_gbp_table ("unigbrk/gbrkprop.h", version);
10156 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
10157 debug_output_composition_tables ("uninorm/composition.txt");
10158 output_composition_tables ("uninorm/composition-table.gperf", version);
10160 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
10161 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
10162 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
10163 output_simple_mapping ("unicase/toupper.h", to_upper, version);
10164 output_simple_mapping ("unicase/tolower.h", to_lower, version);
10165 output_simple_mapping ("unicase/totitle.h", to_title, version);
10166 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
10167 output_casing_rules ("unicase/special-casing-table.gperf", version);
10168 output_casing_properties (version);
10170 return 0;
10174 * Local Variables:
10175 * coding: utf-8
10176 * compile-command: "\
10177 * gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \\
10178 * ./gen-uni-tables \\
10179 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \\
10180 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \\
10181 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \\
10182 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \\
10183 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \\
10184 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \\
10185 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \\
10186 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \\
10187 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \\
10188 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \\
10189 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \\
10190 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \\
10191 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \\
10192 * /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \\
10193 * 6.0.0 \\
10194 * && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \\
10195 * && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt"
10196 * End: