1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2020 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <https://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt \
22 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/PropList.txt \
23 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/DerivedCoreProperties.txt \
24 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/ArabicShaping.txt \
25 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/Scripts.txt \
26 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/Blocks.txt \
27 /usr/local/share/www.unicode.org/Public/3.0-Update1/PropList-3.0.1.txt \
28 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/EastAsianWidth.txt \
29 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/LineBreak.txt \
30 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakProperty.txt \
31 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
32 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/CompositionExclusions.txt \
33 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/SpecialCasing.txt \
34 /usr/local/share/www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt \
46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48 /* ========================================================================= */
50 /* Reading UnicodeData.txt. */
53 /* This structure represents one line in the UnicodeData.txt file. */
54 struct unicode_attribute
56 const char *name
; /* Character name */
57 const char *category
; /* General category */
58 const char *combining
; /* Canonical combining class */
59 const char *bidi
; /* Bidirectional category */
60 const char *decomposition
; /* Character decomposition mapping */
61 const char *decdigit
; /* Decimal digit value */
62 const char *digit
; /* Digit value */
63 const char *numeric
; /* Numeric value */
64 bool mirrored
; /* mirrored */
65 const char *oldname
; /* Old Unicode 1.0 name */
66 const char *comment
; /* Comment */
67 unsigned int upper
; /* Uppercase mapping */
68 unsigned int lower
; /* Lowercase mapping */
69 unsigned int title
; /* Titlecase mapping */
72 /* Missing fields are represented with "" for strings, and NONE for
74 #define NONE (~(unsigned int)0)
76 /* The entire contents of the UnicodeData.txt file. */
77 struct unicode_attribute unicode_attributes
[0x110000];
79 /* Stores in unicode_attributes[i] the values from the given fields. */
81 fill_attribute (unsigned int i
,
82 const char *field1
, const char *field2
,
83 const char *field3
, const char *field4
,
84 const char *field5
, const char *field6
,
85 const char *field7
, const char *field8
,
86 const char *field9
, const char *field10
,
87 const char *field11
, const char *field12
,
88 const char *field13
, const char *field14
)
90 struct unicode_attribute
* uni
;
94 fprintf (stderr
, "index too large\n");
97 if (strcmp (field2
, "Cs") == 0)
98 /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */
100 uni
= &unicode_attributes
[i
];
101 /* Copy the strings. */
102 uni
->name
= strdup (field1
);
103 uni
->category
= (field2
[0] == '\0' ? "" : strdup (field2
));
104 uni
->combining
= (field3
[0] == '\0' ? "" : strdup (field3
));
105 uni
->bidi
= (field4
[0] == '\0' ? "" : strdup (field4
));
106 uni
->decomposition
= (field5
[0] == '\0' ? "" : strdup (field5
));
107 uni
->decdigit
= (field6
[0] == '\0' ? "" : strdup (field6
));
108 uni
->digit
= (field7
[0] == '\0' ? "" : strdup (field7
));
109 uni
->numeric
= (field8
[0] == '\0' ? "" : strdup (field8
));
110 uni
->mirrored
= (field9
[0] == 'Y');
111 uni
->oldname
= (field10
[0] == '\0' ? "" : strdup (field10
));
112 uni
->comment
= (field11
[0] == '\0' ? "" : strdup (field11
));
113 uni
->upper
= (field12
[0] =='\0' ? NONE
: strtoul (field12
, NULL
, 16));
114 uni
->lower
= (field13
[0] =='\0' ? NONE
: strtoul (field13
, NULL
, 16));
115 uni
->title
= (field14
[0] =='\0' ? NONE
: strtoul (field14
, NULL
, 16));
118 /* Maximum length of a field in the UnicodeData.txt file. */
121 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
122 Reads up to (but excluding) DELIM.
123 Returns 1 when a field was successfully read, otherwise 0. */
125 getfield (FILE *stream
, char *buffer
, int delim
)
130 for (; (c
= getc (stream
)), (c
!= EOF
&& c
!= delim
); )
132 /* The original unicode.org UnicodeData.txt file happens to have
133 CR/LF line terminators. Silently convert to LF. */
137 /* Put c into the buffer. */
138 if (++count
>= FIELDLEN
- 1)
140 fprintf (stderr
, "field longer than expected, increase FIELDLEN\n");
153 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
156 fill_attributes (const char *unicodedata_filename
)
160 char field0
[FIELDLEN
];
161 char field1
[FIELDLEN
];
162 char field2
[FIELDLEN
];
163 char field3
[FIELDLEN
];
164 char field4
[FIELDLEN
];
165 char field5
[FIELDLEN
];
166 char field6
[FIELDLEN
];
167 char field7
[FIELDLEN
];
168 char field8
[FIELDLEN
];
169 char field9
[FIELDLEN
];
170 char field10
[FIELDLEN
];
171 char field11
[FIELDLEN
];
172 char field12
[FIELDLEN
];
173 char field13
[FIELDLEN
];
174 char field14
[FIELDLEN
];
177 for (i
= 0; i
< 0x110000; i
++)
178 unicode_attributes
[i
].name
= NULL
;
180 stream
= fopen (unicodedata_filename
, "r");
183 fprintf (stderr
, "error during fopen of '%s'\n", unicodedata_filename
);
192 n
= getfield (stream
, field0
, ';');
193 n
+= getfield (stream
, field1
, ';');
194 n
+= getfield (stream
, field2
, ';');
195 n
+= getfield (stream
, field3
, ';');
196 n
+= getfield (stream
, field4
, ';');
197 n
+= getfield (stream
, field5
, ';');
198 n
+= getfield (stream
, field6
, ';');
199 n
+= getfield (stream
, field7
, ';');
200 n
+= getfield (stream
, field8
, ';');
201 n
+= getfield (stream
, field9
, ';');
202 n
+= getfield (stream
, field10
, ';');
203 n
+= getfield (stream
, field11
, ';');
204 n
+= getfield (stream
, field12
, ';');
205 n
+= getfield (stream
, field13
, ';');
206 n
+= getfield (stream
, field14
, '\n');
211 fprintf (stderr
, "short line in '%s':%d\n",
212 unicodedata_filename
, lineno
);
215 i
= strtoul (field0
, NULL
, 16);
217 && strlen (field1
) >= 9
218 && strcmp (field1
+ strlen (field1
) - 8, ", First>") == 0)
220 /* Deal with a range. */
222 n
= getfield (stream
, field0
, ';');
223 n
+= getfield (stream
, field1
, ';');
224 n
+= getfield (stream
, field2
, ';');
225 n
+= getfield (stream
, field3
, ';');
226 n
+= getfield (stream
, field4
, ';');
227 n
+= getfield (stream
, field5
, ';');
228 n
+= getfield (stream
, field6
, ';');
229 n
+= getfield (stream
, field7
, ';');
230 n
+= getfield (stream
, field8
, ';');
231 n
+= getfield (stream
, field9
, ';');
232 n
+= getfield (stream
, field10
, ';');
233 n
+= getfield (stream
, field11
, ';');
234 n
+= getfield (stream
, field12
, ';');
235 n
+= getfield (stream
, field13
, ';');
236 n
+= getfield (stream
, field14
, '\n');
239 fprintf (stderr
, "missing end range in '%s':%d\n",
240 unicodedata_filename
, lineno
);
243 if (!(field1
[0] == '<'
244 && strlen (field1
) >= 8
245 && strcmp (field1
+ strlen (field1
) - 7, ", Last>") == 0))
247 fprintf (stderr
, "missing end range in '%s':%d\n",
248 unicodedata_filename
, lineno
);
251 field1
[strlen (field1
) - 7] = '\0';
252 j
= strtoul (field0
, NULL
, 16);
254 fill_attribute (i
, field1
+1, field2
, field3
, field4
, field5
,
255 field6
, field7
, field8
, field9
, field10
,
256 field11
, field12
, field13
, field14
);
260 /* Single character line */
261 fill_attribute (i
, field1
, field2
, field3
, field4
, field5
,
262 field6
, field7
, field8
, field9
, field10
,
263 field11
, field12
, field13
, field14
);
267 if (ferror (stream
) || fclose (stream
))
269 fprintf (stderr
, "error reading from '%s'\n", unicodedata_filename
);
274 /* ========================================================================= */
276 /* General category. */
277 /* See Unicode 3.0 book, section 4.5,
281 is_category_L (unsigned int ch
)
283 return (unicode_attributes
[ch
].name
!= NULL
284 && unicode_attributes
[ch
].category
[0] == 'L');
288 is_category_LC (unsigned int ch
)
290 /* See PropertyValueAliases.txt. */
291 return (unicode_attributes
[ch
].name
!= NULL
292 && unicode_attributes
[ch
].category
[0] == 'L'
293 && (unicode_attributes
[ch
].category
[1] == 'u'
294 || unicode_attributes
[ch
].category
[1] == 'l'
295 || unicode_attributes
[ch
].category
[1] == 't'));
299 is_category_Lu (unsigned int ch
)
301 return (unicode_attributes
[ch
].name
!= NULL
302 && unicode_attributes
[ch
].category
[0] == 'L'
303 && unicode_attributes
[ch
].category
[1] == 'u');
307 is_category_Ll (unsigned int ch
)
309 return (unicode_attributes
[ch
].name
!= NULL
310 && unicode_attributes
[ch
].category
[0] == 'L'
311 && unicode_attributes
[ch
].category
[1] == 'l');
315 is_category_Lt (unsigned int ch
)
317 return (unicode_attributes
[ch
].name
!= NULL
318 && unicode_attributes
[ch
].category
[0] == 'L'
319 && unicode_attributes
[ch
].category
[1] == 't');
323 is_category_Lm (unsigned int ch
)
325 return (unicode_attributes
[ch
].name
!= NULL
326 && unicode_attributes
[ch
].category
[0] == 'L'
327 && unicode_attributes
[ch
].category
[1] == 'm');
331 is_category_Lo (unsigned int ch
)
333 return (unicode_attributes
[ch
].name
!= NULL
334 && unicode_attributes
[ch
].category
[0] == 'L'
335 && unicode_attributes
[ch
].category
[1] == 'o');
339 is_category_M (unsigned int ch
)
341 return (unicode_attributes
[ch
].name
!= NULL
342 && unicode_attributes
[ch
].category
[0] == 'M');
346 is_category_Mn (unsigned int ch
)
348 return (unicode_attributes
[ch
].name
!= NULL
349 && unicode_attributes
[ch
].category
[0] == 'M'
350 && unicode_attributes
[ch
].category
[1] == 'n');
354 is_category_Mc (unsigned int ch
)
356 return (unicode_attributes
[ch
].name
!= NULL
357 && unicode_attributes
[ch
].category
[0] == 'M'
358 && unicode_attributes
[ch
].category
[1] == 'c');
362 is_category_Me (unsigned int ch
)
364 return (unicode_attributes
[ch
].name
!= NULL
365 && unicode_attributes
[ch
].category
[0] == 'M'
366 && unicode_attributes
[ch
].category
[1] == 'e');
370 is_category_N (unsigned int ch
)
372 return (unicode_attributes
[ch
].name
!= NULL
373 && unicode_attributes
[ch
].category
[0] == 'N');
377 is_category_Nd (unsigned int ch
)
379 return (unicode_attributes
[ch
].name
!= NULL
380 && unicode_attributes
[ch
].category
[0] == 'N'
381 && unicode_attributes
[ch
].category
[1] == 'd');
385 is_category_Nl (unsigned int ch
)
387 return (unicode_attributes
[ch
].name
!= NULL
388 && unicode_attributes
[ch
].category
[0] == 'N'
389 && unicode_attributes
[ch
].category
[1] == 'l');
393 is_category_No (unsigned int ch
)
395 return (unicode_attributes
[ch
].name
!= NULL
396 && unicode_attributes
[ch
].category
[0] == 'N'
397 && unicode_attributes
[ch
].category
[1] == 'o');
401 is_category_P (unsigned int ch
)
403 return (unicode_attributes
[ch
].name
!= NULL
404 && unicode_attributes
[ch
].category
[0] == 'P');
408 is_category_Pc (unsigned int ch
)
410 return (unicode_attributes
[ch
].name
!= NULL
411 && unicode_attributes
[ch
].category
[0] == 'P'
412 && unicode_attributes
[ch
].category
[1] == 'c');
416 is_category_Pd (unsigned int ch
)
418 return (unicode_attributes
[ch
].name
!= NULL
419 && unicode_attributes
[ch
].category
[0] == 'P'
420 && unicode_attributes
[ch
].category
[1] == 'd');
424 is_category_Ps (unsigned int ch
)
426 return (unicode_attributes
[ch
].name
!= NULL
427 && unicode_attributes
[ch
].category
[0] == 'P'
428 && unicode_attributes
[ch
].category
[1] == 's');
432 is_category_Pe (unsigned int ch
)
434 return (unicode_attributes
[ch
].name
!= NULL
435 && unicode_attributes
[ch
].category
[0] == 'P'
436 && unicode_attributes
[ch
].category
[1] == 'e');
440 is_category_Pi (unsigned int ch
)
442 return (unicode_attributes
[ch
].name
!= NULL
443 && unicode_attributes
[ch
].category
[0] == 'P'
444 && unicode_attributes
[ch
].category
[1] == 'i');
448 is_category_Pf (unsigned int ch
)
450 return (unicode_attributes
[ch
].name
!= NULL
451 && unicode_attributes
[ch
].category
[0] == 'P'
452 && unicode_attributes
[ch
].category
[1] == 'f');
456 is_category_Po (unsigned int ch
)
458 return (unicode_attributes
[ch
].name
!= NULL
459 && unicode_attributes
[ch
].category
[0] == 'P'
460 && unicode_attributes
[ch
].category
[1] == 'o');
464 is_category_S (unsigned int ch
)
466 return (unicode_attributes
[ch
].name
!= NULL
467 && unicode_attributes
[ch
].category
[0] == 'S');
471 is_category_Sm (unsigned int ch
)
473 return (unicode_attributes
[ch
].name
!= NULL
474 && unicode_attributes
[ch
].category
[0] == 'S'
475 && unicode_attributes
[ch
].category
[1] == 'm');
479 is_category_Sc (unsigned int ch
)
481 return (unicode_attributes
[ch
].name
!= NULL
482 && unicode_attributes
[ch
].category
[0] == 'S'
483 && unicode_attributes
[ch
].category
[1] == 'c');
487 is_category_Sk (unsigned int ch
)
489 return (unicode_attributes
[ch
].name
!= NULL
490 && unicode_attributes
[ch
].category
[0] == 'S'
491 && unicode_attributes
[ch
].category
[1] == 'k');
495 is_category_So (unsigned int ch
)
497 return (unicode_attributes
[ch
].name
!= NULL
498 && unicode_attributes
[ch
].category
[0] == 'S'
499 && unicode_attributes
[ch
].category
[1] == 'o');
503 is_category_Z (unsigned int ch
)
505 return (unicode_attributes
[ch
].name
!= NULL
506 && unicode_attributes
[ch
].category
[0] == 'Z');
510 is_category_Zs (unsigned int ch
)
512 return (unicode_attributes
[ch
].name
!= NULL
513 && unicode_attributes
[ch
].category
[0] == 'Z'
514 && unicode_attributes
[ch
].category
[1] == 's');
518 is_category_Zl (unsigned int ch
)
520 return (unicode_attributes
[ch
].name
!= NULL
521 && unicode_attributes
[ch
].category
[0] == 'Z'
522 && unicode_attributes
[ch
].category
[1] == 'l');
526 is_category_Zp (unsigned int ch
)
528 return (unicode_attributes
[ch
].name
!= NULL
529 && unicode_attributes
[ch
].category
[0] == 'Z'
530 && unicode_attributes
[ch
].category
[1] == 'p');
534 is_category_C (unsigned int ch
)
536 return (unicode_attributes
[ch
].name
== NULL
537 || unicode_attributes
[ch
].category
[0] == 'C');
541 is_category_Cc (unsigned int ch
)
543 return (unicode_attributes
[ch
].name
!= NULL
544 && unicode_attributes
[ch
].category
[0] == 'C'
545 && unicode_attributes
[ch
].category
[1] == 'c');
549 is_category_Cf (unsigned int ch
)
551 return (unicode_attributes
[ch
].name
!= NULL
552 && unicode_attributes
[ch
].category
[0] == 'C'
553 && unicode_attributes
[ch
].category
[1] == 'f');
557 is_category_Cs (unsigned int ch
)
559 return (ch
>= 0xd800 && ch
< 0xe000);
563 is_category_Co (unsigned int ch
)
565 return (unicode_attributes
[ch
].name
!= NULL
566 && unicode_attributes
[ch
].category
[0] == 'C'
567 && unicode_attributes
[ch
].category
[1] == 'o');
571 is_category_Cn (unsigned int ch
)
573 return (unicode_attributes
[ch
].name
== NULL
574 && !(ch
>= 0xd800 && ch
< 0xe000));
577 /* Output a boolean property in a human readable format. */
579 debug_output_predicate (const char *filename
, bool (*predicate
) (unsigned int))
584 stream
= fopen (filename
, "w");
587 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
591 #if 0 /* This yields huge text output. */
592 for (ch
= 0; ch
< 0x110000; ch
++)
595 fprintf (stream
, "0x%04X\n", ch
);
598 for (ch
= 0; ch
< 0x110000; ch
++)
601 unsigned int first
= ch
;
604 while (ch
+ 1 < 0x110000 && predicate (ch
+ 1))
608 fprintf (stream
, "0x%04X..0x%04X\n", first
, last
);
610 fprintf (stream
, "0x%04X\n", ch
);
614 if (ferror (stream
) || fclose (stream
))
616 fprintf (stderr
, "error writing to '%s'\n", filename
);
621 /* Output the unit test for a boolean property. */
623 output_predicate_test (const char *filename
, bool (*predicate
) (unsigned int), const char *expression
)
629 stream
= fopen (filename
, "w");
632 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
636 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
637 fprintf (stream
, "/* Test the Unicode character type functions.\n");
638 fprintf (stream
, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
639 fprintf (stream
, "\n");
640 fprintf (stream
, " This program is free software: you can redistribute it and/or modify\n");
641 fprintf (stream
, " it under the terms of the GNU General Public License as published by\n");
642 fprintf (stream
, " the Free Software Foundation; either version 3 of the License, or\n");
643 fprintf (stream
, " (at your option) any later version.\n");
644 fprintf (stream
, "\n");
645 fprintf (stream
, " This program is distributed in the hope that it will be useful,\n");
646 fprintf (stream
, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
647 fprintf (stream
, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
648 fprintf (stream
, " GNU General Public License for more details.\n");
649 fprintf (stream
, "\n");
650 fprintf (stream
, " You should have received a copy of the GNU General Public License\n");
651 fprintf (stream
, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
652 fprintf (stream
, "\n");
653 fprintf (stream
, "#include \"test-predicate-part1.h\"\n");
654 fprintf (stream
, "\n");
657 for (ch
= 0; ch
< 0x110000; ch
++)
660 unsigned int first
= ch
;
663 while (ch
+ 1 < 0x110000 && predicate (ch
+ 1))
667 fprintf (stream
, ",\n");
668 fprintf (stream
, " { 0x%04X, 0x%04X }", first
, last
);
672 fprintf (stream
, "\n");
674 fprintf (stream
, "\n");
675 fprintf (stream
, "#define PREDICATE(c) %s\n", expression
);
676 fprintf (stream
, "#include \"test-predicate-part2.h\"\n");
678 if (ferror (stream
) || fclose (stream
))
680 fprintf (stderr
, "error writing to '%s'\n", filename
);
685 /* Construction of sparse 3-level tables. */
686 #define TABLE predicate_table
687 #define xmalloc malloc
688 #define xrealloc realloc
689 #include "3levelbit.h"
691 /* Output a boolean property in a three-level bitmap. */
693 output_predicate (const char *filename
, bool (*predicate
) (unsigned int), const char *name
, const char *comment
, const char *version
)
697 struct predicate_table t
;
698 unsigned int level1_offset
, level2_offset
, level3_offset
;
700 stream
= fopen (filename
, "w");
703 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
707 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
708 fprintf (stream
, "/* %s of Unicode characters. */\n", comment
);
709 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
714 predicate_table_init (&t
);
716 for (ch
= 0; ch
< 0x110000; ch
++)
718 predicate_table_add (&t
, ch
);
720 predicate_table_finalize (&t
);
722 /* Offsets in t.result, in memory of this process. */
724 5 * sizeof (uint32_t);
726 5 * sizeof (uint32_t)
727 + t
.level1_size
* sizeof (uint32_t);
729 5 * sizeof (uint32_t)
730 + t
.level1_size
* sizeof (uint32_t)
731 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
733 for (i
= 0; i
< 5; i
++)
735 fprintf (stream
, "#define header_%d %d\n", i
,
736 ((uint32_t *) t
.result
)[i
]);
738 fprintf (stream
, "static const\n");
739 fprintf (stream
, "struct\n");
740 fprintf (stream
, " {\n");
741 fprintf (stream
, " int header[1];\n");
742 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
743 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
744 fprintf (stream
, " unsigned int level3[%zu << %d];\n", t
.level3_size
, t
.p
);
745 fprintf (stream
, " }\n");
746 fprintf (stream
, "%s =\n", name
);
747 fprintf (stream
, "{\n");
748 fprintf (stream
, " { %d },\n", ((uint32_t *) t
.result
)[1]);
749 fprintf (stream
, " {");
750 if (t
.level1_size
> 1)
751 fprintf (stream
, "\n ");
752 for (i
= 0; i
< t
.level1_size
; i
++)
755 if (i
> 0 && (i
% 1) == 0)
756 fprintf (stream
, "\n ");
757 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
759 fprintf (stream
, " %5d", -1);
761 fprintf (stream
, " %5zu * sizeof (int) / sizeof (short) + %5zu",
762 1 + t
.level1_size
, (offset
- level2_offset
) / sizeof (uint32_t));
763 if (i
+1 < t
.level1_size
)
764 fprintf (stream
, ",");
766 if (t
.level1_size
> 1)
767 fprintf (stream
, "\n ");
768 fprintf (stream
, " },\n");
769 fprintf (stream
, " {");
770 if (t
.level2_size
<< t
.q
> 1)
771 fprintf (stream
, "\n ");
772 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
775 if (i
> 0 && (i
% 1) == 0)
776 fprintf (stream
, "\n ");
777 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
779 fprintf (stream
, " %5d", -1);
781 fprintf (stream
, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
782 1 + t
.level1_size
, t
.level2_size
<< t
.q
, (offset
- level3_offset
) / sizeof (uint32_t));
783 if (i
+1 < t
.level2_size
<< t
.q
)
784 fprintf (stream
, ",");
786 if (t
.level2_size
<< t
.q
> 1)
787 fprintf (stream
, "\n ");
788 fprintf (stream
, " },\n");
789 fprintf (stream
, " {");
790 if (t
.level3_size
<< t
.p
> 4)
791 fprintf (stream
, "\n ");
792 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
794 if (i
> 0 && (i
% 4) == 0)
795 fprintf (stream
, "\n ");
796 fprintf (stream
, " 0x%08XU",
797 ((uint32_t *) (t
.result
+ level3_offset
))[i
]);
798 if (i
+1 < t
.level3_size
<< t
.p
)
799 fprintf (stream
, ",");
801 if (t
.level3_size
<< t
.p
> 4)
802 fprintf (stream
, "\n ");
803 fprintf (stream
, " }\n");
804 fprintf (stream
, "};\n");
806 if (ferror (stream
) || fclose (stream
))
808 fprintf (stderr
, "error writing to '%s'\n", filename
);
813 /* Output all categories. */
815 output_categories (const char *version
)
817 #define CATEGORY(C) \
818 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
819 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
820 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
864 UC_CATEGORY_MASK_L
= 0x0000001f,
865 UC_CATEGORY_MASK_LC
= 0x00000007,
866 UC_CATEGORY_MASK_Lu
= 0x00000001,
867 UC_CATEGORY_MASK_Ll
= 0x00000002,
868 UC_CATEGORY_MASK_Lt
= 0x00000004,
869 UC_CATEGORY_MASK_Lm
= 0x00000008,
870 UC_CATEGORY_MASK_Lo
= 0x00000010,
871 UC_CATEGORY_MASK_M
= 0x000000e0,
872 UC_CATEGORY_MASK_Mn
= 0x00000020,
873 UC_CATEGORY_MASK_Mc
= 0x00000040,
874 UC_CATEGORY_MASK_Me
= 0x00000080,
875 UC_CATEGORY_MASK_N
= 0x00000700,
876 UC_CATEGORY_MASK_Nd
= 0x00000100,
877 UC_CATEGORY_MASK_Nl
= 0x00000200,
878 UC_CATEGORY_MASK_No
= 0x00000400,
879 UC_CATEGORY_MASK_P
= 0x0003f800,
880 UC_CATEGORY_MASK_Pc
= 0x00000800,
881 UC_CATEGORY_MASK_Pd
= 0x00001000,
882 UC_CATEGORY_MASK_Ps
= 0x00002000,
883 UC_CATEGORY_MASK_Pe
= 0x00004000,
884 UC_CATEGORY_MASK_Pi
= 0x00008000,
885 UC_CATEGORY_MASK_Pf
= 0x00010000,
886 UC_CATEGORY_MASK_Po
= 0x00020000,
887 UC_CATEGORY_MASK_S
= 0x003c0000,
888 UC_CATEGORY_MASK_Sm
= 0x00040000,
889 UC_CATEGORY_MASK_Sc
= 0x00080000,
890 UC_CATEGORY_MASK_Sk
= 0x00100000,
891 UC_CATEGORY_MASK_So
= 0x00200000,
892 UC_CATEGORY_MASK_Z
= 0x01c00000,
893 UC_CATEGORY_MASK_Zs
= 0x00400000,
894 UC_CATEGORY_MASK_Zl
= 0x00800000,
895 UC_CATEGORY_MASK_Zp
= 0x01000000,
896 UC_CATEGORY_MASK_C
= 0x3e000000,
897 UC_CATEGORY_MASK_Cc
= 0x02000000,
898 UC_CATEGORY_MASK_Cf
= 0x04000000,
899 UC_CATEGORY_MASK_Cs
= 0x08000000,
900 UC_CATEGORY_MASK_Co
= 0x10000000,
901 UC_CATEGORY_MASK_Cn
= 0x20000000
905 general_category_byname (const char *category_name
)
907 if (category_name
[0] != '\0'
908 && (category_name
[1] == '\0' || category_name
[2] == '\0'))
909 switch (category_name
[0])
912 switch (category_name
[1])
914 case '\0': return UC_CATEGORY_MASK_L
;
915 case 'C': return UC_CATEGORY_MASK_LC
;
916 case 'u': return UC_CATEGORY_MASK_Lu
;
917 case 'l': return UC_CATEGORY_MASK_Ll
;
918 case 't': return UC_CATEGORY_MASK_Lt
;
919 case 'm': return UC_CATEGORY_MASK_Lm
;
920 case 'o': return UC_CATEGORY_MASK_Lo
;
924 switch (category_name
[1])
926 case '\0': return UC_CATEGORY_MASK_M
;
927 case 'n': return UC_CATEGORY_MASK_Mn
;
928 case 'c': return UC_CATEGORY_MASK_Mc
;
929 case 'e': return UC_CATEGORY_MASK_Me
;
933 switch (category_name
[1])
935 case '\0': return UC_CATEGORY_MASK_N
;
936 case 'd': return UC_CATEGORY_MASK_Nd
;
937 case 'l': return UC_CATEGORY_MASK_Nl
;
938 case 'o': return UC_CATEGORY_MASK_No
;
942 switch (category_name
[1])
944 case '\0': return UC_CATEGORY_MASK_P
;
945 case 'c': return UC_CATEGORY_MASK_Pc
;
946 case 'd': return UC_CATEGORY_MASK_Pd
;
947 case 's': return UC_CATEGORY_MASK_Ps
;
948 case 'e': return UC_CATEGORY_MASK_Pe
;
949 case 'i': return UC_CATEGORY_MASK_Pi
;
950 case 'f': return UC_CATEGORY_MASK_Pf
;
951 case 'o': return UC_CATEGORY_MASK_Po
;
955 switch (category_name
[1])
957 case '\0': return UC_CATEGORY_MASK_S
;
958 case 'm': return UC_CATEGORY_MASK_Sm
;
959 case 'c': return UC_CATEGORY_MASK_Sc
;
960 case 'k': return UC_CATEGORY_MASK_Sk
;
961 case 'o': return UC_CATEGORY_MASK_So
;
965 switch (category_name
[1])
967 case '\0': return UC_CATEGORY_MASK_Z
;
968 case 's': return UC_CATEGORY_MASK_Zs
;
969 case 'l': return UC_CATEGORY_MASK_Zl
;
970 case 'p': return UC_CATEGORY_MASK_Zp
;
974 switch (category_name
[1])
976 case '\0': return UC_CATEGORY_MASK_C
;
977 case 'c': return UC_CATEGORY_MASK_Cc
;
978 case 'f': return UC_CATEGORY_MASK_Cf
;
979 case 's': return UC_CATEGORY_MASK_Cs
;
980 case 'o': return UC_CATEGORY_MASK_Co
;
981 case 'n': return UC_CATEGORY_MASK_Cn
;
985 /* Invalid category name. */
989 /* Construction of sparse 3-level tables. */
990 #define TABLE category_table
991 #define ELEMENT uint8_t
992 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
993 #define xmalloc malloc
994 #define xrealloc realloc
997 /* Output the per-character category table. */
999 output_category (const char *filename
, const char *version
)
1003 struct category_table t
;
1004 unsigned int level1_offset
, level2_offset
, level3_offset
;
1005 uint16_t *level3_packed
;
1007 stream
= fopen (filename
, "w");
1010 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1014 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1015 fprintf (stream
, "/* Categories of Unicode characters. */\n");
1016 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1021 category_table_init (&t
);
1023 for (ch
= 0; ch
< 0x110000; ch
++)
1026 unsigned int log2_value
;
1028 if (is_category_Cs (ch
))
1029 value
= UC_CATEGORY_MASK_Cs
;
1030 else if (unicode_attributes
[ch
].name
!= NULL
)
1031 value
= general_category_byname (unicode_attributes
[ch
].category
);
1035 /* Now value should contain exactly one bit. */
1036 assert (value
!= 0 && (value
& (value
- 1)) == 0);
1038 for (log2_value
= 0; value
> 1; value
>>= 1, log2_value
++);
1040 assert (log2_value
<= 0x1f);
1042 category_table_add (&t
, ch
, log2_value
);
1045 category_table_finalize (&t
);
1047 /* Offsets in t.result, in memory of this process. */
1049 5 * sizeof (uint32_t);
1051 5 * sizeof (uint32_t)
1052 + t
.level1_size
* sizeof (uint32_t);
1054 5 * sizeof (uint32_t)
1055 + t
.level1_size
* sizeof (uint32_t)
1056 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1058 for (i
= 0; i
< 5; i
++)
1059 fprintf (stream
, "#define category_header_%d %d\n", i
,
1060 ((uint32_t *) t
.result
)[i
]);
1061 fprintf (stream
, "static const\n");
1062 fprintf (stream
, "struct\n");
1063 fprintf (stream
, " {\n");
1064 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
1065 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
1066 fprintf (stream
, " unsigned short level3[%zu * %d + 1];\n", t
.level3_size
,
1067 (1 << t
.p
) * 5 / 16);
1068 fprintf (stream
, " }\n");
1069 fprintf (stream
, "u_category =\n");
1070 fprintf (stream
, "{\n");
1071 fprintf (stream
, " {");
1072 if (t
.level1_size
> 8)
1073 fprintf (stream
, "\n ");
1074 for (i
= 0; i
< t
.level1_size
; i
++)
1077 if (i
> 0 && (i
% 8) == 0)
1078 fprintf (stream
, "\n ");
1079 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1081 fprintf (stream
, " %5d", -1);
1083 fprintf (stream
, " %5zu",
1084 (offset
- level2_offset
) / sizeof (uint32_t));
1085 if (i
+1 < t
.level1_size
)
1086 fprintf (stream
, ",");
1088 if (t
.level1_size
> 8)
1089 fprintf (stream
, "\n ");
1090 fprintf (stream
, " },\n");
1091 fprintf (stream
, " {");
1092 if (t
.level2_size
<< t
.q
> 8)
1093 fprintf (stream
, "\n ");
1094 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
1097 if (i
> 0 && (i
% 8) == 0)
1098 fprintf (stream
, "\n ");
1099 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
1101 fprintf (stream
, " %5d", -1);
1103 fprintf (stream
, " %5zu",
1104 (offset
- level3_offset
) / sizeof (uint8_t));
1105 if (i
+1 < t
.level2_size
<< t
.q
)
1106 fprintf (stream
, ",");
1108 if (t
.level2_size
<< t
.q
> 8)
1109 fprintf (stream
, "\n ");
1110 fprintf (stream
, " },\n");
1111 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1112 not 32-bit units, in order to make the lookup function easier. */
1115 calloc ((t
.level3_size
<< t
.p
) * 5 / 16 + 1, sizeof (uint16_t));
1116 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
1118 unsigned int j
= (i
* 5) / 16;
1119 unsigned int k
= (i
* 5) % 16;
1120 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
1121 value
= level3_packed
[j
] | (level3_packed
[j
+1] << 16) | (value
<< k
);
1122 level3_packed
[j
] = value
& 0xffff;
1123 level3_packed
[j
+1] = value
>> 16;
1125 fprintf (stream
, " {");
1126 if ((t
.level3_size
<< t
.p
) * 5 / 16 + 1 > 8)
1127 fprintf (stream
, "\n ");
1128 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 5 / 16 + 1; i
++)
1130 if (i
> 0 && (i
% 8) == 0)
1131 fprintf (stream
, "\n ");
1132 fprintf (stream
, " 0x%04x", level3_packed
[i
]);
1133 if (i
+1 < (t
.level3_size
<< t
.p
) * 5 / 16 + 1)
1134 fprintf (stream
, ",");
1136 if ((t
.level3_size
<< t
.p
) * 5 / 16 + 1 > 8)
1137 fprintf (stream
, "\n ");
1138 fprintf (stream
, " }\n");
1139 free (level3_packed
);
1140 fprintf (stream
, "};\n");
1142 if (ferror (stream
) || fclose (stream
))
1144 fprintf (stderr
, "error writing to '%s'\n", filename
);
1149 /* ========================================================================= */
1151 /* Canonical combining class. */
1152 /* See Unicode 3.0 book, section 4.2,
1155 /* Construction of sparse 3-level tables. */
1156 #define TABLE combclass_table
1157 #define ELEMENT uint8_t
1159 #define xmalloc malloc
1160 #define xrealloc realloc
1163 /* Output the per-character combining class table. */
1165 output_combclass (const char *filename
, const char *version
)
1169 struct combclass_table t
;
1170 unsigned int level1_offset
, level2_offset
, level3_offset
;
1172 stream
= fopen (filename
, "w");
1175 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1179 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1180 fprintf (stream
, "/* Combining class of Unicode characters. */\n");
1181 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1186 combclass_table_init (&t
);
1188 for (ch
= 0; ch
< 0x110000; ch
++)
1189 if (unicode_attributes
[ch
].name
!= NULL
)
1191 int value
= atoi (unicode_attributes
[ch
].combining
);
1192 assert (value
>= 0 && value
<= 255);
1193 combclass_table_add (&t
, ch
, value
);
1196 combclass_table_finalize (&t
);
1198 /* Offsets in t.result, in memory of this process. */
1200 5 * sizeof (uint32_t);
1202 5 * sizeof (uint32_t)
1203 + t
.level1_size
* sizeof (uint32_t);
1205 5 * sizeof (uint32_t)
1206 + t
.level1_size
* sizeof (uint32_t)
1207 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1209 for (i
= 0; i
< 5; i
++)
1210 fprintf (stream
, "#define combclass_header_%d %d\n", i
,
1211 ((uint32_t *) t
.result
)[i
]);
1212 fprintf (stream
, "static const\n");
1213 fprintf (stream
, "struct\n");
1214 fprintf (stream
, " {\n");
1215 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
1216 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
1217 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
, t
.p
);
1218 fprintf (stream
, " }\n");
1219 fprintf (stream
, "u_combclass =\n");
1220 fprintf (stream
, "{\n");
1221 fprintf (stream
, " {");
1222 if (t
.level1_size
> 8)
1223 fprintf (stream
, "\n ");
1224 for (i
= 0; i
< t
.level1_size
; i
++)
1227 if (i
> 0 && (i
% 8) == 0)
1228 fprintf (stream
, "\n ");
1229 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1231 fprintf (stream
, " %5d", -1);
1233 fprintf (stream
, " %5zu",
1234 (offset
- level2_offset
) / sizeof (uint32_t));
1235 if (i
+1 < t
.level1_size
)
1236 fprintf (stream
, ",");
1238 if (t
.level1_size
> 8)
1239 fprintf (stream
, "\n ");
1240 fprintf (stream
, " },\n");
1241 fprintf (stream
, " {");
1242 if (t
.level2_size
<< t
.q
> 8)
1243 fprintf (stream
, "\n ");
1244 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
1247 if (i
> 0 && (i
% 8) == 0)
1248 fprintf (stream
, "\n ");
1249 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
1251 fprintf (stream
, " %5d", -1);
1253 fprintf (stream
, " %5zu",
1254 (offset
- level3_offset
) / sizeof (uint8_t));
1255 if (i
+1 < t
.level2_size
<< t
.q
)
1256 fprintf (stream
, ",");
1258 if (t
.level2_size
<< t
.q
> 8)
1259 fprintf (stream
, "\n ");
1260 fprintf (stream
, " },\n");
1261 fprintf (stream
, " {");
1262 if (t
.level3_size
<< t
.p
> 8)
1263 fprintf (stream
, "\n ");
1264 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
1266 if (i
> 0 && (i
% 8) == 0)
1267 fprintf (stream
, "\n ");
1268 fprintf (stream
, " %3d", ((uint8_t *) (t
.result
+ level3_offset
))[i
]);
1269 if (i
+1 < t
.level3_size
<< t
.p
)
1270 fprintf (stream
, ",");
1272 if (t
.level3_size
<< t
.p
> 8)
1273 fprintf (stream
, "\n ");
1274 fprintf (stream
, " }\n");
1275 fprintf (stream
, "};\n");
1277 if (ferror (stream
) || fclose (stream
))
1279 fprintf (stderr
, "error writing to '%s'\n", filename
);
1284 /* ========================================================================= */
1286 /* Bidirectional category. */
1287 /* See Unicode 3.0 book, section 4.3,
1292 UC_BIDI_L
, /* Left-to-Right */
1293 UC_BIDI_LRE
, /* Left-to-Right Embedding */
1294 UC_BIDI_LRO
, /* Left-to-Right Override */
1295 UC_BIDI_R
, /* Right-to-Left */
1296 UC_BIDI_AL
, /* Right-to-Left Arabic */
1297 UC_BIDI_RLE
, /* Right-to-Left Embedding */
1298 UC_BIDI_RLO
, /* Right-to-Left Override */
1299 UC_BIDI_PDF
, /* Pop Directional Format */
1300 UC_BIDI_EN
, /* European Number */
1301 UC_BIDI_ES
, /* European Number Separator */
1302 UC_BIDI_ET
, /* European Number Terminator */
1303 UC_BIDI_AN
, /* Arabic Number */
1304 UC_BIDI_CS
, /* Common Number Separator */
1305 UC_BIDI_NSM
, /* Non-Spacing Mark */
1306 UC_BIDI_BN
, /* Boundary Neutral */
1307 UC_BIDI_B
, /* Paragraph Separator */
1308 UC_BIDI_S
, /* Segment Separator */
1309 UC_BIDI_WS
, /* Whitespace */
1310 UC_BIDI_ON
, /* Other Neutral */
1311 UC_BIDI_LRI
, /* Left-to-Right Isolate */
1312 UC_BIDI_RLI
, /* Right-to-Left Isolate */
1313 UC_BIDI_FSI
, /* First Strong Isolate */
1314 UC_BIDI_PDI
/* Pop Directional Isolate */
1318 bidi_category_byname (const char *category_name
)
1320 switch (category_name
[0])
1323 switch (category_name
[1])
1326 if (category_name
[2] == '\0')
1330 if (category_name
[2] == '\0')
1336 switch (category_name
[1])
1341 if (category_name
[2] == '\0')
1347 switch (category_name
[1])
1350 if (category_name
[2] == '\0')
1356 switch (category_name
[1])
1359 if (category_name
[2] == '\0')
1363 if (category_name
[2] == '\0')
1367 if (category_name
[2] == '\0')
1373 switch (category_name
[1])
1376 switch (category_name
[2])
1379 if (category_name
[3] == '\0')
1386 switch (category_name
[1])
1391 switch (category_name
[2])
1394 if (category_name
[3] == '\0')
1398 if (category_name
[3] == '\0')
1402 if (category_name
[3] == '\0')
1410 switch (category_name
[1])
1413 switch (category_name
[2])
1416 if (category_name
[3] == '\0')
1424 switch (category_name
[1])
1427 if (category_name
[2] == '\0')
1433 switch (category_name
[1])
1436 switch (category_name
[2])
1439 if (category_name
[3] == '\0')
1443 if (category_name
[3] == '\0')
1451 switch (category_name
[1])
1456 switch (category_name
[2])
1459 if (category_name
[3] == '\0')
1463 if (category_name
[3] == '\0')
1467 if (category_name
[3] == '\0')
1475 if (category_name
[1] == '\0')
1479 switch (category_name
[1])
1482 if (category_name
[2] == '\0')
1488 /* Invalid bidi category name. */
1493 get_bidi_category (unsigned int ch
)
1495 if (unicode_attributes
[ch
].name
!= NULL
)
1496 return bidi_category_byname (unicode_attributes
[ch
].bidi
);
1499 /* The bidi category of unassigned characters depends on the range.
1500 See UTR #9 and DerivedBidiClass.txt. */
1501 if ((ch
>= 0x0590 && ch
<= 0x05FF)
1502 || (ch
>= 0x07FB && ch
<= 0x08FF)
1503 || (ch
>= 0xFB37 && ch
<= 0xFB45)
1504 || (ch
>= 0x10800 && ch
<= 0x10FFF))
1506 else if ((ch
>= 0x0600 && ch
<= 0x07BF)
1507 || (ch
>= 0x2064 && ch
<= 0x2069)
1508 || (ch
>= 0xFBB2 && ch
<= 0xFDCF)
1509 || (ch
>= 0xFDFE && ch
<= 0xFEFE))
1511 else if ((ch
>= 0xFDD0 && ch
<= 0xFDEF)
1512 || (ch
>= 0xFFF0 && ch
<= 0xFFFF)
1513 || (ch
& 0xFFFF) == 0xFFFE
1514 || (ch
& 0xFFFF) == 0xFFFF
1515 || (ch
>= 0xE0000 && ch
<= 0xE0FFF))
1522 /* Construction of sparse 3-level tables. */
1523 #define TABLE bidi_category_table
1524 #define ELEMENT uint8_t
1525 #define DEFAULT UC_BIDI_L
1526 #define xmalloc malloc
1527 #define xrealloc realloc
1530 /* Output the per-character bidi category table. */
1532 output_bidi_category (const char *filename
, const char *version
)
1536 struct bidi_category_table t
;
1537 unsigned int level1_offset
, level2_offset
, level3_offset
;
1538 uint16_t *level3_packed
;
1540 stream
= fopen (filename
, "w");
1543 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1547 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1548 fprintf (stream
, "/* Bidi categories of Unicode characters. */\n");
1549 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1554 bidi_category_table_init (&t
);
1556 for (ch
= 0; ch
< 0x110000; ch
++)
1558 int value
= get_bidi_category (ch
);
1560 assert (value
<= 0x1f);
1562 bidi_category_table_add (&t
, ch
, value
);
1565 bidi_category_table_finalize (&t
);
1567 /* Offsets in t.result, in memory of this process. */
1569 5 * sizeof (uint32_t);
1571 5 * sizeof (uint32_t)
1572 + t
.level1_size
* sizeof (uint32_t);
1574 5 * sizeof (uint32_t)
1575 + t
.level1_size
* sizeof (uint32_t)
1576 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1578 for (i
= 0; i
< 5; i
++)
1579 fprintf (stream
, "#define bidi_category_header_%d %d\n", i
,
1580 ((uint32_t *) t
.result
)[i
]);
1581 fprintf (stream
, "static const\n");
1582 fprintf (stream
, "struct\n");
1583 fprintf (stream
, " {\n");
1584 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
1585 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
1586 fprintf (stream
, " unsigned short level3[%zu * %d + 1];\n", t
.level3_size
,
1587 (1 << t
.p
) * 5 / 16);
1588 fprintf (stream
, " }\n");
1589 fprintf (stream
, "u_bidi_category =\n");
1590 fprintf (stream
, "{\n");
1591 fprintf (stream
, " {");
1592 if (t
.level1_size
> 8)
1593 fprintf (stream
, "\n ");
1594 for (i
= 0; i
< t
.level1_size
; i
++)
1597 if (i
> 0 && (i
% 8) == 0)
1598 fprintf (stream
, "\n ");
1599 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1601 fprintf (stream
, " %5d", -1);
1603 fprintf (stream
, " %5zu",
1604 (offset
- level2_offset
) / sizeof (uint32_t));
1605 if (i
+1 < t
.level1_size
)
1606 fprintf (stream
, ",");
1608 if (t
.level1_size
> 8)
1609 fprintf (stream
, "\n ");
1610 fprintf (stream
, " },\n");
1611 fprintf (stream
, " {");
1612 if (t
.level2_size
<< t
.q
> 8)
1613 fprintf (stream
, "\n ");
1614 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
1617 if (i
> 0 && (i
% 8) == 0)
1618 fprintf (stream
, "\n ");
1619 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
1621 fprintf (stream
, " %5d", -1);
1623 fprintf (stream
, " %5zu",
1624 (offset
- level3_offset
) / sizeof (uint8_t));
1625 if (i
+1 < t
.level2_size
<< t
.q
)
1626 fprintf (stream
, ",");
1628 if (t
.level2_size
<< t
.q
> 8)
1629 fprintf (stream
, "\n ");
1630 fprintf (stream
, " },\n");
1631 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1632 not 32-bit units, in order to make the lookup function easier. */
1635 calloc ((t
.level3_size
<< t
.p
) * 5 / 16 + 1, sizeof (uint16_t));
1636 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
1638 unsigned int j
= (i
* 5) / 16;
1639 unsigned int k
= (i
* 5) % 16;
1640 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
1641 value
= level3_packed
[j
] | (level3_packed
[j
+1] << 16) | (value
<< k
);
1642 level3_packed
[j
] = value
& 0xffff;
1643 level3_packed
[j
+1] = value
>> 16;
1645 fprintf (stream
, " {");
1646 if ((t
.level3_size
<< t
.p
) * 5 / 16 + 1 > 8)
1647 fprintf (stream
, "\n ");
1648 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 5 / 16 + 1; i
++)
1650 if (i
> 0 && (i
% 8) == 0)
1651 fprintf (stream
, "\n ");
1652 fprintf (stream
, " 0x%04x", level3_packed
[i
]);
1653 if (i
+1 < (t
.level3_size
<< t
.p
) * 5 / 16 + 1)
1654 fprintf (stream
, ",");
1656 if ((t
.level3_size
<< t
.p
) * 5 / 16 + 1 > 8)
1657 fprintf (stream
, "\n ");
1658 fprintf (stream
, " }\n");
1659 free (level3_packed
);
1660 fprintf (stream
, "};\n");
1662 if (ferror (stream
) || fclose (stream
))
1664 fprintf (stderr
, "error writing to '%s'\n", filename
);
1669 /* ========================================================================= */
1671 /* Decimal digit value. */
1672 /* See Unicode 3.0 book, section 4.6. */
1675 get_decdigit_value (unsigned int ch
)
1677 if (unicode_attributes
[ch
].name
!= NULL
1678 && unicode_attributes
[ch
].decdigit
[0] != '\0')
1679 return atoi (unicode_attributes
[ch
].decdigit
);
1683 /* Construction of sparse 3-level tables. */
1684 #define TABLE decdigit_table
1685 #define ELEMENT uint8_t
1687 #define xmalloc malloc
1688 #define xrealloc realloc
1691 /* Output the unit test for the per-character decimal digit value table. */
1693 output_decimal_digit_test (const char *filename
, const char *version
)
1699 stream
= fopen (filename
, "w");
1702 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1706 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1707 fprintf (stream
, "/* Decimal digit values of Unicode characters. */\n");
1708 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1712 for (ch
= 0; ch
< 0x110000; ch
++)
1714 int value
= get_decdigit_value (ch
);
1716 assert (value
>= -1 && value
< 10);
1721 fprintf (stream
, ",\n");
1722 fprintf (stream
, " { 0x%04X, %d }", ch
, value
);
1727 fprintf (stream
, "\n");
1729 if (ferror (stream
) || fclose (stream
))
1731 fprintf (stderr
, "error writing to '%s'\n", filename
);
1736 /* Output the per-character decimal digit value table. */
1738 output_decimal_digit (const char *filename
, const char *version
)
1742 struct decdigit_table t
;
1743 unsigned int level1_offset
, level2_offset
, level3_offset
;
1745 stream
= fopen (filename
, "w");
1748 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1752 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1753 fprintf (stream
, "/* Decimal digit values of Unicode characters. */\n");
1754 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1759 decdigit_table_init (&t
);
1761 for (ch
= 0; ch
< 0x110000; ch
++)
1763 int value
= 1 + get_decdigit_value (ch
);
1765 assert (value
>= 0 && value
<= 10);
1767 decdigit_table_add (&t
, ch
, value
);
1770 decdigit_table_finalize (&t
);
1772 /* Offsets in t.result, in memory of this process. */
1774 5 * sizeof (uint32_t);
1776 5 * sizeof (uint32_t)
1777 + t
.level1_size
* sizeof (uint32_t);
1779 5 * sizeof (uint32_t)
1780 + t
.level1_size
* sizeof (uint32_t)
1781 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1783 for (i
= 0; i
< 5; i
++)
1784 fprintf (stream
, "#define decdigit_header_%d %d\n", i
,
1785 ((uint32_t *) t
.result
)[i
]);
1786 fprintf (stream
, "static const\n");
1787 fprintf (stream
, "struct\n");
1788 fprintf (stream
, " {\n");
1789 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
1790 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
1791 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
,
1793 fprintf (stream
, " }\n");
1794 fprintf (stream
, "u_decdigit =\n");
1795 fprintf (stream
, "{\n");
1796 fprintf (stream
, " {");
1797 if (t
.level1_size
> 8)
1798 fprintf (stream
, "\n ");
1799 for (i
= 0; i
< t
.level1_size
; i
++)
1802 if (i
> 0 && (i
% 8) == 0)
1803 fprintf (stream
, "\n ");
1804 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1806 fprintf (stream
, " %5d", -1);
1808 fprintf (stream
, " %5zu",
1809 (offset
- level2_offset
) / sizeof (uint32_t));
1810 if (i
+1 < t
.level1_size
)
1811 fprintf (stream
, ",");
1813 if (t
.level1_size
> 8)
1814 fprintf (stream
, "\n ");
1815 fprintf (stream
, " },\n");
1816 fprintf (stream
, " {");
1817 if (t
.level2_size
<< t
.q
> 8)
1818 fprintf (stream
, "\n ");
1819 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
1822 if (i
> 0 && (i
% 8) == 0)
1823 fprintf (stream
, "\n ");
1824 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
1826 fprintf (stream
, " %5d", -1);
1828 fprintf (stream
, " %5zu",
1829 (offset
- level3_offset
) / sizeof (uint8_t));
1830 if (i
+1 < t
.level2_size
<< t
.q
)
1831 fprintf (stream
, ",");
1833 if (t
.level2_size
<< t
.q
> 8)
1834 fprintf (stream
, "\n ");
1835 fprintf (stream
, " },\n");
1836 /* Pack the level3 array. Each entry needs 4 bits only. */
1837 fprintf (stream
, " {");
1838 if (t
.level3_size
<< (t
.p
- 1) > 8)
1839 fprintf (stream
, "\n ");
1840 for (i
= 0; i
< t
.level3_size
<< (t
.p
- 1); i
++)
1842 if (i
> 0 && (i
% 8) == 0)
1843 fprintf (stream
, "\n ");
1844 fprintf (stream
, " 0x%02x",
1845 ((uint8_t *) (t
.result
+ level3_offset
))[2*i
]
1846 + (((uint8_t *) (t
.result
+ level3_offset
))[2*i
+1] << 4));
1847 if (i
+1 < t
.level3_size
<< (t
.p
- 1))
1848 fprintf (stream
, ",");
1850 if (t
.level3_size
<< (t
.p
- 1) > 8)
1851 fprintf (stream
, "\n ");
1852 fprintf (stream
, " }\n");
1853 fprintf (stream
, "};\n");
1855 if (ferror (stream
) || fclose (stream
))
1857 fprintf (stderr
, "error writing to '%s'\n", filename
);
1862 /* ========================================================================= */
1865 /* See Unicode 3.0 book, section 4.6. */
1868 get_digit_value (unsigned int ch
)
1870 if (unicode_attributes
[ch
].name
!= NULL
1871 && unicode_attributes
[ch
].digit
[0] != '\0')
1872 return atoi (unicode_attributes
[ch
].digit
);
1876 /* Output the unit test for the per-character digit value table. */
1878 output_digit_test (const char *filename
, const char *version
)
1884 stream
= fopen (filename
, "w");
1887 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1891 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1892 fprintf (stream
, "/* Digit values of Unicode characters. */\n");
1893 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1897 for (ch
= 0; ch
< 0x110000; ch
++)
1899 int value
= get_digit_value (ch
);
1901 assert (value
>= -1 && value
< 10);
1906 fprintf (stream
, ",\n");
1907 fprintf (stream
, " { 0x%04X, %d }", ch
, value
);
1912 fprintf (stream
, "\n");
1914 if (ferror (stream
) || fclose (stream
))
1916 fprintf (stderr
, "error writing to '%s'\n", filename
);
1921 /* Output the per-character digit value table. */
1923 output_digit (const char *filename
, const char *version
)
1927 struct decdigit_table t
;
1928 unsigned int level1_offset
, level2_offset
, level3_offset
;
1930 stream
= fopen (filename
, "w");
1933 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1937 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1938 fprintf (stream
, "/* Digit values of Unicode characters. */\n");
1939 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1944 decdigit_table_init (&t
);
1946 for (ch
= 0; ch
< 0x110000; ch
++)
1948 int value
= 1 + get_digit_value (ch
);
1950 assert (value
>= 0 && value
<= 10);
1952 decdigit_table_add (&t
, ch
, value
);
1955 decdigit_table_finalize (&t
);
1957 /* Offsets in t.result, in memory of this process. */
1959 5 * sizeof (uint32_t);
1961 5 * sizeof (uint32_t)
1962 + t
.level1_size
* sizeof (uint32_t);
1964 5 * sizeof (uint32_t)
1965 + t
.level1_size
* sizeof (uint32_t)
1966 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1968 for (i
= 0; i
< 5; i
++)
1969 fprintf (stream
, "#define digit_header_%d %d\n", i
,
1970 ((uint32_t *) t
.result
)[i
]);
1971 fprintf (stream
, "static const\n");
1972 fprintf (stream
, "struct\n");
1973 fprintf (stream
, " {\n");
1974 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
1975 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
1976 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
,
1978 fprintf (stream
, " }\n");
1979 fprintf (stream
, "u_digit =\n");
1980 fprintf (stream
, "{\n");
1981 fprintf (stream
, " {");
1982 if (t
.level1_size
> 8)
1983 fprintf (stream
, "\n ");
1984 for (i
= 0; i
< t
.level1_size
; i
++)
1987 if (i
> 0 && (i
% 8) == 0)
1988 fprintf (stream
, "\n ");
1989 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1991 fprintf (stream
, " %5d", -1);
1993 fprintf (stream
, " %5zu",
1994 (offset
- level2_offset
) / sizeof (uint32_t));
1995 if (i
+1 < t
.level1_size
)
1996 fprintf (stream
, ",");
1998 if (t
.level1_size
> 8)
1999 fprintf (stream
, "\n ");
2000 fprintf (stream
, " },\n");
2001 fprintf (stream
, " {");
2002 if (t
.level2_size
<< t
.q
> 8)
2003 fprintf (stream
, "\n ");
2004 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
2007 if (i
> 0 && (i
% 8) == 0)
2008 fprintf (stream
, "\n ");
2009 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
2011 fprintf (stream
, " %5d", -1);
2013 fprintf (stream
, " %5zu",
2014 (offset
- level3_offset
) / sizeof (uint8_t));
2015 if (i
+1 < t
.level2_size
<< t
.q
)
2016 fprintf (stream
, ",");
2018 if (t
.level2_size
<< t
.q
> 8)
2019 fprintf (stream
, "\n ");
2020 fprintf (stream
, " },\n");
2021 /* Pack the level3 array. Each entry needs 4 bits only. */
2022 fprintf (stream
, " {");
2023 if (t
.level3_size
<< (t
.p
- 1) > 8)
2024 fprintf (stream
, "\n ");
2025 for (i
= 0; i
< t
.level3_size
<< (t
.p
- 1); i
++)
2027 if (i
> 0 && (i
% 8) == 0)
2028 fprintf (stream
, "\n ");
2029 fprintf (stream
, " 0x%02x",
2030 ((uint8_t *) (t
.result
+ level3_offset
))[2*i
]
2031 + (((uint8_t *) (t
.result
+ level3_offset
))[2*i
+1] << 4));
2032 if (i
+1 < t
.level3_size
<< (t
.p
- 1))
2033 fprintf (stream
, ",");
2035 if (t
.level3_size
<< (t
.p
- 1) > 8)
2036 fprintf (stream
, "\n ");
2037 fprintf (stream
, " }\n");
2038 fprintf (stream
, "};\n");
2040 if (ferror (stream
) || fclose (stream
))
2042 fprintf (stderr
, "error writing to '%s'\n", filename
);
2047 /* ========================================================================= */
2049 /* Numeric value. */
2050 /* See Unicode 3.0 book, section 4.6. */
2052 typedef struct { int numerator
; int denominator
; } uc_fraction_t
;
2054 static uc_fraction_t
2055 get_numeric_value (unsigned int ch
)
2057 uc_fraction_t value
;
2059 if (unicode_attributes
[ch
].name
!= NULL
2060 && unicode_attributes
[ch
].numeric
[0] != '\0')
2062 const char *str
= unicode_attributes
[ch
].numeric
;
2063 /* str is of the form "integer" or "integer/posinteger". */
2064 value
.numerator
= atoi (str
);
2065 if (strchr (str
, '/') != NULL
)
2066 value
.denominator
= atoi (strchr (str
, '/') + 1);
2068 value
.denominator
= 1;
2072 value
.numerator
= 0;
2073 value
.denominator
= 0;
2078 /* Output the unit test for the per-character numeric value table. */
2080 output_numeric_test (const char *filename
, const char *version
)
2086 stream
= fopen (filename
, "w");
2089 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
2093 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2094 fprintf (stream
, "/* Numeric values of Unicode characters. */\n");
2095 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2099 for (ch
= 0; ch
< 0x110000; ch
++)
2101 uc_fraction_t value
= get_numeric_value (ch
);
2103 if (value
.numerator
!= 0 || value
.denominator
!= 0)
2106 fprintf (stream
, ",\n");
2107 fprintf (stream
, " { 0x%04X, %d, %d }",
2108 ch
, value
.numerator
, value
.denominator
);
2113 fprintf (stream
, "\n");
2115 if (ferror (stream
) || fclose (stream
))
2117 fprintf (stderr
, "error writing to '%s'\n", filename
);
2122 /* Construction of sparse 3-level tables. */
2123 #define TABLE numeric_table
2124 #define ELEMENT uint8_t
2126 #define xmalloc malloc
2127 #define xrealloc realloc
2130 /* Output the per-character numeric value table. */
2132 output_numeric (const char *filename
, const char *version
)
2135 uc_fraction_t fractions
[160];
2136 unsigned int nfractions
;
2137 unsigned int ch
, i
, j
;
2138 struct numeric_table t
;
2139 unsigned int level1_offset
, level2_offset
, level3_offset
;
2140 uint16_t *level3_packed
;
2142 stream
= fopen (filename
, "w");
2145 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
2149 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2150 fprintf (stream
, "/* Numeric values of Unicode characters. */\n");
2151 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2154 /* Create table of occurring fractions. */
2156 for (ch
= 0; ch
< 0x110000; ch
++)
2158 uc_fraction_t value
= get_numeric_value (ch
);
2160 for (i
= 0; i
< nfractions
; i
++)
2161 if (value
.numerator
== fractions
[i
].numerator
2162 && value
.denominator
== fractions
[i
].denominator
)
2164 if (i
== nfractions
)
2166 assert (nfractions
!= SIZEOF (fractions
));
2167 for (i
= 0; i
< nfractions
; i
++)
2168 if (value
.denominator
< fractions
[i
].denominator
2169 || (value
.denominator
== fractions
[i
].denominator
2170 && value
.numerator
< fractions
[i
].numerator
))
2172 for (j
= nfractions
; j
> i
; j
--)
2173 fractions
[j
] = fractions
[j
- 1];
2174 fractions
[i
] = value
;
2179 fprintf (stream
, "static const uc_fraction_t u_numeric_values[%d] =\n",
2181 fprintf (stream
, "{\n");
2182 for (i
= 0; i
< nfractions
; i
++)
2184 fprintf (stream
, " { %d, %d }", fractions
[i
].numerator
,
2185 fractions
[i
].denominator
);
2186 if (i
+1 < nfractions
)
2187 fprintf (stream
, ",");
2188 fprintf (stream
, "\n");
2190 fprintf (stream
, "};\n");
2194 numeric_table_init (&t
);
2196 for (ch
= 0; ch
< 0x110000; ch
++)
2198 uc_fraction_t value
= get_numeric_value (ch
);
2200 for (i
= 0; i
< nfractions
; i
++)
2201 if (value
.numerator
== fractions
[i
].numerator
2202 && value
.denominator
== fractions
[i
].denominator
)
2204 assert (i
!= nfractions
);
2206 numeric_table_add (&t
, ch
, i
);
2209 numeric_table_finalize (&t
);
2211 /* Offsets in t.result, in memory of this process. */
2213 5 * sizeof (uint32_t);
2215 5 * sizeof (uint32_t)
2216 + t
.level1_size
* sizeof (uint32_t);
2218 5 * sizeof (uint32_t)
2219 + t
.level1_size
* sizeof (uint32_t)
2220 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
2222 for (i
= 0; i
< 5; i
++)
2223 fprintf (stream
, "#define numeric_header_%d %d\n", i
,
2224 ((uint32_t *) t
.result
)[i
]);
2225 fprintf (stream
, "static const\n");
2226 fprintf (stream
, "struct\n");
2227 fprintf (stream
, " {\n");
2228 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
2229 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
2230 fprintf (stream
, " unsigned short level3[%zu * %d + 1];\n", t
.level3_size
,
2231 (1 << t
.p
) * 8 / 16);
2232 fprintf (stream
, " }\n");
2233 fprintf (stream
, "u_numeric =\n");
2234 fprintf (stream
, "{\n");
2235 fprintf (stream
, " {");
2236 if (t
.level1_size
> 8)
2237 fprintf (stream
, "\n ");
2238 for (i
= 0; i
< t
.level1_size
; i
++)
2241 if (i
> 0 && (i
% 8) == 0)
2242 fprintf (stream
, "\n ");
2243 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
2245 fprintf (stream
, " %5d", -1);
2247 fprintf (stream
, " %5zu",
2248 (offset
- level2_offset
) / sizeof (uint32_t));
2249 if (i
+1 < t
.level1_size
)
2250 fprintf (stream
, ",");
2252 if (t
.level1_size
> 8)
2253 fprintf (stream
, "\n ");
2254 fprintf (stream
, " },\n");
2255 fprintf (stream
, " {");
2256 if (t
.level2_size
<< t
.q
> 8)
2257 fprintf (stream
, "\n ");
2258 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
2261 if (i
> 0 && (i
% 8) == 0)
2262 fprintf (stream
, "\n ");
2263 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
2265 fprintf (stream
, " %5d", -1);
2267 fprintf (stream
, " %5zu",
2268 (offset
- level3_offset
) / sizeof (uint8_t));
2269 if (i
+1 < t
.level2_size
<< t
.q
)
2270 fprintf (stream
, ",");
2272 if (t
.level2_size
<< t
.q
> 8)
2273 fprintf (stream
, "\n ");
2274 fprintf (stream
, " },\n");
2275 /* Pack the level3 array. Each entry needs 8 bits only. Use 16-bit units,
2276 not 32-bit units, in order to make the lookup function easier. */
2279 calloc ((t
.level3_size
<< t
.p
) * 8 / 16 + 1, sizeof (uint16_t));
2280 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
2282 unsigned int j
= (i
* 8) / 16;
2283 unsigned int k
= (i
* 8) % 16;
2284 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
2285 value
= level3_packed
[j
] | (level3_packed
[j
+1] << 16) | (value
<< k
);
2286 level3_packed
[j
] = value
& 0xffff;
2287 level3_packed
[j
+1] = value
>> 16;
2289 fprintf (stream
, " {");
2290 if ((t
.level3_size
<< t
.p
) * 8 / 16 + 1 > 8)
2291 fprintf (stream
, "\n ");
2292 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 8 / 16 + 1; i
++)
2294 if (i
> 0 && (i
% 8) == 0)
2295 fprintf (stream
, "\n ");
2296 fprintf (stream
, " 0x%04x", level3_packed
[i
]);
2297 if (i
+1 < (t
.level3_size
<< t
.p
) * 8 / 16 + 1)
2298 fprintf (stream
, ",");
2300 if ((t
.level3_size
<< t
.p
) * 8 / 16 + 1 > 8)
2301 fprintf (stream
, "\n ");
2302 fprintf (stream
, " }\n");
2303 free (level3_packed
);
2304 fprintf (stream
, "};\n");
2306 if (ferror (stream
) || fclose (stream
))
2308 fprintf (stderr
, "error writing to '%s'\n", filename
);
2313 /* ========================================================================= */
2316 /* See Unicode 3.0 book, section 4.7,
2319 /* List of mirrored character pairs. This is a subset of the characters
2320 having the BidiMirrored property. */
2321 static unsigned int mirror_pairs
[][2] =
2378 get_mirror_value (unsigned int ch
)
2381 unsigned int mirror_char
;
2384 mirrored
= (unicode_attributes
[ch
].name
!= NULL
2385 && unicode_attributes
[ch
].mirrored
);
2386 mirror_char
= 0xfffd;
2387 for (i
= 0; i
< sizeof (mirror_pairs
) / sizeof (mirror_pairs
[0]); i
++)
2388 if (ch
== mirror_pairs
[i
][0])
2390 mirror_char
= mirror_pairs
[i
][1];
2393 else if (ch
== mirror_pairs
[i
][1])
2395 mirror_char
= mirror_pairs
[i
][0];
2399 return (int) mirror_char
- (int) ch
;
2402 assert (mirror_char
== 0xfffd);
2407 /* Construction of sparse 3-level tables. */
2408 #define TABLE mirror_table
2409 #define ELEMENT int32_t
2411 #define xmalloc malloc
2412 #define xrealloc realloc
2415 /* Output the per-character mirror table. */
2417 output_mirror (const char *filename
, const char *version
)
2421 struct mirror_table t
;
2422 unsigned int level1_offset
, level2_offset
, level3_offset
;
2424 stream
= fopen (filename
, "w");
2427 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
2431 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2432 fprintf (stream
, "/* Mirrored Unicode characters. */\n");
2433 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2438 mirror_table_init (&t
);
2440 for (ch
= 0; ch
< 0x110000; ch
++)
2442 int value
= get_mirror_value (ch
);
2444 mirror_table_add (&t
, ch
, value
);
2447 mirror_table_finalize (&t
);
2449 /* Offsets in t.result, in memory of this process. */
2451 5 * sizeof (uint32_t);
2453 5 * sizeof (uint32_t)
2454 + t
.level1_size
* sizeof (uint32_t);
2456 5 * sizeof (uint32_t)
2457 + t
.level1_size
* sizeof (uint32_t)
2458 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
2460 for (i
= 0; i
< 5; i
++)
2461 fprintf (stream
, "#define mirror_header_%d %d\n", i
,
2462 ((uint32_t *) t
.result
)[i
]);
2463 fprintf (stream
, "static const\n");
2464 fprintf (stream
, "struct\n");
2465 fprintf (stream
, " {\n");
2466 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
2467 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
2468 fprintf (stream
, " int level3[%zu << %d];\n", t
.level3_size
, t
.p
);
2469 fprintf (stream
, " }\n");
2470 fprintf (stream
, "u_mirror =\n");
2471 fprintf (stream
, "{\n");
2472 fprintf (stream
, " {");
2473 if (t
.level1_size
> 8)
2474 fprintf (stream
, "\n ");
2475 for (i
= 0; i
< t
.level1_size
; i
++)
2478 if (i
> 0 && (i
% 8) == 0)
2479 fprintf (stream
, "\n ");
2480 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
2482 fprintf (stream
, " %5d", -1);
2484 fprintf (stream
, " %5zu",
2485 (offset
- level2_offset
) / sizeof (uint32_t));
2486 if (i
+1 < t
.level1_size
)
2487 fprintf (stream
, ",");
2489 if (t
.level1_size
> 8)
2490 fprintf (stream
, "\n ");
2491 fprintf (stream
, " },\n");
2492 fprintf (stream
, " {");
2493 if (t
.level2_size
<< t
.q
> 8)
2494 fprintf (stream
, "\n ");
2495 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
2498 if (i
> 0 && (i
% 8) == 0)
2499 fprintf (stream
, "\n ");
2500 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
2502 fprintf (stream
, " %5d", -1);
2504 fprintf (stream
, " %5zu",
2505 (offset
- level3_offset
) / sizeof (int32_t));
2506 if (i
+1 < t
.level2_size
<< t
.q
)
2507 fprintf (stream
, ",");
2509 if (t
.level2_size
<< t
.q
> 8)
2510 fprintf (stream
, "\n ");
2511 fprintf (stream
, " },\n");
2512 fprintf (stream
, " {");
2513 if (t
.level3_size
<< t
.p
> 8)
2514 fprintf (stream
, "\n ");
2515 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
2517 if (i
> 0 && (i
% 8) == 0)
2518 fprintf (stream
, "\n ");
2519 fprintf (stream
, " %5d", ((int32_t *) (t
.result
+ level3_offset
))[i
]);
2520 if (i
+1 < t
.level3_size
<< t
.p
)
2521 fprintf (stream
, ",");
2523 if (t
.level3_size
<< t
.p
> 8)
2524 fprintf (stream
, "\n ");
2525 fprintf (stream
, " }\n");
2526 fprintf (stream
, "};\n");
2528 if (ferror (stream
) || fclose (stream
))
2530 fprintf (stderr
, "error writing to '%s'\n", filename
);
2535 /* ========================================================================= */
2537 /* Particular values of the word break property. */
2540 is_WBP_MIDNUMLET (unsigned int ch
)
2542 return (ch
== 0x002E || ch
== 0x2018 || ch
== 0x2019
2543 || ch
== 0x2024 || ch
== 0xFE52 || ch
== 0xFF07 || ch
== 0xFF0E);
2547 is_WBP_MIDLETTER (unsigned int ch
)
2549 return (ch
== 0x00B7 || ch
== 0x05F4 || ch
== 0x2027 || ch
== 0x003A
2550 || ch
== 0x0387 || ch
== 0xFE13 || ch
== 0xFE55 || ch
== 0xFF1A
2554 /* ========================================================================= */
2558 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2567 PROP_QUOTATION_MARK
,
2568 PROP_TERMINAL_PUNCTUATION
,
2571 PROP_ASCII_HEX_DIGIT
,
2572 PROP_OTHER_ALPHABETIC
,
2576 PROP_OTHER_LOWERCASE
,
2577 PROP_OTHER_UPPERCASE
,
2578 PROP_NONCHARACTER_CODE_POINT
,
2579 PROP_OTHER_GRAPHEME_EXTEND
,
2580 PROP_IDS_BINARY_OPERATOR
,
2581 PROP_IDS_TRINARY_OPERATOR
,
2583 PROP_UNIFIED_IDEOGRAPH
,
2584 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT
,
2587 PROP_LOGICAL_ORDER_EXCEPTION
,
2588 PROP_OTHER_ID_START
,
2589 PROP_OTHER_ID_CONTINUE
,
2591 PROP_VARIATION_SELECTOR
,
2592 PROP_PATTERN_WHITE_SPACE
,
2593 PROP_PATTERN_SYNTAX
,
2594 PROP_PREPENDED_CONCATENATION_MARK
,
2595 /* DerivedCoreProperties.txt */
2601 PROP_CASE_IGNORABLE
,
2602 PROP_CHANGES_WHEN_LOWERCASED
,
2603 PROP_CHANGES_WHEN_UPPERCASED
,
2604 PROP_CHANGES_WHEN_TITLECASED
,
2605 PROP_CHANGES_WHEN_CASEFOLDED
,
2606 PROP_CHANGES_WHEN_CASEMAPPED
,
2611 PROP_DEFAULT_IGNORABLE_CODE_POINT
,
2612 PROP_GRAPHEME_EXTEND
,
2616 unsigned long long unicode_properties
[0x110000];
2619 clear_properties (void)
2623 for (i
= 0; i
< 0x110000; i
++)
2624 unicode_properties
[i
] = 0;
2627 /* Stores in unicode_properties[] the properties from the
2628 PropList.txt or DerivedCoreProperties.txt file. */
2630 fill_properties (const char *proplist_filename
)
2635 stream
= fopen (proplist_filename
, "r");
2638 fprintf (stderr
, "error during fopen of '%s'\n", proplist_filename
);
2645 unsigned int i1
, i2
;
2646 char padding
[200+1];
2647 char propname
[200+1];
2648 unsigned int propvalue
;
2650 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
2653 if (buf
[0] == '\0' || buf
[0] == '#')
2656 if (sscanf (buf
, "%X..%X%[ ;]%[^ ]", &i1
, &i2
, padding
, propname
) != 4)
2658 if (sscanf (buf
, "%X%[ ;]%[^ ]", &i1
, padding
, propname
) != 3)
2660 fprintf (stderr
, "parse error in '%s'\n", proplist_filename
);
2665 #define PROP(name,value) \
2666 if (strcmp (propname, name) == 0) propvalue = value; else
2668 PROP ("White_Space", PROP_WHITE_SPACE
)
2669 PROP ("Bidi_Control", PROP_BIDI_CONTROL
)
2670 PROP ("Join_Control", PROP_JOIN_CONTROL
)
2671 PROP ("Dash", PROP_DASH
)
2672 PROP ("Hyphen", PROP_HYPHEN
)
2673 PROP ("Quotation_Mark", PROP_QUOTATION_MARK
)
2674 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION
)
2675 PROP ("Other_Math", PROP_OTHER_MATH
)
2676 PROP ("Hex_Digit", PROP_HEX_DIGIT
)
2677 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT
)
2678 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC
)
2679 PROP ("Ideographic", PROP_IDEOGRAPHIC
)
2680 PROP ("Diacritic", PROP_DIACRITIC
)
2681 PROP ("Extender", PROP_EXTENDER
)
2682 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE
)
2683 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE
)
2684 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT
)
2685 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND
)
2686 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR
)
2687 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR
)
2688 PROP ("Radical", PROP_RADICAL
)
2689 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH
)
2690 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT
)
2691 PROP ("Deprecated", PROP_DEPRECATED
)
2692 PROP ("Soft_Dotted", PROP_SOFT_DOTTED
)
2693 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION
)
2694 PROP ("Other_ID_Start", PROP_OTHER_ID_START
)
2695 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE
)
2696 PROP ("Sentence_Terminal", PROP_STERM
)
2697 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR
)
2698 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE
)
2699 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX
)
2700 PROP ("Prepended_Concatenation_Mark", PROP_PREPENDED_CONCATENATION_MARK
)
2701 /* DerivedCoreProperties.txt */
2702 PROP ("Math", PROP_MATH
)
2703 PROP ("Alphabetic", PROP_ALPHABETIC
)
2704 PROP ("Lowercase", PROP_LOWERCASE
)
2705 PROP ("Uppercase", PROP_UPPERCASE
)
2706 PROP ("Cased", PROP_CASED
)
2707 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE
)
2708 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED
)
2709 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED
)
2710 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED
)
2711 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED
)
2712 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED
)
2713 PROP ("ID_Start", PROP_ID_START
)
2714 PROP ("ID_Continue", PROP_ID_CONTINUE
)
2715 PROP ("XID_Start", PROP_XID_START
)
2716 PROP ("XID_Continue", PROP_XID_CONTINUE
)
2717 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT
)
2718 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND
)
2719 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE
)
2720 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK
)
2723 fprintf (stderr
, "unknown property named '%s' in '%s'\n", propname
,
2727 assert (i1
<= i2
&& i2
< 0x110000);
2729 for (i
= i1
; i
<= i2
; i
++)
2730 unicode_properties
[i
] |= 1ULL << propvalue
;
2733 if (ferror (stream
) || fclose (stream
))
2735 fprintf (stderr
, "error reading from '%s'\n", proplist_filename
);
2740 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2743 fill_property30 (char array
[0x110000], const char *proplist_filename
, const char *property_name
)
2749 for (i
= 0; i
< 0x110000; i
++)
2752 stream
= fopen (proplist_filename
, "r");
2755 fprintf (stderr
, "error during fopen of '%s'\n", proplist_filename
);
2759 /* Search for the "Property dump for: ..." line. */
2762 if (fscanf (stream
, "%100[^\n]\n", buf
) < 1)
2764 fprintf (stderr
, "no property found in '%s'\n", proplist_filename
);
2768 while (strstr (buf
, property_name
) == NULL
);
2772 unsigned int i1
, i2
;
2774 if (fscanf (stream
, "%100[^\n]\n", buf
) < 1)
2778 if (strlen (buf
) >= 10 && buf
[4] == '.' && buf
[5] == '.')
2780 if (sscanf (buf
, "%4X..%4X", &i1
, &i2
) < 2)
2782 fprintf (stderr
, "parse error in property in '%s'\n",
2787 else if (strlen (buf
) >= 4)
2789 if (sscanf (buf
, "%4X", &i1
) < 1)
2791 fprintf (stderr
, "parse error in property in '%s'\n",
2799 fprintf (stderr
, "parse error in property in '%s'\n",
2803 assert (i1
<= i2
&& i2
< 0x110000);
2804 for (i
= i1
; i
<= i2
; i
++)
2808 if (ferror (stream
) || fclose (stream
))
2810 fprintf (stderr
, "error reading from '%s'\n", proplist_filename
);
2815 /* Properties from Unicode 3.0 PropList.txt file. */
2817 /* The paired punctuation property from the PropList.txt file. */
2818 char unicode_pairedpunctuation
[0x110000];
2820 /* The left of pair property from the PropList.txt file. */
2821 char unicode_leftofpair
[0x110000];
2824 fill_properties30 (const char *proplist30_filename
)
2826 fill_property30 (unicode_pairedpunctuation
, proplist30_filename
, "(Paired Punctuation)");
2827 fill_property30 (unicode_leftofpair
, proplist30_filename
, "(Left of Pair)");
2830 /* ------------------------------------------------------------------------- */
2832 /* See PropList.txt, UCD.html. */
2834 is_property_white_space (unsigned int ch
)
2836 return ((unicode_properties
[ch
] & (1ULL << PROP_WHITE_SPACE
)) != 0);
2839 /* See Unicode 3.0 book, section 4.10,
2840 PropList.txt, UCD.html,
2841 DerivedCoreProperties.txt, UCD.html. */
2843 is_property_alphabetic (unsigned int ch
)
2847 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_ALPHABETIC
)) != 0)
2848 /* For some reason, the following are listed as having property
2849 Alphabetic but not as having property Other_Alphabetic. */
2850 || (ch
>= 0x16EE && ch
<= 0x16F0) /* RUNIC SYMBOLS */
2851 || (ch
>= 0x2160 && ch
<= 0x2182) /* ROMAN NUMERALS */
2852 || (ch
>= 0x2185 && ch
<= 0x2188) /* ROMAN NUMERALS */
2853 || (ch
>= 0x24D0 && ch
<= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2854 || (ch
== 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2855 || (ch
>= 0x3021 && ch
<= 0x3029) /* HANGZHOU NUMERAL */
2856 || (ch
>= 0x3038 && ch
<= 0x303A) /* HANGZHOU NUMERAL */
2857 || (ch
>= 0xA6E6 && ch
<= 0xA6EF) /* BAMUM LETTERS */
2858 || (ch
>= 0x10140 && ch
<= 0x10174) /* GREEK ACROPHONICS */
2859 || (ch
== 0x10341) /* GOTHIC LETTER NINETY */
2860 || (ch
== 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2861 || (ch
>= 0x103D1 && ch
<= 0x103D5) /* OLD PERSIAN NUMBERS */
2862 || (ch
>= 0x12400 && ch
<= 0x1246E); /* CUNEIFORM NUMERIC SIGNS */
2864 ((unicode_properties
[ch
] & (1ULL << PROP_ALPHABETIC
)) != 0);
2866 assert (result1
== result2
);
2870 /* See PropList.txt, UCD.html. */
2872 is_property_other_alphabetic (unsigned int ch
)
2874 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_ALPHABETIC
)) != 0);
2877 /* See PropList.txt, UCD.html. */
2879 is_property_not_a_character (unsigned int ch
)
2881 return ((unicode_properties
[ch
] & (1ULL << PROP_NONCHARACTER_CODE_POINT
)) != 0);
2884 /* See PropList.txt, UCD.html,
2885 DerivedCoreProperties.txt, UCD.html. */
2887 is_property_default_ignorable_code_point (unsigned int ch
)
2890 (is_category_Cf (ch
)
2891 && !(ch
>= 0xFFF9 && ch
<= 0xFFFB) /* Annotations */
2892 && !((ch
>= 0x0600 && ch
<= 0x0605) || ch
== 0x06DD || ch
== 0x070F)
2893 /* For some reason, the following are not listed as having property
2894 Default_Ignorable_Code_Point. */
2897 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT
)) != 0)
2898 || ((unicode_properties
[ch
] & (1ULL << PROP_VARIATION_SELECTOR
)) != 0);
2900 ((unicode_properties
[ch
] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT
)) != 0);
2902 assert (result1
== result2
);
2906 /* See PropList.txt, UCD.html. */
2908 is_property_other_default_ignorable_code_point (unsigned int ch
)
2910 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT
)) != 0);
2913 /* See PropList.txt, UCD.html. */
2915 is_property_deprecated (unsigned int ch
)
2917 return ((unicode_properties
[ch
] & (1ULL << PROP_DEPRECATED
)) != 0);
2920 /* See PropList.txt, UCD.html. */
2922 is_property_logical_order_exception (unsigned int ch
)
2924 return ((unicode_properties
[ch
] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION
)) != 0);
2927 /* See PropList.txt, UCD.html. */
2929 is_property_variation_selector (unsigned int ch
)
2931 return ((unicode_properties
[ch
] & (1ULL << PROP_VARIATION_SELECTOR
)) != 0);
2934 /* See PropList-3.0.1.txt. */
2936 is_property_private_use (unsigned int ch
)
2938 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2939 return (ch
>= 0xE000 && ch
<= 0xF8FF)
2940 || (ch
>= 0xF0000 && ch
<= 0xFFFFD)
2941 || (ch
>= 0x100000 && ch
<= 0x10FFFD);
2944 /* See PropList-3.0.1.txt. */
2946 is_property_unassigned_code_value (unsigned int ch
)
2948 return (is_category_Cn (ch
) && !is_property_not_a_character (ch
));
2951 /* See PropList.txt, UCD.html,
2952 DerivedCoreProperties.txt, UCD.html. */
2954 is_property_uppercase (unsigned int ch
)
2958 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_UPPERCASE
)) != 0);
2960 ((unicode_properties
[ch
] & (1ULL << PROP_UPPERCASE
)) != 0);
2962 assert (result1
== result2
);
2966 /* See PropList.txt, UCD.html. */
2968 is_property_other_uppercase (unsigned int ch
)
2970 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_UPPERCASE
)) != 0);
2973 /* See PropList.txt, UCD.html,
2974 DerivedCoreProperties.txt, UCD.html. */
2976 is_property_lowercase (unsigned int ch
)
2980 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_LOWERCASE
)) != 0);
2982 ((unicode_properties
[ch
] & (1ULL << PROP_LOWERCASE
)) != 0);
2984 assert (result1
== result2
);
2988 /* See PropList.txt, UCD.html. */
2990 is_property_other_lowercase (unsigned int ch
)
2992 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_LOWERCASE
)) != 0);
2995 /* See PropList-3.0.1.txt. */
2997 is_property_titlecase (unsigned int ch
)
2999 return is_category_Lt (ch
);
3002 /* See DerivedCoreProperties.txt. */
3004 is_property_cased (unsigned int ch
)
3006 bool result1
= (is_property_lowercase (ch
)
3007 || is_property_uppercase (ch
)
3008 || is_category_Lt (ch
));
3009 bool result2
= ((unicode_properties
[ch
] & (1ULL << PROP_CASED
)) != 0);
3011 assert (result1
== result2
);
3015 /* See DerivedCoreProperties.txt. */
3017 is_property_case_ignorable (unsigned int ch
)
3019 bool result1
= (is_WBP_MIDLETTER (ch
) || is_WBP_MIDNUMLET (ch
)
3021 || is_category_Mn (ch
)
3022 || is_category_Me (ch
)
3023 || is_category_Cf (ch
)
3024 || is_category_Lm (ch
)
3025 || is_category_Sk (ch
));
3026 bool result2
= ((unicode_properties
[ch
] & (1ULL << PROP_CASE_IGNORABLE
)) != 0);
3028 assert (result1
== result2
);
3032 /* See DerivedCoreProperties.txt. */
3034 is_property_changes_when_lowercased (unsigned int ch
)
3036 bool result1
= ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED
)) != 0);
3037 bool result2
= (unicode_attributes
[ch
].name
!= NULL
3038 && unicode_attributes
[ch
].lower
!= NONE
3039 && unicode_attributes
[ch
].lower
!= ch
);
3041 assert (result1
== result2
);
3045 /* See DerivedCoreProperties.txt. */
3047 is_property_changes_when_uppercased (unsigned int ch
)
3049 return ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED
)) != 0);
3052 /* See DerivedCoreProperties.txt. */
3054 is_property_changes_when_titlecased (unsigned int ch
)
3056 return ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_TITLECASED
)) != 0);
3059 /* See DerivedCoreProperties.txt. */
3061 is_property_changes_when_casefolded (unsigned int ch
)
3063 return ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED
)) != 0);
3066 /* See DerivedCoreProperties.txt. */
3068 is_property_changes_when_casemapped (unsigned int ch
)
3070 return ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED
)) != 0);
3073 /* See PropList.txt, UCD.html. */
3075 is_property_soft_dotted (unsigned int ch
)
3077 return ((unicode_properties
[ch
] & (1ULL << PROP_SOFT_DOTTED
)) != 0);
3080 /* See DerivedCoreProperties.txt, UCD.html. */
3082 is_property_id_start (unsigned int ch
)
3084 return ((unicode_properties
[ch
] & (1ULL << PROP_ID_START
)) != 0);
3087 /* See PropList.txt, UCD.html. */
3089 is_property_other_id_start (unsigned int ch
)
3091 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_ID_START
)) != 0);
3094 /* See DerivedCoreProperties.txt, UCD.html. */
3096 is_property_id_continue (unsigned int ch
)
3098 return ((unicode_properties
[ch
] & (1ULL << PROP_ID_CONTINUE
)) != 0);
3101 /* See PropList.txt, UCD.html. */
3103 is_property_other_id_continue (unsigned int ch
)
3105 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_ID_CONTINUE
)) != 0);
3108 /* See DerivedCoreProperties.txt, UCD.html. */
3110 is_property_xid_start (unsigned int ch
)
3112 return ((unicode_properties
[ch
] & (1ULL << PROP_XID_START
)) != 0);
3115 /* See DerivedCoreProperties.txt, UCD.html. */
3117 is_property_xid_continue (unsigned int ch
)
3119 return ((unicode_properties
[ch
] & (1ULL << PROP_XID_CONTINUE
)) != 0);
3122 /* See PropList.txt, UCD.html. */
3124 is_property_pattern_white_space (unsigned int ch
)
3126 return ((unicode_properties
[ch
] & (1ULL << PROP_PATTERN_WHITE_SPACE
)) != 0);
3129 /* See PropList.txt, UCD.html. */
3131 is_property_pattern_syntax (unsigned int ch
)
3133 return ((unicode_properties
[ch
] & (1ULL << PROP_PATTERN_SYNTAX
)) != 0);
3136 /* See PropList.txt, UCD.html. */
3138 is_property_join_control (unsigned int ch
)
3140 return ((unicode_properties
[ch
] & (1ULL << PROP_JOIN_CONTROL
)) != 0);
3143 /* See DerivedCoreProperties.txt, UCD.html. */
3145 is_property_grapheme_base (unsigned int ch
)
3147 return ((unicode_properties
[ch
] & (1ULL << PROP_GRAPHEME_BASE
)) != 0);
3150 /* See DerivedCoreProperties.txt, UCD.html. */
3152 is_property_grapheme_extend (unsigned int ch
)
3154 return ((unicode_properties
[ch
] & (1ULL << PROP_GRAPHEME_EXTEND
)) != 0);
3157 /* See PropList.txt, UCD.html. */
3159 is_property_other_grapheme_extend (unsigned int ch
)
3161 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND
)) != 0);
3164 /* See DerivedCoreProperties.txt, UCD.html. */
3166 is_property_grapheme_link (unsigned int ch
)
3168 return ((unicode_properties
[ch
] & (1ULL << PROP_GRAPHEME_LINK
)) != 0);
3171 /* See PropList.txt, UCD.html. */
3173 is_property_bidi_control (unsigned int ch
)
3175 return ((unicode_properties
[ch
] & (1ULL << PROP_BIDI_CONTROL
)) != 0);
3178 /* See PropList-3.0.1.txt. */
3180 is_property_bidi_left_to_right (unsigned int ch
)
3182 return (get_bidi_category (ch
) == UC_BIDI_L
);
3185 /* See PropList-3.0.1.txt. */
3187 is_property_bidi_hebrew_right_to_left (unsigned int ch
)
3189 return (get_bidi_category (ch
) == UC_BIDI_R
);
3192 /* See PropList-3.0.1.txt. */
3194 is_property_bidi_arabic_right_to_left (unsigned int ch
)
3196 return (get_bidi_category (ch
) == UC_BIDI_AL
);
3199 /* See PropList-3.0.1.txt. */
3201 is_property_bidi_european_digit (unsigned int ch
)
3203 return (get_bidi_category (ch
) == UC_BIDI_EN
);
3206 /* See PropList-3.0.1.txt. */
3208 is_property_bidi_eur_num_separator (unsigned int ch
)
3210 return (get_bidi_category (ch
) == UC_BIDI_ES
);
3213 /* See PropList-3.0.1.txt. */
3215 is_property_bidi_eur_num_terminator (unsigned int ch
)
3217 return (get_bidi_category (ch
) == UC_BIDI_ET
);
3220 /* See PropList-3.0.1.txt. */
3222 is_property_bidi_arabic_digit (unsigned int ch
)
3224 return (get_bidi_category (ch
) == UC_BIDI_AN
);
3227 /* See PropList-3.0.1.txt. */
3229 is_property_bidi_common_separator (unsigned int ch
)
3231 return (get_bidi_category (ch
) == UC_BIDI_CS
);
3234 /* See PropList-3.0.1.txt. */
3236 is_property_bidi_block_separator (unsigned int ch
)
3238 return (get_bidi_category (ch
) == UC_BIDI_B
);
3241 /* See PropList-3.0.1.txt. */
3243 is_property_bidi_segment_separator (unsigned int ch
)
3245 return (get_bidi_category (ch
) == UC_BIDI_S
);
3248 /* See PropList-3.0.1.txt. */
3250 is_property_bidi_whitespace (unsigned int ch
)
3252 return (get_bidi_category (ch
) == UC_BIDI_WS
);
3255 /* See PropList-3.0.1.txt. */
3257 is_property_bidi_non_spacing_mark (unsigned int ch
)
3259 return (get_bidi_category (ch
) == UC_BIDI_NSM
);
3262 /* See PropList-3.0.1.txt. */
3264 is_property_bidi_boundary_neutral (unsigned int ch
)
3266 return (get_bidi_category (ch
) == UC_BIDI_BN
);
3269 /* See PropList-3.0.1.txt. */
3271 is_property_bidi_pdf (unsigned int ch
)
3273 return (get_bidi_category (ch
) == UC_BIDI_PDF
);
3276 /* See PropList-3.0.1.txt. */
3278 is_property_bidi_embedding_or_override (unsigned int ch
)
3280 int category
= get_bidi_category (ch
);
3281 return (category
== UC_BIDI_LRE
|| category
== UC_BIDI_LRO
3282 || category
== UC_BIDI_RLE
|| category
== UC_BIDI_RLO
);
3285 /* See PropList-3.0.1.txt. */
3287 is_property_bidi_other_neutral (unsigned int ch
)
3289 return (get_bidi_category (ch
) == UC_BIDI_ON
);
3292 /* See PropList.txt, UCD.html. */
3294 is_property_hex_digit (unsigned int ch
)
3296 return ((unicode_properties
[ch
] & (1ULL << PROP_HEX_DIGIT
)) != 0);
3299 /* See PropList.txt, UCD.html. */
3301 is_property_ascii_hex_digit (unsigned int ch
)
3303 return ((unicode_properties
[ch
] & (1ULL << PROP_ASCII_HEX_DIGIT
)) != 0);
3306 /* See Unicode 3.0 book, section 4.10,
3307 PropList.txt, UCD.html. */
3309 is_property_ideographic (unsigned int ch
)
3311 return ((unicode_properties
[ch
] & (1ULL << PROP_IDEOGRAPHIC
)) != 0);
3314 /* See PropList.txt, UCD.html. */
3316 is_property_unified_ideograph (unsigned int ch
)
3318 return ((unicode_properties
[ch
] & (1ULL << PROP_UNIFIED_IDEOGRAPH
)) != 0);
3321 /* See PropList.txt, UCD.html. */
3323 is_property_radical (unsigned int ch
)
3325 return ((unicode_properties
[ch
] & (1ULL << PROP_RADICAL
)) != 0);
3328 /* See PropList.txt, UCD.html. */
3330 is_property_ids_binary_operator (unsigned int ch
)
3332 return ((unicode_properties
[ch
] & (1ULL << PROP_IDS_BINARY_OPERATOR
)) != 0);
3335 /* See PropList.txt, UCD.html. */
3337 is_property_ids_trinary_operator (unsigned int ch
)
3339 return ((unicode_properties
[ch
] & (1ULL << PROP_IDS_TRINARY_OPERATOR
)) != 0);
3342 /* See PropList-3.0.1.txt. */
3344 is_property_zero_width (unsigned int ch
)
3346 return is_category_Cf (ch
)
3347 || (unicode_attributes
[ch
].name
!= NULL
3348 && strstr (unicode_attributes
[ch
].name
, "ZERO WIDTH") != NULL
);
3351 /* See PropList-3.0.1.txt. */
3353 is_property_space (unsigned int ch
)
3355 return is_category_Zs (ch
);
3358 /* See PropList-3.0.1.txt. */
3360 is_property_non_break (unsigned int ch
)
3362 /* This is exactly the set of characters having line breaking
3364 return (ch
== 0x00A0 /* NO-BREAK SPACE */
3365 || ch
== 0x034F /* COMBINING GRAPHEME JOINER */
3366 || ch
== 0x035C /* COMBINING DOUBLE BREVE BELOW */
3367 || ch
== 0x035D /* COMBINING DOUBLE BREVE */
3368 || ch
== 0x035E /* COMBINING DOUBLE MACRON */
3369 || ch
== 0x035F /* COMBINING DOUBLE MACRON BELOW */
3370 || ch
== 0x0360 /* COMBINING DOUBLE TILDE */
3371 || ch
== 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3372 || ch
== 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3373 || ch
== 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3374 || ch
== 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3375 || ch
== 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3376 || ch
== 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3377 || ch
== 0x2007 /* FIGURE SPACE */
3378 || ch
== 0x2011 /* NON-BREAKING HYPHEN */
3379 || ch
== 0x202F /* NARROW NO-BREAK SPACE */);
3382 /* See PropList-3.0.1.txt. */
3384 is_property_iso_control (unsigned int ch
)
3387 (unicode_attributes
[ch
].name
!= NULL
3388 && strcmp (unicode_attributes
[ch
].name
, "<control>") == 0);
3390 is_category_Cc (ch
);
3392 assert (result1
== result2
);
3396 /* See PropList-3.0.1.txt. */
3398 is_property_format_control (unsigned int ch
)
3400 return (is_category_Cf (ch
)
3401 && get_bidi_category (ch
) == UC_BIDI_BN
3402 && !is_property_join_control (ch
)
3406 /* See PropList.txt, UCD.html. */
3408 is_property_dash (unsigned int ch
)
3410 return ((unicode_properties
[ch
] & (1ULL << PROP_DASH
)) != 0);
3413 /* See PropList.txt, UCD.html. */
3415 is_property_hyphen (unsigned int ch
)
3417 return ((unicode_properties
[ch
] & (1ULL << PROP_HYPHEN
)) != 0);
3420 /* See PropList-3.0.1.txt. */
3422 is_property_punctuation (unsigned int ch
)
3424 return is_category_P (ch
);
3427 /* See PropList-3.0.1.txt. */
3429 is_property_line_separator (unsigned int ch
)
3431 return is_category_Zl (ch
);
3434 /* See PropList-3.0.1.txt. */
3436 is_property_paragraph_separator (unsigned int ch
)
3438 return is_category_Zp (ch
);
3441 /* See PropList.txt, UCD.html. */
3443 is_property_quotation_mark (unsigned int ch
)
3445 return ((unicode_properties
[ch
] & (1ULL << PROP_QUOTATION_MARK
)) != 0);
3448 /* See PropList.txt, UCD.html. */
3450 is_property_sentence_terminal (unsigned int ch
)
3452 return ((unicode_properties
[ch
] & (1ULL << PROP_STERM
)) != 0);
3455 /* See PropList.txt, UCD.html. */
3457 is_property_terminal_punctuation (unsigned int ch
)
3459 return ((unicode_properties
[ch
] & (1ULL << PROP_TERMINAL_PUNCTUATION
)) != 0);
3462 /* See PropList-3.0.1.txt. */
3464 is_property_currency_symbol (unsigned int ch
)
3466 return is_category_Sc (ch
);
3469 /* See Unicode 3.0 book, section 4.9,
3470 PropList.txt, UCD.html,
3471 DerivedCoreProperties.txt, UCD.html. */
3473 is_property_math (unsigned int ch
)
3477 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_MATH
)) != 0);
3479 ((unicode_properties
[ch
] & (1ULL << PROP_MATH
)) != 0);
3481 assert (result1
== result2
);
3485 /* See PropList.txt, UCD.html. */
3487 is_property_other_math (unsigned int ch
)
3489 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_MATH
)) != 0);
3492 /* See PropList-3.0.1.txt. */
3494 is_property_paired_punctuation (unsigned int ch
)
3496 return unicode_pairedpunctuation
[ch
];
3499 /* See PropList-3.0.1.txt. */
3501 is_property_left_of_pair (unsigned int ch
)
3503 return unicode_leftofpair
[ch
];
3506 /* See PropList-3.0.1.txt. */
3508 is_property_combining (unsigned int ch
)
3510 return (unicode_attributes
[ch
].name
!= NULL
3511 && (strcmp (unicode_attributes
[ch
].combining
, "0") != 0
3512 || is_category_Mc (ch
)
3513 || is_category_Me (ch
)
3514 || is_category_Mn (ch
)));
3517 #if 0 /* same as is_property_bidi_non_spacing_mark */
3518 /* See PropList-3.0.1.txt. */
3520 is_property_non_spacing (unsigned int ch
)
3522 return (unicode_attributes
[ch
].name
!= NULL
3523 && get_bidi_category (ch
) == UC_BIDI_NSM
);
3527 /* See PropList-3.0.1.txt. */
3529 is_property_composite (unsigned int ch
)
3531 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3532 logical in some sense. */
3533 if (ch
>= 0xAC00 && ch
<= 0xD7A4) /* Hangul Syllables */
3535 if (unicode_attributes
[ch
].name
!= NULL
3536 && unicode_attributes
[ch
].decomposition
!= NULL
)
3538 /* Test whether the decomposition contains more than one character,
3539 and the first is not a space. */
3540 const char *decomp
= unicode_attributes
[ch
].decomposition
;
3541 if (decomp
[0] == '<')
3543 decomp
= strchr (decomp
, '>') + 1;
3544 if (decomp
[0] == ' ')
3547 return strchr (decomp
, ' ') != NULL
&& strncmp (decomp
, "0020 ", 5) != 0;
3552 /* See PropList-3.0.1.txt. */
3554 is_property_decimal_digit (unsigned int ch
)
3556 return is_category_Nd (ch
);
3559 /* See PropList-3.0.1.txt. */
3561 is_property_numeric (unsigned int ch
)
3563 return ((get_numeric_value (ch
)).denominator
> 0)
3564 || (ch
== 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3565 || (ch
== 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3568 /* See PropList.txt, UCD.html. */
3570 is_property_diacritic (unsigned int ch
)
3572 return ((unicode_properties
[ch
] & (1ULL << PROP_DIACRITIC
)) != 0);
3575 /* See PropList.txt, UCD.html. */
3577 is_property_extender (unsigned int ch
)
3579 return ((unicode_properties
[ch
] & (1ULL << PROP_EXTENDER
)) != 0);
3582 /* See PropList-3.0.1.txt. */
3584 is_property_ignorable_control (unsigned int ch
)
3586 return ((is_category_Cc (ch
) && get_bidi_category (ch
) == UC_BIDI_BN
)
3587 || is_category_Cf (ch
))
3591 /* ------------------------------------------------------------------------- */
3593 /* Output all properties. */
3595 output_properties (const char *version
)
3597 #define PROPERTY(P) \
3598 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3599 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3600 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3601 PROPERTY(white_space
)
3602 PROPERTY(alphabetic
)
3603 PROPERTY(other_alphabetic
)
3604 PROPERTY(not_a_character
)
3605 PROPERTY(default_ignorable_code_point
)
3606 PROPERTY(other_default_ignorable_code_point
)
3607 PROPERTY(deprecated
)
3608 PROPERTY(logical_order_exception
)
3609 PROPERTY(variation_selector
)
3610 PROPERTY(private_use
)
3611 PROPERTY(unassigned_code_value
)
3613 PROPERTY(other_uppercase
)
3615 PROPERTY(other_lowercase
)
3618 PROPERTY(case_ignorable
)
3619 PROPERTY(changes_when_lowercased
)
3620 PROPERTY(changes_when_uppercased
)
3621 PROPERTY(changes_when_titlecased
)
3622 PROPERTY(changes_when_casefolded
)
3623 PROPERTY(changes_when_casemapped
)
3624 PROPERTY(soft_dotted
)
3626 PROPERTY(other_id_start
)
3627 PROPERTY(id_continue
)
3628 PROPERTY(other_id_continue
)
3630 PROPERTY(xid_continue
)
3631 PROPERTY(pattern_white_space
)
3632 PROPERTY(pattern_syntax
)
3633 PROPERTY(join_control
)
3634 PROPERTY(grapheme_base
)
3635 PROPERTY(grapheme_extend
)
3636 PROPERTY(other_grapheme_extend
)
3637 PROPERTY(grapheme_link
)
3638 PROPERTY(bidi_control
)
3639 PROPERTY(bidi_left_to_right
)
3640 PROPERTY(bidi_hebrew_right_to_left
)
3641 PROPERTY(bidi_arabic_right_to_left
)
3642 PROPERTY(bidi_european_digit
)
3643 PROPERTY(bidi_eur_num_separator
)
3644 PROPERTY(bidi_eur_num_terminator
)
3645 PROPERTY(bidi_arabic_digit
)
3646 PROPERTY(bidi_common_separator
)
3647 PROPERTY(bidi_block_separator
)
3648 PROPERTY(bidi_segment_separator
)
3649 PROPERTY(bidi_whitespace
)
3650 PROPERTY(bidi_non_spacing_mark
)
3651 PROPERTY(bidi_boundary_neutral
)
3653 PROPERTY(bidi_embedding_or_override
)
3654 PROPERTY(bidi_other_neutral
)
3656 PROPERTY(ascii_hex_digit
)
3657 PROPERTY(ideographic
)
3658 PROPERTY(unified_ideograph
)
3660 PROPERTY(ids_binary_operator
)
3661 PROPERTY(ids_trinary_operator
)
3662 PROPERTY(zero_width
)
3665 PROPERTY(iso_control
)
3666 PROPERTY(format_control
)
3669 PROPERTY(punctuation
)
3670 PROPERTY(line_separator
)
3671 PROPERTY(paragraph_separator
)
3672 PROPERTY(quotation_mark
)
3673 PROPERTY(sentence_terminal
)
3674 PROPERTY(terminal_punctuation
)
3675 PROPERTY(currency_symbol
)
3677 PROPERTY(other_math
)
3678 PROPERTY(paired_punctuation
)
3679 PROPERTY(left_of_pair
)
3682 PROPERTY(decimal_digit
)
3686 PROPERTY(ignorable_control
)
3690 /* ========================================================================= */
3692 /* Arabic Shaping. */
3696 UC_JOINING_TYPE_U
, /* Non_Joining */
3697 UC_JOINING_TYPE_T
, /* Transparent */
3698 UC_JOINING_TYPE_C
, /* Join_Causing */
3699 UC_JOINING_TYPE_L
, /* Left_Joining */
3700 UC_JOINING_TYPE_R
, /* Right_Joining */
3701 UC_JOINING_TYPE_D
/* Dual_Joining */
3704 static uint8_t unicode_joining_type
[0x110000];
3708 UC_JOINING_GROUP_NONE
, /* No_Joining_Group */
3709 UC_JOINING_GROUP_AIN
, /* Ain */
3710 UC_JOINING_GROUP_ALAPH
, /* Alaph */
3711 UC_JOINING_GROUP_ALEF
, /* Alef */
3712 UC_JOINING_GROUP_BEH
, /* Beh */
3713 UC_JOINING_GROUP_BETH
, /* Beth */
3714 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE
, /* Burushaski_Yeh_Barree */
3715 UC_JOINING_GROUP_DAL
, /* Dal */
3716 UC_JOINING_GROUP_DALATH_RISH
, /* Dalath_Rish */
3717 UC_JOINING_GROUP_E
, /* E */
3718 UC_JOINING_GROUP_FARSI_YEH
, /* Farsi_Yeh */
3719 UC_JOINING_GROUP_FE
, /* Fe */
3720 UC_JOINING_GROUP_FEH
, /* Feh */
3721 UC_JOINING_GROUP_FINAL_SEMKATH
, /* Final_Semkath */
3722 UC_JOINING_GROUP_GAF
, /* Gaf */
3723 UC_JOINING_GROUP_GAMAL
, /* Gamal */
3724 UC_JOINING_GROUP_HAH
, /* Hah */
3725 UC_JOINING_GROUP_HE
, /* He */
3726 UC_JOINING_GROUP_HEH
, /* Heh */
3727 UC_JOINING_GROUP_HEH_GOAL
, /* Heh_Goal */
3728 UC_JOINING_GROUP_HETH
, /* Heth */
3729 UC_JOINING_GROUP_KAF
, /* Kaf */
3730 UC_JOINING_GROUP_KAPH
, /* Kaph */
3731 UC_JOINING_GROUP_KHAPH
, /* Khaph */
3732 UC_JOINING_GROUP_KNOTTED_HEH
, /* Knotted_Heh */
3733 UC_JOINING_GROUP_LAM
, /* Lam */
3734 UC_JOINING_GROUP_LAMADH
, /* Lamadh */
3735 UC_JOINING_GROUP_MEEM
, /* Meem */
3736 UC_JOINING_GROUP_MIM
, /* Mim */
3737 UC_JOINING_GROUP_NOON
, /* Noon */
3738 UC_JOINING_GROUP_NUN
, /* Nun */
3739 UC_JOINING_GROUP_NYA
, /* Nya */
3740 UC_JOINING_GROUP_PE
, /* Pe */
3741 UC_JOINING_GROUP_QAF
, /* Qaf */
3742 UC_JOINING_GROUP_QAPH
, /* Qaph */
3743 UC_JOINING_GROUP_REH
, /* Reh */
3744 UC_JOINING_GROUP_REVERSED_PE
, /* Reversed_Pe */
3745 UC_JOINING_GROUP_SAD
, /* Sad */
3746 UC_JOINING_GROUP_SADHE
, /* Sadhe */
3747 UC_JOINING_GROUP_SEEN
, /* Seen */
3748 UC_JOINING_GROUP_SEMKATH
, /* Semkath */
3749 UC_JOINING_GROUP_SHIN
, /* Shin */
3750 UC_JOINING_GROUP_SWASH_KAF
, /* Swash_Kaf */
3751 UC_JOINING_GROUP_SYRIAC_WAW
, /* Syriac_Waw */
3752 UC_JOINING_GROUP_TAH
, /* Tah */
3753 UC_JOINING_GROUP_TAW
, /* Taw */
3754 UC_JOINING_GROUP_TEH_MARBUTA
, /* Teh_Marbuta */
3755 UC_JOINING_GROUP_TEH_MARBUTA_GOAL
, /* Teh_Marbuta_Goal */
3756 UC_JOINING_GROUP_TETH
, /* Teth */
3757 UC_JOINING_GROUP_WAW
, /* Waw */
3758 UC_JOINING_GROUP_YEH
, /* Yeh */
3759 UC_JOINING_GROUP_YEH_BARREE
, /* Yeh_Barree */
3760 UC_JOINING_GROUP_YEH_WITH_TAIL
, /* Yeh_With_Tail */
3761 UC_JOINING_GROUP_YUDH
, /* Yudh */
3762 UC_JOINING_GROUP_YUDH_HE
, /* Yudh_He */
3763 UC_JOINING_GROUP_ZAIN
, /* Zain */
3764 UC_JOINING_GROUP_ZHAIN
, /* Zhain */
3765 UC_JOINING_GROUP_ROHINGYA_YEH
, /* Rohingya_Yeh */
3766 UC_JOINING_GROUP_STRAIGHT_WAW
, /* Straight_Waw */
3767 UC_JOINING_GROUP_MANICHAEAN_ALEPH
, /* Manichaean_Aleph */
3768 UC_JOINING_GROUP_MANICHAEAN_BETH
, /* Manichaean_Beth */
3769 UC_JOINING_GROUP_MANICHAEAN_GIMEL
, /* Manichaean_Gimel */
3770 UC_JOINING_GROUP_MANICHAEAN_DALETH
, /* Manichaean_Daleth */
3771 UC_JOINING_GROUP_MANICHAEAN_WAW
, /* Manichaean_Waw */
3772 UC_JOINING_GROUP_MANICHAEAN_ZAYIN
, /* Manichaean_Zayin */
3773 UC_JOINING_GROUP_MANICHAEAN_HETH
, /* Manichaean_Heth */
3774 UC_JOINING_GROUP_MANICHAEAN_TETH
, /* Manichaean_Teth */
3775 UC_JOINING_GROUP_MANICHAEAN_YODH
, /* Manichaean_Yodh */
3776 UC_JOINING_GROUP_MANICHAEAN_KAPH
, /* Manichaean_Kaph */
3777 UC_JOINING_GROUP_MANICHAEAN_LAMEDH
, /* Manichaean_Lamedh */
3778 UC_JOINING_GROUP_MANICHAEAN_DHAMEDH
, /* Manichaean_Dhamedh */
3779 UC_JOINING_GROUP_MANICHAEAN_THAMEDH
, /* Manichaean_Thamedh */
3780 UC_JOINING_GROUP_MANICHAEAN_MEM
, /* Manichaean_Mem */
3781 UC_JOINING_GROUP_MANICHAEAN_NUN
, /* Manichaean_Nun */
3782 UC_JOINING_GROUP_MANICHAEAN_SAMEKH
, /* Manichaean_Aleph */
3783 UC_JOINING_GROUP_MANICHAEAN_AYIN
, /* Manichaean_Ayin */
3784 UC_JOINING_GROUP_MANICHAEAN_PE
, /* Manichaean_Pe */
3785 UC_JOINING_GROUP_MANICHAEAN_SADHE
, /* Manichaean_Sadhe */
3786 UC_JOINING_GROUP_MANICHAEAN_QOPH
, /* Manichaean_Qoph */
3787 UC_JOINING_GROUP_MANICHAEAN_RESH
, /* Manichaean_Resh */
3788 UC_JOINING_GROUP_MANICHAEAN_TAW
, /* Manichaean_Taw */
3789 UC_JOINING_GROUP_MANICHAEAN_ONE
, /* Manichaean_One */
3790 UC_JOINING_GROUP_MANICHAEAN_FIVE
, /* Manichaean_Five */
3791 UC_JOINING_GROUP_MANICHAEAN_TEN
, /* Manichaean_Ten */
3792 UC_JOINING_GROUP_MANICHAEAN_TWENTY
, /* Manichaean_Twenty */
3793 UC_JOINING_GROUP_MANICHAEAN_HUNDRED
, /* Manichaean_Hundred */
3794 UC_JOINING_GROUP_AFRICAN_FEH
, /* African_Feh */
3795 UC_JOINING_GROUP_AFRICAN_QAF
, /* African_Qaf */
3796 UC_JOINING_GROUP_AFRICAN_NOON
/* African_Noon */
3799 static uint8_t unicode_joining_group
[0x110000];
3802 fill_arabicshaping (const char *arabicshaping_filename
)
3808 stream
= fopen (arabicshaping_filename
, "r");
3811 fprintf (stderr
, "error during fopen of '%s'\n", arabicshaping_filename
);
3815 for (i
= 0; i
< 0x110000; i
++)
3817 unicode_joining_type
[i
] = (uint8_t)~(uint8_t)0;
3818 unicode_joining_group
[i
] = UC_JOINING_GROUP_NONE
;
3825 char separator1
[200+1];
3826 char schematic_name
[200+1];
3827 char separator2
[200+1];
3828 char joining_type_name
[200+1];
3829 char separator3
[200+1];
3830 char joining_group_name
[200+1];
3835 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
3838 if (buf
[0] == '\0' || buf
[0] == '#')
3841 if (sscanf (buf
, "%X%[; ]%[^;]%[; ]%[^;]%[; ]%100[^\n]",
3842 &i
, separator1
, schematic_name
, separator2
, joining_type_name
,
3843 separator3
, joining_group_name
) != 7)
3845 fprintf (stderr
, "parse error in '%s':%d\n",
3846 arabicshaping_filename
, lineno
);
3849 assert (i
< 0x110000);
3851 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
3853 TRY(UC_JOINING_TYPE_U
)
3854 TRY(UC_JOINING_TYPE_T
)
3855 TRY(UC_JOINING_TYPE_C
)
3856 TRY(UC_JOINING_TYPE_L
)
3857 TRY(UC_JOINING_TYPE_R
)
3858 TRY(UC_JOINING_TYPE_D
)
3862 fprintf (stderr
, "unknown joining type value \"%s\" in '%s':%d\n",
3863 joining_type_name
, arabicshaping_filename
, lineno
);
3867 /* Remove trailing spaces. */
3868 while (joining_group_name
[0] != '\0'
3869 && joining_group_name
[strlen (joining_group_name
) - 1] == ' ')
3870 joining_group_name
[strlen (joining_group_name
) - 1] = '\0';
3872 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
3874 TRY(UC_JOINING_GROUP_NONE
, "No_Joining_Group")
3875 TRY(UC_JOINING_GROUP_AIN
, "AIN")
3876 TRY(UC_JOINING_GROUP_ALAPH
, "ALAPH")
3877 TRY(UC_JOINING_GROUP_ALEF
, "ALEF")
3878 TRY(UC_JOINING_GROUP_BEH
, "BEH")
3879 TRY(UC_JOINING_GROUP_BETH
, "BETH")
3880 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE
, "BURUSHASKI YEH BARREE")
3881 TRY(UC_JOINING_GROUP_DAL
, "DAL")
3882 TRY(UC_JOINING_GROUP_DALATH_RISH
, "DALATH RISH")
3883 TRY(UC_JOINING_GROUP_E
, "E")
3884 TRY(UC_JOINING_GROUP_FARSI_YEH
, "FARSI YEH")
3885 TRY(UC_JOINING_GROUP_FE
, "FE")
3886 TRY(UC_JOINING_GROUP_FEH
, "FEH")
3887 TRY(UC_JOINING_GROUP_FINAL_SEMKATH
, "FINAL SEMKATH")
3888 TRY(UC_JOINING_GROUP_GAF
, "GAF")
3889 TRY(UC_JOINING_GROUP_GAMAL
, "GAMAL")
3890 TRY(UC_JOINING_GROUP_HAH
, "HAH")
3891 TRY(UC_JOINING_GROUP_HE
, "HE")
3892 TRY(UC_JOINING_GROUP_HEH
, "HEH")
3893 TRY(UC_JOINING_GROUP_HEH_GOAL
, "HEH GOAL")
3894 TRY(UC_JOINING_GROUP_HETH
, "HETH")
3895 TRY(UC_JOINING_GROUP_KAF
, "KAF")
3896 TRY(UC_JOINING_GROUP_KAPH
, "KAPH")
3897 TRY(UC_JOINING_GROUP_KHAPH
, "KHAPH")
3898 TRY(UC_JOINING_GROUP_KNOTTED_HEH
, "KNOTTED HEH")
3899 TRY(UC_JOINING_GROUP_LAM
, "LAM")
3900 TRY(UC_JOINING_GROUP_LAMADH
, "LAMADH")
3901 TRY(UC_JOINING_GROUP_MEEM
, "MEEM")
3902 TRY(UC_JOINING_GROUP_MIM
, "MIM")
3903 TRY(UC_JOINING_GROUP_NOON
, "NOON")
3904 TRY(UC_JOINING_GROUP_NUN
, "NUN")
3905 TRY(UC_JOINING_GROUP_NYA
, "NYA")
3906 TRY(UC_JOINING_GROUP_PE
, "PE")
3907 TRY(UC_JOINING_GROUP_QAF
, "QAF")
3908 TRY(UC_JOINING_GROUP_QAPH
, "QAPH")
3909 TRY(UC_JOINING_GROUP_REH
, "REH")
3910 TRY(UC_JOINING_GROUP_REVERSED_PE
, "REVERSED PE")
3911 TRY(UC_JOINING_GROUP_SAD
, "SAD")
3912 TRY(UC_JOINING_GROUP_SADHE
, "SADHE")
3913 TRY(UC_JOINING_GROUP_SEEN
, "SEEN")
3914 TRY(UC_JOINING_GROUP_SEMKATH
, "SEMKATH")
3915 TRY(UC_JOINING_GROUP_SHIN
, "SHIN")
3916 TRY(UC_JOINING_GROUP_SWASH_KAF
, "SWASH KAF")
3917 TRY(UC_JOINING_GROUP_SYRIAC_WAW
, "SYRIAC WAW")
3918 TRY(UC_JOINING_GROUP_TAH
, "TAH")
3919 TRY(UC_JOINING_GROUP_TAW
, "TAW")
3920 TRY(UC_JOINING_GROUP_TEH_MARBUTA
, "TEH MARBUTA")
3921 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL
, "TEH MARBUTA GOAL")
3922 TRY(UC_JOINING_GROUP_TETH
, "TETH")
3923 TRY(UC_JOINING_GROUP_WAW
, "WAW")
3924 TRY(UC_JOINING_GROUP_YEH
, "YEH")
3925 TRY(UC_JOINING_GROUP_YEH_BARREE
, "YEH BARREE")
3926 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL
, "YEH WITH TAIL")
3927 TRY(UC_JOINING_GROUP_YUDH
, "YUDH")
3928 TRY(UC_JOINING_GROUP_YUDH_HE
, "YUDH HE")
3929 TRY(UC_JOINING_GROUP_ZAIN
, "ZAIN")
3930 TRY(UC_JOINING_GROUP_ZHAIN
, "ZHAIN")
3931 TRY(UC_JOINING_GROUP_ROHINGYA_YEH
, "ROHINGYA YEH")
3932 TRY(UC_JOINING_GROUP_STRAIGHT_WAW
, "STRAIGHT WAW")
3933 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH
, "MANICHAEAN ALEPH")
3934 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH
, "MANICHAEAN BETH")
3935 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL
, "MANICHAEAN GIMEL")
3936 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH
, "MANICHAEAN DALETH")
3937 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW
, "MANICHAEAN WAW")
3938 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN
, "MANICHAEAN ZAYIN")
3939 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH
, "MANICHAEAN HETH")
3940 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH
, "MANICHAEAN TETH")
3941 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH
, "MANICHAEAN YODH")
3942 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH
, "MANICHAEAN KAPH")
3943 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH
, "MANICHAEAN LAMEDH")
3944 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH
, "MANICHAEAN DHAMEDH")
3945 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH
, "MANICHAEAN THAMEDH")
3946 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM
, "MANICHAEAN MEM")
3947 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN
, "MANICHAEAN NUN")
3948 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH
, "MANICHAEAN SAMEKH")
3949 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN
, "MANICHAEAN AYIN")
3950 TRY(UC_JOINING_GROUP_MANICHAEAN_PE
, "MANICHAEAN PE")
3951 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE
, "MANICHAEAN SADHE")
3952 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH
, "MANICHAEAN QOPH")
3953 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH
, "MANICHAEAN RESH")
3954 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW
, "MANICHAEAN TAW")
3955 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE
, "MANICHAEAN ONE")
3956 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE
, "MANICHAEAN FIVE")
3957 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN
, "MANICHAEAN TEN")
3958 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY
, "MANICHAEAN TWENTY")
3959 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED
, "MANICHAEAN HUNDRED")
3960 TRY(UC_JOINING_GROUP_AFRICAN_FEH
, "AFRICAN FEH")
3961 TRY(UC_JOINING_GROUP_AFRICAN_QAF
, "AFRICAN QAF")
3962 TRY(UC_JOINING_GROUP_AFRICAN_NOON
, "AFRICAN NOON")
3966 fprintf (stderr
, "unknown joining group value \"%s\" in '%s':%d\n",
3967 joining_group_name
, arabicshaping_filename
, lineno
);
3971 unicode_joining_type
[i
] = joining_type
;
3972 unicode_joining_group
[i
] = joining_group
;
3975 if (ferror (stream
) || fclose (stream
))
3977 fprintf (stderr
, "error reading from '%s'\n", arabicshaping_filename
);
3982 /* Convert a Joining_Type value to a C identifier. */
3984 joining_type_as_c_identifier (int joining_type
)
3986 #define TRY(value) if (joining_type == value) return #value;
3987 TRY(UC_JOINING_TYPE_U
)
3988 TRY(UC_JOINING_TYPE_T
)
3989 TRY(UC_JOINING_TYPE_C
)
3990 TRY(UC_JOINING_TYPE_L
)
3991 TRY(UC_JOINING_TYPE_R
)
3992 TRY(UC_JOINING_TYPE_D
)
3998 output_joining_type_test (const char *filename
, const char *version
)
4004 stream
= fopen (filename
, "w");
4007 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4011 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4012 fprintf (stream
, "/* Arabic joining type of Unicode characters. */\n");
4013 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4017 for (ch
= 0; ch
< 0x110000; ch
++)
4019 int value
= unicode_joining_type
[ch
];
4021 if (value
!= (uint8_t)~(uint8_t)0)
4024 fprintf (stream
, ",\n");
4025 fprintf (stream
, " { 0x%04X, %s }", ch
, joining_type_as_c_identifier (value
));
4030 fprintf (stream
, "\n");
4032 if (ferror (stream
) || fclose (stream
))
4034 fprintf (stderr
, "error writing to '%s'\n", filename
);
4039 /* Construction of sparse 3-level tables. */
4040 #define TABLE joining_type_table
4041 #define ELEMENT uint8_t
4042 #define DEFAULT (uint8_t)~(uint8_t)0
4043 #define xmalloc malloc
4044 #define xrealloc realloc
4048 output_joining_type (const char *filename
, const char *version
)
4052 struct joining_type_table t
;
4053 unsigned int level1_offset
, level2_offset
, level3_offset
;
4054 uint8_t *level3_packed
;
4056 stream
= fopen (filename
, "w");
4059 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4063 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4064 fprintf (stream
, "/* Arabic joining type of Unicode characters. */\n");
4065 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4070 joining_type_table_init (&t
);
4072 for (ch
= 0; ch
< 0x110000; ch
++)
4074 uint8_t value
= unicode_joining_type
[ch
];
4076 assert (value
== (uint8_t)~(uint8_t)0 || value
<= 0x0f);
4078 joining_type_table_add (&t
, ch
, value
);
4081 joining_type_table_finalize (&t
);
4083 /* Offsets in t.result, in memory of this process. */
4085 5 * sizeof (uint32_t);
4087 5 * sizeof (uint32_t)
4088 + t
.level1_size
* sizeof (uint32_t);
4090 5 * sizeof (uint32_t)
4091 + t
.level1_size
* sizeof (uint32_t)
4092 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
4094 for (i
= 0; i
< 5; i
++)
4095 fprintf (stream
, "#define joining_type_header_%d %d\n", i
,
4096 ((uint32_t *) t
.result
)[i
]);
4097 fprintf (stream
, "static const\n");
4098 fprintf (stream
, "struct\n");
4099 fprintf (stream
, " {\n");
4100 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
4101 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
4102 fprintf (stream
, " unsigned char level3[%zu * %d];\n", t
.level3_size
,
4103 (1 << t
.p
) * 4 / 8);
4104 fprintf (stream
, " }\n");
4105 fprintf (stream
, "u_joining_type =\n");
4106 fprintf (stream
, "{\n");
4107 fprintf (stream
, " {");
4108 if (t
.level1_size
> 8)
4109 fprintf (stream
, "\n ");
4110 for (i
= 0; i
< t
.level1_size
; i
++)
4113 if (i
> 0 && (i
% 8) == 0)
4114 fprintf (stream
, "\n ");
4115 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
4117 fprintf (stream
, " %5d", -1);
4119 fprintf (stream
, " %5zu",
4120 (offset
- level2_offset
) / sizeof (uint32_t));
4121 if (i
+1 < t
.level1_size
)
4122 fprintf (stream
, ",");
4124 if (t
.level1_size
> 8)
4125 fprintf (stream
, "\n ");
4126 fprintf (stream
, " },\n");
4127 fprintf (stream
, " {");
4128 if (t
.level2_size
<< t
.q
> 8)
4129 fprintf (stream
, "\n ");
4130 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
4133 if (i
> 0 && (i
% 8) == 0)
4134 fprintf (stream
, "\n ");
4135 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
4137 fprintf (stream
, " %5d", -1);
4139 fprintf (stream
, " %5zu",
4140 (offset
- level3_offset
) / sizeof (uint8_t));
4141 if (i
+1 < t
.level2_size
<< t
.q
)
4142 fprintf (stream
, ",");
4144 if (t
.level2_size
<< t
.q
> 8)
4145 fprintf (stream
, "\n ");
4146 fprintf (stream
, " },\n");
4147 /* Pack the level3 array. Each entry needs 4 bits only. */
4149 (uint8_t *) calloc ((t
.level3_size
<< t
.p
) * 4 / 8, sizeof (uint8_t));
4150 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
4152 unsigned int j
= (i
* 4) / 8;
4153 unsigned int k
= (i
* 4) % 8;
4154 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
] & 0x0f;
4155 level3_packed
[j
] |= (value
<< k
);
4157 fprintf (stream
, " {");
4158 if ((t
.level3_size
<< t
.p
) * 4 / 8 > 8)
4159 fprintf (stream
, "\n ");
4160 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 4 / 8; i
++)
4162 if (i
> 0 && (i
% 8) == 0)
4163 fprintf (stream
, "\n ");
4164 fprintf (stream
, " 0x%02x", level3_packed
[i
]);
4165 if (i
+1 < (t
.level3_size
<< t
.p
) * 4 / 8)
4166 fprintf (stream
, ",");
4168 if ((t
.level3_size
<< t
.p
) * 4 / 8 > 8)
4169 fprintf (stream
, "\n ");
4170 fprintf (stream
, " }\n");
4171 free (level3_packed
);
4172 fprintf (stream
, "};\n");
4174 if (ferror (stream
) || fclose (stream
))
4176 fprintf (stderr
, "error writing to '%s'\n", filename
);
4181 /* Convert a Joining_Group value to a C identifier. */
4183 joining_group_as_c_identifier (int joining_group
)
4185 #define TRY(value) if (joining_group == value) return #value;
4186 TRY(UC_JOINING_GROUP_NONE
)
4187 TRY(UC_JOINING_GROUP_AIN
)
4188 TRY(UC_JOINING_GROUP_ALAPH
)
4189 TRY(UC_JOINING_GROUP_ALEF
)
4190 TRY(UC_JOINING_GROUP_BEH
)
4191 TRY(UC_JOINING_GROUP_BETH
)
4192 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE
)
4193 TRY(UC_JOINING_GROUP_DAL
)
4194 TRY(UC_JOINING_GROUP_DALATH_RISH
)
4195 TRY(UC_JOINING_GROUP_E
)
4196 TRY(UC_JOINING_GROUP_FARSI_YEH
)
4197 TRY(UC_JOINING_GROUP_FE
)
4198 TRY(UC_JOINING_GROUP_FEH
)
4199 TRY(UC_JOINING_GROUP_FINAL_SEMKATH
)
4200 TRY(UC_JOINING_GROUP_GAF
)
4201 TRY(UC_JOINING_GROUP_GAMAL
)
4202 TRY(UC_JOINING_GROUP_HAH
)
4203 TRY(UC_JOINING_GROUP_HE
)
4204 TRY(UC_JOINING_GROUP_HEH
)
4205 TRY(UC_JOINING_GROUP_HEH_GOAL
)
4206 TRY(UC_JOINING_GROUP_HETH
)
4207 TRY(UC_JOINING_GROUP_KAF
)
4208 TRY(UC_JOINING_GROUP_KAPH
)
4209 TRY(UC_JOINING_GROUP_KHAPH
)
4210 TRY(UC_JOINING_GROUP_KNOTTED_HEH
)
4211 TRY(UC_JOINING_GROUP_LAM
)
4212 TRY(UC_JOINING_GROUP_LAMADH
)
4213 TRY(UC_JOINING_GROUP_MEEM
)
4214 TRY(UC_JOINING_GROUP_MIM
)
4215 TRY(UC_JOINING_GROUP_NOON
)
4216 TRY(UC_JOINING_GROUP_NUN
)
4217 TRY(UC_JOINING_GROUP_NYA
)
4218 TRY(UC_JOINING_GROUP_PE
)
4219 TRY(UC_JOINING_GROUP_QAF
)
4220 TRY(UC_JOINING_GROUP_QAPH
)
4221 TRY(UC_JOINING_GROUP_REH
)
4222 TRY(UC_JOINING_GROUP_REVERSED_PE
)
4223 TRY(UC_JOINING_GROUP_SAD
)
4224 TRY(UC_JOINING_GROUP_SADHE
)
4225 TRY(UC_JOINING_GROUP_SEEN
)
4226 TRY(UC_JOINING_GROUP_SEMKATH
)
4227 TRY(UC_JOINING_GROUP_SHIN
)
4228 TRY(UC_JOINING_GROUP_SWASH_KAF
)
4229 TRY(UC_JOINING_GROUP_SYRIAC_WAW
)
4230 TRY(UC_JOINING_GROUP_TAH
)
4231 TRY(UC_JOINING_GROUP_TAW
)
4232 TRY(UC_JOINING_GROUP_TEH_MARBUTA
)
4233 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL
)
4234 TRY(UC_JOINING_GROUP_TETH
)
4235 TRY(UC_JOINING_GROUP_WAW
)
4236 TRY(UC_JOINING_GROUP_YEH
)
4237 TRY(UC_JOINING_GROUP_YEH_BARREE
)
4238 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL
)
4239 TRY(UC_JOINING_GROUP_YUDH
)
4240 TRY(UC_JOINING_GROUP_YUDH_HE
)
4241 TRY(UC_JOINING_GROUP_ZAIN
)
4242 TRY(UC_JOINING_GROUP_ZHAIN
)
4243 TRY(UC_JOINING_GROUP_ROHINGYA_YEH
)
4244 TRY(UC_JOINING_GROUP_STRAIGHT_WAW
)
4245 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH
)
4246 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH
)
4247 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL
)
4248 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH
)
4249 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW
)
4250 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN
)
4251 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH
)
4252 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH
)
4253 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH
)
4254 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH
)
4255 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH
)
4256 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH
)
4257 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH
)
4258 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM
)
4259 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN
)
4260 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH
)
4261 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN
)
4262 TRY(UC_JOINING_GROUP_MANICHAEAN_PE
)
4263 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE
)
4264 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH
)
4265 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH
)
4266 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW
)
4267 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE
)
4268 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE
)
4269 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN
)
4270 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY
)
4271 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED
)
4272 TRY(UC_JOINING_GROUP_AFRICAN_FEH
)
4273 TRY(UC_JOINING_GROUP_AFRICAN_QAF
)
4274 TRY(UC_JOINING_GROUP_AFRICAN_NOON
)
4280 output_joining_group_test (const char *filename
, const char *version
)
4286 stream
= fopen (filename
, "w");
4289 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4293 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4294 fprintf (stream
, "/* Arabic joining group of Unicode characters. */\n");
4295 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4299 for (ch
= 0; ch
< 0x110000; ch
++)
4301 int value
= unicode_joining_group
[ch
];
4303 if (value
!= UC_JOINING_GROUP_NONE
)
4306 fprintf (stream
, ",\n");
4307 fprintf (stream
, " { 0x%04X, %s }", ch
, joining_group_as_c_identifier (value
));
4312 fprintf (stream
, "\n");
4314 if (ferror (stream
) || fclose (stream
))
4316 fprintf (stderr
, "error writing to '%s'\n", filename
);
4321 /* Construction of sparse 3-level tables. */
4322 #define TABLE joining_group_table
4323 #define ELEMENT uint8_t
4324 #define DEFAULT UC_JOINING_GROUP_NONE
4325 #define xmalloc malloc
4326 #define xrealloc realloc
4330 output_joining_group (const char *filename
, const char *version
)
4334 struct joining_group_table t
;
4335 unsigned int level1_offset
, level2_offset
, level3_offset
;
4336 uint16_t *level3_packed
;
4338 stream
= fopen (filename
, "w");
4341 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4345 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4346 fprintf (stream
, "/* Arabic joining group of Unicode characters. */\n");
4347 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4352 joining_group_table_init (&t
);
4354 for (ch
= 0; ch
< 0x110000; ch
++)
4356 uint8_t value
= unicode_joining_group
[ch
];
4358 assert (value
<= 0x7f);
4360 joining_group_table_add (&t
, ch
, value
);
4363 joining_group_table_finalize (&t
);
4365 /* Offsets in t.result, in memory of this process. */
4367 5 * sizeof (uint32_t);
4369 5 * sizeof (uint32_t)
4370 + t
.level1_size
* sizeof (uint32_t);
4372 5 * sizeof (uint32_t)
4373 + t
.level1_size
* sizeof (uint32_t)
4374 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
4376 for (i
= 0; i
< 5; i
++)
4377 fprintf (stream
, "#define joining_group_header_%d %d\n", i
,
4378 ((uint32_t *) t
.result
)[i
]);
4379 fprintf (stream
, "static const\n");
4380 fprintf (stream
, "struct\n");
4381 fprintf (stream
, " {\n");
4382 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
4383 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
4384 fprintf (stream
, " unsigned short level3[%zu * %d + 1];\n", t
.level3_size
,
4385 (1 << t
.p
) * 7 / 16);
4386 fprintf (stream
, " }\n");
4387 fprintf (stream
, "u_joining_group =\n");
4388 fprintf (stream
, "{\n");
4389 fprintf (stream
, " {");
4390 if (t
.level1_size
> 8)
4391 fprintf (stream
, "\n ");
4392 for (i
= 0; i
< t
.level1_size
; i
++)
4395 if (i
> 0 && (i
% 8) == 0)
4396 fprintf (stream
, "\n ");
4397 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
4399 fprintf (stream
, " %5d", -1);
4401 fprintf (stream
, " %5zu",
4402 (offset
- level2_offset
) / sizeof (uint32_t));
4403 if (i
+1 < t
.level1_size
)
4404 fprintf (stream
, ",");
4406 if (t
.level1_size
> 8)
4407 fprintf (stream
, "\n ");
4408 fprintf (stream
, " },\n");
4409 fprintf (stream
, " {");
4410 if (t
.level2_size
<< t
.q
> 8)
4411 fprintf (stream
, "\n ");
4412 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
4415 if (i
> 0 && (i
% 8) == 0)
4416 fprintf (stream
, "\n ");
4417 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
4419 fprintf (stream
, " %5d", -1);
4421 fprintf (stream
, " %5zu",
4422 (offset
- level3_offset
) / sizeof (uint8_t));
4423 if (i
+1 < t
.level2_size
<< t
.q
)
4424 fprintf (stream
, ",");
4426 if (t
.level2_size
<< t
.q
> 8)
4427 fprintf (stream
, "\n ");
4428 fprintf (stream
, " },\n");
4429 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
4430 not 32-bit units, in order to make the lookup function easier. */
4433 calloc ((t
.level3_size
<< t
.p
) * 7 / 16 + 1, sizeof (uint16_t));
4434 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
4436 unsigned int j
= (i
* 7) / 16;
4437 unsigned int k
= (i
* 7) % 16;
4438 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
4439 value
= level3_packed
[j
] | (level3_packed
[j
+1] << 16) | (value
<< k
);
4440 level3_packed
[j
] = value
& 0xffff;
4441 level3_packed
[j
+1] = value
>> 16;
4443 fprintf (stream
, " {");
4444 if ((t
.level3_size
<< t
.p
) * 7 / 16 + 1 > 8)
4445 fprintf (stream
, "\n ");
4446 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 7 / 16 + 1; i
++)
4448 if (i
> 0 && (i
% 8) == 0)
4449 fprintf (stream
, "\n ");
4450 fprintf (stream
, " 0x%04x", level3_packed
[i
]);
4451 if (i
+1 < (t
.level3_size
<< t
.p
) * 7 / 16 + 1)
4452 fprintf (stream
, ",");
4454 if ((t
.level3_size
<< t
.p
) * 7 / 16 + 1 > 8)
4455 fprintf (stream
, "\n ");
4456 fprintf (stream
, " }\n");
4457 free (level3_packed
);
4458 fprintf (stream
, "};\n");
4460 if (ferror (stream
) || fclose (stream
))
4462 fprintf (stderr
, "error writing to '%s'\n", filename
);
4467 /* ========================================================================= */
4471 static const char *scripts
[256];
4472 static unsigned int numscripts
;
4474 static uint8_t unicode_scripts
[0x110000];
4477 fill_scripts (const char *scripts_filename
)
4482 stream
= fopen (scripts_filename
, "r");
4485 fprintf (stderr
, "error during fopen of '%s'\n", scripts_filename
);
4491 for (i
= 0; i
< 0x110000; i
++)
4492 unicode_scripts
[i
] = (uint8_t)~(uint8_t)0;
4497 unsigned int i1
, i2
;
4498 char padding
[200+1];
4499 char scriptname
[200+1];
4502 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
4505 if (buf
[0] == '\0' || buf
[0] == '#')
4508 if (sscanf (buf
, "%X..%X%[ ;]%[^ ]", &i1
, &i2
, padding
, scriptname
) != 4)
4510 if (sscanf (buf
, "%X%[ ;]%[^ ]", &i1
, padding
, scriptname
) != 3)
4512 fprintf (stderr
, "parse error in '%s'\n", scripts_filename
);
4518 assert (i2
< 0x110000);
4520 for (script
= numscripts
- 1; script
>= 0; script
--)
4521 if (strcmp (scripts
[script
], scriptname
) == 0)
4525 scripts
[numscripts
] = strdup (scriptname
);
4526 script
= numscripts
;
4528 assert (numscripts
!= 256);
4531 for (i
= i1
; i
<= i2
; i
++)
4533 if (unicode_scripts
[i
] != (uint8_t)~(uint8_t)0)
4534 fprintf (stderr
, "0x%04X belongs to multiple scripts\n", i
);
4535 unicode_scripts
[i
] = script
;
4539 if (ferror (stream
) || fclose (stream
))
4541 fprintf (stderr
, "error reading from '%s'\n", scripts_filename
);
4546 /* Construction of sparse 3-level tables. */
4547 #define TABLE script_table
4548 #define ELEMENT uint8_t
4549 #define DEFAULT (uint8_t)~(uint8_t)0
4550 #define xmalloc malloc
4551 #define xrealloc realloc
4555 output_scripts (const char *version
)
4557 const char *filename
= "unictype/scripts.h";
4559 unsigned int ch
, s
, i
;
4560 struct script_table t
;
4561 unsigned int level1_offset
, level2_offset
, level3_offset
;
4565 const char *lowercase_name
;
4568 scriptinfo_t scriptinfo
[256];
4570 stream
= fopen (filename
, "w");
4573 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4577 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4578 fprintf (stream
, "/* Unicode scripts. */\n");
4579 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4582 for (s
= 0; s
< numscripts
; s
++)
4584 char *lcp
= strdup (scripts
[s
]);
4587 for (cp
= lcp
; *cp
!= '\0'; cp
++)
4588 if (*cp
>= 'A' && *cp
<= 'Z')
4591 scriptinfo
[s
].lowercase_name
= lcp
;
4594 for (s
= 0; s
< numscripts
; s
++)
4596 fprintf (stream
, "static const uc_interval_t script_%s_intervals[] =\n",
4597 scriptinfo
[s
].lowercase_name
);
4598 fprintf (stream
, "{\n");
4600 for (ch
= 0; ch
< 0x110000; ch
++)
4601 if (unicode_scripts
[ch
] == s
)
4607 while (ch
+ 1 < 0x110000 && unicode_scripts
[ch
+ 1] == s
)
4612 fprintf (stream
, ",\n");
4614 fprintf (stream
, " { 0x%04X, 1, 1 }", start
);
4616 fprintf (stream
, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
4620 fprintf (stream
, "\n");
4621 fprintf (stream
, "};\n");
4624 fprintf (stream
, "static const uc_script_t scripts[%d] =\n", numscripts
);
4625 fprintf (stream
, "{\n");
4626 for (s
= 0; s
< numscripts
; s
++)
4628 fprintf (stream
, " {\n");
4629 fprintf (stream
, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
4630 scriptinfo
[s
].lowercase_name
);
4631 fprintf (stream
, " script_%s_intervals,\n",
4632 scriptinfo
[s
].lowercase_name
);
4633 fprintf (stream
, " \"%s\"\n", scripts
[s
]);
4634 fprintf (stream
, " }");
4635 if (s
+1 < numscripts
)
4636 fprintf (stream
, ",");
4637 fprintf (stream
, "\n");
4639 fprintf (stream
, "};\n");
4643 script_table_init (&t
);
4645 for (ch
= 0; ch
< 0x110000; ch
++)
4647 unsigned int s
= unicode_scripts
[ch
];
4648 if (s
!= (uint8_t)~(uint8_t)0)
4649 script_table_add (&t
, ch
, s
);
4652 script_table_finalize (&t
);
4654 /* Offsets in t.result, in memory of this process. */
4656 5 * sizeof (uint32_t);
4658 5 * sizeof (uint32_t)
4659 + t
.level1_size
* sizeof (uint32_t);
4661 5 * sizeof (uint32_t)
4662 + t
.level1_size
* sizeof (uint32_t)
4663 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
4665 for (i
= 0; i
< 5; i
++)
4666 fprintf (stream
, "#define script_header_%d %d\n", i
,
4667 ((uint32_t *) t
.result
)[i
]);
4668 fprintf (stream
, "static const\n");
4669 fprintf (stream
, "struct\n");
4670 fprintf (stream
, " {\n");
4671 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
4672 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
4673 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
, t
.p
);
4674 fprintf (stream
, " }\n");
4675 fprintf (stream
, "u_script =\n");
4676 fprintf (stream
, "{\n");
4677 fprintf (stream
, " {");
4678 if (t
.level1_size
> 8)
4679 fprintf (stream
, "\n ");
4680 for (i
= 0; i
< t
.level1_size
; i
++)
4683 if (i
> 0 && (i
% 8) == 0)
4684 fprintf (stream
, "\n ");
4685 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
4687 fprintf (stream
, " %5d", -1);
4689 fprintf (stream
, " %5zu",
4690 (offset
- level2_offset
) / sizeof (uint32_t));
4691 if (i
+1 < t
.level1_size
)
4692 fprintf (stream
, ",");
4694 if (t
.level1_size
> 8)
4695 fprintf (stream
, "\n ");
4696 fprintf (stream
, " },\n");
4697 fprintf (stream
, " {");
4698 if (t
.level2_size
<< t
.q
> 8)
4699 fprintf (stream
, "\n ");
4700 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
4703 if (i
> 0 && (i
% 8) == 0)
4704 fprintf (stream
, "\n ");
4705 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
4707 fprintf (stream
, " %5d", -1);
4709 fprintf (stream
, " %5zu",
4710 (offset
- level3_offset
) / sizeof (uint8_t));
4711 if (i
+1 < t
.level2_size
<< t
.q
)
4712 fprintf (stream
, ",");
4714 if (t
.level2_size
<< t
.q
> 8)
4715 fprintf (stream
, "\n ");
4716 fprintf (stream
, " },\n");
4717 fprintf (stream
, " {");
4718 if (t
.level3_size
<< t
.p
> 8)
4719 fprintf (stream
, "\n ");
4720 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
4722 if (i
> 0 && (i
% 8) == 0)
4723 fprintf (stream
, "\n ");
4724 fprintf (stream
, " %3d", ((uint8_t *) (t
.result
+ level3_offset
))[i
]);
4725 if (i
+1 < t
.level3_size
<< t
.p
)
4726 fprintf (stream
, ",");
4728 if (t
.level3_size
<< t
.p
> 8)
4729 fprintf (stream
, "\n ");
4730 fprintf (stream
, " }\n");
4731 fprintf (stream
, "};\n");
4733 if (ferror (stream
) || fclose (stream
))
4735 fprintf (stderr
, "error writing to '%s'\n", filename
);
4741 output_scripts_byname (const char *version
)
4743 const char *filename
= "unictype/scripts_byname.gperf";
4747 stream
= fopen (filename
, "w");
4750 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4754 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4755 fprintf (stream
, "/* Unicode scripts. */\n");
4756 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4758 fprintf (stream
, "struct named_script { int name; unsigned int index; };\n");
4759 fprintf (stream
, "%%struct-type\n");
4760 fprintf (stream
, "%%language=ANSI-C\n");
4761 fprintf (stream
, "%%define hash-function-name scripts_hash\n");
4762 fprintf (stream
, "%%define lookup-function-name uc_script_lookup\n");
4763 fprintf (stream
, "%%readonly-tables\n");
4764 fprintf (stream
, "%%global-table\n");
4765 fprintf (stream
, "%%define word-array-name script_names\n");
4766 fprintf (stream
, "%%pic\n");
4767 fprintf (stream
, "%%define string-pool-name script_stringpool\n");
4768 fprintf (stream
, "%%%%\n");
4769 for (s
= 0; s
< numscripts
; s
++)
4770 fprintf (stream
, "%s, %u\n", scripts
[s
], s
);
4772 if (ferror (stream
) || fclose (stream
))
4774 fprintf (stderr
, "error writing to '%s'\n", filename
);
4779 /* ========================================================================= */
4783 typedef struct { unsigned int start
; unsigned int end
; const char *name
; }
4785 static block_t blocks
[384];
4786 static unsigned int numblocks
;
4789 fill_blocks (const char *blocks_filename
)
4793 stream
= fopen (blocks_filename
, "r");
4796 fprintf (stderr
, "error during fopen of '%s'\n", blocks_filename
);
4803 unsigned int i1
, i2
;
4804 char padding
[200+1];
4805 char blockname
[200+1];
4807 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
4810 if (buf
[0] == '\0' || buf
[0] == '#')
4813 if (sscanf (buf
, "%X..%X%[ ;]%[^\r]", &i1
, &i2
, padding
, blockname
) != 4)
4815 fprintf (stderr
, "parse error in '%s'\n", blocks_filename
);
4818 blocks
[numblocks
].start
= i1
;
4819 blocks
[numblocks
].end
= i2
;
4820 blocks
[numblocks
].name
= strdup (blockname
);
4821 /* It must be sorted. */
4822 assert (numblocks
== 0 || blocks
[numblocks
-1].end
< blocks
[numblocks
].start
);
4824 assert (numblocks
!= SIZEOF (blocks
));
4827 if (ferror (stream
) || fclose (stream
))
4829 fprintf (stderr
, "error reading from '%s'\n", blocks_filename
);
4834 /* Return the smallest block index among the blocks for characters >= ch. */
4836 block_first_index (unsigned int ch
)
4838 /* Binary search. */
4839 unsigned int lo
= 0;
4840 unsigned int hi
= numblocks
;
4842 All blocks[i], i < lo, have blocks[i].end < ch,
4843 all blocks[i], i >= hi, have blocks[i].end >= ch. */
4846 unsigned int mid
= (lo
+ hi
) / 2; /* >= lo, < hi */
4847 if (blocks
[mid
].end
< ch
)
4855 /* Return the largest block index among the blocks for characters <= ch,
4858 block_last_index (unsigned int ch
)
4860 /* Binary search. */
4861 unsigned int lo
= 0;
4862 unsigned int hi
= numblocks
;
4864 All blocks[i], i < lo, have blocks[i].start <= ch,
4865 all blocks[i], i >= hi, have blocks[i].start > ch. */
4868 unsigned int mid
= (lo
+ hi
) / 2; /* >= lo, < hi */
4869 if (blocks
[mid
].start
<= ch
)
4878 output_blocks (const char *version
)
4880 const char *filename
= "unictype/blocks.h";
4881 const unsigned int shift
= 8; /* bits to shift away for array access */
4882 const unsigned int threshold
= 0x28000; /* cut-off table here to save space */
4887 stream
= fopen (filename
, "w");
4890 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4894 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4895 fprintf (stream
, "/* Unicode blocks. */\n");
4896 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4899 fprintf (stream
, "static const uc_block_t blocks[] =\n");
4900 fprintf (stream
, "{\n");
4901 for (i
= 0; i
< numblocks
; i
++)
4903 fprintf (stream
, " { 0x%04X, 0x%04X, \"%s\" }", blocks
[i
].start
,
4904 blocks
[i
].end
, blocks
[i
].name
);
4905 if (i
+1 < numblocks
)
4906 fprintf (stream
, ",");
4907 fprintf (stream
, "\n");
4909 fprintf (stream
, "};\n");
4910 fprintf (stream
, "#define blocks_level1_shift %d\n", shift
);
4911 fprintf (stream
, "#define blocks_level1_threshold 0x%04X\n", threshold
);
4912 fprintf (stream
, "static const uint16_t blocks_level1[%d * 2] =\n",
4913 threshold
>> shift
);
4914 fprintf (stream
, "{\n");
4915 for (i1
= 0; i1
< (threshold
>> shift
); i1
++)
4917 unsigned int first_index
= block_first_index (i1
<< shift
);
4918 unsigned int last_index
= block_last_index (((i1
+ 1) << shift
) - 1);
4919 fprintf (stream
, " %3d, %3d", first_index
, last_index
);
4920 if (i1
+1 < (threshold
>> shift
))
4921 fprintf (stream
, ",");
4922 fprintf (stream
, "\n");
4924 fprintf (stream
, "};\n");
4925 fprintf (stream
, "#define blocks_upper_first_index %d\n",
4926 block_first_index (threshold
));
4927 fprintf (stream
, "#define blocks_upper_last_index %d\n",
4928 block_last_index (0x10FFFF));
4930 if (ferror (stream
) || fclose (stream
))
4932 fprintf (stderr
, "error writing to '%s'\n", filename
);
4937 /* ========================================================================= */
4939 /* C and Java syntax. */
4943 UC_IDENTIFIER_START
, /* valid as first or subsequent character */
4944 UC_IDENTIFIER_VALID
, /* valid as subsequent character only */
4945 UC_IDENTIFIER_INVALID
, /* not valid */
4946 UC_IDENTIFIER_IGNORABLE
/* ignorable (Java only) */
4949 /* ISO C 99 section 6.4.(3). */
4951 is_c_whitespace (unsigned int ch
)
4953 return (ch
== ' ' /* space */
4954 || ch
== '\t' /* horizontal tab */
4955 || ch
== '\n' || ch
== '\r' /* new-line */
4956 || ch
== '\v' /* vertical tab */
4957 || ch
== '\f'); /* form-feed */
4960 /* ISO C 99 section 6.4.2.1 and appendix D. */
4962 c_ident_category (unsigned int ch
)
4964 /* Section 6.4.2.1. */
4965 if (ch
>= '0' && ch
<= '9')
4966 return UC_IDENTIFIER_VALID
;
4967 if ((ch
>= 'A' && ch
<= 'Z') || (ch
>= 'a' && ch
<= 'z') || ch
== '_')
4968 return UC_IDENTIFIER_START
;
4974 || (ch
>= 0x00C0 && ch
<= 0x00D6)
4975 || (ch
>= 0x00D8 && ch
<= 0x00F6)
4976 || (ch
>= 0x00F8 && ch
<= 0x01F5)
4977 || (ch
>= 0x01FA && ch
<= 0x0217)
4978 || (ch
>= 0x0250 && ch
<= 0x02A8)
4979 || (ch
>= 0x1E00 && ch
<= 0x1E9B)
4980 || (ch
>= 0x1EA0 && ch
<= 0x1EF9)
4984 || (ch
>= 0x0388 && ch
<= 0x038A)
4986 || (ch
>= 0x038E && ch
<= 0x03A1)
4987 || (ch
>= 0x03A3 && ch
<= 0x03CE)
4988 || (ch
>= 0x03D0 && ch
<= 0x03D6)
4993 || (ch
>= 0x03E2 && ch
<= 0x03F3)
4994 || (ch
>= 0x1F00 && ch
<= 0x1F15)
4995 || (ch
>= 0x1F18 && ch
<= 0x1F1D)
4996 || (ch
>= 0x1F20 && ch
<= 0x1F45)
4997 || (ch
>= 0x1F48 && ch
<= 0x1F4D)
4998 || (ch
>= 0x1F50 && ch
<= 0x1F57)
5002 || (ch
>= 0x1F5F && ch
<= 0x1F7D)
5003 || (ch
>= 0x1F80 && ch
<= 0x1FB4)
5004 || (ch
>= 0x1FB6 && ch
<= 0x1FBC)
5005 || (ch
>= 0x1FC2 && ch
<= 0x1FC4)
5006 || (ch
>= 0x1FC6 && ch
<= 0x1FCC)
5007 || (ch
>= 0x1FD0 && ch
<= 0x1FD3)
5008 || (ch
>= 0x1FD6 && ch
<= 0x1FDB)
5009 || (ch
>= 0x1FE0 && ch
<= 0x1FEC)
5010 || (ch
>= 0x1FF2 && ch
<= 0x1FF4)
5011 || (ch
>= 0x1FF6 && ch
<= 0x1FFC)
5013 || (ch
>= 0x0401 && ch
<= 0x040C)
5014 || (ch
>= 0x040E && ch
<= 0x044F)
5015 || (ch
>= 0x0451 && ch
<= 0x045C)
5016 || (ch
>= 0x045E && ch
<= 0x0481)
5017 || (ch
>= 0x0490 && ch
<= 0x04C4)
5018 || (ch
>= 0x04C7 && ch
<= 0x04C8)
5019 || (ch
>= 0x04CB && ch
<= 0x04CC)
5020 || (ch
>= 0x04D0 && ch
<= 0x04EB)
5021 || (ch
>= 0x04EE && ch
<= 0x04F5)
5022 || (ch
>= 0x04F8 && ch
<= 0x04F9)
5024 || (ch
>= 0x0531 && ch
<= 0x0556)
5025 || (ch
>= 0x0561 && ch
<= 0x0587)
5027 || (ch
>= 0x05B0 && ch
<= 0x05B9)
5028 || (ch
>= 0x05BB && ch
<= 0x05BD)
5030 || (ch
>= 0x05C1 && ch
<= 0x05C2)
5031 || (ch
>= 0x05D0 && ch
<= 0x05EA)
5032 || (ch
>= 0x05F0 && ch
<= 0x05F2)
5034 || (ch
>= 0x0621 && ch
<= 0x063A)
5035 || (ch
>= 0x0640 && ch
<= 0x0652)
5036 || (ch
>= 0x0670 && ch
<= 0x06B7)
5037 || (ch
>= 0x06BA && ch
<= 0x06BE)
5038 || (ch
>= 0x06C0 && ch
<= 0x06CE)
5039 || (ch
>= 0x06D0 && ch
<= 0x06DC)
5040 || (ch
>= 0x06E5 && ch
<= 0x06E8)
5041 || (ch
>= 0x06EA && ch
<= 0x06ED)
5043 || (ch
>= 0x0901 && ch
<= 0x0903)
5044 || (ch
>= 0x0905 && ch
<= 0x0939)
5045 || (ch
>= 0x093E && ch
<= 0x094D)
5046 || (ch
>= 0x0950 && ch
<= 0x0952)
5047 || (ch
>= 0x0958 && ch
<= 0x0963)
5049 || (ch
>= 0x0981 && ch
<= 0x0983)
5050 || (ch
>= 0x0985 && ch
<= 0x098C)
5051 || (ch
>= 0x098F && ch
<= 0x0990)
5052 || (ch
>= 0x0993 && ch
<= 0x09A8)
5053 || (ch
>= 0x09AA && ch
<= 0x09B0)
5055 || (ch
>= 0x09B6 && ch
<= 0x09B9)
5056 || (ch
>= 0x09BE && ch
<= 0x09C4)
5057 || (ch
>= 0x09C7 && ch
<= 0x09C8)
5058 || (ch
>= 0x09CB && ch
<= 0x09CD)
5059 || (ch
>= 0x09DC && ch
<= 0x09DD)
5060 || (ch
>= 0x09DF && ch
<= 0x09E3)
5061 || (ch
>= 0x09F0 && ch
<= 0x09F1)
5064 || (ch
>= 0x0A05 && ch
<= 0x0A0A)
5065 || (ch
>= 0x0A0F && ch
<= 0x0A10)
5066 || (ch
>= 0x0A13 && ch
<= 0x0A28)
5067 || (ch
>= 0x0A2A && ch
<= 0x0A30)
5068 || (ch
>= 0x0A32 && ch
<= 0x0A33)
5069 || (ch
>= 0x0A35 && ch
<= 0x0A36)
5070 || (ch
>= 0x0A38 && ch
<= 0x0A39)
5071 || (ch
>= 0x0A3E && ch
<= 0x0A42)
5072 || (ch
>= 0x0A47 && ch
<= 0x0A48)
5073 || (ch
>= 0x0A4B && ch
<= 0x0A4D)
5074 || (ch
>= 0x0A59 && ch
<= 0x0A5C)
5078 || (ch
>= 0x0A81 && ch
<= 0x0A83)
5079 || (ch
>= 0x0A85 && ch
<= 0x0A8B)
5081 || (ch
>= 0x0A8F && ch
<= 0x0A91)
5082 || (ch
>= 0x0A93 && ch
<= 0x0AA8)
5083 || (ch
>= 0x0AAA && ch
<= 0x0AB0)
5084 || (ch
>= 0x0AB2 && ch
<= 0x0AB3)
5085 || (ch
>= 0x0AB5 && ch
<= 0x0AB9)
5086 || (ch
>= 0x0ABD && ch
<= 0x0AC5)
5087 || (ch
>= 0x0AC7 && ch
<= 0x0AC9)
5088 || (ch
>= 0x0ACB && ch
<= 0x0ACD)
5092 || (ch
>= 0x0B01 && ch
<= 0x0B03)
5093 || (ch
>= 0x0B05 && ch
<= 0x0B0C)
5094 || (ch
>= 0x0B0F && ch
<= 0x0B10)
5095 || (ch
>= 0x0B13 && ch
<= 0x0B28)
5096 || (ch
>= 0x0B2A && ch
<= 0x0B30)
5097 || (ch
>= 0x0B32 && ch
<= 0x0B33)
5098 || (ch
>= 0x0B36 && ch
<= 0x0B39)
5099 || (ch
>= 0x0B3E && ch
<= 0x0B43)
5100 || (ch
>= 0x0B47 && ch
<= 0x0B48)
5101 || (ch
>= 0x0B4B && ch
<= 0x0B4D)
5102 || (ch
>= 0x0B5C && ch
<= 0x0B5D)
5103 || (ch
>= 0x0B5F && ch
<= 0x0B61)
5105 || (ch
>= 0x0B82 && ch
<= 0x0B83)
5106 || (ch
>= 0x0B85 && ch
<= 0x0B8A)
5107 || (ch
>= 0x0B8E && ch
<= 0x0B90)
5108 || (ch
>= 0x0B92 && ch
<= 0x0B95)
5109 || (ch
>= 0x0B99 && ch
<= 0x0B9A)
5111 || (ch
>= 0x0B9E && ch
<= 0x0B9F)
5112 || (ch
>= 0x0BA3 && ch
<= 0x0BA4)
5113 || (ch
>= 0x0BA8 && ch
<= 0x0BAA)
5114 || (ch
>= 0x0BAE && ch
<= 0x0BB5)
5115 || (ch
>= 0x0BB7 && ch
<= 0x0BB9)
5116 || (ch
>= 0x0BBE && ch
<= 0x0BC2)
5117 || (ch
>= 0x0BC6 && ch
<= 0x0BC8)
5118 || (ch
>= 0x0BCA && ch
<= 0x0BCD)
5120 || (ch
>= 0x0C01 && ch
<= 0x0C03)
5121 || (ch
>= 0x0C05 && ch
<= 0x0C0C)
5122 || (ch
>= 0x0C0E && ch
<= 0x0C10)
5123 || (ch
>= 0x0C12 && ch
<= 0x0C28)
5124 || (ch
>= 0x0C2A && ch
<= 0x0C33)
5125 || (ch
>= 0x0C35 && ch
<= 0x0C39)
5126 || (ch
>= 0x0C3E && ch
<= 0x0C44)
5127 || (ch
>= 0x0C46 && ch
<= 0x0C48)
5128 || (ch
>= 0x0C4A && ch
<= 0x0C4D)
5129 || (ch
>= 0x0C60 && ch
<= 0x0C61)
5131 || (ch
>= 0x0C82 && ch
<= 0x0C83)
5132 || (ch
>= 0x0C85 && ch
<= 0x0C8C)
5133 || (ch
>= 0x0C8E && ch
<= 0x0C90)
5134 || (ch
>= 0x0C92 && ch
<= 0x0CA8)
5135 || (ch
>= 0x0CAA && ch
<= 0x0CB3)
5136 || (ch
>= 0x0CB5 && ch
<= 0x0CB9)
5137 || (ch
>= 0x0CBE && ch
<= 0x0CC4)
5138 || (ch
>= 0x0CC6 && ch
<= 0x0CC8)
5139 || (ch
>= 0x0CCA && ch
<= 0x0CCD)
5141 || (ch
>= 0x0CE0 && ch
<= 0x0CE1)
5143 || (ch
>= 0x0D02 && ch
<= 0x0D03)
5144 || (ch
>= 0x0D05 && ch
<= 0x0D0C)
5145 || (ch
>= 0x0D0E && ch
<= 0x0D10)
5146 || (ch
>= 0x0D12 && ch
<= 0x0D28)
5147 || (ch
>= 0x0D2A && ch
<= 0x0D39)
5148 || (ch
>= 0x0D3E && ch
<= 0x0D43)
5149 || (ch
>= 0x0D46 && ch
<= 0x0D48)
5150 || (ch
>= 0x0D4A && ch
<= 0x0D4D)
5151 || (ch
>= 0x0D60 && ch
<= 0x0D61)
5153 || (ch
>= 0x0E01 && ch
<= 0x0E3A)
5154 || (ch
>= 0x0E40 && ch
<= 0x0E5B)
5156 || (ch
>= 0x0E81 && ch
<= 0x0E82)
5158 || (ch
>= 0x0E87 && ch
<= 0x0E88)
5161 || (ch
>= 0x0E94 && ch
<= 0x0E97)
5162 || (ch
>= 0x0E99 && ch
<= 0x0E9F)
5163 || (ch
>= 0x0EA1 && ch
<= 0x0EA3)
5166 || (ch
>= 0x0EAA && ch
<= 0x0EAB)
5167 || (ch
>= 0x0EAD && ch
<= 0x0EAE)
5168 || (ch
>= 0x0EB0 && ch
<= 0x0EB9)
5169 || (ch
>= 0x0EBB && ch
<= 0x0EBD)
5170 || (ch
>= 0x0EC0 && ch
<= 0x0EC4)
5172 || (ch
>= 0x0EC8 && ch
<= 0x0ECD)
5173 || (ch
>= 0x0EDC && ch
<= 0x0EDD)
5176 || (ch
>= 0x0F18 && ch
<= 0x0F19)
5180 || (ch
>= 0x0F3E && ch
<= 0x0F47)
5181 || (ch
>= 0x0F49 && ch
<= 0x0F69)
5182 || (ch
>= 0x0F71 && ch
<= 0x0F84)
5183 || (ch
>= 0x0F86 && ch
<= 0x0F8B)
5184 || (ch
>= 0x0F90 && ch
<= 0x0F95)
5186 || (ch
>= 0x0F99 && ch
<= 0x0FAD)
5187 || (ch
>= 0x0FB1 && ch
<= 0x0FB7)
5190 || (ch
>= 0x10A0 && ch
<= 0x10C5)
5191 || (ch
>= 0x10D0 && ch
<= 0x10F6)
5193 || (ch
>= 0x3041 && ch
<= 0x3093)
5194 || (ch
>= 0x309B && ch
<= 0x309C)
5196 || (ch
>= 0x30A1 && ch
<= 0x30F6)
5197 || (ch
>= 0x30FB && ch
<= 0x30FC)
5199 || (ch
>= 0x3105 && ch
<= 0x312C)
5200 /* CJK Unified Ideographs */
5201 || (ch
>= 0x4E00 && ch
<= 0x9FA5)
5203 || (ch
>= 0xAC00 && ch
<= 0xD7A3)
5205 || (ch
>= 0x0660 && ch
<= 0x0669)
5206 || (ch
>= 0x06F0 && ch
<= 0x06F9)
5207 || (ch
>= 0x0966 && ch
<= 0x096F)
5208 || (ch
>= 0x09E6 && ch
<= 0x09EF)
5209 || (ch
>= 0x0A66 && ch
<= 0x0A6F)
5210 || (ch
>= 0x0AE6 && ch
<= 0x0AEF)
5211 || (ch
>= 0x0B66 && ch
<= 0x0B6F)
5212 || (ch
>= 0x0BE7 && ch
<= 0x0BEF)
5213 || (ch
>= 0x0C66 && ch
<= 0x0C6F)
5214 || (ch
>= 0x0CE6 && ch
<= 0x0CEF)
5215 || (ch
>= 0x0D66 && ch
<= 0x0D6F)
5216 || (ch
>= 0x0E50 && ch
<= 0x0E59)
5217 || (ch
>= 0x0ED0 && ch
<= 0x0ED9)
5218 || (ch
>= 0x0F20 && ch
<= 0x0F33)
5219 /* Special characters */
5222 || (ch
>= 0x02B0 && ch
<= 0x02B8)
5224 || (ch
>= 0x02BD && ch
<= 0x02C1)
5225 || (ch
>= 0x02D0 && ch
<= 0x02D1)
5226 || (ch
>= 0x02E0 && ch
<= 0x02E4)
5232 || (ch
>= 0x203F && ch
<= 0x2040)
5235 || (ch
>= 0x210A && ch
<= 0x2113)
5237 || (ch
>= 0x2118 && ch
<= 0x211D)
5241 || (ch
>= 0x212A && ch
<= 0x2131)
5242 || (ch
>= 0x2133 && ch
<= 0x2138)
5243 || (ch
>= 0x2160 && ch
<= 0x2182)
5244 || (ch
>= 0x3005 && ch
<= 0x3007)
5245 || (ch
>= 0x3021 && ch
<= 0x3029)
5247 return UC_IDENTIFIER_START
;
5248 return UC_IDENTIFIER_INVALID
;
5251 /* The Java Language Specification, 3rd edition, §3.6.
5252 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.6 */
5254 is_java_whitespace (unsigned int ch
)
5256 return (ch
== ' ' || ch
== '\t' || ch
== '\f'
5257 || ch
== '\n' || ch
== '\r');
5260 /* The Java Language Specification, 3rd edition, §3.8.
5261 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.8
5262 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
5264 java_ident_category (unsigned int ch
)
5266 /* FIXME: Check this against Sun's JDK implementation. */
5267 if (is_category_L (ch
) /* = Character.isLetter(ch) */
5268 || is_category_Nl (ch
) /* = Character.getType(ch)==LETTER_NUMBER */
5269 || is_category_Sc (ch
) /* currency symbol */
5270 || is_category_Pc (ch
) /* connector punctuation */
5272 return UC_IDENTIFIER_START
;
5273 if (is_category_Nd (ch
) /* digit */
5274 || is_category_Mc (ch
) /* combining mark */
5275 || is_category_Mn (ch
) /* non-spacing mark */
5277 return UC_IDENTIFIER_VALID
;
5278 if ((ch
>= 0x0000 && ch
<= 0x0008)
5279 || (ch
>= 0x000E && ch
<= 0x001B)
5280 || (ch
>= 0x007F && ch
<= 0x009F)
5281 || is_category_Cf (ch
) /* = Character.getType(ch)==FORMAT */
5283 return UC_IDENTIFIER_IGNORABLE
;
5284 return UC_IDENTIFIER_INVALID
;
5287 /* Construction of sparse 3-level tables. */
5288 #define TABLE identsyntax_table
5289 #define ELEMENT uint8_t
5290 #define DEFAULT UC_IDENTIFIER_INVALID
5291 #define xmalloc malloc
5292 #define xrealloc realloc
5295 /* Output an identifier syntax categorization in a three-level bitmap. */
5297 output_ident_category (const char *filename
, int (*predicate
) (unsigned int), const char *name
, const char *version
)
5301 struct identsyntax_table t
;
5302 unsigned int level1_offset
, level2_offset
, level3_offset
;
5304 stream
= fopen (filename
, "w");
5307 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
5311 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5312 fprintf (stream
, "/* Language syntax properties of Unicode characters. */\n");
5313 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5318 identsyntax_table_init (&t
);
5320 for (ch
= 0; ch
< 0x110000; ch
++)
5322 int syntaxcode
= predicate (ch
);
5324 assert (syntaxcode
<= 0x03);
5326 if (syntaxcode
!= UC_IDENTIFIER_INVALID
)
5327 identsyntax_table_add (&t
, ch
, syntaxcode
);
5330 identsyntax_table_finalize (&t
);
5332 /* Offsets in t.result, in memory of this process. */
5334 5 * sizeof (uint32_t);
5336 5 * sizeof (uint32_t)
5337 + t
.level1_size
* sizeof (uint32_t);
5339 5 * sizeof (uint32_t)
5340 + t
.level1_size
* sizeof (uint32_t)
5341 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
5343 for (i
= 0; i
< 5; i
++)
5344 fprintf (stream
, "#define identsyntax_header_%d %d\n", i
,
5345 ((uint32_t *) t
.result
)[i
]);
5346 fprintf (stream
, "static const\n");
5347 fprintf (stream
, "struct\n");
5348 fprintf (stream
, " {\n");
5349 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
5350 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
5351 fprintf (stream
, " unsigned short level3[%zu * %d];\n", t
.level3_size
,
5352 (1 << t
.p
) * 2 / 16);
5353 fprintf (stream
, " }\n");
5354 fprintf (stream
, "%s =\n", name
);
5355 fprintf (stream
, "{\n");
5356 fprintf (stream
, " {");
5357 if (t
.level1_size
> 8)
5358 fprintf (stream
, "\n ");
5359 for (i
= 0; i
< t
.level1_size
; i
++)
5362 if (i
> 0 && (i
% 8) == 0)
5363 fprintf (stream
, "\n ");
5364 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
5366 fprintf (stream
, " %5d", -1);
5368 fprintf (stream
, " %5zu",
5369 (offset
- level2_offset
) / sizeof (uint32_t));
5370 if (i
+1 < t
.level1_size
)
5371 fprintf (stream
, ",");
5373 if (t
.level1_size
> 8)
5374 fprintf (stream
, "\n ");
5375 fprintf (stream
, " },\n");
5376 fprintf (stream
, " {");
5377 if (t
.level2_size
<< t
.q
> 8)
5378 fprintf (stream
, "\n ");
5379 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
5382 if (i
> 0 && (i
% 8) == 0)
5383 fprintf (stream
, "\n ");
5384 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
5386 fprintf (stream
, " %5d", -1);
5388 fprintf (stream
, " %5zu",
5389 (offset
- level3_offset
) / sizeof (uint8_t));
5390 if (i
+1 < t
.level2_size
<< t
.q
)
5391 fprintf (stream
, ",");
5393 if (t
.level2_size
<< t
.q
> 8)
5394 fprintf (stream
, "\n ");
5395 fprintf (stream
, " },\n");
5396 /* Pack the level3 array. Each entry needs 2 bits only. */
5397 fprintf (stream
, " {");
5398 if ((t
.level3_size
<< t
.p
) * 2 / 16 > 8)
5399 fprintf (stream
, "\n ");
5400 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 2 / 16; i
++)
5402 if (i
> 0 && (i
% 8) == 0)
5403 fprintf (stream
, "\n ");
5404 fprintf (stream
, " 0x%04x",
5405 (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
] << 0)
5406 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 1] << 2)
5407 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 2] << 4)
5408 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 3] << 6)
5409 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 4] << 8)
5410 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 5] << 10)
5411 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 6] << 12)
5412 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 7] << 14));
5413 if (i
+1 < (t
.level3_size
<< t
.p
) * 2 / 16)
5414 fprintf (stream
, ",");
5416 if ((t
.level3_size
<< t
.p
) * 2 / 16 > 8)
5417 fprintf (stream
, "\n ");
5418 fprintf (stream
, " }\n");
5419 fprintf (stream
, "};\n");
5421 if (ferror (stream
) || fclose (stream
))
5423 fprintf (stderr
, "error writing to '%s'\n", filename
);
5429 output_ident_properties (const char *version
)
5431 #define PROPERTY(P) \
5432 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
5433 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5434 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
5435 PROPERTY(c_whitespace
)
5436 PROPERTY(java_whitespace
)
5439 output_ident_category ("unictype/sy_c_ident.h", c_ident_category
, "u_c_ident", version
);
5440 output_ident_category ("unictype/sy_java_ident.h", java_ident_category
, "u_java_ident", version
);
5443 /* ========================================================================= */
5445 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
5446 glibc/localedata/locales/i18n file, generated by
5447 glibc/localedata/gen-unicode-ctype.c. */
5449 /* Character mappings. */
5452 to_upper (unsigned int ch
)
5454 if (unicode_attributes
[ch
].name
!= NULL
5455 && unicode_attributes
[ch
].upper
!= NONE
)
5456 return unicode_attributes
[ch
].upper
;
5462 to_lower (unsigned int ch
)
5464 if (unicode_attributes
[ch
].name
!= NULL
5465 && unicode_attributes
[ch
].lower
!= NONE
)
5466 return unicode_attributes
[ch
].lower
;
5472 to_title (unsigned int ch
)
5474 if (unicode_attributes
[ch
].name
!= NULL
5475 && unicode_attributes
[ch
].title
!= NONE
)
5476 return unicode_attributes
[ch
].title
;
5481 /* Character class properties. */
5484 is_upper (unsigned int ch
)
5486 return (to_lower (ch
) != ch
);
5490 is_lower (unsigned int ch
)
5492 return (to_upper (ch
) != ch
)
5493 /* <U00DF> is lowercase, but without simple to_upper mapping. */
5498 is_alpha (unsigned int ch
)
5500 return (unicode_attributes
[ch
].name
!= NULL
5501 && ((unicode_attributes
[ch
].category
[0] == 'L'
5502 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5503 <U0E2F>, <U0E46> should belong to is_punct. */
5504 && (ch
!= 0x0E2F) && (ch
!= 0x0E46))
5505 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5506 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
5508 || (ch
>= 0x0E34 && ch
<= 0x0E3A)
5509 || (ch
>= 0x0E47 && ch
<= 0x0E4E)
5510 /* Avoid warning for <U0345>. */
5512 /* Avoid warnings for <U2160>..<U217F>. */
5513 || (unicode_attributes
[ch
].category
[0] == 'N'
5514 && unicode_attributes
[ch
].category
[1] == 'l')
5515 /* Avoid warnings for <U24B6>..<U24E9>. */
5516 || (unicode_attributes
[ch
].category
[0] == 'S'
5517 && unicode_attributes
[ch
].category
[1] == 'o'
5518 && strstr (unicode_attributes
[ch
].name
, " LETTER ")
5520 /* Consider all the non-ASCII digits as alphabetic.
5521 ISO C 99 forbids us to have them in category "digit",
5522 but we want iswalnum to return true on them. */
5523 || (unicode_attributes
[ch
].category
[0] == 'N'
5524 && unicode_attributes
[ch
].category
[1] == 'd'
5525 && !(ch
>= 0x0030 && ch
<= 0x0039))));
5529 is_digit (unsigned int ch
)
5532 return (unicode_attributes
[ch
].name
!= NULL
5533 && unicode_attributes
[ch
].category
[0] == 'N'
5534 && unicode_attributes
[ch
].category
[1] == 'd');
5535 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
5536 a zero. Must add <0> in front of them by hand. */
5538 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
5541 The iswdigit function tests for any wide character that corresponds
5542 to a decimal-digit character (as defined in 5.2.1).
5544 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
5546 return (ch
>= 0x0030 && ch
<= 0x0039);
5551 is_alnum (unsigned int ch
)
5553 return is_alpha (ch
) || is_digit (ch
);
5557 is_blank (unsigned int ch
)
5559 return (ch
== 0x0009 /* '\t' */
5560 /* Category Zs without mention of "<noBreak>" */
5561 || (unicode_attributes
[ch
].name
!= NULL
5562 && unicode_attributes
[ch
].category
[0] == 'Z'
5563 && unicode_attributes
[ch
].category
[1] == 's'
5564 && !strstr (unicode_attributes
[ch
].decomposition
, "<noBreak>")));
5568 is_space (unsigned int ch
)
5570 /* Don't make U+00A0 a space. Non-breaking space means that all programs
5571 should treat it like a punctuation character, not like a space. */
5572 return (ch
== 0x0020 /* ' ' */
5573 || ch
== 0x000C /* '\f' */
5574 || ch
== 0x000A /* '\n' */
5575 || ch
== 0x000D /* '\r' */
5576 || ch
== 0x0009 /* '\t' */
5577 || ch
== 0x000B /* '\v' */
5578 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
5579 || (unicode_attributes
[ch
].name
!= NULL
5580 && unicode_attributes
[ch
].category
[0] == 'Z'
5581 && (unicode_attributes
[ch
].category
[1] == 'l'
5582 || unicode_attributes
[ch
].category
[1] == 'p'
5583 || (unicode_attributes
[ch
].category
[1] == 's'
5584 && !strstr (unicode_attributes
[ch
].decomposition
,
5589 is_cntrl (unsigned int ch
)
5591 return (unicode_attributes
[ch
].name
!= NULL
5592 && (strcmp (unicode_attributes
[ch
].name
, "<control>") == 0
5593 /* Categories Zl and Zp */
5594 || (unicode_attributes
[ch
].category
[0] == 'Z'
5595 && (unicode_attributes
[ch
].category
[1] == 'l'
5596 || unicode_attributes
[ch
].category
[1] == 'p'))));
5600 is_xdigit (unsigned int ch
)
5603 return is_digit (ch
)
5604 || (ch
>= 0x0041 && ch
<= 0x0046)
5605 || (ch
>= 0x0061 && ch
<= 0x0066);
5607 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
5610 The iswxdigit function tests for any wide character that corresponds
5611 to a hexadecimal-digit character (as defined in 6.4.4.1).
5613 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
5615 return (ch
>= 0x0030 && ch
<= 0x0039)
5616 || (ch
>= 0x0041 && ch
<= 0x0046)
5617 || (ch
>= 0x0061 && ch
<= 0x0066);
5622 is_graph (unsigned int ch
)
5624 return (unicode_attributes
[ch
].name
!= NULL
5625 && strcmp (unicode_attributes
[ch
].name
, "<control>")
5630 is_print (unsigned int ch
)
5632 return (unicode_attributes
[ch
].name
!= NULL
5633 && strcmp (unicode_attributes
[ch
].name
, "<control>")
5634 /* Categories Zl and Zp */
5635 && !(unicode_attributes
[ch
].name
!= NULL
5636 && unicode_attributes
[ch
].category
[0] == 'Z'
5637 && (unicode_attributes
[ch
].category
[1] == 'l'
5638 || unicode_attributes
[ch
].category
[1] == 'p')));
5642 is_punct (unsigned int ch
)
5645 return (unicode_attributes
[ch
].name
!= NULL
5646 && unicode_attributes
[ch
].category
[0] == 'P');
5648 /* The traditional POSIX definition of punctuation is every graphic,
5649 non-alphanumeric character. */
5650 return (is_graph (ch
) && !is_alpha (ch
) && !is_digit (ch
));
5654 /* Output all properties. */
5656 output_old_ctype (const char *version
)
5658 #define PROPERTY(P) \
5659 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
5660 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5661 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
5680 is_combining (unsigned int ch
)
5682 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
5683 file. In 3.0.1 it was identical to the union of the general categories
5684 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
5685 PropList.txt file, so we take the latter definition. */
5686 return (unicode_attributes
[ch
].name
!= NULL
5687 && unicode_attributes
[ch
].category
[0] == 'M'
5688 && (unicode_attributes
[ch
].category
[1] == 'n'
5689 || unicode_attributes
[ch
].category
[1] == 'c'
5690 || unicode_attributes
[ch
].category
[1] == 'e'));
5694 is_combining_level3 (unsigned int ch
)
5696 return is_combining (ch
)
5697 && !(unicode_attributes
[ch
].combining
[0] != '\0'
5698 && unicode_attributes
[ch
].combining
[0] != '0'
5699 && strtoul (unicode_attributes
[ch
].combining
, NULL
, 10) >= 200);
5702 /* Return the UCS symbol string for a Unicode character. */
5704 ucs_symbol (unsigned int i
)
5706 static char buf
[11+1];
5708 sprintf (buf
, (i
< 0x10000 ? "<U%04X>" : "<U%08X>"), i
);
5712 /* Return the UCS symbol range string for a Unicode characters interval. */
5714 ucs_symbol_range (unsigned int low
, unsigned int high
)
5716 static char buf
[24+1];
5718 strcpy (buf
, ucs_symbol (low
));
5720 strcat (buf
, ucs_symbol (high
));
5724 /* Output a character class (= property) table. */
5727 output_charclass (FILE *stream
, const char *classname
,
5728 bool (*func
) (unsigned int))
5730 char table
[0x110000];
5732 bool need_semicolon
;
5733 const int max_column
= 75;
5736 for (i
= 0; i
< 0x110000; i
++)
5737 table
[i
] = (int) func (i
);
5739 fprintf (stream
, "%s ", classname
);
5740 need_semicolon
= false;
5742 for (i
= 0; i
< 0x110000; )
5748 unsigned int low
, high
;
5754 while (i
< 0x110000 && table
[i
]);
5758 strcpy (buf
, ucs_symbol (low
));
5760 strcpy (buf
, ucs_symbol_range (low
, high
));
5764 fprintf (stream
, ";");
5768 if (column
+ strlen (buf
) > max_column
)
5770 fprintf (stream
, "/\n ");
5774 fprintf (stream
, "%s", buf
);
5775 column
+= strlen (buf
);
5776 need_semicolon
= true;
5779 fprintf (stream
, "\n");
5782 /* Output a character mapping table. */
5785 output_charmap (FILE *stream
, const char *mapname
,
5786 unsigned int (*func
) (unsigned int))
5788 char table
[0x110000];
5790 bool need_semicolon
;
5791 const int max_column
= 75;
5794 for (i
= 0; i
< 0x110000; i
++)
5795 table
[i
] = (func (i
) != i
);
5797 fprintf (stream
, "%s ", mapname
);
5798 need_semicolon
= false;
5800 for (i
= 0; i
< 0x110000; i
++)
5806 strcat (buf
, ucs_symbol (i
));
5808 strcat (buf
, ucs_symbol (func (i
)));
5813 fprintf (stream
, ";");
5817 if (column
+ strlen (buf
) > max_column
)
5819 fprintf (stream
, "/\n ");
5823 fprintf (stream
, "%s", buf
);
5824 column
+= strlen (buf
);
5825 need_semicolon
= true;
5827 fprintf (stream
, "\n");
5830 /* Output the width table. */
5833 output_widthmap (FILE *stream
)
5837 /* Output the tables to the given file. */
5840 output_tables (const char *filename
, const char *version
)
5845 stream
= fopen (filename
, "w");
5848 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
5852 fprintf (stream
, "escape_char /\n");
5853 fprintf (stream
, "comment_char %%\n");
5854 fprintf (stream
, "\n");
5855 fprintf (stream
, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
5857 fprintf (stream
, "\n");
5859 fprintf (stream
, "LC_IDENTIFICATION\n");
5860 fprintf (stream
, "title \"Unicode %s FDCC-set\"\n", version
);
5861 fprintf (stream
, "source \"UnicodeData.txt, PropList.txt\"\n");
5862 fprintf (stream
, "address \"\"\n");
5863 fprintf (stream
, "contact \"\"\n");
5864 fprintf (stream
, "email \"bug-glibc@gnu.org\"\n");
5865 fprintf (stream
, "tel \"\"\n");
5866 fprintf (stream
, "fax \"\"\n");
5867 fprintf (stream
, "language \"\"\n");
5868 fprintf (stream
, "territory \"Earth\"\n");
5869 fprintf (stream
, "revision \"%s\"\n", version
);
5874 strftime (date
, sizeof (date
), "%Y-%m-%d", gmtime (&now
));
5875 fprintf (stream
, "date \"%s\"\n", date
);
5877 fprintf (stream
, "category \"unicode:2001\";LC_CTYPE\n");
5878 fprintf (stream
, "END LC_IDENTIFICATION\n");
5879 fprintf (stream
, "\n");
5882 for (ch
= 0; ch
< 0x110000; ch
++)
5884 /* toupper restriction: "Only characters specified for the keywords
5885 lower and upper shall be specified. */
5886 if (to_upper (ch
) != ch
&& !(is_lower (ch
) || is_upper (ch
)))
5888 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
5889 ucs_symbol (ch
), ch
, to_upper (ch
));
5891 /* tolower restriction: "Only characters specified for the keywords
5892 lower and upper shall be specified. */
5893 if (to_lower (ch
) != ch
&& !(is_lower (ch
) || is_upper (ch
)))
5895 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
5896 ucs_symbol (ch
), ch
, to_lower (ch
));
5898 /* alpha restriction: "Characters classified as either upper or lower
5899 shall automatically belong to this class. */
5900 if ((is_lower (ch
) || is_upper (ch
)) && !is_alpha (ch
))
5901 fprintf (stderr
, "%s is upper|lower but not alpha\n", ucs_symbol (ch
));
5903 /* alpha restriction: "No character specified for the keywords cntrl,
5904 digit, punct or space shall be specified." */
5905 if (is_alpha (ch
) && is_cntrl (ch
))
5906 fprintf (stderr
, "%s is alpha and cntrl\n", ucs_symbol (ch
));
5907 if (is_alpha (ch
) && is_digit (ch
))
5908 fprintf (stderr
, "%s is alpha and digit\n", ucs_symbol (ch
));
5909 if (is_alpha (ch
) && is_punct (ch
))
5910 fprintf (stderr
, "%s is alpha and punct\n", ucs_symbol (ch
));
5911 if (is_alpha (ch
) && is_space (ch
))
5912 fprintf (stderr
, "%s is alpha and space\n", ucs_symbol (ch
));
5914 /* space restriction: "No character specified for the keywords upper,
5915 lower, alpha, digit, graph or xdigit shall be specified."
5916 upper, lower, alpha already checked above. */
5917 if (is_space (ch
) && is_digit (ch
))
5918 fprintf (stderr
, "%s is space and digit\n", ucs_symbol (ch
));
5919 if (is_space (ch
) && is_graph (ch
))
5920 fprintf (stderr
, "%s is space and graph\n", ucs_symbol (ch
));
5921 if (is_space (ch
) && is_xdigit (ch
))
5922 fprintf (stderr
, "%s is space and xdigit\n", ucs_symbol (ch
));
5924 /* cntrl restriction: "No character specified for the keywords upper,
5925 lower, alpha, digit, punct, graph, print or xdigit shall be
5926 specified." upper, lower, alpha already checked above. */
5927 if (is_cntrl (ch
) && is_digit (ch
))
5928 fprintf (stderr
, "%s is cntrl and digit\n", ucs_symbol (ch
));
5929 if (is_cntrl (ch
) && is_punct (ch
))
5930 fprintf (stderr
, "%s is cntrl and punct\n", ucs_symbol (ch
));
5931 if (is_cntrl (ch
) && is_graph (ch
))
5932 fprintf (stderr
, "%s is cntrl and graph\n", ucs_symbol (ch
));
5933 if (is_cntrl (ch
) && is_print (ch
))
5934 fprintf (stderr
, "%s is cntrl and print\n", ucs_symbol (ch
));
5935 if (is_cntrl (ch
) && is_xdigit (ch
))
5936 fprintf (stderr
, "%s is cntrl and xdigit\n", ucs_symbol (ch
));
5938 /* punct restriction: "No character specified for the keywords upper,
5939 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5940 be specified." upper, lower, alpha, cntrl already checked above. */
5941 if (is_punct (ch
) && is_digit (ch
))
5942 fprintf (stderr
, "%s is punct and digit\n", ucs_symbol (ch
));
5943 if (is_punct (ch
) && is_xdigit (ch
))
5944 fprintf (stderr
, "%s is punct and xdigit\n", ucs_symbol (ch
));
5945 if (is_punct (ch
) && (ch
== 0x0020))
5946 fprintf (stderr
, "%s is punct\n", ucs_symbol (ch
));
5948 /* graph restriction: "No character specified for the keyword cntrl
5949 shall be specified." Already checked above. */
5951 /* print restriction: "No character specified for the keyword cntrl
5952 shall be specified." Already checked above. */
5954 /* graph - print relation: differ only in the <space> character.
5955 How is this possible if there are more than one space character?!
5956 I think susv2/xbd/locale.html should speak of "space characters",
5957 not "space character". */
5958 if (is_print (ch
) && !(is_graph (ch
) || /* ch == 0x0020 */ is_space (ch
)))
5960 "%s is print but not graph|<space>\n", ucs_symbol (ch
));
5961 if (!is_print (ch
) && (is_graph (ch
) || ch
== 0x0020))
5963 "%s is graph|<space> but not print\n", ucs_symbol (ch
));
5966 fprintf (stream
, "LC_CTYPE\n");
5967 output_charclass (stream
, "upper", is_upper
);
5968 output_charclass (stream
, "lower", is_lower
);
5969 output_charclass (stream
, "alpha", is_alpha
);
5970 output_charclass (stream
, "digit", is_digit
);
5971 output_charclass (stream
, "outdigit", is_outdigit
);
5972 output_charclass (stream
, "blank", is_blank
);
5973 output_charclass (stream
, "space", is_space
);
5974 output_charclass (stream
, "cntrl", is_cntrl
);
5975 output_charclass (stream
, "punct", is_punct
);
5976 output_charclass (stream
, "xdigit", is_xdigit
);
5977 output_charclass (stream
, "graph", is_graph
);
5978 output_charclass (stream
, "print", is_print
);
5979 output_charclass (stream
, "class \"combining\";", is_combining
);
5980 output_charclass (stream
, "class \"combining_level3\";", is_combining_level3
);
5981 output_charmap (stream
, "toupper", to_upper
);
5982 output_charmap (stream
, "tolower", to_lower
);
5983 output_charmap (stream
, "map \"totitle\";", to_title
);
5984 output_widthmap (stream
);
5985 fprintf (stream
, "END LC_CTYPE\n");
5987 if (ferror (stream
) || fclose (stream
))
5989 fprintf (stderr
, "error writing to '%s'\n", filename
);
5996 /* ========================================================================= */
5998 /* The width property from the EastAsianWidth.txt file.
5999 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
6000 const char * unicode_width
[0x110000];
6002 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
6005 fill_width (const char *width_filename
)
6009 char field0
[FIELDLEN
];
6010 char field1
[FIELDLEN
];
6011 char field2
[FIELDLEN
];
6014 for (i
= 0; i
< 0x110000; i
++)
6015 unicode_width
[i
] = (unicode_attributes
[i
].name
!= NULL
? "N" : NULL
);
6017 stream
= fopen (width_filename
, "r");
6020 fprintf (stderr
, "error during fopen of '%s'\n", width_filename
);
6035 do c
= getc (stream
); while (c
!= EOF
&& c
!= '\n');
6039 n
= getfield (stream
, field0
, ';');
6040 n
+= getfield (stream
, field1
, ' ');
6041 n
+= getfield (stream
, field2
, '\n');
6046 fprintf (stderr
, "short line in '%s':%d\n", width_filename
, lineno
);
6049 i
= strtoul (field0
, NULL
, 16);
6050 if (strstr (field0
, "..") != NULL
)
6052 /* Deal with a range. */
6053 j
= strtoul (strstr (field0
, "..") + 2, NULL
, 16);
6055 unicode_width
[i
] = strdup (field1
);
6059 /* Single character line. */
6060 unicode_width
[i
] = strdup (field1
);
6064 if (ferror (stream
) || fclose (stream
))
6066 fprintf (stderr
, "error reading from '%s'\n", width_filename
);
6071 /* ========================================================================= */
6073 /* Non-spacing attribute and width. */
6075 /* The non-spacing attribute table consists of:
6076 - Non-spacing characters; generated from PropList.txt or
6077 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
6078 - Format control characters; generated from
6079 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
6080 - Zero width characters; generated from
6081 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
6085 is_nonspacing (unsigned int ch
)
6087 return (unicode_attributes
[ch
].name
!= NULL
6088 && (get_bidi_category (ch
) == UC_BIDI_NSM
6089 || is_category_Cc (ch
) || is_category_Cf (ch
)
6090 || strncmp (unicode_attributes
[ch
].name
, "ZERO WIDTH ", 11) == 0));
6094 output_nonspacing_property (const char *filename
)
6097 int ind
[0x110000 / 0x200];
6102 stream
= fopen (filename
, "w");
6105 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
6110 for (i
= 0; i
< 0x110000 / 0x200; i
++)
6112 bool nontrivial
= false;
6115 if (i
!= 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
6116 for (ch
= i
* 0x200; ch
< (i
+ 1) * 0x200; ch
++)
6117 if (is_nonspacing (ch
))
6123 ind
[i
] = next_ind
++;
6128 fprintf (stream
, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
6131 for (i
= 0; i
< 0x110000 / 0x200; i
++)
6133 bool nontrivial
= (ind
[i
] >= 0);
6139 fprintf (stream
, " /* 0x%04x-0x%04x */\n", i
* 0x200, (i
+ 1) * 0x200 - 1);
6140 for (j
= 0; j
< 8; j
++)
6144 fprintf (stream
, " ");
6145 for (k
= 0; k
< 8; k
++)
6148 unsigned char bits
= 0;
6150 for (l
= 0; l
< 8; l
++)
6152 unsigned int ch
= i
* 0x200 + j
* 0x40 + k
* 8 + l
;
6154 if (is_nonspacing (ch
))
6157 fprintf (stream
, " 0x%02x%c", bits
,
6158 ind
[i
] + 1 == next_ind
&& j
== 8 - 1 && k
== 8 - 1 ? ' ' : ',');
6160 fprintf (stream
, " /* 0x%04x-0x%04x */\n",
6161 i
* 0x200 + j
* 0x40, i
* 0x200 + (j
+ 1) * 0x40 - 1);
6166 fprintf (stream
, "};\n");
6168 i_max
= ((i_max
+ 8 - 1) / 8) * 8;
6169 fprintf (stream
, "static const signed char nonspacing_table_ind[%u] = {\n",
6174 for (j
= 0; j
< i_max
/ 8; j
++)
6178 fprintf (stream
, " ");
6179 for (k
= 0; k
< 8; k
++)
6182 fprintf (stream
, " %2d%c", ind
[i
],
6183 j
== i_max
/ 8 - 1 && k
== 8 - 1 ? ' ' : ',');
6185 fprintf (stream
, " /* 0x%04x-0x%04x */\n",
6186 j
* 8 * 0x200, (j
+ 1) * 8 * 0x200 - 1);
6189 fprintf (stream
, "};\n");
6191 if (ferror (stream
) || fclose (stream
))
6193 fprintf (stderr
, "error writing to '%s'\n", filename
);
6198 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
6200 symbolic_width (unsigned int ch
)
6202 /* Test for unassigned character. */
6203 if (is_property_unassigned_code_value (ch
))
6205 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
6206 if (ch
>= 0xE000 && ch
<= 0xF8FF) /* Private Use */
6208 if ((ch
>= 0x4E00 && ch
<= 0x9FFF) /* CJK Unified Ideographs block */
6209 || (ch
>= 0x3400 && ch
<= 0x4DBF) /* CJK Unified Ideographs Extension A block */
6210 || (ch
>= 0xF900 && ch
<= 0xFAFF) /* CJK Compatibility Ideographs block */
6211 || (ch
>= 0x20000 && ch
<= 0x2FFFF) /* Supplementary Ideographic Plane */
6212 || (ch
>= 0x30000 && ch
<= 0x3FFFF) /* Tertiary Ideographic Plane */)
6218 /* Test for non-spacing or control character. */
6219 if (is_category_Cc (ch
) && ch
< 0x00A0)
6221 if (is_nonspacing (ch
))
6223 /* Test for double-width character. */
6224 if (unicode_width
[ch
] != NULL
6225 && (strcmp (unicode_width
[ch
], "W") == 0
6226 || strcmp (unicode_width
[ch
], "F") == 0))
6228 /* Test for half-width character. */
6229 if (unicode_width
[ch
] != NULL
6230 && strcmp (unicode_width
[ch
], "H") == 0)
6233 /* In ancient CJK encodings, Cyrillic and most other characters are
6234 double-width as well. */
6235 if (ch
>= 0x00A1 && ch
< 0x10000)
6241 output_width_property_test (const char *filename
)
6244 unsigned int interval_start
, interval_end
, ch
;
6245 char interval_value
;
6247 stream
= fopen (filename
, "w");
6250 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
6255 interval_start
= interval_end
= 0; /* avoid GCC warning */
6256 for (ch
= 0; ch
< 0x110000; ch
++)
6258 char value
= symbolic_width (ch
);
6259 if (value
!= 0) /* skip Cc control characters and unassigned characters */
6261 if (value
== interval_value
)
6262 /* Extend the interval. */
6266 /* Terminate the interval. */
6267 if (interval_value
!= 0)
6269 if (interval_end
== interval_start
)
6270 fprintf (stream
, "%04X\t\t%c\n", interval_start
, interval_value
);
6272 fprintf (stream
, "%04X..%04X\t%c\n", interval_start
, interval_end
, interval_value
);
6274 /* Start a new interval. */
6275 interval_start
= interval_end
= ch
;
6276 interval_value
= value
;
6280 /* Terminate the last interval. */
6281 if (interval_value
!= 0)
6283 if (interval_end
== interval_start
)
6284 fprintf (stream
, "%04X\t\t%c\n", interval_start
, interval_value
);
6286 fprintf (stream
, "%04X..%04X\t%c\n", interval_start
, interval_end
, interval_value
);
6289 if (ferror (stream
) || fclose (stream
))
6291 fprintf (stderr
, "error writing to '%s'\n", filename
);
6296 /* ========================================================================= */
6298 /* Line breaking classification.
6299 Updated for Unicode TR #14 revision 26. */
6303 /* Values >= 30 are resolved at run time. */
6304 LBP_BK
= 30, /* mandatory break */
6305 /*LBP_CR, carriage return - not used here because it's a DOSism */
6306 /*LBP_LF, line feed - not used here because it's a DOSism */
6307 LBP_CM
= 31, /* attached characters and combining marks */
6308 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
6309 /*LBP_SG, surrogates - not used here because they are not characters */
6310 LBP_WJ
= 0, /* word joiner */
6311 LBP_ZW
= 32, /* zero width space */
6312 LBP_GL
= 1, /* non-breaking (glue) */
6313 LBP_SP
= 33, /* space */
6314 LBP_B2
= 2, /* break opportunity before and after */
6315 LBP_BA
= 3, /* break opportunity after */
6316 LBP_BB
= 4, /* break opportunity before */
6317 LBP_HY
= 5, /* hyphen */
6318 LBP_CB
= 34, /* contingent break opportunity */
6319 LBP_CL
= 6, /* closing punctuation */
6320 LBP_CP
= 7, /* closing parenthesis */
6321 LBP_EX
= 8, /* exclamation/interrogation */
6322 LBP_IN
= 9, /* inseparable */
6323 LBP_NS
= 10, /* non starter */
6324 LBP_OP
= 11, /* opening punctuation */
6325 LBP_QU
= 12, /* ambiguous quotation */
6326 LBP_IS
= 13, /* infix separator (numeric) */
6327 LBP_NU
= 14, /* numeric */
6328 LBP_PO
= 15, /* postfix (numeric) */
6329 LBP_PR
= 16, /* prefix (numeric) */
6330 LBP_SY
= 17, /* symbols allowing breaks */
6331 LBP_AI
= 35, /* ambiguous (alphabetic or ideograph) */
6332 LBP_AL
= 18, /* ordinary alphabetic and symbol characters */
6333 /*LBP_CJ, conditional Japanese starter, resolved to NS */
6334 LBP_H2
= 19, /* Hangul LV syllable */
6335 LBP_H3
= 20, /* Hangul LVT syllable */
6336 LBP_HL
= 25, /* Hebrew letter */
6337 LBP_ID
= 21, /* ideographic */
6338 LBP_JL
= 22, /* Hangul L Jamo */
6339 LBP_JV
= 23, /* Hangul V Jamo */
6340 LBP_JT
= 24, /* Hangul T Jamo */
6341 LBP_RI
= 26, /* regional indicator */
6342 LBP_SA
= 36, /* complex context (South East Asian) */
6343 LBP_ZWJ
= 27, /* zero width joiner */
6344 LBP_EB
= 28, /* emoji base */
6345 LBP_EM
= 29, /* emoji modifier */
6346 LBP_XX
= 37 /* unknown */
6349 /* Returns the line breaking classification for ch, as a bit mask. */
6351 get_lbp (unsigned int ch
)
6355 /* U+20BC..U+20CF are reserved for prefixes. */
6356 if (unicode_attributes
[ch
].name
== NULL
&& (ch
>= 0x20BC && ch
<= 0x20CF))
6357 return (int64_t) 1 << LBP_PR
;
6359 if (unicode_attributes
[ch
].name
!= NULL
)
6361 /* mandatory break */
6362 if (ch
== 0x000A || ch
== 0x000D || ch
== 0x0085 /* newline */
6363 || ch
== 0x000C /* form feed */
6364 || ch
== 0x000B /* line tabulation */
6365 || ch
== 0x2028 /* LINE SEPARATOR */
6366 || ch
== 0x2029 /* PARAGRAPH SEPARATOR */)
6367 attr
|= (int64_t) 1 << LBP_BK
;
6369 if (ch
== 0x2060 /* WORD JOINER */
6370 || ch
== 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
6371 attr
|= (int64_t) 1 << LBP_WJ
;
6373 /* zero width space */
6374 if (ch
== 0x200B /* ZERO WIDTH SPACE */)
6375 attr
|= (int64_t) 1 << LBP_ZW
;
6377 /* zero width joiner */
6378 if (ch
== 0x200D /* ZERO WIDTH JOINER */)
6379 attr
|= (int64_t) 1 << LBP_ZWJ
;
6382 if (ch
== 0x261D /* WHITE UP POINTING INDEX */
6383 || ch
== 0x26F9 /* PERSON WITH BALL */
6384 || (ch
>= 0x270A && ch
<= 0x270D) /* RAISED FIST..WRITING HAND */
6385 || ch
== 0x1F385 /* FATHER CHRISTMAS */
6386 || (ch
>= 0x1F3C3 && ch
<= 0x1F3C4) /* RUNNER..SURFER */
6387 || (ch
>= 0x1F3CA && ch
<= 0x1F3CB) /* SWIMMER..WEIGHT LIFTER */
6388 || (ch
>= 0x1F442 && ch
<= 0x1F443) /* EAR..NOSE */
6389 || (ch
>= 0x1F446 && ch
<= 0x1F450) /* WHITE UP POINTING BACKHAND INDEX..OPEN HANDS SIGN */
6390 || (ch
>= 0x1F466 && ch
<= 0x1F469) /* BOY..WOMAN */
6391 || ch
== 0x1F46E /* POLICE OFFICER */
6392 || (ch
>= 0x1F470 && ch
<= 0x1F478) /* BRIDE WITH VEIL..PRINCESS */
6393 || ch
== 0x1F47C /* BABY ANGEL */
6394 || (ch
>= 0x1F481 && ch
<= 0x1F483) /* INFORMATION DESK PERSON..DANCER */
6395 || (ch
>= 0x1F485 && ch
<= 0x1F487) /* NAIL POLISH..HAIRCUT */
6396 || ch
== 0x1F4AA /* FLEXED BICEPS */
6397 || ch
== 0x1F575 /* SLEUTH OR SPY */
6398 || ch
== 0x1F57A /* MAN DANCING */
6399 || ch
== 0x1F590 /* RAISED HAND WITH FINGERS SPLAYED */
6400 || (ch
>= 0x1F595 && ch
<= 0x1F596) /* REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS */
6401 || (ch
>= 0x1F645 && ch
<= 0x1F647) /* FACE WITH NO GOOD GESTURE..PERSON BOWING DEEPLY */
6402 || (ch
>= 0x1F64B && ch
<= 0x1F64F) /* HAPPY PERSON RAISING ONE HAND..PERSON WITH FOLDED HANDS */
6403 || ch
== 0x1F6A3 /* ROWBOAT */
6404 || (ch
>= 0x1F6B4 && ch
<= 0x1F6B6) /* BICYCLIST..PEDESTRIAN */
6405 || ch
== 0x1F6C0 /* BATH */
6406 || (ch
>= 0x1F918 && ch
<= 0x1F91E) /* SIGN OF THE HORNS..HAND WITH INDEX AND MIDDLE FINGERS CROSSED */
6407 || ch
== 0x1F926 /* FACE PALM */
6408 || ch
== 0x1F930 /* PREGNANT WOMAN */
6409 || (ch
>= 0x1F933 && ch
<= 0x1F939) /* SELFIE..JUGGLING */
6410 || (ch
>= 0x1F93C && ch
<= 0x1F93E) /* WRESTLERS..HANDBALL */)
6411 attr
|= (int64_t) 1 << LBP_EB
;
6413 if ((ch
>= 0x1F3FB && ch
<= 0x1F3FF) /* EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 */)
6414 attr
|= (int64_t) 1 << LBP_EM
;
6416 /* non-breaking (glue) */
6417 if (ch
== 0x00A0 /* NO-BREAK SPACE */
6418 || ch
== 0x202F /* NARROW NO-BREAK SPACE */
6419 || ch
== 0x180E /* MONGOLIAN VOWEL SEPARATOR */
6420 || ch
== 0x034F /* COMBINING GRAPHEME JOINER */
6421 || ch
== 0x2007 /* FIGURE SPACE */
6422 || ch
== 0x2011 /* NON-BREAKING HYPHEN */
6423 || ch
== 0x0F08 /* TIBETAN MARK SBRUL SHAD */
6424 || ch
== 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
6425 || ch
== 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
6426 || (ch
>= 0x035C && ch
<= 0x0362) /* COMBINING DOUBLE ... */
6427 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6428 || ch
== 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
6429 || ch
== 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
6430 attr
|= (int64_t) 1 << LBP_GL
;
6433 if (ch
== 0x0020 /* SPACE */)
6434 attr
|= (int64_t) 1 << LBP_SP
;
6436 /* break opportunity before and after */
6437 if (ch
== 0x2014 /* EM DASH */
6438 || ch
== 0x2E3A /* TWO-EM DASH */
6439 || ch
== 0x2E3B /* THREE-EM DASH */)
6440 attr
|= (int64_t) 1 << LBP_B2
;
6442 /* break opportunity after */
6443 if (/* Breaking Spaces */
6444 ch
== 0x1680 /* OGHAM SPACE MARK */
6445 || ch
== 0x2000 /* EN QUAD */
6446 || ch
== 0x2001 /* EM QUAD */
6447 || ch
== 0x2002 /* EN SPACE */
6448 || ch
== 0x2003 /* EM SPACE */
6449 || ch
== 0x2004 /* THREE-PER-EM SPACE */
6450 || ch
== 0x2005 /* FOUR-PER-EM SPACE */
6451 || ch
== 0x2006 /* SIX-PER-EM SPACE */
6452 || ch
== 0x2008 /* PUNCTUATION SPACE */
6453 || ch
== 0x2009 /* THIN SPACE */
6454 || ch
== 0x200A /* HAIR SPACE */
6455 || ch
== 0x205F /* MEDIUM MATHEMATICAL SPACE */
6456 || ch
== 0x3000 /* IDEOGRAPHIC SPACE */
6458 || ch
== 0x0009 /* tab */
6459 /* Conditional Hyphens */
6460 || ch
== 0x00AD /* SOFT HYPHEN */
6461 /* Breaking Hyphens */
6462 || ch
== 0x058A /* ARMENIAN HYPHEN */
6463 || ch
== 0x1400 /* CANADIAN SYLLABICS HYPHEN */
6464 || ch
== 0x2010 /* HYPHEN */
6465 || ch
== 0x2012 /* FIGURE DASH */
6466 || ch
== 0x2013 /* EN DASH */
6467 /* Visible Word Dividers */
6468 || ch
== 0x05BE /* HEBREW PUNCTUATION MAQAF */
6469 || ch
== 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
6470 || ch
== 0x1361 /* ETHIOPIC WORDSPACE */
6471 || ch
== 0x17D8 /* KHMER SIGN BEYYAL */
6472 || ch
== 0x17DA /* KHMER SIGN KOOMUUT */
6473 || ch
== 0x2027 /* HYPHENATION POINT */
6474 || ch
== 0x007C /* VERTICAL LINE */
6475 /* Historic Word Separators */
6476 || ch
== 0x16EB /* RUNIC SINGLE PUNCTUATION */
6477 || ch
== 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
6478 || ch
== 0x16ED /* RUNIC CROSS PUNCTUATION */
6479 || ch
== 0x2056 /* THREE DOT PUNCTUATION */
6480 || ch
== 0x2058 /* FOUR DOT PUNCTUATION */
6481 || ch
== 0x2059 /* FIVE DOT PUNCTUATION */
6482 || ch
== 0x205A /* TWO DOT PUNCTUATION */
6483 || ch
== 0x205B /* FOUR DOT MARK */
6484 || ch
== 0x205D /* TRICOLON */
6485 || ch
== 0x205E /* VERTICAL FOUR DOTS */
6486 || ch
== 0x2E19 /* PALM BRANCH */
6487 || ch
== 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
6488 || ch
== 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
6489 || ch
== 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
6490 || ch
== 0x2E2D /* FIVE DOT PUNCTUATION */
6491 || ch
== 0x2E30 /* RING POINT */
6492 || ch
== 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
6493 || ch
== 0x2E33 /* RAISED DOT */
6494 || ch
== 0x2E34 /* RAISED COMMA */
6495 || ch
== 0x10100 /* AEGEAN WORD SEPARATOR LINE */
6496 || ch
== 0x10101 /* AEGEAN WORD SEPARATOR DOT */
6497 || ch
== 0x10102 /* AEGEAN CHECK MARK */
6498 || ch
== 0x1039F /* UGARITIC WORD DIVIDER */
6499 || ch
== 0x103D0 /* OLD PERSIAN WORD DIVIDER */
6500 || ch
== 0x1091F /* PHOENICIAN WORD SEPARATOR */
6501 || ch
== 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
6503 || ch
== 0x0964 /* DEVANAGARI DANDA */
6504 || ch
== 0x0965 /* DEVANAGARI DOUBLE DANDA */
6505 || ch
== 0x0E5A /* THAI CHARACTER ANGKHANKHU */
6506 || ch
== 0x0E5B /* THAI CHARACTER KHOMUT */
6507 || ch
== 0x104A /* MYANMAR SIGN LITTLE SECTION */
6508 || ch
== 0x104B /* MYANMAR SIGN SECTION */
6509 || ch
== 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
6510 || ch
== 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
6511 || ch
== 0x17D4 /* KHMER SIGN KHAN */
6512 || ch
== 0x17D5 /* KHMER SIGN BARIYOOSAN */
6513 || ch
== 0x1B5E /* BALINESE CARIK SIKI */
6514 || ch
== 0x1B5F /* BALINESE CARIK PAREREN */
6515 || ch
== 0xA8CE /* SAURASHTRA DANDA */
6516 || ch
== 0xA8CF /* SAURASHTRA DOUBLE DANDA */
6517 || ch
== 0xAA5D /* CHAM PUNCTUATION DANDA */
6518 || ch
== 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
6519 || ch
== 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
6520 || ch
== 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
6521 || ch
== 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
6523 || ch
== 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
6524 || ch
== 0x0F7F /* TIBETAN SIGN RNAM BCAD */
6525 || ch
== 0x0F85 /* TIBETAN MARK PALUTA */
6526 || ch
== 0x0FBE /* TIBETAN KU RU KHA */
6527 || ch
== 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
6528 || ch
== 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
6529 /* Other Terminating Punctuation */
6530 || ch
== 0x1804 /* MONGOLIAN COLON */
6531 || ch
== 0x1805 /* MONGOLIAN FOUR DOTS */
6532 || ch
== 0x1B5A /* BALINESE PANTI */
6533 || ch
== 0x1B5B /* BALINESE PAMADA */
6534 || ch
== 0x1B5D /* BALINESE CARIK PAMUNGKAH */
6535 || ch
== 0x1B60 /* BALINESE PAMENENG */
6536 || ch
== 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
6537 || ch
== 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
6538 || ch
== 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
6539 || ch
== 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
6540 || ch
== 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
6541 || ch
== 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
6542 || ch
== 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
6543 || ch
== 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
6544 || ch
== 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
6545 || ch
== 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
6546 || ch
== 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
6547 || (ch
>= 0x2E0E && ch
<= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
6548 || ch
== 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
6549 || ch
== 0x2E43 /* DASH WITH LEFT UPTURN */
6550 || ch
== 0x2E44 /* DOUBLE SUSPENSION MARK */
6551 || ch
== 0x2E3C /* STENOGRAPHIC FULL STOP */
6552 || ch
== 0x2E3D /* VERTICAL SIX DOTS */
6553 || ch
== 0x2E3E /* WIGGLY VERTICAL LINE */
6554 || ch
== 0x2E40 /* DOUBLE HYPHEN */
6555 || ch
== 0x2E41 /* REVERSED COMMA */
6556 || ch
== 0xA60D /* VAI COMMA */
6557 || ch
== 0xA60F /* VAI QUESTION MARK */
6558 || ch
== 0xA92E /* KAYAH LI SIGN CWI */
6559 || ch
== 0xA92F /* KAYAH LI SIGN SHYA */
6560 || ch
== 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
6561 || ch
== 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
6562 || ch
== 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
6563 || ch
== 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
6564 || ch
== 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
6565 || ch
== 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
6566 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6567 || ch
== 0x2D70 /* TIFINAGH SEPARATOR MARK */
6568 || ch
== 0xA4FE /* LISU PUNCTUATION COMMA */
6569 || ch
== 0xA4FF /* LISU PUNCTUATION FULL STOP */
6570 || ch
== 0xA6F3 /* BAMUM FULL STOP */
6571 || ch
== 0xA6F4 /* BAMUM COLON */
6572 || ch
== 0xA6F5 /* BAMUM COMMA */
6573 || ch
== 0xA6F6 /* BAMUM SEMICOLON */
6574 || ch
== 0xA6F7 /* BAMUM QUESTION MARK */
6575 || ch
== 0xA9C7 /* JAVANESE PADA PANGKAT */
6576 || ch
== 0xA9C8 /* JAVANESE PADA LINGSA */
6577 || ch
== 0xA9C9 /* JAVANESE PADA LUNGSI */
6578 || ch
== 0xAAF0 /* MEETEI MAYEK CHEIKHAN */
6579 || ch
== 0xAAF1 /* MEETEI MAYEK AHANG KHUDAM */
6580 || ch
== 0xABEB /* MEETEI MAYEK CHEIKHEI */
6581 || ch
== 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
6582 || (ch
>= 0x10AF0 && ch
<= 0x10AF5) /* MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS */
6583 || ch
== 0x10B39 /* AVESTAN ABBREVIATION MARK */
6584 || ch
== 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
6585 || ch
== 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
6586 || ch
== 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
6587 || ch
== 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
6588 || ch
== 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
6589 || ch
== 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
6590 || ch
== 0x11047 /* BRAHMI DANDA */
6591 || ch
== 0x11048 /* BRAHMI DOUBLE DANDA */
6592 || ch
== 0x110BE /* KAITHI SECTION MARK */
6593 || ch
== 0x110BF /* KAITHI DOUBLE SECTION MARK */
6594 || ch
== 0x110C0 /* KAITHI DANDA */
6595 || ch
== 0x110C1 /* KAITHI DOUBLE DANDA */
6596 || ch
== 0x11140 /* CHAKMA SECTION MARK */
6597 || ch
== 0x11141 /* CHAKMA DANDA */
6598 || ch
== 0x11142 /* CHAKMA DOUBLE DANDA */
6599 || ch
== 0x11143 /* CHAKMA QUESTION MARK */
6600 || ch
== 0x111C5 /* SHARADA DANDA */
6601 || ch
== 0x111C6 /* SHARADA DOUBLE DANDA */
6602 || ch
== 0x111C8 /* SHARADA SEPARATOR */
6603 || (ch
>= 0x111DD && ch
<= 0x111DF) /* SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 */
6604 || ch
== 0x11238 /* KHOJKI DANDA */
6605 || ch
== 0x11239 /* KHOJKI DOUBLE DANDA */
6606 || ch
== 0x1123B /* KHOJKI SECTION MARK */
6607 || ch
== 0x1123C /* KHOJKI DOUBLE SECTION MARK */
6608 || ch
== 0x112A9 /* MULTANI SECTION MARK */
6609 || (ch
>= 0x1144B && ch
<= 0x1144E) /* NEWA DANDA..NEWA GAP FILLER */
6610 || ch
== 0x1145B /* NEWA PLACEHOLDER MARK */
6611 || ch
== 0x115C2 /* SIDDHAM DANDA */
6612 || ch
== 0x115C3 /* SIDDHAM DOUBLE DANDA */
6613 || (ch
>= 0x115C9 && ch
<= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
6614 || ch
== 0x11641 /* MODI DANDA */
6615 || ch
== 0x11642 /* MODI DOUBLE DANDA */
6616 || (ch
>= 0x1173C && ch
<= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
6617 || (ch
>= 0x11C41 && ch
<= 0x11C45) /* BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 */
6618 || ch
== 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
6619 || ch
== 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
6620 || ch
== 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
6621 || ch
== 0x12474 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON */
6622 || ch
== 0x16A6E /* MRO DANDA */
6623 || ch
== 0x16A6F /* MRO DOUBLE DANDA */
6624 || ch
== 0x16AF5 /* BASSA VAH FULL STOP */
6625 || ch
== 0x16B37 /* PAHAWH HMONG SIGN VOS THOM */
6626 || ch
== 0x16B38 /* PAHAWH HMONG SIGN VOS TSHAB CEEB */
6627 || ch
== 0x16B39 /* PAHAWH HMONG SIGN CIM CHEEM */
6628 || ch
== 0x16B44 /* PAHAWH HMONG SIGN XAUS */
6629 || ch
== 0x1BC9F /* DUPLOYAN PUNCTUATION CHINOOK FULL STOP */
6630 || (ch
>= 0x1DA87 && ch
<= 0x1DA8A) /* SIGNWRITING COMMA..SIGNWRITING COLON */)
6631 attr
|= (int64_t) 1 << LBP_BA
;
6633 /* break opportunity before */
6634 if (ch
== 0x00B4 /* ACUTE ACCENT */
6635 || ch
== 0x1FFD /* GREEK OXIA */
6636 || ch
== 0x02DF /* MODIFIER LETTER CROSS ACCENT */
6637 || ch
== 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
6638 || ch
== 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
6639 || ch
== 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
6640 || ch
== 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
6641 || ch
== 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
6642 || ch
== 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
6643 || ch
== 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
6644 || ch
== 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
6645 || ch
== 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
6646 || ch
== 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
6647 || ch
== 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
6648 || ch
== 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
6649 || ch
== 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
6650 || ch
== 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
6651 || ch
== 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
6652 || ch
== 0xA8FC /* DEVANAGARI SIGN SIDDHAM */
6653 || ch
== 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
6654 || ch
== 0x11175 /* MAHAJANI SECTION MARK */
6655 || ch
== 0x111DB /* SHARADA SIGN SIDDHAM */
6656 || ch
== 0x115C1 /* SIDDHAM SIGN SIDDHAM */
6657 || (ch
>= 0x11660 && ch
<= 0x1166C) /* MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT */
6658 || ch
== 0x11C70 /* MARCHEN HEAD MARK */)
6659 attr
|= (int64_t) 1 << LBP_BB
;
6662 if (ch
== 0x002D /* HYPHEN-MINUS */)
6663 attr
|= (int64_t) 1 << LBP_HY
;
6665 /* contingent break opportunity */
6666 if (ch
== 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
6667 attr
|= (int64_t) 1 << LBP_CB
;
6669 /* closing parenthesis */
6670 if (ch
== 0x0029 /* RIGHT PARENTHESIS */
6671 || ch
== 0x005D /* RIGHT SQUARE BRACKET */)
6672 attr
|= (int64_t) 1 << LBP_CP
;
6674 /* closing punctuation */
6675 if ((unicode_attributes
[ch
].category
[0] == 'P'
6676 && unicode_attributes
[ch
].category
[1] == 'e'
6677 && !(attr
& ((int64_t) 1 << LBP_CP
)))
6678 || ch
== 0x3001 /* IDEOGRAPHIC COMMA */
6679 || ch
== 0x3002 /* IDEOGRAPHIC FULL STOP */
6680 || ch
== 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
6681 || ch
== 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
6682 || ch
== 0xFE50 /* SMALL COMMA */
6683 || ch
== 0xFE52 /* SMALL FULL STOP */
6684 || ch
== 0xFF0C /* FULLWIDTH COMMA */
6685 || ch
== 0xFF0E /* FULLWIDTH FULL STOP */
6686 || ch
== 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
6687 || ch
== 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
6688 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6689 || ch
== 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
6690 || ch
== 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
6691 || ch
== 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
6692 || ch
== 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
6693 || ch
== 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
6694 || ch
== 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
6695 || ch
== 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
6696 || ch
== 0x1337B /* EGYPTIAN HIEROGLYPH V011C */
6697 || ch
== 0x145CF /* ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK */)
6698 attr
|= (int64_t) 1 << LBP_CL
;
6700 /* exclamation/interrogation */
6701 if (ch
== 0x0021 /* EXCLAMATION MARK */
6702 || ch
== 0x003F /* QUESTION MARK */
6703 || ch
== 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
6704 || ch
== 0x061B /* ARABIC SEMICOLON */
6705 || ch
== 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
6706 || ch
== 0x061F /* ARABIC QUESTION MARK */
6707 || ch
== 0x06D4 /* ARABIC FULL STOP */
6708 || ch
== 0x07F9 /* NKO EXCLAMATION MARK */
6709 || ch
== 0x0F0D /* TIBETAN MARK SHAD */
6710 || ch
== 0x0F0E /* TIBETAN MARK NYIS SHAD */
6711 || ch
== 0x0F0F /* TIBETAN MARK TSHEG SHAD */
6712 || ch
== 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
6713 || ch
== 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
6714 || ch
== 0x0F14 /* TIBETAN MARK GTER TSHEG */
6715 || ch
== 0x1802 /* MONGOLIAN COMMA */
6716 || ch
== 0x1803 /* MONGOLIAN FULL STOP */
6717 || ch
== 0x1808 /* MONGOLIAN MANCHU COMMA */
6718 || ch
== 0x1809 /* MONGOLIAN MANCHU FULL STOP */
6719 || ch
== 0x1944 /* LIMBU EXCLAMATION MARK */
6720 || ch
== 0x1945 /* LIMBU QUESTION MARK */
6721 || ch
== 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
6722 || ch
== 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
6723 || ch
== 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
6724 || ch
== 0x2CFE /* COPTIC FULL STOP */
6725 || ch
== 0x2E2E /* REVERSED QUESTION MARK */
6726 || ch
== 0xA60E /* VAI FULL STOP */
6727 || ch
== 0xA876 /* PHAGS-PA MARK SHAD */
6728 || ch
== 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
6729 || ch
== 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
6730 || ch
== 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
6731 || ch
== 0xFE56 /* SMALL QUESTION MARK */
6732 || ch
== 0xFE57 /* SMALL EXCLAMATION MARK */
6733 || ch
== 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
6734 || ch
== 0xFF1F /* FULLWIDTH QUESTION MARK */
6735 || ch
== 0x115C4 /* SIDDHAM SEPARATOR DOT */
6736 || ch
== 0x115C5 /* SIDDHAM SEPARATOR BAR */
6737 || ch
== 0x11C71 /* MARCHEN MARK SHAD */)
6738 attr
|= (int64_t) 1 << LBP_EX
;
6741 if (ch
== 0x2024 /* ONE DOT LEADER */
6742 || ch
== 0x2025 /* TWO DOT LEADER */
6743 || ch
== 0x2026 /* HORIZONTAL ELLIPSIS */
6744 || ch
== 0x22EF /* MIDLINE HORIZONTAL ELLIPSIS */
6745 || ch
== 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */
6746 || ch
== 0x10AF6 /* MANICHAEAN PUNCTUATION LINE FILLER */)
6747 attr
|= (int64_t) 1 << LBP_IN
;
6750 if (ch
== 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
6751 || ch
== 0x203C /* DOUBLE EXCLAMATION MARK */
6752 || ch
== 0x203D /* INTERROBANG */
6753 || ch
== 0x2047 /* DOUBLE QUESTION MARK */
6754 || ch
== 0x2048 /* QUESTION EXCLAMATION MARK */
6755 || ch
== 0x2049 /* EXCLAMATION QUESTION MARK */
6756 || ch
== 0x3005 /* IDEOGRAPHIC ITERATION MARK */
6757 || ch
== 0x301C /* WAVE DASH */
6758 || ch
== 0x303C /* MASU MARK */
6759 || ch
== 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
6760 || ch
== 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
6761 || ch
== 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
6762 || ch
== 0x309D /* HIRAGANA ITERATION MARK */
6763 || ch
== 0x309E /* HIRAGANA VOICED ITERATION MARK */
6764 || ch
== 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
6765 || ch
== 0x30FB /* KATAKANA MIDDLE DOT */
6766 || ch
== 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6767 || ch
== 0x30FD /* KATAKANA ITERATION MARK */
6768 || ch
== 0x30FE /* KATAKANA VOICED ITERATION MARK */
6769 || ch
== 0xA015 /* YI SYLLABLE WU */
6770 || ch
== 0xFE54 /* SMALL SEMICOLON */
6771 || ch
== 0xFE55 /* SMALL COLON */
6772 || ch
== 0xFF1A /* FULLWIDTH COLON */
6773 || ch
== 0xFF1B /* FULLWIDTH SEMICOLON */
6774 || ch
== 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
6775 || ch
== 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6776 || ch
== 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
6777 || ch
== 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
6778 || ch
== 0x16FE0 /* TANGUT ITERATION MARK */
6779 || ch
== 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
6780 || ch
== 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
6781 || ch
== 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */
6782 || strstr (unicode_attributes
[ch
].name
, "HIRAGANA LETTER SMALL ") != NULL
6783 || strstr (unicode_attributes
[ch
].name
, "KATAKANA LETTER SMALL ") != NULL
)
6784 attr
|= (int64_t) 1 << LBP_NS
;
6786 /* opening punctuation */
6787 if ((unicode_attributes
[ch
].category
[0] == 'P'
6788 && unicode_attributes
[ch
].category
[1] == 's')
6789 || ch
== 0x00A1 /* INVERTED EXCLAMATION MARK */
6790 || ch
== 0x00BF /* INVERTED QUESTION MARK */
6791 || ch
== 0x2E18 /* INVERTED INTERROBANG */
6792 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6793 || ch
== 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
6794 || ch
== 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
6795 || ch
== 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
6796 || ch
== 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
6797 || ch
== 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
6798 || ch
== 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
6799 || ch
== 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */
6800 || (ch
>= 0x1E95E && ch
<= 0x1E95F) /* ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK */)
6801 attr
|= (int64_t) 1 << LBP_OP
;
6803 /* ambiguous quotation */
6804 if ((unicode_attributes
[ch
].category
[0] == 'P'
6805 && (unicode_attributes
[ch
].category
[1] == 'f'
6806 || unicode_attributes
[ch
].category
[1] == 'i'))
6807 || ch
== 0x0022 /* QUOTATION MARK */
6808 || ch
== 0x0027 /* APOSTROPHE */
6809 || ch
== 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
6810 || ch
== 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
6811 || ch
== 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6812 || ch
== 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6813 || ch
== 0x275F /* HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT */
6814 || ch
== 0x2760 /* HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */
6815 || ch
== 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
6816 || ch
== 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
6817 || ch
== 0x2E06 /* RAISED INTERPOLATION MARKER */
6818 || ch
== 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
6819 || ch
== 0x2E08 /* DOTTED TRANSPOSITION MARKER */
6820 || ch
== 0x2E0B /* RAISED SQUARE */
6821 || ch
== 0x1F676 /* SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6822 || ch
== 0x1F677 /* SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6823 || ch
== 0x1F678 /* SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */)
6824 attr
|= (int64_t) 1 << LBP_QU
;
6826 /* infix separator (numeric) */
6827 if (ch
== 0x002C /* COMMA */
6828 || ch
== 0x002E /* FULL STOP */
6829 || ch
== 0x003A /* COLON */
6830 || ch
== 0x003B /* SEMICOLON */
6831 || ch
== 0x037E /* GREEK QUESTION MARK */
6832 || ch
== 0x0589 /* ARMENIAN FULL STOP */
6833 || ch
== 0x060C /* ARABIC COMMA */
6834 || ch
== 0x060D /* ARABIC DATE SEPARATOR */
6835 || ch
== 0x07F8 /* NKO COMMA */
6836 || ch
== 0x2044 /* FRACTION SLASH */
6837 || ch
== 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
6838 || ch
== 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
6839 || ch
== 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
6840 attr
|= (int64_t) 1 << LBP_IS
;
6843 if ((unicode_attributes
[ch
].category
[0] == 'N'
6844 && unicode_attributes
[ch
].category
[1] == 'd'
6845 && strstr (unicode_attributes
[ch
].name
, "FULLWIDTH") == NULL
)
6846 || ch
== 0x066B /* ARABIC DECIMAL SEPARATOR */
6847 || ch
== 0x066C /* ARABIC THOUSANDS SEPARATOR */)
6848 attr
|= (int64_t) 1 << LBP_NU
;
6850 /* postfix (numeric) */
6851 if (ch
== 0x0025 /* PERCENT SIGN */
6852 || ch
== 0x00A2 /* CENT SIGN */
6853 || ch
== 0x00B0 /* DEGREE SIGN */
6854 || ch
== 0x060B /* AFGHANI SIGN */
6855 || ch
== 0x066A /* ARABIC PERCENT SIGN */
6856 || ch
== 0x2030 /* PER MILLE SIGN */
6857 || ch
== 0x2031 /* PER TEN THOUSAND SIGN */
6858 || ch
== 0x2032 /* PRIME */
6859 || ch
== 0x2033 /* DOUBLE PRIME */
6860 || ch
== 0x2034 /* TRIPLE PRIME */
6861 || ch
== 0x2035 /* REVERSED PRIME */
6862 || ch
== 0x2036 /* REVERSED DOUBLE PRIME */
6863 || ch
== 0x2037 /* REVERSED TRIPLE PRIME */
6864 || ch
== 0x20A7 /* PESETA SIGN */
6865 || ch
== 0x20BB /* NORDIC MARK SIGN */
6866 || ch
== 0x2103 /* DEGREE CELSIUS */
6867 || ch
== 0x2109 /* DEGREE FAHRENHEIT */
6868 || ch
== 0xFDFC /* RIAL SIGN */
6869 || ch
== 0xFE6A /* SMALL PERCENT SIGN */
6870 || ch
== 0xFF05 /* FULLWIDTH PERCENT SIGN */
6871 || ch
== 0xFFE0 /* FULLWIDTH DIGIT ZERO */
6872 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6873 || ch
== 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
6874 || ch
== 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
6875 || ch
== 0x09F2 /* BENGALI RUPEE MARK */
6876 || ch
== 0x09F3 /* BENGALI RUPEE SIGN */
6877 || ch
== 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
6878 || ch
== 0x0D79 /* MALAYALAM DATE MARK */
6879 || ch
== 0x20B6 /* LIVRE TOURNOIS SIGN */
6880 || ch
== 0x20BE /* LARI SIGN */
6881 || ch
== 0xA838 /* NORTH INDIC RUPEE MARK */)
6882 attr
|= (int64_t) 1 << LBP_PO
;
6884 /* prefix (numeric) */
6885 if ((unicode_attributes
[ch
].category
[0] == 'S'
6886 && unicode_attributes
[ch
].category
[1] == 'c')
6887 || ch
== 0x002B /* PLUS SIGN */
6888 || ch
== 0x005C /* REVERSE SOLIDUS */
6889 || ch
== 0x00B1 /* PLUS-MINUS SIGN */
6890 || ch
== 0x2116 /* NUMERO SIGN */
6891 || ch
== 0x2212 /* MINUS SIGN */
6892 || ch
== 0x2213 /* MINUS-OR-PLUS SIGN */)
6893 if (!(attr
& ((int64_t) 1 << LBP_PO
)))
6894 attr
|= (int64_t) 1 << LBP_PR
;
6896 /* symbols allowing breaks */
6897 if (ch
== 0x002F /* SOLIDUS */)
6898 attr
|= (int64_t) 1 << LBP_SY
;
6900 if (ch
>= 0xAC00 && ch
<= 0xD7A3 && ((ch
- 0xAC00) % 28) == 0)
6901 attr
|= (int64_t) 1 << LBP_H2
;
6903 if (ch
>= 0xAC00 && ch
<= 0xD7A3 && ((ch
- 0xAC00) % 28) != 0)
6904 attr
|= (int64_t) 1 << LBP_H3
;
6906 if ((ch
>= 0x05D0 && ch
<= 0x05F2) || ch
== 0xFB1D
6907 || (ch
>= 0xFB1F && ch
<= 0xFB28) || (ch
>= 0xFB2A && ch
<= 0xFB4F))
6908 attr
|= (int64_t) 1 << LBP_HL
;
6910 if ((ch
>= 0x1100 && ch
<= 0x115F) || (ch
>= 0xA960 && ch
<= 0xA97C))
6911 attr
|= (int64_t) 1 << LBP_JL
;
6913 if ((ch
>= 0x1160 && ch
<= 0x11A7) || (ch
>= 0xD7B0 && ch
<= 0xD7C6))
6914 attr
|= (int64_t) 1 << LBP_JV
;
6916 if ((ch
>= 0x11A8 && ch
<= 0x11FF) || (ch
>= 0xD7CB && ch
<= 0xD7FB))
6917 attr
|= (int64_t) 1 << LBP_JT
;
6919 /* regional indicator */
6920 if (ch
>= 0x1F1E6 && ch
<= 0x1F1FF)
6921 attr
|= (int64_t) 1 << LBP_RI
;
6923 /* complex context (South East Asian) */
6924 if (((unicode_attributes
[ch
].category
[0] == 'C'
6925 && unicode_attributes
[ch
].category
[1] == 'f')
6926 || (unicode_attributes
[ch
].category
[0] == 'L'
6927 && (unicode_attributes
[ch
].category
[1] == 'm'
6928 || unicode_attributes
[ch
].category
[1] == 'o'))
6929 || (unicode_attributes
[ch
].category
[0] == 'M'
6930 && (unicode_attributes
[ch
].category
[1] == 'c'
6931 || unicode_attributes
[ch
].category
[1] == 'n')
6932 && ch
!= 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
6933 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6934 || ch
== 0x109E /* MYANMAR SYMBOL SHAN ONE */
6935 || ch
== 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
6936 || ch
== 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
6937 || ch
== 0x19DE /* NEW TAI LUE SIGN LAE */
6938 || ch
== 0x19DF /* NEW TAI LUE SIGN LAEV */
6939 || (ch
>= 0x1AA0 && ch
<= 0x1AAD) /* TAI THAM SIGN */
6940 || (ch
>= 0xA9E0 && ch
<= 0xA9EF) /* Myanmar */
6941 || (ch
>= 0xA9FA && ch
<= 0xA9FE) /* Myanmar */
6942 || (ch
>= 0xAA77 && ch
<= 0xAA79) /* MYANMAR SYMBOL AITON */
6943 || (ch
>= 0xAADE && ch
<= 0xAADF) /* TAI VIET SYMBOL */
6944 || (ch
>= 0x1173A && ch
<= 0x1173B) /* Ahom */
6945 || ch
== 0x1173F /* Ahom */)
6946 && ((ch
>= 0x0E00 && ch
<= 0x0EFF) /* Thai, Lao */
6947 || (ch
>= 0x1000 && ch
<= 0x109F) /* Myanmar */
6948 || (ch
>= 0x1780 && ch
<= 0x17FF) /* Khmer */
6949 || (ch
>= 0x1950 && ch
<= 0x19DF) /* Tai Le, New Tai Lue */
6950 || (ch
>= 0x1A20 && ch
<= 0x1AAF) /* Tai Tham */
6951 || (ch
>= 0xA9E0 && ch
<= 0xA9EF) /* Myanmar */
6952 || (ch
>= 0xA9FA && ch
<= 0xA9FE) /* Myanmar */
6953 || (ch
>= 0xAA60 && ch
<= 0xAADF) /* Myanmar Extended-A, Tai Viet */
6954 || (ch
>= 0x11700 && ch
<= 0x11719) /* Ahom */
6955 || (ch
>= 0x1171D && ch
<= 0x1172B) /* Ahom */
6956 || (ch
>= 0x1173A && ch
<= 0x1173B) /* Ahom */
6957 || ch
== 0x1173F /* Ahom */))
6958 attr
|= (int64_t) 1 << LBP_SA
;
6960 /* attached characters and combining marks */
6961 if ((unicode_attributes
[ch
].category
[0] == 'M'
6962 && (unicode_attributes
[ch
].category
[1] == 'c'
6963 || unicode_attributes
[ch
].category
[1] == 'e'
6964 || unicode_attributes
[ch
].category
[1] == 'n'))
6965 || (unicode_attributes
[ch
].category
[0] == 'C'
6966 && (unicode_attributes
[ch
].category
[1] == 'c'
6967 || unicode_attributes
[ch
].category
[1] == 'f')
6968 && ch
!= 0x110BD /* KAITHI NUMBER SIGN */
6969 && ch
!= 0x08E2 /* ARABIC DISPUTED END OF AYAH */)
6970 || ch
== 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
6971 if (!(attr
& (((int64_t) 1 << LBP_BK
) | ((int64_t) 1 << LBP_BA
) | ((int64_t) 1 << LBP_GL
) | ((int64_t) 1 << LBP_SA
) | ((int64_t) 1 << LBP_WJ
) | ((int64_t) 1 << LBP_ZW
) | ((int64_t) 1 << LBP_ZWJ
))))
6972 attr
|= (int64_t) 1 << LBP_CM
;
6975 if (ch
== 0x231A /* WATCH */
6976 || ch
== 0x231B /* HOURGLASS */
6977 || ch
== 0x23F0 /* ALARM CLOCK */
6978 || ch
== 0x23F1 /* STOPWATCH */
6979 || ch
== 0x23F2 /* TIMER CLOCK */
6980 || ch
== 0x23F3 /* HOURGLASS WITH FLOWING SAND */
6981 || ch
== 0x2600 /* BLACK SUN WITH RAYS */
6982 || ch
== 0x2601 /* CLOUD */
6983 || ch
== 0x2602 /* UMBRELLA */
6984 || ch
== 0x2603 /* SNOWMAN */
6985 || ch
== 0x2614 /* UMBRELLA WITH RAIN DROPS */
6986 || ch
== 0x2615 /* HOT BEVERAGE */
6987 || ch
== 0x2618 /* SHAMROCK */
6988 || ch
== 0x261A /* BLACK LEFT POINTING INDEX */
6989 || ch
== 0x261B /* BLACK RIGHT POINTING INDEX */
6990 || ch
== 0x261C /* WHITE LEFT POINTING INDEX */
6991 || ch
== 0x261D /* WHITE UP POINTING INDEX */
6992 || ch
== 0x261E /* WHITE RIGHT POINTING INDEX */
6993 || ch
== 0x261F /* WHITE DOWN POINTING INDEX */
6994 || ch
== 0x2639 /* WHITE FROWNING FACE */
6995 || ch
== 0x263A /* WHITE SMILING FACE */
6996 || ch
== 0x263B /* BLACK SMILING FACE */
6997 || ch
== 0x2668 /* HOT SPRINGS */
6998 || ch
== 0x267F /* WHEELCHAIR SYMBOL */
6999 || ch
== 0x26BD /* SOCCER BALL */
7000 || ch
== 0x26BE /* BASEBALL */
7001 || ch
== 0x26BF /* SQUARED KEY */
7002 || ch
== 0x26C0 /* WHITE DRAUGHTS MAN */
7003 || ch
== 0x26C1 /* WHITE DRAUGHTS KING */
7004 || ch
== 0x26C2 /* BLACK DRAUGHTS MAN */
7005 || ch
== 0x26C3 /* BLACK DRAUGHTS KING */
7006 || ch
== 0x26C4 /* SNOWMAN WITHOUT SNOW */
7007 || ch
== 0x26C5 /* SUN BEHIND CLOUD */
7008 || ch
== 0x26C6 /* RAIN */
7009 || ch
== 0x26C7 /* BLACK SNOWMAN */
7010 || ch
== 0x26C8 /* THUNDER CLOUD AND RAIN */
7011 || ch
== 0x26CD /* DISABLED CAR */
7012 || ch
== 0x26CF /* PICK */
7013 || ch
== 0x26D0 /* CAR SLIDING */
7014 || ch
== 0x26D1 /* HELMET WITH WHITE CROSS */
7015 || ch
== 0x26D3 /* CHAINS */
7016 || ch
== 0x26D4 /* NO ENTRY */
7017 || ch
== 0x26D8 /* BLACK LEFT LANE MERGE */
7018 || ch
== 0x26D9 /* WHITE LEFT LANE MERGE */
7019 || ch
== 0x26DC /* LEFT CLOSED ENTRY */
7020 || ch
== 0x26DF /* BLACK TRUCK */
7021 || ch
== 0x26E0 /* RESTRICTED LEFT ENTRY-1 */
7022 || ch
== 0x26E1 /* RESTRICTED LEFT ENTRY-2 */
7023 || ch
== 0x26EA /* CHURCH */
7024 || ch
== 0x26F1 /* UMBRELLA ON GROUND */
7025 || ch
== 0x26F2 /* FOUNTAIN */
7026 || ch
== 0x26F3 /* FLAG IN HOLE */
7027 || ch
== 0x26F4 /* FERRY */
7028 || ch
== 0x26F5 /* SAILBOAT */
7029 || ch
== 0x26F7 /* SKIER */
7030 || ch
== 0x26F8 /* ICE SKATE */
7031 || ch
== 0x26F9 /* PERSON WITH BALL */
7032 || ch
== 0x26FA /* TENT */
7033 || ch
== 0x26FD /* FUEL PUMP */
7034 || ch
== 0x26FE /* CUP ON BLACK SQUARE */
7035 || ch
== 0x26FF /* WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE */
7036 || ch
== 0x2700 /* BLACK SAFETY SCISSORS */
7037 || ch
== 0x2701 /* UPPER BLADE SCISSORS */
7038 || ch
== 0x2702 /* BLACK SCISSORS */
7039 || ch
== 0x2703 /* LOWER BLADE SCISSORS */
7040 || ch
== 0x2704 /* WHITE SCISSORS */
7041 || ch
== 0x2708 /* AIRPLANE */
7042 || ch
== 0x2709 /* ENVELOPE */
7043 || ch
== 0x270A /* RAISED FIST */
7044 || ch
== 0x270B /* RAISED HAND */
7045 || ch
== 0x270C /* VICTORY HAND */
7046 || ch
== 0x270D /* WRITING HAND */
7047 || ch
== 0x2764 /* HEAVY BLACK HEART */
7048 || (ch
>= 0x2E80 && ch
<= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
7049 || (ch
>= 0x3040 && ch
<= 0x309F) /* HIRAGANA */
7050 || (ch
>= 0x30A0 && ch
<= 0x30FF) /* KATAKANA */
7051 || (ch
>= 0x3400 && ch
<= 0x4DBF) /* CJK Ideograph Extension A */
7052 || (ch
>= 0x4E00 && ch
<= 0x9FFF) /* CJK Ideograph */
7053 || (ch
>= 0xF900 && ch
<= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
7054 || (ch
>= 0xA000 && ch
<= 0xA48F) /* YI SYLLABLE */
7055 || (ch
>= 0xA490 && ch
<= 0xA4CF) /* YI RADICAL */
7056 || ch
== 0xFE62 /* SMALL PLUS SIGN */
7057 || ch
== 0xFE63 /* SMALL HYPHEN-MINUS */
7058 || ch
== 0xFE64 /* SMALL LESS-THAN SIGN */
7059 || ch
== 0xFE65 /* SMALL GREATER-THAN SIGN */
7060 || ch
== 0xFE66 /* SMALL EQUALS SIGN */
7061 || (ch
>= 0xFF10 && ch
<= 0xFF19) /* FULLWIDTH DIGIT */
7062 || (ch
>= 0x20000 && ch
<= 0x2A6D6) /* CJK Ideograph Extension B */
7063 || (ch
>= 0x2F800 && ch
<= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
7064 || strstr (unicode_attributes
[ch
].name
, "FULLWIDTH LATIN ") != NULL
7065 || (ch
>= 0x3000 && ch
<= 0x33FF
7066 && !(attr
& (((int64_t) 1 << LBP_BA
) | ((int64_t) 1 << LBP_CM
) | ((int64_t) 1 << LBP_NS
) | ((int64_t) 1 << LBP_OP
) | ((int64_t) 1 << LBP_CL
) | ((int64_t) 1 << LBP_CP
))))
7067 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7068 || ch
== 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
7069 || ch
== 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
7070 || ch
== 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
7071 || ch
== 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
7072 || ch
== 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
7073 || ch
== 0xFE45 /* SESAME DOT */
7074 || ch
== 0xFE46 /* WHITE SESAME DOT */
7075 || ch
== 0xFE49 /* DASHED OVERLINE */
7076 || ch
== 0xFE4A /* CENTRELINE OVERLINE */
7077 || ch
== 0xFE4B /* WAVY OVERLINE */
7078 || ch
== 0xFE4C /* DOUBLE WAVY OVERLINE */
7079 || ch
== 0xFE4D /* DASHED LOW LINE */
7080 || ch
== 0xFE4E /* CENTRELINE LOW LINE */
7081 || ch
== 0xFE4F /* WAVY LOW LINE */
7082 || ch
== 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
7083 || ch
== 0xFE58 /* SMALL EM DASH */
7084 || ch
== 0xFE5F /* SMALL NUMBER SIGN */
7085 || ch
== 0xFE60 /* SMALL AMPERSAND */
7086 || ch
== 0xFE61 /* SMALL ASTERISK */
7087 || ch
== 0xFE68 /* SMALL REVERSE SOLIDUS */
7088 || ch
== 0xFE6B /* SMALL COMMERCIAL AT */
7089 || ch
== 0xFF02 /* FULLWIDTH QUOTATION MARK */
7090 || ch
== 0xFF03 /* FULLWIDTH NUMBER SIGN */
7091 || ch
== 0xFF06 /* FULLWIDTH AMPERSAND */
7092 || ch
== 0xFF07 /* FULLWIDTH APOSTROPHE */
7093 || ch
== 0xFF0A /* FULLWIDTH ASTERISK */
7094 || ch
== 0xFF0B /* FULLWIDTH PLUS SIGN */
7095 || ch
== 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
7096 || ch
== 0xFF0F /* FULLWIDTH SOLIDUS */
7097 || ch
== 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
7098 || ch
== 0xFF1D /* FULLWIDTH EQUALS SIGN */
7099 || ch
== 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
7100 || ch
== 0xFF20 /* FULLWIDTH COMMERCIAL AT */
7101 || ch
== 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
7102 || ch
== 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
7103 || ch
== 0xFF3F /* FULLWIDTH LOW LINE */
7104 || ch
== 0xFF40 /* FULLWIDTH GRAVE ACCENT */
7105 || ch
== 0xFF5C /* FULLWIDTH VERTICAL LINE */
7106 || ch
== 0xFF5E /* FULLWIDTH TILDE */
7107 || ch
== 0xFFE2 /* FULLWIDTH NOT SIGN */
7108 || ch
== 0xFFE3 /* FULLWIDTH MACRON */
7109 || ch
== 0xFFE4 /* FULLWIDTH BROKEN BAR */
7110 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7111 || ch
== 0xFF66 /* Halfwidth Katakana */
7112 || (ch
>= 0xFF71 && ch
<= 0xFF9D) /* Halfwidth Katakana */
7113 || (ch
>= 0xFFA0 && ch
<= 0xFFBE) /* Halfwidth Hangul */
7114 || (ch
>= 0xFFC2 && ch
<= 0xFFC7) /* Halfwidth Hangul */
7115 || (ch
>= 0xFFCA && ch
<= 0xFFCF) /* Halfwidth Hangul */
7116 || (ch
>= 0xFFD2 && ch
<= 0xFFD7) /* Halfwidth Hangul */
7117 || (ch
>= 0xFFDA && ch
<= 0xFFDC) /* Halfwidth Hangul */
7118 || (ch
>= 0x17000 && ch
<= 0x187EC) /* Tangut Ideograph */
7119 || (ch
>= 0x18800 && ch
<= 0x18AF2) /* Tangut Ideograph */
7120 || (ch
>= 0x1B000 && ch
<= 0x1B001) /* Kana Supplement */
7121 || (ch
>= 0x1F000 && ch
<= 0x1F02B) /* Mahjong Tiles */
7122 || (ch
>= 0x1F030 && ch
<= 0x1F093) /* Domino Tiles */
7123 || (ch
>= 0x1F0A0 && ch
<= 0x1F0F5) /* Playing Cards */
7124 || (ch
>= 0x1F200 && ch
<= 0x1F248) /* Enclosed Ideographic Supplement */
7125 || (ch
>= 0x1F250 && ch
<= 0x1F251) /* Enclosed Ideographic Supplement */
7126 || (ch
>= 0x1F300 && ch
<= 0x1F5FF /* Miscellaneous Symbols and Pictographs */
7127 && ch
!= 0x1F3B5 && ch
!= 0x1F3B6 && ch
!= 0x1F3BC
7128 && ch
!= 0x1F4A0 && ch
!= 0x1F4A2 && ch
!= 0x1F4A4
7129 && ch
!= 0x1F4AF && ch
!= 0x1F4B1 && ch
!= 0x1F4B2
7130 && !(ch
>= 0x1F39C && ch
<= 0x1F39D)
7131 && !(ch
>= 0x1F3FB && ch
<= 0x1F3FF)
7132 && !(ch
>= 0x1F500 && ch
<= 0x1F506)
7133 && !(ch
>= 0x1F517 && ch
<= 0x1F524)
7134 && !(ch
>= 0x1F532 && ch
<= 0x1F549)
7135 && !(ch
>= 0x1F5D4 && ch
<= 0x1F5DB)
7136 && !(ch
>= 0x1F5F4 && ch
<= 0x1F5F9))
7137 || (ch
>= 0x1F600 && ch
<= 0x1F64F) /* Emoticons */
7138 || (ch
>= 0x1F680 && ch
<= 0x1F6DF) /* Transport and Map Symbols */
7139 || (ch
>= 0x1F6E0 && ch
<= 0x1F6EC) /* Transport and Map Symbols */
7140 || (ch
>= 0x1F6F0 && ch
<= 0x1F6F6) /* Transport and Map Symbols */
7141 || (ch
>= 0x1F900 && ch
<= 0x1F9FF) /* Supplemental Symbols and Pictographs */
7142 || (ch
>= 0x2A700 && ch
<= 0x2B734) /* CJK Ideograph Extension C */
7143 || (ch
>= 0x2B740 && ch
<= 0x2B81D) /* CJK Ideograph Extension D */
7144 || (ch
>= 0x2B820 && ch
<= 0x2CEAF) /* CJK Ideograph Extension E */)
7145 if (!(attr
& (((int64_t) 1 << LBP_NS
) | ((int64_t) 1 << LBP_CM
) | ((int64_t) 1 << LBP_EB
))))
7147 /* ambiguous (ideograph) ? */
7148 if ((unicode_width
[ch
] != NULL
7149 && unicode_width
[ch
][0] == 'A'
7158 && !(ch
>= 0x26C4 && ch
<= 0x26C8)
7172 && !(ch
>= 0x26F1 && ch
<= 0x26F5)
7173 && !(ch
>= 0x26F7 && ch
<= 0x26FA)
7174 && !(ch
>= 0x26FD && ch
<= 0x26FF))
7175 || ch
== 0x24EA /* CIRCLED DIGIT ZERO */
7176 || (ch
>= 0x2780 && ch
<= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
7177 attr
|= (int64_t) 1 << LBP_AI
;
7179 attr
|= (int64_t) 1 << LBP_ID
;
7182 /* ordinary alphabetic and symbol characters */
7183 if ((unicode_attributes
[ch
].category
[0] == 'L'
7184 && (unicode_attributes
[ch
].category
[1] == 'u'
7185 || unicode_attributes
[ch
].category
[1] == 'l'
7186 || unicode_attributes
[ch
].category
[1] == 't'
7187 || unicode_attributes
[ch
].category
[1] == 'm'
7188 || unicode_attributes
[ch
].category
[1] == 'o'))
7189 || (unicode_attributes
[ch
].category
[0] == 'S'
7190 && (unicode_attributes
[ch
].category
[1] == 'm'
7191 || unicode_attributes
[ch
].category
[1] == 'k'
7192 || unicode_attributes
[ch
].category
[1] == 'o'))
7193 || (unicode_attributes
[ch
].category
[0] == 'N'
7194 && (unicode_attributes
[ch
].category
[1] == 'l'
7195 || unicode_attributes
[ch
].category
[1] == 'o'))
7196 || (unicode_attributes
[ch
].category
[0] == 'P'
7197 && (unicode_attributes
[ch
].category
[1] == 'c'
7198 || unicode_attributes
[ch
].category
[1] == 'd'
7199 || unicode_attributes
[ch
].category
[1] == 'o'))
7200 || ch
== 0x0600 /* ARABIC NUMBER SIGN */
7201 || ch
== 0x0601 /* ARABIC SIGN SANAH */
7202 || ch
== 0x0602 /* ARABIC FOOTNOTE MARKER */
7203 || ch
== 0x0603 /* ARABIC SIGN SAFHA */
7204 || ch
== 0x0604 /* ARABIC SIGN SAMVAT */
7205 || ch
== 0x0605 /* ARABIC NUMBER MARK ABOVE */
7206 || ch
== 0x06DD /* ARABIC END OF AYAH */
7207 || ch
== 0x070F /* SYRIAC ABBREVIATION MARK */
7208 || ch
== 0x08E2 /* ARABIC DISPUTED END OF AYAH */
7209 || ch
== 0x2061 /* FUNCTION APPLICATION */
7210 || ch
== 0x2062 /* INVISIBLE TIMES */
7211 || ch
== 0x2063 /* INVISIBLE SEPARATOR */
7212 || ch
== 0x2064 /* INVISIBLE PLUS */
7213 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7214 || ch
== 0x110BD /* KAITHI NUMBER SIGN */)
7215 if (!(attr
& (((int64_t) 1 << LBP_GL
) | ((int64_t) 1 << LBP_B2
) | ((int64_t) 1 << LBP_BA
) | ((int64_t) 1 << LBP_BB
) | ((int64_t) 1 << LBP_HY
) | ((int64_t) 1 << LBP_CB
) | ((int64_t) 1 << LBP_CL
) | ((int64_t) 1 << LBP_CP
) | ((int64_t) 1 << LBP_EX
) | ((int64_t) 1 << LBP_IN
) | ((int64_t) 1 << LBP_NS
) | ((int64_t) 1 << LBP_OP
) | ((int64_t) 1 << LBP_QU
) | ((int64_t) 1 << LBP_IS
) | ((int64_t) 1 << LBP_NU
) | ((int64_t) 1 << LBP_PO
) | ((int64_t) 1 << LBP_PR
) | ((int64_t) 1 << LBP_SY
) | ((int64_t) 1 << LBP_H2
) | ((int64_t) 1 << LBP_H3
) | ((int64_t) 1 << LBP_HL
) | ((int64_t) 1 << LBP_JL
) | ((int64_t) 1 << LBP_JV
) | ((int64_t) 1 << LBP_JT
) | ((int64_t) 1 << LBP_RI
) | ((int64_t) 1 << LBP_SA
) | ((int64_t) 1 << LBP_ID
) | ((int64_t) 1 << LBP_EB
) | ((int64_t) 1 << LBP_EM
)))
7216 && ch
!= 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
7218 /* ambiguous (alphabetic) ? */
7219 if ((unicode_width
[ch
] != NULL
7220 && unicode_width
[ch
][0] == 'A'
7222 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
7223 && ch
!= 0x2022 /* BULLET */
7224 && ch
!= 0x203E /* OVERLINE */
7225 && ch
!= 0x2126 /* OHM SIGN */
7226 && ch
!= 0x2153 /* VULGAR FRACTION ONE THIRD */
7227 && ch
!= 0x215C /* VULGAR FRACTION THREE EIGHTHS */
7228 && ch
!= 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
7229 && ch
!= 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
7230 && ch
!= 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
7231 && ch
!= 0x21E7 /* UPWARDS WHITE ARROW */
7232 && ch
!= 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
7233 && ch
!= 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
7234 || ch
== 0x00A7 /* SECTION SIGN */
7235 || ch
== 0x00A8 /* DIAERESIS */
7236 || ch
== 0x00AA /* FEMININE ORDINAL INDICATOR */
7237 || ch
== 0x00B2 /* SUPERSCRIPT TWO */
7238 || ch
== 0x00B3 /* SUPERSCRIPT THREE */
7239 || ch
== 0x00B6 /* PILCROW SIGN */
7240 || ch
== 0x00B7 /* MIDDLE DOT */
7241 || ch
== 0x00B8 /* CEDILLA */
7242 || ch
== 0x00B9 /* SUPERSCRIPT ONE */
7243 || ch
== 0x00BA /* MASCULINE ORDINAL INDICATOR */
7244 || ch
== 0x00BC /* VULGAR FRACTION ONE QUARTER */
7245 || ch
== 0x00BD /* VULGAR FRACTION ONE HALF */
7246 || ch
== 0x00BE /* VULGAR FRACTION THREE QUARTERS */
7247 || ch
== 0x00D7 /* MULTIPLICATION SIGN */
7248 || ch
== 0x00F7 /* DIVISION SIGN */
7249 || ch
== 0x02C7 /* CARON */
7250 || ch
== 0x02C9 /* MODIFIER LETTER MACRON */
7251 || ch
== 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
7252 || ch
== 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
7253 || ch
== 0x02CD /* MODIFIER LETTER LOW MACRON */
7254 || ch
== 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
7255 || ch
== 0x02D8 /* BREVE */
7256 || ch
== 0x02D9 /* DOT ABOVE */
7257 || ch
== 0x02DA /* RING ABOVE */
7258 || ch
== 0x02DB /* OGONEK */
7259 || ch
== 0x02DD /* DOUBLE ACUTE ACCENT */
7260 || ch
== 0x24EA /* CIRCLED DIGIT ZERO */
7261 || (ch
>= 0x2780 && ch
<= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
7262 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7263 || ch
== 0x2155 /* VULGAR FRACTION ONE FIFTH */
7264 || ch
== 0x2574 /* BOX DRAWINGS LIGHT LEFT */
7265 || ch
== 0x2616 /* WHITE SHOGI PIECE */
7266 || ch
== 0x2617 /* BLACK SHOGI PIECE */
7267 || ch
== 0x2757 /* HEAVY EXCLAMATION MARK SYMBOL */
7268 || ch
== 0x2B55 /* HEAVY LARGE CIRCLE */
7269 || ch
== 0x1F10B /* DINGBAT CIRCLED SANS-SERIF DIGIT ZERO */
7270 || ch
== 0x1F18E /* NEGATIVE SQUARED AB */
7271 || (ch
>= 0x1F191 && ch
<= 0x1F19A) /* SQUARED CL..SQUARED VS */
7272 || ch
== 0x1F10C /* DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */)
7273 attr
|= (int64_t) 1 << LBP_AI
;
7275 attr
|= (int64_t) 1 << LBP_AL
;
7276 attr
&= ~((int64_t) 1 << LBP_CM
);
7281 /* Unassigned character. */
7282 if ((ch
>= 0x3400 && ch
<= 0x4DBF) /* CJK Unified Ideographs Extension A */
7283 || (ch
>= 0x4E00 && ch
<= 0x9FFF) /* CJK Unified Ideographs */
7284 || (ch
>= 0xF900 && ch
<= 0xFAFF) /* CJK Compatibility Ideographs */
7285 || (ch
>= 0x1F02C && ch
<= 0x1F02F) /* reserved */
7286 || (ch
>= 0x1F094 && ch
<= 0x1F09F) /* reserved */
7287 || (ch
>= 0x1F0AF && ch
<= 0x1F0B0) /* reserved */
7288 || ch
== 0x1F0C0 /* reserved */
7289 || ch
== 0x1F0D0 /* reserved */
7290 || (ch
>= 0x1F0F6 && ch
<= 0x1F0FF) /* reserved */
7291 || (ch
>= 0x1F10D && ch
<= 0x1F10F) /* reserved */
7292 || ch
== 0x1F12F /* reserved */
7293 || (ch
>= 0x1F16C && ch
<= 0x1F16F) /* reserved */
7294 || (ch
>= 0x1F1AD && ch
<= 0x1F1E5) /* reserved */
7295 || (ch
>= 0x1F203 && ch
<= 0x1F20F) /* reserved */
7296 || (ch
>= 0x1F23C && ch
<= 0x1F23F) /* reserved */
7297 || (ch
>= 0x1F249 && ch
<= 0x1F24F) /* reserved */
7298 || (ch
>= 0x1F252 && ch
<= 0x1F2FF) /* reserved */
7299 || (ch
>= 0x1F6D3 && ch
<= 0x1F6DF) /* reserved */
7300 || (ch
>= 0x1F6ED && ch
<= 0x1F6EF) /* reserved */
7301 || (ch
>= 0x1F6F7 && ch
<= 0x1F6FF) /* reserved */
7302 || (ch
>= 0x1F774 && ch
<= 0x1F77F) /* reserved */
7303 || (ch
>= 0x1F7D5 && ch
<= 0x1F7FF) /* reserved */
7304 || (ch
>= 0x1F80C && ch
<= 0x1F80F) /* reserved */
7305 || (ch
>= 0x1F848 && ch
<= 0x1F84F) /* reserved */
7306 || (ch
>= 0x1F85A && ch
<= 0x1F85F) /* reserved */
7307 || (ch
>= 0x1F888 && ch
<= 0x1F88F) /* reserved */
7308 || (ch
>= 0x1F8AE && ch
<= 0x1F90F) /* reserved */
7309 || ch
== 0x1F91F /* reserved */
7310 || ch
== 0x1F93F /* reserved */
7311 || (ch
>= 0x1F928 && ch
<= 0x1F92F) /* reserved */
7312 || (ch
>= 0x1F931 && ch
<= 0x1F932) /* reserved */
7313 || (ch
>= 0x1F94C && ch
<= 0x1F94F) /* reserved */
7314 || (ch
>= 0x1F95F && ch
<= 0x1F97F) /* reserved */
7315 || (ch
>= 0x1F992 && ch
<= 0x1F9BF) /* reserved */
7316 || (ch
>= 0x1F9C1 && ch
<= 0x1FFFD) /* reserved */
7317 || (ch
>= 0x20000 && ch
<= 0x2A6FF) /* CJK Unified Ideographs Extension B */
7318 || (ch
>= 0x2A700 && ch
<= 0x2F7FF) /* CJK Unified Ideographs Extension C,
7319 Supplementary Ideographic Plane (Plane 2) outside of blocks */
7320 || (ch
>= 0x2F800 && ch
<= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
7321 Supplementary Ideographic Plane (Plane 2) outside of blocks */
7322 || (ch
>= 0x30000 && ch
<= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
7323 attr
|= (int64_t) 1 << LBP_ID
;
7328 attr
|= (int64_t) 1 << LBP_XX
;
7333 /* Output the line breaking properties in a human readable format. */
7335 debug_output_lbp (FILE *stream
)
7339 for (i
= 0; i
< 0x110000; i
++)
7341 int64_t attr
= get_lbp (i
);
7342 if (attr
!= (int64_t) 1 << LBP_XX
)
7344 fprintf (stream
, "0x%04X", i
);
7345 #define PRINT_BIT(attr,bit) \
7346 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
7347 PRINT_BIT(attr
,LBP_BK
);
7348 PRINT_BIT(attr
,LBP_CM
);
7349 PRINT_BIT(attr
,LBP_WJ
);
7350 PRINT_BIT(attr
,LBP_ZW
);
7351 PRINT_BIT(attr
,LBP_GL
);
7352 PRINT_BIT(attr
,LBP_SP
);
7353 PRINT_BIT(attr
,LBP_B2
);
7354 PRINT_BIT(attr
,LBP_BA
);
7355 PRINT_BIT(attr
,LBP_BB
);
7356 PRINT_BIT(attr
,LBP_HY
);
7357 PRINT_BIT(attr
,LBP_CB
);
7358 PRINT_BIT(attr
,LBP_CL
);
7359 PRINT_BIT(attr
,LBP_CP
);
7360 PRINT_BIT(attr
,LBP_EX
);
7361 PRINT_BIT(attr
,LBP_IN
);
7362 PRINT_BIT(attr
,LBP_NS
);
7363 PRINT_BIT(attr
,LBP_OP
);
7364 PRINT_BIT(attr
,LBP_QU
);
7365 PRINT_BIT(attr
,LBP_IS
);
7366 PRINT_BIT(attr
,LBP_NU
);
7367 PRINT_BIT(attr
,LBP_PO
);
7368 PRINT_BIT(attr
,LBP_PR
);
7369 PRINT_BIT(attr
,LBP_SY
);
7370 PRINT_BIT(attr
,LBP_AI
);
7371 PRINT_BIT(attr
,LBP_AL
);
7372 PRINT_BIT(attr
,LBP_H2
);
7373 PRINT_BIT(attr
,LBP_H3
);
7374 PRINT_BIT(attr
,LBP_HL
);
7375 PRINT_BIT(attr
,LBP_ID
);
7376 PRINT_BIT(attr
,LBP_JL
);
7377 PRINT_BIT(attr
,LBP_JV
);
7378 PRINT_BIT(attr
,LBP_JT
);
7379 PRINT_BIT(attr
,LBP_RI
);
7380 PRINT_BIT(attr
,LBP_SA
);
7381 PRINT_BIT(attr
,LBP_ZWJ
);
7382 PRINT_BIT(attr
,LBP_EB
);
7383 PRINT_BIT(attr
,LBP_EM
);
7384 PRINT_BIT(attr
,LBP_XX
);
7386 fprintf (stream
, "\n");
7392 debug_output_lbrk_tables (const char *filename
)
7396 stream
= fopen (filename
, "w");
7399 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
7403 debug_output_lbp (stream
);
7405 if (ferror (stream
) || fclose (stream
))
7407 fprintf (stderr
, "error writing to '%s'\n", filename
);
7412 /* The line breaking property from the LineBreak.txt file. */
7413 int unicode_org_lbp
[0x110000];
7415 /* Stores in unicode_org_lbp[] the line breaking property from the
7416 LineBreak.txt file. */
7418 fill_org_lbp (const char *linebreak_filename
)
7422 char field0
[FIELDLEN
];
7423 char field1
[FIELDLEN
];
7424 char field2
[FIELDLEN
];
7427 for (i
= 0; i
< 0x110000; i
++)
7428 unicode_org_lbp
[i
] = LBP_XX
;
7430 stream
= fopen (linebreak_filename
, "r");
7433 fprintf (stderr
, "error during fopen of '%s'\n", linebreak_filename
);
7449 do c
= getc (stream
); while (c
!= EOF
&& c
!= '\n');
7453 n
= getfield (stream
, field0
, ';');
7454 n
+= getfield (stream
, field1
, ' ');
7455 n
+= getfield (stream
, field2
, '\n');
7460 fprintf (stderr
, "short line in '%s':%d\n", linebreak_filename
,
7464 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
7505 else if (strcmp (field1
, "LF") == 0) value
= LBP_BK
;
7506 else if (strcmp (field1
, "CR") == 0) value
= LBP_BK
;
7507 else if (strcmp (field1
, "NL") == 0) value
= LBP_BK
;
7508 else if (strcmp (field1
, "SG") == 0) value
= LBP_XX
;
7509 else if (strcmp (field1
, "CJ") == 0) value
= LBP_NS
;
7512 fprintf (stderr
, "unknown property value \"%s\" in '%s':%d\n",
7513 field1
, linebreak_filename
, lineno
);
7516 i
= strtoul (field0
, NULL
, 16);
7517 if (strstr (field0
, "..") != NULL
)
7519 /* Deal with a range. */
7520 j
= strtoul (strstr (field0
, "..") + 2, NULL
, 16);
7522 unicode_org_lbp
[i
] = value
;
7526 /* Single character line. */
7527 unicode_org_lbp
[i
] = value
;
7531 if (ferror (stream
) || fclose (stream
))
7533 fprintf (stderr
, "error reading from '%s'\n", linebreak_filename
);
7538 /* Output the line breaking properties in a human readable format. */
7540 debug_output_org_lbp (FILE *stream
)
7544 for (i
= 0; i
< 0x110000; i
++)
7546 int attr
= unicode_org_lbp
[i
];
7549 fprintf (stream
, "0x%04X", i
);
7550 #define PRINT_BIT(attr,bit) \
7551 if (attr == bit) fprintf (stream, " " #bit);
7552 PRINT_BIT(attr
,LBP_BK
);
7553 PRINT_BIT(attr
,LBP_CM
);
7554 PRINT_BIT(attr
,LBP_WJ
);
7555 PRINT_BIT(attr
,LBP_ZW
);
7556 PRINT_BIT(attr
,LBP_GL
);
7557 PRINT_BIT(attr
,LBP_SP
);
7558 PRINT_BIT(attr
,LBP_B2
);
7559 PRINT_BIT(attr
,LBP_BA
);
7560 PRINT_BIT(attr
,LBP_BB
);
7561 PRINT_BIT(attr
,LBP_HY
);
7562 PRINT_BIT(attr
,LBP_CB
);
7563 PRINT_BIT(attr
,LBP_CL
);
7564 PRINT_BIT(attr
,LBP_CP
);
7565 PRINT_BIT(attr
,LBP_EX
);
7566 PRINT_BIT(attr
,LBP_IN
);
7567 PRINT_BIT(attr
,LBP_NS
);
7568 PRINT_BIT(attr
,LBP_OP
);
7569 PRINT_BIT(attr
,LBP_QU
);
7570 PRINT_BIT(attr
,LBP_IS
);
7571 PRINT_BIT(attr
,LBP_NU
);
7572 PRINT_BIT(attr
,LBP_PO
);
7573 PRINT_BIT(attr
,LBP_PR
);
7574 PRINT_BIT(attr
,LBP_SY
);
7575 PRINT_BIT(attr
,LBP_AI
);
7576 PRINT_BIT(attr
,LBP_AL
);
7577 PRINT_BIT(attr
,LBP_H2
);
7578 PRINT_BIT(attr
,LBP_H3
);
7579 PRINT_BIT(attr
,LBP_HL
);
7580 PRINT_BIT(attr
,LBP_ID
);
7581 PRINT_BIT(attr
,LBP_JL
);
7582 PRINT_BIT(attr
,LBP_JV
);
7583 PRINT_BIT(attr
,LBP_JT
);
7584 PRINT_BIT(attr
,LBP_RI
);
7585 PRINT_BIT(attr
,LBP_SA
);
7586 PRINT_BIT(attr
,LBP_ZWJ
);
7587 PRINT_BIT(attr
,LBP_EB
);
7588 PRINT_BIT(attr
,LBP_EM
);
7589 PRINT_BIT(attr
,LBP_XX
);
7591 fprintf (stream
, "\n");
7597 debug_output_org_lbrk_tables (const char *filename
)
7601 stream
= fopen (filename
, "w");
7604 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
7608 debug_output_org_lbp (stream
);
7610 if (ferror (stream
) || fclose (stream
))
7612 fprintf (stderr
, "error writing to '%s'\n", filename
);
7617 /* Construction of sparse 3-level tables. */
7618 #define TABLE lbp_table
7619 #define ELEMENT unsigned char
7620 #define DEFAULT LBP_XX
7621 #define xmalloc malloc
7622 #define xrealloc realloc
7626 output_lbp (FILE *stream1
, FILE *stream2
)
7630 unsigned int level1_offset
, level2_offset
, level3_offset
;
7634 lbp_table_init (&t
);
7636 for (i
= 0; i
< 0x110000; i
++)
7638 int64_t attr
= get_lbp (i
);
7640 /* Now attr should contain exactly one bit. */
7641 assert (attr
!= 0 && (attr
& (attr
- 1)) == 0);
7643 if (attr
!= (int64_t) 1 << LBP_XX
)
7645 unsigned int log2_attr
;
7646 for (log2_attr
= 0; attr
> 1; attr
>>= 1, log2_attr
++);
7648 lbp_table_add (&t
, i
, log2_attr
);
7652 lbp_table_finalize (&t
);
7655 5 * sizeof (uint32_t);
7657 5 * sizeof (uint32_t)
7658 + t
.level1_size
* sizeof (uint32_t);
7660 5 * sizeof (uint32_t)
7661 + t
.level1_size
* sizeof (uint32_t)
7662 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
7664 for (i
= 0; i
< 5; i
++)
7665 fprintf (stream1
, "#define lbrkprop_header_%d %d\n", i
,
7666 ((uint32_t *) t
.result
)[i
]);
7667 fprintf (stream1
, "\n");
7668 fprintf (stream1
, "typedef struct\n");
7669 fprintf (stream1
, " {\n");
7670 fprintf (stream1
, " int level1[%zu];\n", t
.level1_size
);
7671 fprintf (stream1
, " int level2[%zu << %d];\n", t
.level2_size
, t
.q
);
7672 fprintf (stream1
, " unsigned char level3[%zu << %d];\n", t
.level3_size
, t
.p
);
7673 fprintf (stream1
, " }\n");
7674 fprintf (stream1
, "lbrkprop_t;\n");
7675 fprintf (stream1
, "extern const lbrkprop_t unilbrkprop;\n");
7677 fprintf (stream2
, "const lbrkprop_t unilbrkprop =\n");
7678 fprintf (stream2
, "{\n");
7679 fprintf (stream2
, " {");
7680 if (t
.level1_size
> 8)
7681 fprintf (stream2
, "\n ");
7682 for (i
= 0; i
< t
.level1_size
; i
++)
7685 if (i
> 0 && (i
% 8) == 0)
7686 fprintf (stream2
, "\n ");
7687 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
7689 fprintf (stream2
, " %5d", -1);
7691 fprintf (stream2
, " %5zu",
7692 (offset
- level2_offset
) / sizeof (uint32_t));
7693 if (i
+1 < t
.level1_size
)
7694 fprintf (stream2
, ",");
7696 if (t
.level1_size
> 8)
7697 fprintf (stream2
, "\n ");
7698 fprintf (stream2
, " },\n");
7699 fprintf (stream2
, " {");
7700 if (t
.level2_size
<< t
.q
> 8)
7701 fprintf (stream2
, "\n ");
7702 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
7705 if (i
> 0 && (i
% 8) == 0)
7706 fprintf (stream2
, "\n ");
7707 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
7709 fprintf (stream2
, " %5d", -1);
7711 fprintf (stream2
, " %5zu",
7712 (offset
- level3_offset
) / sizeof (unsigned char));
7713 if (i
+1 < t
.level2_size
<< t
.q
)
7714 fprintf (stream2
, ",");
7716 if (t
.level2_size
<< t
.q
> 8)
7717 fprintf (stream2
, "\n ");
7718 fprintf (stream2
, " },\n");
7719 fprintf (stream2
, " {");
7720 if (t
.level3_size
<< t
.p
> 8)
7721 fprintf (stream2
, "\n ");
7722 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
7724 unsigned char value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
7725 const char *value_string
;
7728 #define CASE(x) case x: value_string = #x; break;
7771 if (i
> 0 && (i
% 8) == 0)
7772 fprintf (stream2
, "\n ");
7773 fprintf (stream2
, " %s%s", value_string
,
7774 (i
+1 < t
.level3_size
<< t
.p
? "," : ""));
7776 if (t
.level3_size
<< t
.p
> 8)
7777 fprintf (stream2
, "\n ");
7778 fprintf (stream2
, " }\n");
7779 fprintf (stream2
, "};\n");
7783 output_lbrk_tables (const char *filename1
, const char *filename2
, const char *version
)
7785 const char *filenames
[2];
7789 filenames
[0] = filename1
;
7790 filenames
[1] = filename2
;
7792 for (i
= 0; i
< 2; i
++)
7794 streams
[i
] = fopen (filenames
[i
], "w");
7795 if (streams
[i
] == NULL
)
7797 fprintf (stderr
, "cannot open '%s' for writing\n", filenames
[i
]);
7802 for (i
= 0; i
< 2; i
++)
7804 FILE *stream
= streams
[i
];
7806 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7807 fprintf (stream
, "/* Line breaking properties of Unicode characters. */\n");
7808 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7810 fprintf (stream
, "\n");
7812 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7813 still carries the GPL header), and it's gnulib-tool which replaces the
7814 GPL header with an LGPL header. */
7815 fprintf (stream
, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
7816 fprintf (stream
, "\n");
7817 fprintf (stream
, " This program is free software: you can redistribute it and/or modify\n");
7818 fprintf (stream
, " it under the terms of the GNU General Public License as published by\n");
7819 fprintf (stream
, " the Free Software Foundation; either version 3 of the License, or\n");
7820 fprintf (stream
, " (at your option) any later version.\n");
7821 fprintf (stream
, "\n");
7822 fprintf (stream
, " This program is distributed in the hope that it will be useful,\n");
7823 fprintf (stream
, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7824 fprintf (stream
, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7825 fprintf (stream
, " GNU General Public License for more details.\n");
7826 fprintf (stream
, "\n");
7827 fprintf (stream
, " You should have received a copy of the GNU General Public License\n");
7828 fprintf (stream
, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
7829 fprintf (stream
, "\n");
7832 output_lbp (streams
[0], streams
[1]);
7834 for (i
= 0; i
< 2; i
++)
7836 if (ferror (streams
[i
]) || fclose (streams
[i
]))
7838 fprintf (stderr
, "error writing to '%s'\n", filenames
[i
]);
7844 /* ========================================================================= */
7846 /* Word break property.
7847 Updated for Unicode TR #29 revision 17. */
7849 /* Possible values of the Word_Break property. */
7864 WBP_EXTENDNUMLET
= 7,
7876 /* Returns the word breaking property for ch, as a bit mask. */
7878 get_wbp (unsigned int ch
)
7882 if (unicode_attributes
[ch
].name
!= NULL
)
7885 attr
|= 1 << WBP_CR
;
7888 attr
|= 1 << WBP_LF
;
7890 if (ch
== 0x000B || ch
== 0x000C
7892 || ch
== 0x2028 || ch
== 0x2029)
7893 attr
|= 1 << WBP_NEWLINE
;
7895 if (((unicode_properties
[ch
] >> PROP_GRAPHEME_EXTEND
) & 1) != 0
7896 || ((unicode_properties
[ch
] >> PROP_OTHER_GRAPHEME_EXTEND
) & 1) != 0
7897 || (unicode_attributes
[ch
].category
!= NULL
7898 && strcmp (unicode_attributes
[ch
].category
, "Mc") == 0))
7899 attr
|= 1 << WBP_EXTEND
;
7901 if (unicode_attributes
[ch
].category
!= NULL
7902 && strcmp (unicode_attributes
[ch
].category
, "Cf") == 0
7903 && ch
!= 0x200B && ch
!= 0x200C && ch
!= 0x200D
7904 && !(ch
>= 0xe0020 && ch
<= 0xe007f))
7905 attr
|= 1 << WBP_FORMAT
;
7907 if ((unicode_scripts
[ch
] < numscripts
7908 && strcmp (scripts
[unicode_scripts
[ch
]], "Katakana") == 0)
7909 || (ch
>= 0x3031 && ch
<= 0x3035)
7910 || ch
== 0x309B || ch
== 0x309C || ch
== 0x30A0 || ch
== 0x30FC
7912 attr
|= 1 << WBP_KATAKANA
;
7914 if ((unicode_scripts
[ch
] < numscripts
7915 && strcmp (scripts
[unicode_scripts
[ch
]], "Hebrew") == 0)
7916 && strcmp (unicode_attributes
[ch
].category
, "Lo") == 0)
7917 attr
|= 1 << WBP_HL
;
7919 if ((((unicode_properties
[ch
] >> PROP_ALPHABETIC
) & 1) != 0
7921 && ((unicode_properties
[ch
] >> PROP_IDEOGRAPHIC
) & 1) == 0
7922 && (attr
& (1 << WBP_KATAKANA
)) == 0
7923 && ((get_lbp (ch
) >> LBP_SA
) & 1) == 0
7924 && !(unicode_scripts
[ch
] < numscripts
7925 && strcmp (scripts
[unicode_scripts
[ch
]], "Hiragana") == 0)
7926 && (attr
& (1 << WBP_EXTEND
)) == 0
7927 && (attr
& (1 << WBP_HL
)) == 0)
7928 attr
|= 1 << WBP_ALETTER
;
7930 if (is_WBP_MIDNUMLET (ch
))
7931 attr
|= 1 << WBP_MIDNUMLET
;
7933 if (is_WBP_MIDLETTER (ch
))
7934 attr
|= 1 << WBP_MIDLETTER
;
7936 if ((((get_lbp (ch
) >> LBP_IS
) & 1) != 0
7937 || ch
== 0x066C || ch
== 0xFE50 || ch
== 0xFE54 || ch
== 0xFF0C
7939 && ch
!= 0x003A && ch
!= 0xFE13 && ch
!= 0x002E)
7940 attr
|= 1 << WBP_MIDNUM
;
7942 if (((get_lbp (ch
) >> LBP_NU
) & 1) != 0
7944 attr
|= 1 << WBP_NUMERIC
;
7946 if ((unicode_attributes
[ch
].category
!= NULL
7947 && strcmp (unicode_attributes
[ch
].category
, "Pc") == 0)
7948 || ch
== 0x202F /* NARROW NO-BREAK SPACE */)
7949 attr
|= 1 << WBP_EXTENDNUMLET
;
7951 if (((get_lbp (ch
) >> LBP_RI
) & 1) != 0)
7952 attr
|= 1 << WBP_RI
;
7955 attr
|= 1 << WBP_DQ
;
7958 attr
|= 1 << WBP_SQ
;
7961 attr
|= 1 << WBP_ZWJ
;
7963 if (ch
>= 0x1F466 && ch
<= 0x1F469)
7964 attr
|= 1 << WBP_EBG
;
7965 else if (((get_lbp (ch
) >> LBP_EB
) & 1) != 0)
7966 attr
|= 1 << WBP_EB
;
7968 if (((get_lbp (ch
) >> LBP_EM
) & 1) != 0)
7969 attr
|= 1 << WBP_EM
;
7971 if (ch
== 0x2764 || ch
== 0x1F48B || ch
== 0x1F5E8)
7972 attr
|= 1 << WBP_GAZ
;
7977 attr
|= 1 << WBP_OTHER
;
7982 /* Output the word break property in a human readable format. */
7984 debug_output_wbp (FILE *stream
)
7988 for (i
= 0; i
< 0x110000; i
++)
7990 int attr
= get_wbp (i
);
7991 if (attr
!= 1 << WBP_OTHER
)
7993 fprintf (stream
, "0x%04X", i
);
7994 if (attr
& (1 << WBP_CR
))
7995 fprintf (stream
, " CR");
7996 if (attr
& (1 << WBP_LF
))
7997 fprintf (stream
, " LF");
7998 if (attr
& (1 << WBP_NEWLINE
))
7999 fprintf (stream
, " Newline");
8000 if (attr
& (1 << WBP_EXTEND
))
8001 fprintf (stream
, " Extend");
8002 if (attr
& (1 << WBP_FORMAT
))
8003 fprintf (stream
, " Format");
8004 if (attr
& (1 << WBP_KATAKANA
))
8005 fprintf (stream
, " Katakana");
8006 if (attr
& (1 << WBP_ALETTER
))
8007 fprintf (stream
, " ALetter");
8008 if (attr
& (1 << WBP_MIDNUMLET
))
8009 fprintf (stream
, " MidNumLet");
8010 if (attr
& (1 << WBP_MIDLETTER
))
8011 fprintf (stream
, " MidLetter");
8012 if (attr
& (1 << WBP_MIDNUM
))
8013 fprintf (stream
, " MidNum");
8014 if (attr
& (1 << WBP_NUMERIC
))
8015 fprintf (stream
, " Numeric");
8016 if (attr
& (1 << WBP_EXTENDNUMLET
))
8017 fprintf (stream
, " ExtendNumLet");
8018 if (attr
& (1 << WBP_RI
))
8019 fprintf (stream
, " Regional_Indicator");
8020 if (attr
& (1 << WBP_DQ
))
8021 fprintf (stream
, " Double_Quote");
8022 if (attr
& (1 << WBP_SQ
))
8023 fprintf (stream
, " Single_Quote");
8024 if (attr
& (1 << WBP_HL
))
8025 fprintf (stream
, " Hebrew_Letter");
8026 if (attr
& (1 << WBP_ZWJ
))
8027 fprintf (stream
, " ZWJ");
8028 if (attr
& (1 << WBP_EB
))
8029 fprintf (stream
, " E_Base");
8030 if (attr
& (1 << WBP_EM
))
8031 fprintf (stream
, " E_Modifier");
8032 if (attr
& (1 << WBP_GAZ
))
8033 fprintf (stream
, " Glue_After_Zwj");
8034 if (attr
& (1 << WBP_EBG
))
8035 fprintf (stream
, " E_Base_GAZ");
8036 fprintf (stream
, "\n");
8042 debug_output_wbrk_tables (const char *filename
)
8046 stream
= fopen (filename
, "w");
8049 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
8053 debug_output_wbp (stream
);
8055 if (ferror (stream
) || fclose (stream
))
8057 fprintf (stderr
, "error writing to '%s'\n", filename
);
8062 /* The word break property from the WordBreakProperty.txt file. */
8063 int unicode_org_wbp
[0x110000];
8065 /* Stores in unicode_org_wbp[] the word break property from the
8066 WordBreakProperty.txt file. */
8068 fill_org_wbp (const char *wordbreakproperty_filename
)
8073 for (i
= 0; i
< 0x110000; i
++)
8074 unicode_org_wbp
[i
] = WBP_OTHER
;
8076 stream
= fopen (wordbreakproperty_filename
, "r");
8079 fprintf (stderr
, "error during fopen of '%s'\n", wordbreakproperty_filename
);
8086 unsigned int i1
, i2
;
8087 char padding
[200+1];
8088 char propname
[200+1];
8091 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
8094 if (buf
[0] == '\0' || buf
[0] == '#')
8097 if (sscanf (buf
, "%X..%X%[ ;]%[^ ]", &i1
, &i2
, padding
, propname
) != 4)
8099 if (sscanf (buf
, "%X%[ ;]%[^ ]", &i1
, padding
, propname
) != 3)
8101 fprintf (stderr
, "parse error in '%s'\n",
8102 wordbreakproperty_filename
);
8107 #define PROP(name,value) \
8108 if (strcmp (propname, name) == 0) propvalue = value; else
8111 PROP ("Newline", WBP_NEWLINE
)
8112 PROP ("Extend", WBP_EXTEND
)
8113 PROP ("Format", WBP_FORMAT
)
8114 PROP ("Katakana", WBP_KATAKANA
)
8115 PROP ("ALetter", WBP_ALETTER
)
8116 PROP ("MidNumLet", WBP_MIDNUMLET
)
8117 PROP ("MidLetter", WBP_MIDLETTER
)
8118 PROP ("MidNum", WBP_MIDNUM
)
8119 PROP ("Numeric", WBP_NUMERIC
)
8120 PROP ("ExtendNumLet", WBP_EXTENDNUMLET
)
8121 PROP ("Regional_Indicator", WBP_RI
)
8122 PROP ("Double_Quote", WBP_DQ
)
8123 PROP ("Single_Quote", WBP_SQ
)
8124 PROP ("Hebrew_Letter", WBP_HL
)
8125 PROP ("ZWJ", WBP_ZWJ
)
8126 PROP ("E_Base", WBP_EB
)
8127 PROP ("E_Modifier", WBP_EM
)
8128 PROP ("Glue_After_Zwj", WBP_GAZ
)
8129 PROP ("E_Base_GAZ", WBP_EBG
)
8132 fprintf (stderr
, "unknown property value '%s' in '%s'\n", propname
,
8133 wordbreakproperty_filename
);
8136 assert (i1
<= i2
&& i2
< 0x110000);
8138 for (i
= i1
; i
<= i2
; i
++)
8139 unicode_org_wbp
[i
] = propvalue
;
8142 if (ferror (stream
) || fclose (stream
))
8144 fprintf (stderr
, "error reading from '%s'\n", wordbreakproperty_filename
);
8149 /* Output the word break property in a human readable format. */
8151 debug_output_org_wbp (FILE *stream
)
8155 for (i
= 0; i
< 0x110000; i
++)
8157 int propvalue
= unicode_org_wbp
[i
];
8158 if (propvalue
!= WBP_OTHER
)
8160 fprintf (stream
, "0x%04X", i
);
8161 #define PROP(name,value) \
8162 if (propvalue == value) fprintf (stream, " " name); else
8165 PROP ("Newline", WBP_NEWLINE
)
8166 PROP ("Extend", WBP_EXTEND
)
8167 PROP ("Format", WBP_FORMAT
)
8168 PROP ("Katakana", WBP_KATAKANA
)
8169 PROP ("ALetter", WBP_ALETTER
)
8170 PROP ("MidNumLet", WBP_MIDNUMLET
)
8171 PROP ("MidLetter", WBP_MIDLETTER
)
8172 PROP ("MidNum", WBP_MIDNUM
)
8173 PROP ("Numeric", WBP_NUMERIC
)
8174 PROP ("ExtendNumLet", WBP_EXTENDNUMLET
)
8175 PROP ("Regional_Indicator", WBP_RI
)
8176 PROP ("Double_Quote", WBP_DQ
)
8177 PROP ("Single_Quote", WBP_SQ
)
8178 PROP ("Hebrew_Letter", WBP_HL
)
8179 PROP ("ZWJ", WBP_ZWJ
)
8180 PROP ("E_Base", WBP_EB
)
8181 PROP ("E_Modifier", WBP_EM
)
8182 PROP ("Glue_After_Zwj", WBP_GAZ
)
8183 PROP ("E_Base_GAZ", WBP_EBG
)
8185 fprintf (stream
, " ??");
8186 fprintf (stream
, "\n");
8192 debug_output_org_wbrk_tables (const char *filename
)
8196 stream
= fopen (filename
, "w");
8199 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
8203 debug_output_org_wbp (stream
);
8205 if (ferror (stream
) || fclose (stream
))
8207 fprintf (stderr
, "error writing to '%s'\n", filename
);
8212 /* Construction of sparse 3-level tables. */
8213 #define TABLE wbp_table
8214 #define ELEMENT unsigned char
8215 #define DEFAULT WBP_OTHER
8216 #define xmalloc malloc
8217 #define xrealloc realloc
8221 output_wbp (FILE *stream
)
8225 unsigned int level1_offset
, level2_offset
, level3_offset
;
8229 wbp_table_init (&t
);
8231 for (i
= 0; i
< 0x110000; i
++)
8233 int attr
= get_wbp (i
);
8235 /* Now attr should contain exactly one bit. */
8236 assert (attr
!= 0 && (attr
& (attr
- 1)) == 0);
8238 if (attr
!= 1 << WBP_OTHER
)
8240 unsigned int log2_attr
;
8241 for (log2_attr
= 0; attr
> 1; attr
>>= 1, log2_attr
++);
8243 wbp_table_add (&t
, i
, log2_attr
);
8247 wbp_table_finalize (&t
);
8250 5 * sizeof (uint32_t);
8252 5 * sizeof (uint32_t)
8253 + t
.level1_size
* sizeof (uint32_t);
8255 5 * sizeof (uint32_t)
8256 + t
.level1_size
* sizeof (uint32_t)
8257 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
8259 for (i
= 0; i
< 5; i
++)
8260 fprintf (stream
, "#define wbrkprop_header_%d %d\n", i
,
8261 ((uint32_t *) t
.result
)[i
]);
8262 fprintf (stream
, "\n");
8263 fprintf (stream
, "typedef struct\n");
8264 fprintf (stream
, " {\n");
8265 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
8266 fprintf (stream
, " int level2[%zu << %d];\n", t
.level2_size
, t
.q
);
8267 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
, t
.p
);
8268 fprintf (stream
, " }\n");
8269 fprintf (stream
, "wbrkprop_t;\n");
8270 fprintf (stream
, "static const wbrkprop_t uniwbrkprop =\n");
8271 fprintf (stream
, "{\n");
8272 fprintf (stream
, " {");
8273 if (t
.level1_size
> 8)
8274 fprintf (stream
, "\n ");
8275 for (i
= 0; i
< t
.level1_size
; i
++)
8278 if (i
> 0 && (i
% 8) == 0)
8279 fprintf (stream
, "\n ");
8280 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
8282 fprintf (stream
, " %5d", -1);
8284 fprintf (stream
, " %5zu",
8285 (offset
- level2_offset
) / sizeof (uint32_t));
8286 if (i
+1 < t
.level1_size
)
8287 fprintf (stream
, ",");
8289 if (t
.level1_size
> 8)
8290 fprintf (stream
, "\n ");
8291 fprintf (stream
, " },\n");
8292 fprintf (stream
, " {");
8293 if (t
.level2_size
<< t
.q
> 8)
8294 fprintf (stream
, "\n ");
8295 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
8298 if (i
> 0 && (i
% 8) == 0)
8299 fprintf (stream
, "\n ");
8300 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
8302 fprintf (stream
, " %5d", -1);
8304 fprintf (stream
, " %5zu",
8305 (offset
- level3_offset
) / sizeof (unsigned char));
8306 if (i
+1 < t
.level2_size
<< t
.q
)
8307 fprintf (stream
, ",");
8309 if (t
.level2_size
<< t
.q
> 8)
8310 fprintf (stream
, "\n ");
8311 fprintf (stream
, " },\n");
8312 fprintf (stream
, " {");
8313 if (t
.level3_size
<< t
.p
> 4)
8314 fprintf (stream
, "\n ");
8315 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
8317 unsigned char value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
8318 const char *value_string
;
8321 #define CASE(x) case x: value_string = #x; break;
8330 CASE(WBP_MIDNUMLET
);
8331 CASE(WBP_MIDLETTER
);
8334 CASE(WBP_EXTENDNUMLET
);
8348 if (i
> 0 && (i
% 4) == 0)
8349 fprintf (stream
, "\n ");
8350 fprintf (stream
, " %s%s", value_string
,
8351 (i
+1 < t
.level3_size
<< t
.p
? "," : ""));
8353 if (t
.level3_size
<< t
.p
> 4)
8354 fprintf (stream
, "\n ");
8355 fprintf (stream
, " }\n");
8356 fprintf (stream
, "};\n");
8360 output_wbrk_tables (const char *filename
, const char *version
)
8364 stream
= fopen (filename
, "w");
8367 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
8371 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8372 fprintf (stream
, "/* Line breaking properties of Unicode characters. */\n");
8373 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8375 fprintf (stream
, "\n");
8377 /* Put a GPL header on it. The gnulib module is under LGPL (although it
8378 still carries the GPL header), and it's gnulib-tool which replaces the
8379 GPL header with an LGPL header. */
8380 fprintf (stream
, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
8381 fprintf (stream
, "\n");
8382 fprintf (stream
, " This program is free software: you can redistribute it and/or modify\n");
8383 fprintf (stream
, " it under the terms of the GNU General Public License as published by\n");
8384 fprintf (stream
, " the Free Software Foundation; either version 3 of the License, or\n");
8385 fprintf (stream
, " (at your option) any later version.\n");
8386 fprintf (stream
, "\n");
8387 fprintf (stream
, " This program is distributed in the hope that it will be useful,\n");
8388 fprintf (stream
, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8389 fprintf (stream
, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8390 fprintf (stream
, " GNU General Public License for more details.\n");
8391 fprintf (stream
, "\n");
8392 fprintf (stream
, " You should have received a copy of the GNU General Public License\n");
8393 fprintf (stream
, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
8394 fprintf (stream
, "\n");
8396 output_wbp (stream
);
8398 if (ferror (stream
) || fclose (stream
))
8400 fprintf (stderr
, "error writing to '%s'\n", filename
);
8405 /* ========================================================================= */
8407 /* Grapheme break property.
8408 Updated for Unicode TR #29 revision 29. */
8410 /* Possible values of the Grapheme_Cluster_Break property. */
8419 GBP_SPACINGMARK
= 6,
8433 /* Construction of sparse 3-level tables. */
8434 #define TABLE gbp_table
8435 #define ELEMENT unsigned char
8436 #define DEFAULT GBP_OTHER
8437 #define xmalloc malloc
8438 #define xrealloc realloc
8441 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
8442 int unicode_org_gbp
[0x110000];
8444 /* Output the unit test data for the grapheme break property. */
8446 output_gbp_test (const char *filename
)
8452 stream
= fopen (filename
, "w");
8455 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
8459 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8460 fprintf (stream
, "/* Test the Unicode grapheme break property functions.\n");
8461 fprintf (stream
, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
8462 fprintf (stream
, "\n");
8463 fprintf (stream
, " This program is free software: you can redistribute it and/or modify\n");
8464 fprintf (stream
, " it under the terms of the GNU General Public License as published by\n");
8465 fprintf (stream
, " the Free Software Foundation; either version 3 of the License, or\n");
8466 fprintf (stream
, " (at your option) any later version.\n");
8467 fprintf (stream
, "\n");
8468 fprintf (stream
, " This program is distributed in the hope that it will be useful,\n");
8469 fprintf (stream
, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8470 fprintf (stream
, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8471 fprintf (stream
, " GNU General Public License for more details.\n");
8472 fprintf (stream
, "\n");
8473 fprintf (stream
, " You should have received a copy of the GNU General Public License\n");
8474 fprintf (stream
, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
8475 fprintf (stream
, "\n");
8478 for (ch
= 0; ch
< 0x110000; ch
++)
8480 int gbp
= unicode_org_gbp
[ch
];
8481 const char *gbp_string
;
8483 while (ch
+ 1 < 0x110000 && unicode_org_gbp
[ch
+ 1] == gbp
)
8488 #define CASE(x) case x: gbp_string = #x; break;
8495 CASE (GBP_SPACINGMARK
)
8513 fprintf (stream
, ",\n");
8514 fprintf (stream
, "{ 0x%04X, %s }", ch
+ 1, gbp_string
);
8518 fprintf (stream
, "\n");
8520 if (ferror (stream
) || fclose (stream
))
8522 fprintf (stderr
, "error writing to '%s'\n", filename
);
8527 /* Output the per-character grapheme break property table. */
8529 output_gbp_table (const char *filename
, const char *version
)
8534 unsigned int level1_offset
, level2_offset
, level3_offset
;
8536 stream
= fopen (filename
, "w");
8539 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
8543 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8544 fprintf (stream
, "/* Grapheme break property of Unicode characters. */\n");
8545 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8550 gbp_table_init (&t
);
8552 for (ch
= 0; ch
< 0x110000; ch
++)
8553 gbp_table_add (&t
, ch
, unicode_org_gbp
[ch
]);
8555 gbp_table_finalize (&t
);
8557 /* Offsets in t.result, in memory of this process. */
8559 5 * sizeof (uint32_t);
8561 5 * sizeof (uint32_t)
8562 + t
.level1_size
* sizeof (uint32_t);
8564 5 * sizeof (uint32_t)
8565 + t
.level1_size
* sizeof (uint32_t)
8566 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
8568 for (i
= 0; i
< 5; i
++)
8569 fprintf (stream
, "#define gbrkprop_header_%d %d\n", i
,
8570 ((uint32_t *) t
.result
)[i
]);
8571 fprintf (stream
, "static const\n");
8572 fprintf (stream
, "struct\n");
8573 fprintf (stream
, " {\n");
8574 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
8575 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
8576 fprintf (stream
, " unsigned char level3[%zu << %d];\n",
8577 t
.level3_size
, t
.p
);
8578 fprintf (stream
, " }\n");
8579 fprintf (stream
, "unigbrkprop =\n");
8580 fprintf (stream
, "{\n");
8581 fprintf (stream
, " {");
8582 if (t
.level1_size
> 8)
8583 fprintf (stream
, "\n ");
8584 for (i
= 0; i
< t
.level1_size
; i
++)
8587 if (i
> 0 && (i
% 8) == 0)
8588 fprintf (stream
, "\n ");
8589 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
8591 fprintf (stream
, " %5d", -1);
8593 fprintf (stream
, " %5zu",
8594 (offset
- level2_offset
) / sizeof (uint32_t));
8595 if (i
+1 < t
.level1_size
)
8596 fprintf (stream
, ",");
8598 if (t
.level1_size
> 8)
8599 fprintf (stream
, "\n ");
8600 fprintf (stream
, " },\n");
8601 fprintf (stream
, " {");
8602 if (t
.level2_size
<< t
.q
> 8)
8603 fprintf (stream
, "\n ");
8604 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
8607 if (i
> 0 && (i
% 8) == 0)
8608 fprintf (stream
, "\n ");
8609 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
8611 fprintf (stream
, " %5d", -1);
8613 fprintf (stream
, " %5zu",
8614 (offset
- level3_offset
) / sizeof (uint8_t));
8615 if (i
+1 < t
.level2_size
<< t
.q
)
8616 fprintf (stream
, ",");
8618 if (t
.level2_size
<< t
.q
> 8)
8619 fprintf (stream
, "\n ");
8620 fprintf (stream
, " },\n");
8621 fprintf (stream
, " {");
8622 if (t
.level3_size
<< t
.p
> 4)
8623 fprintf (stream
, "\n ");
8624 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
8626 unsigned char value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
8627 const char *value_string
;
8630 #define CASE(x) case x: value_string = #x; break;
8637 CASE (GBP_SPACINGMARK
)
8653 if (i
> 0 && (i
% 4) == 0)
8654 fprintf (stream
, "\n ");
8655 fprintf (stream
, " %s%s", value_string
,
8656 (i
+1 < t
.level3_size
<< t
.p
? "," : ""));
8658 if (t
.level3_size
<< t
.p
> 4)
8659 fprintf (stream
, "\n ");
8660 fprintf (stream
, " }\n");
8661 fprintf (stream
, "};\n");
8663 if (ferror (stream
) || fclose (stream
))
8665 fprintf (stderr
, "error writing to '%s'\n", filename
);
8670 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
8671 GraphemeBreakProperty.txt file. */
8673 fill_org_gbp (const char *graphemebreakproperty_filename
)
8679 for (i
= 0; i
< 0x110000; i
++)
8680 unicode_org_gbp
[i
] = GBP_OTHER
;
8682 stream
= fopen (graphemebreakproperty_filename
, "r");
8685 fprintf (stderr
, "error during fopen of '%s'\n",
8686 graphemebreakproperty_filename
);
8693 unsigned int i1
, i2
;
8694 char padding
[200+1];
8695 char propname
[200+1];
8699 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
8702 if (buf
[0] == '\0' || buf
[0] == '#')
8705 if (sscanf (buf
, "%X..%X%[ ;]%[^ ]", &i1
, &i2
, padding
, propname
) != 4)
8707 if (sscanf (buf
, "%X%[ ;]%[^ ]", &i1
, padding
, propname
) != 3)
8709 fprintf (stderr
, "parse error in '%s'\n",
8710 graphemebreakproperty_filename
);
8715 #define PROP(name,value) \
8716 if (strcmp (propname, name) == 0) propvalue = value; else
8719 PROP ("Control", GBP_CONTROL
)
8720 PROP ("Extend", GBP_EXTEND
)
8721 PROP ("Prepend", GBP_PREPEND
)
8722 PROP ("SpacingMark", GBP_SPACINGMARK
)
8727 PROP ("LVT", GBP_LVT
)
8728 PROP ("Regional_Indicator", GBP_RI
)
8729 PROP ("ZWJ", GBP_ZWJ
)
8730 PROP ("E_Base", GBP_EB
)
8731 PROP ("E_Modifier", GBP_EM
)
8732 PROP ("Glue_After_Zwj", GBP_GAZ
)
8733 PROP ("E_Base_GAZ", GBP_EBG
)
8736 fprintf (stderr
, "unknown property value '%s' in %s:%d\n", propname
,
8737 graphemebreakproperty_filename
, lineno
);
8740 assert (i1
<= i2
&& i2
< 0x110000);
8742 for (i
= i1
; i
<= i2
; i
++)
8743 unicode_org_gbp
[i
] = propvalue
;
8746 if (ferror (stream
) || fclose (stream
))
8748 fprintf (stderr
, "error reading from '%s'\n", graphemebreakproperty_filename
);
8753 /* ========================================================================= */
8755 /* Composition and decomposition.
8756 Updated for Unicode TR #15 revision 33. */
8758 /* Maximum number of characters into which a single Unicode character can be
8760 #define MAX_DECOMP_LENGTH 18
8764 UC_DECOMP_CANONICAL
,/* Canonical decomposition. */
8765 UC_DECOMP_FONT
, /* <font> A font variant (e.g. a blackletter form). */
8766 UC_DECOMP_NOBREAK
, /* <noBreak> A no-break version of a space or hyphen. */
8767 UC_DECOMP_INITIAL
, /* <initial> An initial presentation form (Arabic). */
8768 UC_DECOMP_MEDIAL
, /* <medial> A medial presentation form (Arabic). */
8769 UC_DECOMP_FINAL
, /* <final> A final presentation form (Arabic). */
8770 UC_DECOMP_ISOLATED
,/* <isolated> An isolated presentation form (Arabic). */
8771 UC_DECOMP_CIRCLE
, /* <circle> An encircled form. */
8772 UC_DECOMP_SUPER
, /* <super> A superscript form. */
8773 UC_DECOMP_SUB
, /* <sub> A subscript form. */
8774 UC_DECOMP_VERTICAL
,/* <vertical> A vertical layout presentation form. */
8775 UC_DECOMP_WIDE
, /* <wide> A wide (or zenkaku) compatibility character. */
8776 UC_DECOMP_NARROW
, /* <narrow> A narrow (or hankaku) compatibility character. */
8777 UC_DECOMP_SMALL
, /* <small> A small variant form (CNS compatibility). */
8778 UC_DECOMP_SQUARE
, /* <square> A CJK squared font variant. */
8779 UC_DECOMP_FRACTION
,/* <fraction> A vulgar fraction form. */
8780 UC_DECOMP_COMPAT
/* <compat> Otherwise unspecified compatibility character. */
8783 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
8784 decompositions). Return the type, or -1 for none. */
8786 get_decomposition (unsigned int ch
,
8787 unsigned int *lengthp
, unsigned int decomposed
[MAX_DECOMP_LENGTH
])
8789 const char *decomposition
= unicode_attributes
[ch
].decomposition
;
8791 if (decomposition
!= NULL
&& decomposition
[0] != '\0')
8793 int type
= UC_DECOMP_CANONICAL
;
8794 unsigned int length
;
8797 if (decomposition
[0] == '<')
8802 rangle
= strchr (decomposition
+ 1, '>');
8803 assert (rangle
!= NULL
);
8804 typelen
= rangle
+ 1 - decomposition
;
8805 #define TYPE(t1,t2) \
8806 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
8809 TYPE ("<font>", UC_DECOMP_FONT
)
8810 TYPE ("<noBreak>", UC_DECOMP_NOBREAK
)
8811 TYPE ("<initial>", UC_DECOMP_INITIAL
)
8812 TYPE ("<medial>", UC_DECOMP_MEDIAL
)
8813 TYPE ("<final>", UC_DECOMP_FINAL
)
8814 TYPE ("<isolated>", UC_DECOMP_ISOLATED
)
8815 TYPE ("<circle>", UC_DECOMP_CIRCLE
)
8816 TYPE ("<super>", UC_DECOMP_SUPER
)
8817 TYPE ("<sub>", UC_DECOMP_SUB
)
8818 TYPE ("<vertical>", UC_DECOMP_VERTICAL
)
8819 TYPE ("<wide>", UC_DECOMP_WIDE
)
8820 TYPE ("<narrow>", UC_DECOMP_NARROW
)
8821 TYPE ("<small>", UC_DECOMP_SMALL
)
8822 TYPE ("<square>", UC_DECOMP_SQUARE
)
8823 TYPE ("<fraction>", UC_DECOMP_FRACTION
)
8824 TYPE ("<compat>", UC_DECOMP_COMPAT
)
8826 fprintf (stderr
, "unknown decomposition type %*s\n", (int)typelen
, decomposition
);
8830 decomposition
= rangle
+ 1;
8831 if (decomposition
[0] == ' ')
8834 for (length
= 0; length
< MAX_DECOMP_LENGTH
; length
++)
8836 decomposed
[length
] = strtoul (decomposition
, &endptr
, 16);
8837 if (endptr
== decomposition
)
8839 decomposition
= endptr
;
8840 if (decomposition
[0] == ' ')
8843 /* Make sure that *DECOMPOSITION is not NULL-terminated.
8844 Otherwise MAX_DECOMP_LENGTH is too small. */
8845 assert (*decomposition
== '\0');
8854 /* Construction of sparse 3-level tables. */
8855 #define TABLE decomp_table
8856 #define ELEMENT uint16_t
8857 #define DEFAULT (uint16_t)(-1)
8858 #define xmalloc malloc
8859 #define xrealloc realloc
8863 output_decomposition (FILE *stream1
, FILE *stream2
)
8865 struct decomp_table t
;
8866 unsigned int level1_offset
, level2_offset
, level3_offset
;
8867 unsigned int offset
;
8873 decomp_table_init (&t
);
8875 fprintf (stream1
, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
8876 fprintf (stream1
, "\n");
8877 fprintf (stream2
, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
8880 for (ch
= 0; ch
< 0x110000; ch
++)
8882 unsigned int length
;
8883 unsigned int decomposed
[MAX_DECOMP_LENGTH
];
8884 int type
= get_decomposition (ch
, &length
, decomposed
);
8888 assert (offset
< (1 << 15));
8889 decomp_table_add (&t
, ch
, ((type
== UC_DECOMP_CANONICAL
? 0 : 1) << 15) | offset
);
8891 /* Produce length 3-bytes entries. */
8892 /* We would need a special representation of zero-length entries. */
8893 assert (length
!= 0);
8894 for (i
= 0; i
< length
; i
++)
8897 fprintf (stream2
, ",");
8898 if ((offset
% 4) == 0)
8899 fprintf (stream2
, "\n ");
8900 assert (decomposed
[i
] < (1 << 18));
8901 fprintf (stream2
, " 0x%02X, 0x%02X, 0x%02X",
8902 (((i
+1 < length
? (1 << 23) : 0)
8903 | (i
== 0 ? (type
<< 18) : 0)
8904 | decomposed
[i
]) >> 16) & 0xff,
8905 (decomposed
[i
] >> 8) & 0xff,
8906 decomposed
[i
] & 0xff);
8912 fprintf (stream2
, "\n};\n");
8913 fprintf (stream2
, "\n");
8915 decomp_table_finalize (&t
);
8918 5 * sizeof (uint32_t);
8920 5 * sizeof (uint32_t)
8921 + t
.level1_size
* sizeof (uint32_t);
8923 5 * sizeof (uint32_t)
8924 + t
.level1_size
* sizeof (uint32_t)
8925 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
8927 for (i
= 0; i
< 5; i
++)
8928 fprintf (stream1
, "#define decomp_header_%d %d\n", i
,
8929 ((uint32_t *) t
.result
)[i
]);
8930 fprintf (stream1
, "\n");
8931 fprintf (stream1
, "typedef struct\n");
8932 fprintf (stream1
, " {\n");
8933 fprintf (stream1
, " int level1[%zu];\n", t
.level1_size
);
8934 fprintf (stream1
, " int level2[%zu << %d];\n", t
.level2_size
, t
.q
);
8935 fprintf (stream1
, " unsigned short level3[%zu << %d];\n", t
.level3_size
, t
.p
);
8936 fprintf (stream1
, " }\n");
8937 fprintf (stream1
, "decomp_index_table_t;\n");
8938 fprintf (stream1
, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
8939 fprintf (stream2
, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
8940 fprintf (stream2
, "{\n");
8941 fprintf (stream2
, " {");
8942 if (t
.level1_size
> 8)
8943 fprintf (stream2
, "\n ");
8944 for (i
= 0; i
< t
.level1_size
; i
++)
8947 if (i
> 0 && (i
% 8) == 0)
8948 fprintf (stream2
, "\n ");
8949 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
8951 fprintf (stream2
, " %5d", -1);
8953 fprintf (stream2
, " %5zu",
8954 (offset
- level2_offset
) / sizeof (uint32_t));
8955 if (i
+1 < t
.level1_size
)
8956 fprintf (stream2
, ",");
8958 if (t
.level1_size
> 8)
8959 fprintf (stream2
, "\n ");
8960 fprintf (stream2
, " },\n");
8961 fprintf (stream2
, " {");
8962 if (t
.level2_size
<< t
.q
> 8)
8963 fprintf (stream2
, "\n ");
8964 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
8967 if (i
> 0 && (i
% 8) == 0)
8968 fprintf (stream2
, "\n ");
8969 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
8971 fprintf (stream2
, " %5d", -1);
8973 fprintf (stream2
, " %5zu",
8974 (offset
- level3_offset
) / sizeof (uint16_t));
8975 if (i
+1 < t
.level2_size
<< t
.q
)
8976 fprintf (stream2
, ",");
8978 if (t
.level2_size
<< t
.q
> 8)
8979 fprintf (stream2
, "\n ");
8980 fprintf (stream2
, " },\n");
8981 fprintf (stream2
, " {");
8982 if (t
.level3_size
<< t
.p
> 8)
8983 fprintf (stream2
, "\n ");
8984 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
8986 uint16_t value
= ((uint16_t *) (t
.result
+ level3_offset
))[i
];
8987 if (i
> 0 && (i
% 8) == 0)
8988 fprintf (stream2
, "\n ");
8989 fprintf (stream2
, " %5d", value
== (uint16_t)(-1) ? -1 : value
);
8990 if (i
+1 < t
.level3_size
<< t
.p
)
8991 fprintf (stream2
, ",");
8993 if (t
.level3_size
<< t
.p
> 8)
8994 fprintf (stream2
, "\n ");
8995 fprintf (stream2
, " }\n");
8996 fprintf (stream2
, "};\n");
9000 output_decomposition_tables (const char *filename1
, const char *filename2
, const char *version
)
9002 const char *filenames
[2];
9006 filenames
[0] = filename1
;
9007 filenames
[1] = filename2
;
9009 for (i
= 0; i
< 2; i
++)
9011 streams
[i
] = fopen (filenames
[i
], "w");
9012 if (streams
[i
] == NULL
)
9014 fprintf (stderr
, "cannot open '%s' for writing\n", filenames
[i
]);
9019 for (i
= 0; i
< 2; i
++)
9021 FILE *stream
= streams
[i
];
9023 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9024 fprintf (stream
, "/* Decomposition of Unicode characters. */\n");
9025 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9027 fprintf (stream
, "\n");
9030 output_decomposition (streams
[0], streams
[1]);
9032 for (i
= 0; i
< 2; i
++)
9034 if (ferror (streams
[i
]) || fclose (streams
[i
]))
9036 fprintf (stderr
, "error writing to '%s'\n", filenames
[i
]);
9042 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
9043 char unicode_composition_exclusions
[0x110000];
9046 fill_composition_exclusions (const char *compositionexclusions_filename
)
9051 stream
= fopen (compositionexclusions_filename
, "r");
9054 fprintf (stderr
, "error during fopen of '%s'\n", compositionexclusions_filename
);
9058 for (i
= 0; i
< 0x110000; i
++)
9059 unicode_composition_exclusions
[i
] = 0;
9066 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
9069 if (buf
[0] == '\0' || buf
[0] == '#')
9072 if (sscanf (buf
, "%X", &i
) != 1)
9074 fprintf (stderr
, "parse error in '%s'\n", compositionexclusions_filename
);
9077 assert (i
< 0x110000);
9079 unicode_composition_exclusions
[i
] = 1;
9082 if (ferror (stream
) || fclose (stream
))
9084 fprintf (stderr
, "error reading from '%s'\n", compositionexclusions_filename
);
9090 debug_output_composition_tables (const char *filename
)
9095 stream
= fopen (filename
, "w");
9098 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
9102 for (ch
= 0; ch
< 0x110000; ch
++)
9104 unsigned int length
;
9105 unsigned int decomposed
[MAX_DECOMP_LENGTH
];
9106 int type
= get_decomposition (ch
, &length
, decomposed
);
9108 if (type
== UC_DECOMP_CANONICAL
9109 /* Consider only binary decompositions.
9110 Exclude singleton decompositions. */
9113 unsigned int code1
= decomposed
[0];
9114 unsigned int code2
= decomposed
[1];
9115 unsigned int combined
= ch
;
9117 /* Exclude decompositions where the first part is not a starter,
9118 i.e. is not of canonical combining class 0. */
9119 if (strcmp (unicode_attributes
[code1
].combining
, "0") == 0
9120 /* Exclude characters listed in CompositionExclusions.txt. */
9121 && !unicode_composition_exclusions
[combined
])
9123 /* The combined character must now also be a starter.
9125 assert (strcmp (unicode_attributes
[combined
].combining
, "0") == 0);
9127 fprintf (stream
, "0x%04X\t0x%04X\t0x%04X\t%s\n",
9131 unicode_attributes
[code2
].combining
);
9136 if (ferror (stream
) || fclose (stream
))
9138 fprintf (stderr
, "error writing to '%s'\n", filename
);
9144 output_composition_tables (const char *filename
, const char *version
)
9149 stream
= fopen (filename
, "w");
9152 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
9156 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9157 fprintf (stream
, "/* Canonical composition of Unicode characters. */\n");
9158 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9160 fprintf (stream
, "\n");
9162 /* Put a GPL header on it. The gnulib module is under LGPL (although it
9163 still carries the GPL header), and it's gnulib-tool which replaces the
9164 GPL header with an LGPL header. */
9165 fprintf (stream
, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
9166 fprintf (stream
, "\n");
9167 fprintf (stream
, " This program is free software: you can redistribute it and/or modify\n");
9168 fprintf (stream
, " it under the terms of the GNU General Public License as published by\n");
9169 fprintf (stream
, " the Free Software Foundation; either version 3 of the License, or\n");
9170 fprintf (stream
, " (at your option) any later version.\n");
9171 fprintf (stream
, "\n");
9172 fprintf (stream
, " This program is distributed in the hope that it will be useful,\n");
9173 fprintf (stream
, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
9174 fprintf (stream
, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
9175 fprintf (stream
, " GNU General Public License for more details.\n");
9176 fprintf (stream
, "\n");
9177 fprintf (stream
, " You should have received a copy of the GNU General Public License\n");
9178 fprintf (stream
, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
9179 fprintf (stream
, "\n");
9181 /* The composition table is a set of mappings (code1, code2) -> combined,
9183 367 values for code1 (from 0x003C to 0x30FD),
9184 54 values for code2 (from 0x0300 to 0x309A).
9185 For a fixed code1, there are from 1 to 19 possible values for code2.
9186 For a fixed code2, there are from 1 to 117 possible values for code1.
9187 This is a very sparse matrix.
9189 We want an O(1) hash lookup.
9191 We could implement the hash lookup by mapping (code1, code2) to a linear
9192 combination mul1*code1 + mul2*code2, which is then used as an index into
9193 a 3-level table. But this leads to a table of size 37 KB.
9195 We use gperf to implement the hash lookup, giving it the 928 sets of
9196 4 bytes (code1, code2) as input. gperf generates a hash table of size
9197 1527, which is quite good (60% filled). It requires an auxiliary table
9198 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
9200 fprintf (stream
, "struct composition_rule { char codes[6]; };\n");
9201 fprintf (stream
, "%%struct-type\n");
9202 fprintf (stream
, "%%language=ANSI-C\n");
9203 fprintf (stream
, "%%define slot-name codes\n");
9204 fprintf (stream
, "%%define hash-function-name gl_uninorm_compose_hash\n");
9205 fprintf (stream
, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
9206 fprintf (stream
, "%%compare-lengths\n");
9207 fprintf (stream
, "%%compare-strncmp\n");
9208 fprintf (stream
, "%%readonly-tables\n");
9209 fprintf (stream
, "%%omit-struct-type\n");
9210 fprintf (stream
, "%%%%\n");
9212 for (ch
= 0; ch
< 0x110000; ch
++)
9214 unsigned int length
;
9215 unsigned int decomposed
[MAX_DECOMP_LENGTH
];
9216 int type
= get_decomposition (ch
, &length
, decomposed
);
9218 if (type
== UC_DECOMP_CANONICAL
9219 /* Consider only binary decompositions.
9220 Exclude singleton decompositions. */
9223 unsigned int code1
= decomposed
[0];
9224 unsigned int code2
= decomposed
[1];
9225 unsigned int combined
= ch
;
9227 /* Exclude decompositions where the first part is not a starter,
9228 i.e. is not of canonical combining class 0. */
9229 if (strcmp (unicode_attributes
[code1
].combining
, "0") == 0
9230 /* Exclude characters listed in CompositionExclusions.txt. */
9231 && !unicode_composition_exclusions
[combined
])
9233 /* The combined character must now also be a starter.
9235 assert (strcmp (unicode_attributes
[combined
].combining
, "0") == 0);
9237 fprintf (stream
, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
9238 (code1
>> 16) & 0xff, (code1
>> 8) & 0xff, code1
& 0xff,
9239 (code2
>> 16) & 0xff, (code2
>> 8) & 0xff, code2
& 0xff,
9245 if (ferror (stream
) || fclose (stream
))
9247 fprintf (stderr
, "error writing to '%s'\n", filename
);
9252 /* ========================================================================= */
9254 /* Output the test for a simple character mapping table to the given file. */
9257 output_simple_mapping_test (const char *filename
,
9258 const char *function_name
,
9259 unsigned int (*func
) (unsigned int),
9260 const char *version
)
9266 stream
= fopen (filename
, "w");
9269 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
9273 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9274 fprintf (stream
, "/* Test the Unicode character mapping functions.\n");
9275 fprintf (stream
, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
9276 fprintf (stream
, "\n");
9277 fprintf (stream
, " This program is free software: you can redistribute it and/or modify\n");
9278 fprintf (stream
, " it under the terms of the GNU General Public License as published by\n");
9279 fprintf (stream
, " the Free Software Foundation; either version 3 of the License, or\n");
9280 fprintf (stream
, " (at your option) any later version.\n");
9281 fprintf (stream
, "\n");
9282 fprintf (stream
, " This program is distributed in the hope that it will be useful,\n");
9283 fprintf (stream
, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
9284 fprintf (stream
, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
9285 fprintf (stream
, " GNU General Public License for more details.\n");
9286 fprintf (stream
, "\n");
9287 fprintf (stream
, " You should have received a copy of the GNU General Public License\n");
9288 fprintf (stream
, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
9289 fprintf (stream
, "\n");
9290 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9292 fprintf (stream
, "\n");
9293 fprintf (stream
, "#include \"test-mapping-part1.h\"\n");
9294 fprintf (stream
, "\n");
9297 for (ch
= 0; ch
< 0x110000; ch
++)
9299 unsigned int value
= func (ch
);
9304 fprintf (stream
, ",\n");
9305 fprintf (stream
, " { 0x%04X, 0x%04X }", ch
, value
);
9310 fprintf (stream
, "\n");
9312 fprintf (stream
, "\n");
9313 fprintf (stream
, "#define MAP(c) %s (c)\n", function_name
);
9314 fprintf (stream
, "#include \"test-mapping-part2.h\"\n");
9316 if (ferror (stream
) || fclose (stream
))
9318 fprintf (stderr
, "error writing to '%s'\n", filename
);
9323 /* Construction of sparse 3-level tables. */
9324 #define TABLE mapping_table
9325 #define ELEMENT int32_t
9327 #define xmalloc malloc
9328 #define xrealloc realloc
9331 /* Output a simple character mapping table to the given file. */
9334 output_simple_mapping (const char *filename
,
9335 unsigned int (*func
) (unsigned int),
9336 const char *version
)
9340 struct mapping_table t
;
9341 unsigned int level1_offset
, level2_offset
, level3_offset
;
9343 stream
= fopen (filename
, "w");
9346 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
9350 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9351 fprintf (stream
, "/* Simple character mapping of Unicode characters. */\n");
9352 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9357 mapping_table_init (&t
);
9359 for (ch
= 0; ch
< 0x110000; ch
++)
9361 int value
= (int) func (ch
) - (int) ch
;
9363 mapping_table_add (&t
, ch
, value
);
9366 mapping_table_finalize (&t
);
9368 /* Offsets in t.result, in memory of this process. */
9370 5 * sizeof (uint32_t);
9372 5 * sizeof (uint32_t)
9373 + t
.level1_size
* sizeof (uint32_t);
9375 5 * sizeof (uint32_t)
9376 + t
.level1_size
* sizeof (uint32_t)
9377 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
9379 for (i
= 0; i
< 5; i
++)
9380 fprintf (stream
, "#define mapping_header_%d %d\n", i
,
9381 ((uint32_t *) t
.result
)[i
]);
9382 fprintf (stream
, "static const\n");
9383 fprintf (stream
, "struct\n");
9384 fprintf (stream
, " {\n");
9385 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
9386 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
9387 fprintf (stream
, " int level3[%zu << %d];\n", t
.level3_size
, t
.p
);
9388 fprintf (stream
, " }\n");
9389 fprintf (stream
, "u_mapping =\n");
9390 fprintf (stream
, "{\n");
9391 fprintf (stream
, " {");
9392 if (t
.level1_size
> 8)
9393 fprintf (stream
, "\n ");
9394 for (i
= 0; i
< t
.level1_size
; i
++)
9397 if (i
> 0 && (i
% 8) == 0)
9398 fprintf (stream
, "\n ");
9399 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
9401 fprintf (stream
, " %5d", -1);
9403 fprintf (stream
, " %5zu",
9404 (offset
- level2_offset
) / sizeof (uint32_t));
9405 if (i
+1 < t
.level1_size
)
9406 fprintf (stream
, ",");
9408 if (t
.level1_size
> 8)
9409 fprintf (stream
, "\n ");
9410 fprintf (stream
, " },\n");
9411 fprintf (stream
, " {");
9412 if (t
.level2_size
<< t
.q
> 8)
9413 fprintf (stream
, "\n ");
9414 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
9417 if (i
> 0 && (i
% 8) == 0)
9418 fprintf (stream
, "\n ");
9419 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
9421 fprintf (stream
, " %5d", -1);
9423 fprintf (stream
, " %5zu",
9424 (offset
- level3_offset
) / sizeof (int32_t));
9425 if (i
+1 < t
.level2_size
<< t
.q
)
9426 fprintf (stream
, ",");
9428 if (t
.level2_size
<< t
.q
> 8)
9429 fprintf (stream
, "\n ");
9430 fprintf (stream
, " },\n");
9431 fprintf (stream
, " {");
9432 if (t
.level3_size
<< t
.p
> 8)
9433 fprintf (stream
, "\n ");
9434 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
9436 if (i
> 0 && (i
% 8) == 0)
9437 fprintf (stream
, "\n ");
9438 fprintf (stream
, " %5d", ((int32_t *) (t
.result
+ level3_offset
))[i
]);
9439 if (i
+1 < t
.level3_size
<< t
.p
)
9440 fprintf (stream
, ",");
9442 if (t
.level3_size
<< t
.p
> 8)
9443 fprintf (stream
, "\n ");
9444 fprintf (stream
, " }\n");
9445 fprintf (stream
, "};\n");
9447 if (ferror (stream
) || fclose (stream
))
9449 fprintf (stderr
, "error writing to '%s'\n", filename
);
9454 /* ========================================================================= */
9456 /* A special casing context.
9457 A context is negated through x -> -x. */
9462 SCC_AFTER_SOFT_DOTTED
,
9468 /* A special casing rule. */
9469 struct special_casing_rule
9472 unsigned int lower_mapping
[3];
9473 unsigned int title_mapping
[3];
9474 unsigned int upper_mapping
[3];
9475 unsigned int casefold_mapping
[3];
9476 const char *language
;
9480 /* The special casing rules. */
9481 struct special_casing_rule
**casing_rules
;
9482 unsigned int num_casing_rules
;
9483 unsigned int allocated_casing_rules
;
9486 add_casing_rule (struct special_casing_rule
*new_rule
)
9488 if (num_casing_rules
== allocated_casing_rules
)
9490 allocated_casing_rules
= 2 * allocated_casing_rules
;
9491 if (allocated_casing_rules
< 16)
9492 allocated_casing_rules
= 16;
9494 (struct special_casing_rule
**)
9495 realloc (casing_rules
, allocated_casing_rules
* sizeof (struct special_casing_rule
*));
9497 casing_rules
[num_casing_rules
++] = new_rule
;
9500 /* Stores in casing_rules the special casing rules found in
9501 specialcasing_filename. */
9503 fill_casing_rules (const char *specialcasing_filename
)
9507 stream
= fopen (specialcasing_filename
, "r");
9510 fprintf (stderr
, "error during fopen of '%s'\n", specialcasing_filename
);
9514 casing_rules
= NULL
;
9515 num_casing_rules
= 0;
9516 allocated_casing_rules
= 0;
9526 unsigned int lower_mapping
[3];
9527 unsigned int title_mapping
[3];
9528 unsigned int upper_mapping
[3];
9532 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
9535 if (buf
[0] == '\0' || buf
[0] == '#')
9540 code
= strtoul (scanptr
, &endptr
, 16);
9541 if (endptr
== scanptr
)
9543 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
9547 if (*scanptr
!= ';')
9549 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
9554 /* Scan lower mapping. */
9555 for (i
= 0; i
< 3; i
++)
9556 lower_mapping
[i
] = 0;
9557 for (i
= 0; i
< 3; i
++)
9559 while (*scanptr
== ' ')
9561 if (*scanptr
== ';')
9563 lower_mapping
[i
] = strtoul (scanptr
, &endptr
, 16);
9564 if (endptr
== scanptr
)
9566 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
9571 if (*scanptr
!= ';')
9573 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
9578 /* Scan title mapping. */
9579 for (i
= 0; i
< 3; i
++)
9580 title_mapping
[i
] = 0;
9581 for (i
= 0; i
< 3; i
++)
9583 while (*scanptr
== ' ')
9585 if (*scanptr
== ';')
9587 title_mapping
[i
] = strtoul (scanptr
, &endptr
, 16);
9588 if (endptr
== scanptr
)
9590 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
9595 if (*scanptr
!= ';')
9597 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
9602 /* Scan upper mapping. */
9603 for (i
= 0; i
< 3; i
++)
9604 upper_mapping
[i
] = 0;
9605 for (i
= 0; i
< 3; i
++)
9607 while (*scanptr
== ' ')
9609 if (*scanptr
== ';')
9611 upper_mapping
[i
] = strtoul (scanptr
, &endptr
, 16);
9612 if (endptr
== scanptr
)
9614 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
9619 if (*scanptr
!= ';')
9621 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
9626 /* Scan language and context. */
9628 context
= SCC_ALWAYS
;
9629 while (*scanptr
== ' ')
9631 if (*scanptr
!= '\0' && *scanptr
!= '#')
9633 const char *word_begin
= scanptr
;
9634 const char *word_end
;
9636 while (*scanptr
!= '\0' && *scanptr
!= '#' && *scanptr
!= ';' && *scanptr
!= ' ')
9640 while (*scanptr
== ' ')
9643 if (word_end
- word_begin
== 2)
9645 language
= (char *) malloc ((word_end
- word_begin
) + 1);
9646 memcpy (language
, word_begin
, 2);
9647 language
[word_end
- word_begin
] = '\0';
9648 word_begin
= word_end
= NULL
;
9650 if (*scanptr
!= '\0' && *scanptr
!= '#' && *scanptr
!= ';')
9652 word_begin
= scanptr
;
9653 while (*scanptr
!= '\0' && *scanptr
!= '#' && *scanptr
!= ';' && *scanptr
!= ' ')
9659 if (word_end
> word_begin
)
9661 bool negate
= false;
9663 if (word_end
- word_begin
>= 4 && memcmp (word_begin
, "Not_", 4) == 0)
9668 if (word_end
- word_begin
== 11 && memcmp (word_begin
, "Final_Sigma", 11) == 0)
9669 context
= SCC_FINAL_SIGMA
;
9670 else if (word_end
- word_begin
== 17 && memcmp (word_begin
, "After_Soft_Dotted", 17) == 0)
9671 context
= SCC_AFTER_SOFT_DOTTED
;
9672 else if (word_end
- word_begin
== 10 && memcmp (word_begin
, "More_Above", 10) == 0)
9673 context
= SCC_MORE_ABOVE
;
9674 else if (word_end
- word_begin
== 10 && memcmp (word_begin
, "Before_Dot", 10) == 0)
9675 context
= SCC_BEFORE_DOT
;
9676 else if (word_end
- word_begin
== 7 && memcmp (word_begin
, "After_I", 7) == 0)
9677 context
= SCC_AFTER_I
;
9680 fprintf (stderr
, "unknown context type in '%s'\n", specialcasing_filename
);
9684 context
= - context
;
9687 if (*scanptr
!= '\0' && *scanptr
!= '#' && *scanptr
!= ';')
9689 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
9694 /* Store the rule. */
9696 struct special_casing_rule
*new_rule
=
9697 (struct special_casing_rule
*) malloc (sizeof (struct special_casing_rule
));
9698 new_rule
->code
= code
;
9699 new_rule
->language
= language
;
9700 new_rule
->context
= context
;
9701 memcpy (new_rule
->lower_mapping
, lower_mapping
, sizeof (new_rule
->lower_mapping
));
9702 memcpy (new_rule
->title_mapping
, title_mapping
, sizeof (new_rule
->title_mapping
));
9703 memcpy (new_rule
->upper_mapping
, upper_mapping
, sizeof (new_rule
->upper_mapping
));
9705 add_casing_rule (new_rule
);
9709 if (ferror (stream
) || fclose (stream
))
9711 fprintf (stderr
, "error reading from '%s'\n", specialcasing_filename
);
9716 /* A casefolding rule. */
9717 struct casefold_rule
9720 unsigned int mapping
[3];
9721 const char *language
;
9724 /* The casefolding rules. */
9725 struct casefold_rule
**casefolding_rules
;
9726 unsigned int num_casefolding_rules
;
9727 unsigned int allocated_casefolding_rules
;
9729 /* Stores in casefolding_rules the case folding rules found in
9730 casefolding_filename. */
9732 fill_casefolding_rules (const char *casefolding_filename
)
9736 stream
= fopen (casefolding_filename
, "r");
9739 fprintf (stderr
, "error during fopen of '%s'\n", casefolding_filename
);
9743 casefolding_rules
= NULL
;
9744 num_casefolding_rules
= 0;
9745 allocated_casefolding_rules
= 0;
9756 unsigned int mapping
[3];
9758 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
9761 if (buf
[0] == '\0' || buf
[0] == '#')
9766 code
= strtoul (scanptr
, &endptr
, 16);
9767 if (endptr
== scanptr
)
9769 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
9773 if (*scanptr
!= ';')
9775 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
9781 while (*scanptr
== ' ')
9786 case 'C': case 'F': case 'S': case 'T':
9790 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
9794 if (*scanptr
!= ';')
9796 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
9801 /* Scan casefold mapping. */
9802 for (i
= 0; i
< 3; i
++)
9804 for (i
= 0; i
< 3; i
++)
9806 while (*scanptr
== ' ')
9808 if (*scanptr
== ';')
9810 mapping
[i
] = strtoul (scanptr
, &endptr
, 16);
9811 if (endptr
== scanptr
)
9813 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
9818 if (*scanptr
!= ';')
9820 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
9825 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
9828 const char * const *languages
;
9829 unsigned int languages_count
;
9831 /* Type 'T' indicates that the rule is applicable to Turkish
9835 static const char * const turkish_languages
[] = { "tr", "az" };
9836 languages
= turkish_languages
;
9837 languages_count
= 2;
9841 static const char * const all_languages
[] = { NULL
};
9842 languages
= all_languages
;
9843 languages_count
= 1;
9846 for (i
= 0; i
< languages_count
; i
++)
9848 /* Store a new rule. */
9849 struct casefold_rule
*new_rule
=
9850 (struct casefold_rule
*) malloc (sizeof (struct casefold_rule
));
9851 new_rule
->code
= code
;
9852 memcpy (new_rule
->mapping
, mapping
, sizeof (new_rule
->mapping
));
9853 new_rule
->language
= languages
[i
];
9855 if (num_casefolding_rules
== allocated_casefolding_rules
)
9857 allocated_casefolding_rules
= 2 * allocated_casefolding_rules
;
9858 if (allocated_casefolding_rules
< 16)
9859 allocated_casefolding_rules
= 16;
9861 (struct casefold_rule
**)
9862 realloc (casefolding_rules
,
9863 allocated_casefolding_rules
* sizeof (struct casefold_rule
*));
9865 casefolding_rules
[num_casefolding_rules
++] = new_rule
;
9870 if (ferror (stream
) || fclose (stream
))
9872 fprintf (stderr
, "error reading from '%s'\n", casefolding_filename
);
9877 /* Casefold mapping, when it maps to a single character. */
9878 unsigned int unicode_casefold
[0x110000];
9881 to_casefold (unsigned int ch
)
9883 return unicode_casefold
[ch
];
9886 /* Redistribute the casefolding_rules:
9887 - Rules that map to a single character, language independently, are stored
9888 in unicode_casefold.
9889 - Other rules are merged into casing_rules. */
9891 redistribute_casefolding_rules (void)
9893 unsigned int ch
, i
, j
;
9895 /* Fill unicode_casefold[]. */
9896 for (ch
= 0; ch
< 0x110000; ch
++)
9897 unicode_casefold
[ch
] = ch
;
9898 for (i
= 0; i
< num_casefolding_rules
; i
++)
9900 struct casefold_rule
*cfrule
= casefolding_rules
[i
];
9902 if (cfrule
->language
== NULL
&& cfrule
->mapping
[1] == 0)
9905 assert (ch
< 0x110000);
9906 unicode_casefold
[ch
] = cfrule
->mapping
[0];
9910 /* Extend the special casing rules by filling in their casefold_mapping[]
9912 for (j
= 0; j
< num_casing_rules
; j
++)
9914 struct special_casing_rule
*rule
= casing_rules
[j
];
9917 rule
->casefold_mapping
[0] = to_casefold (rule
->code
);
9918 for (k
= 1; k
< 3; k
++)
9919 rule
->casefold_mapping
[k
] = 0;
9922 /* Now merge the other casefolding rules into casing_rules. */
9923 for (i
= 0; i
< num_casefolding_rules
; i
++)
9925 struct casefold_rule
*cfrule
= casefolding_rules
[i
];
9927 if (!(cfrule
->language
== NULL
&& cfrule
->mapping
[1] == 0))
9929 /* Find a rule that applies to the same code, same language, and it
9930 has context SCC_ALWAYS. At the same time, update all rules that
9931 have the same code and same or more specific language. */
9932 struct special_casing_rule
*found_rule
= NULL
;
9934 for (j
= 0; j
< num_casing_rules
; j
++)
9936 struct special_casing_rule
*rule
= casing_rules
[j
];
9938 if (rule
->code
== cfrule
->code
9939 && (cfrule
->language
== NULL
9940 || (rule
->language
!= NULL
9941 && strcmp (rule
->language
, cfrule
->language
) == 0)))
9943 memcpy (rule
->casefold_mapping
, cfrule
->mapping
,
9944 sizeof (rule
->casefold_mapping
));
9946 if ((cfrule
->language
== NULL
9947 ? rule
->language
== NULL
9948 : rule
->language
!= NULL
9949 && strcmp (rule
->language
, cfrule
->language
) == 0)
9950 && rule
->context
== SCC_ALWAYS
)
9958 if (found_rule
== NULL
)
9960 /* Create a new rule. */
9961 struct special_casing_rule
*new_rule
=
9962 (struct special_casing_rule
*) malloc (sizeof (struct special_casing_rule
));
9964 /* Try to find a rule that applies to the same code, no language
9965 restriction, and with context SCC_ALWAYS. */
9966 for (j
= 0; j
< num_casing_rules
; j
++)
9968 struct special_casing_rule
*rule
= casing_rules
[j
];
9970 if (rule
->code
== cfrule
->code
9971 && rule
->context
== SCC_ALWAYS
9972 && rule
->language
== NULL
)
9980 new_rule
->code
= cfrule
->code
;
9981 new_rule
->language
= cfrule
->language
;
9982 new_rule
->context
= SCC_ALWAYS
;
9983 if (found_rule
!= NULL
)
9985 memcpy (new_rule
->lower_mapping
, found_rule
->lower_mapping
,
9986 sizeof (new_rule
->lower_mapping
));
9987 memcpy (new_rule
->title_mapping
, found_rule
->title_mapping
,
9988 sizeof (new_rule
->title_mapping
));
9989 memcpy (new_rule
->upper_mapping
, found_rule
->upper_mapping
,
9990 sizeof (new_rule
->upper_mapping
));
9996 new_rule
->lower_mapping
[0] = to_lower (cfrule
->code
);
9997 for (k
= 1; k
< 3; k
++)
9998 new_rule
->lower_mapping
[k
] = 0;
9999 new_rule
->title_mapping
[0] = to_title (cfrule
->code
);
10000 for (k
= 1; k
< 3; k
++)
10001 new_rule
->title_mapping
[k
] = 0;
10002 new_rule
->upper_mapping
[0] = to_upper (cfrule
->code
);
10003 for (k
= 1; k
< 3; k
++)
10004 new_rule
->upper_mapping
[k
] = 0;
10006 memcpy (new_rule
->casefold_mapping
, cfrule
->mapping
,
10007 sizeof (new_rule
->casefold_mapping
));
10009 add_casing_rule (new_rule
);
10016 compare_casing_rules (const void *a
, const void *b
)
10018 struct special_casing_rule
*a_rule
= *(struct special_casing_rule
**) a
;
10019 struct special_casing_rule
*b_rule
= *(struct special_casing_rule
**) b
;
10020 unsigned int a_code
= a_rule
->code
;
10021 unsigned int b_code
= b_rule
->code
;
10023 if (a_code
< b_code
)
10025 if (a_code
> b_code
)
10028 /* Sort the more specific rules before the more general ones. */
10029 return (- ((a_rule
->language
!= NULL
? 1 : 0) + (a_rule
->context
!= SCC_ALWAYS
? 1 : 0))
10030 + ((b_rule
->language
!= NULL
? 1 : 0) + (b_rule
->context
!= SCC_ALWAYS
? 1 : 0)));
10034 sort_casing_rules (void)
10036 /* Sort the rules 1. by code, 2. by specificity. */
10037 if (num_casing_rules
> 1)
10038 qsort (casing_rules
, num_casing_rules
, sizeof (struct special_casing_rule
*),
10039 compare_casing_rules
);
10042 /* Output the special casing rules. */
10044 output_casing_rules (const char *filename
, const char *version
)
10048 unsigned int minor
;
10050 stream
= fopen (filename
, "w");
10051 if (stream
== NULL
)
10053 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
10057 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10058 fprintf (stream
, "/* Special casing rules of Unicode characters. */\n");
10059 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10061 fprintf (stream
, "struct special_casing_rule { char code[3]; };\n");
10062 fprintf (stream
, "%%struct-type\n");
10063 fprintf (stream
, "%%language=ANSI-C\n");
10064 fprintf (stream
, "%%define slot-name code\n");
10065 fprintf (stream
, "%%define hash-function-name gl_unicase_special_hash\n");
10066 fprintf (stream
, "%%define lookup-function-name gl_unicase_special_lookup\n");
10067 fprintf (stream
, "%%compare-lengths\n");
10068 fprintf (stream
, "%%compare-strncmp\n");
10069 fprintf (stream
, "%%readonly-tables\n");
10070 fprintf (stream
, "%%omit-struct-type\n");
10071 fprintf (stream
, "%%%%\n");
10074 for (i
= 0; i
< num_casing_rules
; i
++)
10076 struct special_casing_rule
*rule
= casing_rules
[i
];
10079 if (i
> 0 && rule
->code
== casing_rules
[i
- 1]->code
)
10084 if (!(rule
->code
< 0x10000))
10086 fprintf (stderr
, "special rule #%u: code %u out of range\n", i
, rule
->code
);
10090 fprintf (stream
, "\"\\x%02x\\x%02x\\x%02x\", ",
10091 (rule
->code
>> 8) & 0xff, rule
->code
& 0xff, minor
);
10093 fprintf (stream
, "%d, ",
10094 i
+ 1 < num_casing_rules
&& casing_rules
[i
+ 1]->code
== rule
->code
? 1 : 0);
10096 context
= rule
->context
;
10099 fprintf (stream
, "-");
10100 context
= - context
;
10103 fprintf (stream
, " ");
10107 fprintf (stream
, "SCC_ALWAYS ");
10109 case SCC_FINAL_SIGMA
:
10110 fprintf (stream
, "SCC_FINAL_SIGMA ");
10112 case SCC_AFTER_SOFT_DOTTED
:
10113 fprintf (stream
, "SCC_AFTER_SOFT_DOTTED");
10115 case SCC_MORE_ABOVE
:
10116 fprintf (stream
, "SCC_MORE_ABOVE ");
10118 case SCC_BEFORE_DOT
:
10119 fprintf (stream
, "SCC_BEFORE_DOT ");
10122 fprintf (stream
, "SCC_AFTER_I ");
10127 fprintf (stream
, ", ");
10129 if (rule
->language
!= NULL
)
10131 assert (strlen (rule
->language
) == 2);
10132 fprintf (stream
, "{ '%c', '%c' }, ", rule
->language
[0], rule
->language
[1]);
10135 fprintf (stream
, "{ '\\0', '\\0' }, ");
10137 fprintf (stream
, "{ ");
10138 for (j
= 0; j
< 3; j
++)
10141 fprintf (stream
, ", ");
10142 if (!(rule
->upper_mapping
[j
] < 0x10000))
10144 fprintf (stderr
, "special rule #%u: upper mapping of code %u out of range\n", i
, rule
->code
);
10147 if (rule
->upper_mapping
[j
] != 0)
10148 fprintf (stream
, "0x%04X", rule
->upper_mapping
[j
]);
10150 fprintf (stream
, " 0");
10152 fprintf (stream
, " }, { ");
10153 for (j
= 0; j
< 3; j
++)
10156 fprintf (stream
, ", ");
10157 if (!(rule
->lower_mapping
[j
] < 0x10000))
10159 fprintf (stderr
, "special rule #%u: lower mapping of code %u out of range\n", i
, rule
->code
);
10162 if (rule
->lower_mapping
[j
] != 0)
10163 fprintf (stream
, "0x%04X", rule
->lower_mapping
[j
]);
10165 fprintf (stream
, " 0");
10167 fprintf (stream
, " }, { ");
10168 for (j
= 0; j
< 3; j
++)
10171 fprintf (stream
, ", ");
10172 if (!(rule
->title_mapping
[j
] < 0x10000))
10174 fprintf (stderr
, "special rule #%u: title mapping of code %u out of range\n", i
, rule
->code
);
10177 if (rule
->title_mapping
[j
] != 0)
10178 fprintf (stream
, "0x%04X", rule
->title_mapping
[j
]);
10180 fprintf (stream
, " 0");
10182 fprintf (stream
, " }, { ");
10183 for (j
= 0; j
< 3; j
++)
10186 fprintf (stream
, ", ");
10187 if (!(rule
->casefold_mapping
[j
] < 0x10000))
10189 fprintf (stderr
, "special rule #%u: casefold mapping of code %u out of range\n", i
, rule
->code
);
10192 if (rule
->casefold_mapping
[j
] != 0)
10193 fprintf (stream
, "0x%04X", rule
->casefold_mapping
[j
]);
10195 fprintf (stream
, " 0");
10197 fprintf (stream
, " }\n");
10200 if (ferror (stream
) || fclose (stream
))
10202 fprintf (stderr
, "error writing to '%s'\n", filename
);
10207 /* ========================================================================= */
10209 /* Quoting the Unicode standard:
10210 Definition: A character is defined to be "cased" if it has the Lowercase
10211 or Uppercase property or has a General_Category value of
10212 Titlecase_Letter. */
10214 is_cased (unsigned int ch
)
10216 return (is_property_lowercase (ch
)
10217 || is_property_uppercase (ch
)
10218 || is_category_Lt (ch
));
10221 /* Quoting the Unicode standard:
10222 Definition: A character is defined to be "case-ignorable" if it has the
10223 value MidLetter {or the value MidNumLet} for the Word_Break property or
10224 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
10225 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10226 The text marked in braces was added in Unicode 5.1.0, see
10227 <https://www.unicode.org/versions/Unicode5.1.0/> section "Update of
10228 Definition of case-ignorable". */
10229 /* Since this predicate is only used for the "Before C" and "After C"
10230 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
10231 This simplifies the evaluation of the regular expressions
10232 \p{cased} (\p{case-ignorable})* C
10234 C (\p{case-ignorable})* \p{cased}
10237 is_case_ignorable (unsigned int ch
)
10239 return (unicode_org_wbp
[ch
] == WBP_MIDLETTER
10240 || unicode_org_wbp
[ch
] == WBP_MIDNUMLET
10241 || is_category_Mn (ch
)
10242 || is_category_Me (ch
)
10243 || is_category_Cf (ch
)
10244 || is_category_Lm (ch
)
10245 || is_category_Sk (ch
))
10249 /* ------------------------------------------------------------------------- */
10251 /* Output all case related properties. */
10253 output_casing_properties (const char *version
)
10255 #define PROPERTY(FN,P) \
10256 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
10257 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
10258 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
10259 PROPERTY(cased
, cased
)
10260 PROPERTY(ignorable
, case_ignorable
)
10264 /* ========================================================================= */
10267 main (int argc
, char * argv
[])
10269 const char *unicodedata_filename
;
10270 const char *proplist_filename
;
10271 const char *derivedproplist_filename
;
10272 const char *arabicshaping_filename
;
10273 const char *scripts_filename
;
10274 const char *blocks_filename
;
10275 const char *proplist30_filename
;
10276 const char *eastasianwidth_filename
;
10277 const char *linebreak_filename
;
10278 const char *wordbreakproperty_filename
;
10279 const char *graphemebreakproperty_filename
;
10280 const char *compositionexclusions_filename
;
10281 const char *specialcasing_filename
;
10282 const char *casefolding_filename
;
10283 const char *version
;
10287 fprintf (stderr
, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
10292 unicodedata_filename
= argv
[1];
10293 proplist_filename
= argv
[2];
10294 derivedproplist_filename
= argv
[3];
10295 arabicshaping_filename
= argv
[4];
10296 scripts_filename
= argv
[5];
10297 blocks_filename
= argv
[6];
10298 proplist30_filename
= argv
[7];
10299 eastasianwidth_filename
= argv
[8];
10300 linebreak_filename
= argv
[9];
10301 wordbreakproperty_filename
= argv
[10];
10302 graphemebreakproperty_filename
= argv
[11];
10303 compositionexclusions_filename
= argv
[12];
10304 specialcasing_filename
= argv
[13];
10305 casefolding_filename
= argv
[14];
10306 version
= argv
[15];
10308 fill_attributes (unicodedata_filename
);
10309 clear_properties ();
10310 fill_properties (proplist_filename
);
10311 fill_properties (derivedproplist_filename
);
10312 fill_properties30 (proplist30_filename
);
10313 fill_arabicshaping (arabicshaping_filename
);
10314 fill_scripts (scripts_filename
);
10315 fill_blocks (blocks_filename
);
10316 fill_width (eastasianwidth_filename
);
10317 fill_org_lbp (linebreak_filename
);
10318 fill_org_wbp (wordbreakproperty_filename
);
10319 fill_org_gbp (graphemebreakproperty_filename
);
10320 fill_composition_exclusions (compositionexclusions_filename
);
10321 fill_casing_rules (specialcasing_filename
);
10322 fill_casefolding_rules (casefolding_filename
);
10323 redistribute_casefolding_rules ();
10324 sort_casing_rules ();
10326 output_categories (version
);
10327 output_category ("unictype/categ_of.h", version
);
10328 output_combclass ("unictype/combiningclass.h", version
);
10329 output_bidi_category ("unictype/bidi_of.h", version
);
10330 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version
);
10331 output_decimal_digit ("unictype/decdigit.h", version
);
10332 output_digit_test ("../tests/unictype/test-digit.h", version
);
10333 output_digit ("unictype/digit.h", version
);
10334 output_numeric_test ("../tests/unictype/test-numeric.h", version
);
10335 output_numeric ("unictype/numeric.h", version
);
10336 output_mirror ("unictype/mirror.h", version
);
10337 output_properties (version
);
10338 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version
);
10339 output_joining_type ("unictype/joiningtype_of.h", version
);
10340 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version
);
10341 output_joining_group ("unictype/joininggroup_of.h", version
);
10343 output_scripts (version
);
10344 output_scripts_byname (version
);
10345 output_blocks (version
);
10346 output_ident_properties (version
);
10347 output_nonspacing_property ("uniwidth/width.c.part");
10348 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
10349 output_old_ctype (version
);
10351 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
10352 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
10353 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version
);
10355 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
10356 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
10357 output_wbrk_tables ("uniwbrk/wbrkprop.h", version
);
10359 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
10360 output_gbp_table ("unigbrk/gbrkprop.h", version
);
10362 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version
);
10363 debug_output_composition_tables ("uninorm/composition.txt");
10364 output_composition_tables ("uninorm/composition-table.gperf", version
);
10366 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper
, version
);
10367 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower
, version
);
10368 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title
, version
);
10369 output_simple_mapping ("unicase/toupper.h", to_upper
, version
);
10370 output_simple_mapping ("unicase/tolower.h", to_lower
, version
);
10371 output_simple_mapping ("unicase/totitle.h", to_title
, version
);
10372 output_simple_mapping ("unicase/tocasefold.h", to_casefold
, version
);
10373 output_casing_rules ("unicase/special-casing-table.gperf", version
);
10374 output_casing_properties (version
);
10382 * compile-command: "\
10383 * gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \\
10384 * ./gen-uni-tables \\
10385 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/UnicodeData.txt \\
10386 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/PropList.txt \\
10387 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/DerivedCoreProperties.txt \\
10388 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/ArabicShaping.txt \\
10389 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/Scripts.txt \\
10390 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/Blocks.txt \\
10391 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \\
10392 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/EastAsianWidth.txt \\
10393 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/LineBreak.txt \\
10394 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/auxiliary/WordBreakProperty.txt \\
10395 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \\
10396 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/CompositionExclusions.txt \\
10397 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/SpecialCasing.txt \\
10398 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/CaseFolding.txt \\
10400 * && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \\
10401 * && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt"