1 /* Line breaking auxiliary tables.
2 Copyright (C) 2001-2003, 2006-2024 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation, either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
33 /* Line breaking classification. */
37 /* Values >= 40 are resolved at run time. */
38 LBP_BK
= 40, /* mandatory break */
39 LBP_CR
= 41, /* carriage return */
40 LBP_LF
= 42, /* line feed */
41 LBP_CM
= 43, /* attached characters and combining marks */
42 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
43 /*LBP_SG, surrogates - not used here because they are not characters */
44 LBP_WJ
= 0, /* word joiner */
45 LBP_ZW
= 44, /* zero width space */
46 LBP_GL
= 1, /* non-breaking (glue) */
47 LBP_SP
= 45, /* space */
48 LBP_B2
= 2, /* break opportunity before and after */
49 LBP_BA
= 3, /* break opportunity after */
50 LBP_BB
= 4, /* break opportunity before */
51 LBP_HY
= 5, /* hyphen */
52 LBP_CB
= 46, /* contingent break opportunity */
53 LBP_CL
= 6, /* closing punctuation */
54 LBP_CP1
= 7, /* closing parenthesis, non-EastAsian character */
55 LBP_CP2
= 8, /* closing parenthesis, EastAsian character */
56 LBP_EX
= 9, /* exclamation/interrogation */
57 LBP_IN
= 10, /* inseparable */
58 LBP_NS
= 11, /* non starter */
59 LBP_OP1
= 12, /* opening punctuation, non-EastAsian character */
60 LBP_OP2
= 13, /* opening punctuation, EastAsian character */
61 LBP_QU1
= 14, /* ambiguous quotation, neither initial nor final punctuation */
62 LBP_QU2
= 15, /* ambiguous quotation, initial punctuation */
63 LBP_QU3
= 16, /* ambiguous quotation, final punctuation */
64 LBP_IS
= 17, /* infix separator (numeric) */
65 LBP_NU
= 18, /* numeric */
66 LBP_PO
= 19, /* postfix (numeric) */
67 LBP_PR
= 20, /* prefix (numeric) */
68 LBP_SY
= 21, /* symbols allowing breaks */
69 LBP_AI
= 47, /* ambiguous (alphabetic or ideograph) */
70 LBP_AL
= 22, /* ordinary alphabetic and symbol characters */
71 /*LBP_CJ, conditional Japanese starter, resolved to NS */
72 LBP_H2
= 23, /* Hangul LV syllable */
73 LBP_H3
= 24, /* Hangul LVT syllable */
74 LBP_HL
= 30, /* Hebrew letter */
75 LBP_ID1
= 25, /* ideographic */
76 LBP_ID2
= 26, /* ideographic and potential future emoji */
77 LBP_JL
= 27, /* Hangul L Jamo */
78 LBP_JV
= 28, /* Hangul V Jamo */
79 LBP_JT
= 29, /* Hangul T Jamo */
80 LBP_AP
= 31, /* Brahmic scripts: pre-base repha */
81 LBP_AK
= 32, /* Brahmic scripts: consonants */
82 LBP_AS
= 33, /* Brahmic scripts: independent vowels */
83 LBP_VI
= 34, /* Brahmic scripts: conjoining viramas */
84 LBP_VF
= 35, /* Brahmic scripts: viramas for final consonants */
85 LBP_RI
= 36, /* regional indicator */
86 LBP_SA
= 48, /* complex context (South East Asian) */
87 LBP_ZWJ
= 37, /* zero width joiner */
88 LBP_EB
= 38, /* emoji base */
89 LBP_EM
= 39, /* emoji modifier */
90 LBP_XX
= 49, /* unknown */
91 /* Artificial values that exist only at runtime, not in the tables. */
95 #include "lbrkprop1.h"
97 static inline unsigned char
98 unilbrkprop_lookup (ucs4_t uc
)
100 unsigned int index1
= uc
>> lbrkprop_header_0
;
101 if (index1
< lbrkprop_header_1
)
103 int lookup1
= unilbrkprop
.level1
[index1
];
106 unsigned int index2
= (uc
>> lbrkprop_header_2
) & lbrkprop_header_3
;
107 int lookup2
= unilbrkprop
.level2
[lookup1
+ index2
];
110 unsigned int index3
= uc
& lbrkprop_header_4
;
111 return unilbrkprop
.level3
[lookup2
+ index3
];
118 /* Table indexed by two line breaking classifications. */
119 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
120 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
121 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
123 extern const unsigned char unilbrk_table
[40][40];
125 /* We don't support line breaking of complex-context dependent characters
126 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */