timespec_get: New module.
[gnulib.git] / lib / uniwbrk / u-wordbreaks.h
blob193ef08cdc34496061448249217ac8ab170e848d
1 /* Word breaks in UTF-8/UTF-16/UTF-32 strings. -*- coding: utf-8 -*-
2 Copyright (C) 2009-2021 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 void
19 FUNC (const UNIT *s, size_t n, char *p)
21 if (n > 0)
23 const UNIT *s_end = s + n;
25 /* Word break property of the last character.
26 -1 at the very beginning of the string. */
27 int last_char_prop = -1;
29 /* Format and Extend characters are ignored; this means, the mostly used
30 unit is the complex character (= character with subsequent ignored
31 characters).
32 Word break property of the last complex character.
33 -1 at the very beginning of the string. */
34 int last_compchar_prop = -1;
35 char *last_compchar_ptr = NULL;
37 /* For recognizing rules involving 3 complex characters:
38 Word break property of the second-to-last complex character.
39 -1 at the very beginning of the string. */
40 int secondlast_compchar_prop = -1;
42 size_t ri_count = 0;
44 /* Don't break inside multibyte characters. */
45 memset (p, 0, n);
47 while (s < s_end)
49 ucs4_t uc;
50 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
51 int prop = uc_wordbreak_property (uc);
53 /* No break at the start of the string. */
54 if (last_char_prop >= 0)
56 /* No break between CR and LF (WB3). */
57 if (last_char_prop == WBP_CR && prop == WBP_LF)
58 /* *p = 0 */;
59 /* Break before and after newlines (WB3a, WB3b). */
60 else if ((last_char_prop == WBP_CR
61 || last_char_prop == WBP_LF
62 || last_char_prop == WBP_NEWLINE)
63 || (prop == WBP_CR
64 || prop == WBP_LF
65 || prop == WBP_NEWLINE))
66 *p = 1;
67 /* No break within emoji zwj sequence (WB3c). */
68 else if (last_char_prop == WBP_ZWJ &&
69 (prop == WBP_GAZ || prop == WBP_EBG))
70 /* *p = 0 */;
71 /* Ignore Format and Extend characters. */
72 else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
74 /* No break in these situations (see UAX #29):
76 secondlast last current
78 (ALetter | HL) (MidLetter | MidNumLet | SQ) × (ALetter | HL) (WB7)
79 (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6)
80 Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11)
81 Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12)
82 HL × DQ HL (WB7b)
83 HL DQ × HL (WB7c)
84 ^ (RI RI)* RI × RI (WB15)
85 [^RI] (RI RI)* RI × RI (WB16)
87 /* No break across certain punctuation. Also, disable word
88 breaks that were recognized earlier (due to lookahead of
89 only one complex character). */
90 if (((prop == WBP_ALETTER
91 || prop == WBP_HL)
92 && (last_compchar_prop == WBP_MIDLETTER
93 || last_compchar_prop == WBP_MIDNUMLET
94 || last_compchar_prop == WBP_SQ)
95 && (secondlast_compchar_prop == WBP_ALETTER
96 || secondlast_compchar_prop == WBP_HL))
97 || (prop == WBP_NUMERIC
98 && (last_compchar_prop == WBP_MIDNUM
99 || last_compchar_prop == WBP_MIDNUMLET
100 || last_compchar_prop == WBP_SQ)
101 && secondlast_compchar_prop == WBP_NUMERIC)
102 || (prop == WBP_HL
103 && last_compchar_prop == WBP_DQ
104 && secondlast_compchar_prop == WBP_HL))
106 *last_compchar_ptr = 0;
107 /* *p = 0; */
109 /* Break before RI, if odd number of RI's are
110 preceding (WB15, WB16). */
111 else if (last_compchar_prop == WBP_RI && prop == WBP_RI)
113 if (ri_count % 2 == 0)
114 *p = 1;
115 /* else *p = 0 */
117 /* Break after Format and Extend character. */
118 else if (last_compchar_prop == WBP_EXTEND
119 || last_compchar_prop == WBP_FORMAT)
120 *p = 1;
121 else
123 int last_compchar_index =
124 uniwbrk_prop_index[last_compchar_prop];
125 int index = uniwbrk_prop_index[prop];
127 /* Break between unknown pair (WB999). */
128 if (last_compchar_index < 0 || index < 0)
129 *p = 1;
130 /* Perform a single table lookup. */
131 else if (uniwbrk_table[last_compchar_index][index])
132 *p = 1;
133 /* else *p = 0; */
138 last_char_prop = prop;
140 /* Ignore Format and Extend characters, except at the
141 start of the line. */
142 if (last_compchar_prop < 0
143 || last_compchar_prop == WBP_CR
144 || last_compchar_prop == WBP_LF
145 || last_compchar_prop == WBP_NEWLINE
146 || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
148 secondlast_compchar_prop = last_compchar_prop;
149 last_compchar_prop = prop;
150 last_compchar_ptr = p;
152 if (prop == WBP_RI)
153 ri_count++;
154 else
155 ri_count = 0;
158 s += count;
159 p += count;