usleep tests: Avoid failure due to known Cygwin 3.5.3 bug.
[gnulib.git] / lib / unilbrk / u32-possible-linebreaks.c
blobeb28891db0a107cacc91e3c464339abaf355cf81
1 /* Line breaking of UTF-32 strings.
2 Copyright (C) 2001-2003, 2006-2024 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation, either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
26 #include <config.h>
28 /* Specification. */
29 #include "unilbrk.h"
30 #include "unilbrk/internal.h"
32 #include <stdlib.h>
34 #include "unilbrk/lbrktables.h"
35 #include "uniwidth/cjk.h"
37 /* This file implements
38 Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */
40 void
41 u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
42 int cr, char *p)
44 if (n > 0)
46 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL);
47 const uint32_t *s_end = s + n;
48 int prev_prop = LBP_BK; /* line break property of last character */
49 int last_prop = LBP_BK; /* line break property of last non-space character */
50 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
52 /* Number of consecutive regional indicator (RI) characters seen
53 immediately before the current point. */
54 size_t ri_count = 0;
58 ucs4_t uc = *s;
59 s++;
60 int prop = unilbrkprop_lookup (uc);
62 if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
64 /* (LB4,LB5,LB6) Mandatory break. */
65 *p = UC_BREAK_MANDATORY;
66 /* cr is either LBP_CR or -1. In the first case, recognize
67 a CR-LF sequence. */
68 if (prev_prop == cr && prop == LBP_LF)
69 p[-1] = UC_BREAK_CR_BEFORE_LF;
70 prev_prop = prop;
71 last_prop = LBP_BK;
72 seen_space = NULL;
74 else
76 /* Resolve property values whose behaviour is not fixed. */
77 switch (prop)
79 case LBP_AI:
80 /* Resolve ambiguous. */
81 prop = LBP_AI_REPLACEMENT;
82 break;
83 case LBP_CB:
84 /* This is arbitrary. */
85 prop = LBP_ID1;
86 break;
87 case LBP_SA:
88 /* We don't handle complex scripts yet.
89 Treat LBP_SA like LBP_XX. */
90 case LBP_XX:
91 /* This is arbitrary. */
92 prop = LBP_AL;
93 break;
94 case LBP_QU2:
95 /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous
96 character's line break property was not one of
97 BK, CR, LF, OP, QU, GL, SP, ZW. */
98 switch (prev_prop)
100 case LBP_BK:
101 case LBP_CR:
102 case LBP_LF:
103 case LBP_OP1: case LBP_OP2:
104 case LBP_QU1: case LBP_QU2: case LBP_QU3:
105 case LBP_GL:
106 case LBP_SP:
107 case LBP_ZW:
108 break;
109 default:
110 prop = LBP_QU1;
111 break;
113 break;
114 case LBP_QU3:
115 /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next
116 character's line break property is not one of
117 BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */
119 int next_prop;
120 if (s < s_end)
122 ucs4_t next_uc = *s;
123 next_prop = unilbrkprop_lookup (next_uc);
125 else
126 next_prop = LBP_BK;
127 switch (next_prop)
129 case LBP_BK:
130 case LBP_CR:
131 case LBP_LF:
132 case LBP_SP:
133 case LBP_GL:
134 case LBP_WJ:
135 case LBP_CL:
136 case LBP_QU1: case LBP_QU2: case LBP_QU3:
137 case LBP_CP1: case LBP_CP2:
138 case LBP_EX:
139 case LBP_IS:
140 case LBP_SY:
141 case LBP_ZW:
142 break;
143 default:
144 prop = LBP_QU1;
145 break;
148 break;
151 /* Deal with spaces and combining characters. */
152 if (prop == LBP_SP)
154 /* (LB7) Don't break just before a space. */
155 *p = UC_BREAK_PROHIBITED;
156 seen_space = p;
158 else if (prop == LBP_ZW)
160 /* (LB7) Don't break just before a zero-width space. */
161 *p = UC_BREAK_PROHIBITED;
162 last_prop = LBP_ZW;
163 seen_space = NULL;
165 else if (prop == LBP_CM || prop == LBP_ZWJ)
167 /* (LB9) Don't break just before a combining character or
168 zero-width joiner, except immediately after a mandatory
169 break character, space, or zero-width space. */
170 if (last_prop == LBP_BK)
172 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
173 *p = UC_BREAK_PROHIBITED;
174 /* (LB10) Treat CM or ZWJ as AL. */
175 last_prop = LBP_AL;
176 seen_space = NULL;
178 else if (last_prop == LBP_ZW || seen_space != NULL)
180 /* (LB8) Break after zero-width space. */
181 /* (LB18) Break after spaces.
182 We do *not* implement the "legacy support for space
183 character as base for combining marks" because now the
184 NBSP CM sequence is recommended instead of SP CM. */
185 *p = UC_BREAK_POSSIBLE;
186 /* (LB10) Treat CM or ZWJ as AL. */
187 last_prop = LBP_AL;
188 seen_space = NULL;
190 else
192 /* Treat X CM as if it were X. */
193 *p = UC_BREAK_PROHIBITED;
196 else
198 /* prop must be usable as an index for table 7.3 of UTR #14. */
199 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
200 abort ();
202 if (last_prop == LBP_BK)
204 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
205 *p = UC_BREAK_PROHIBITED;
207 else if (last_prop == LBP_ZW)
209 /* (LB8) Break after zero-width space. */
210 *p = UC_BREAK_POSSIBLE;
212 else if (prev_prop == LBP_ZWJ)
214 /* (LB8a) Don't break right after a zero-width joiner. */
215 *p = UC_BREAK_PROHIBITED;
217 else if (last_prop == LBP_RI && prop == LBP_RI)
219 /* (LB30a) Break between two regional indicator symbols
220 if and only if there are an even number of regional
221 indicators preceding the position of the break. */
222 *p = (seen_space != NULL || (ri_count % 2) == 0
223 ? UC_BREAK_POSSIBLE
224 : UC_BREAK_PROHIBITED);
226 else if (prev_prop == LBP_HL_BA)
228 /* (LB21a) Don't break after Hebrew + Hyphen/Break-After. */
229 *p = UC_BREAK_PROHIBITED;
231 else
233 switch (unilbrk_table [last_prop] [prop])
235 case D:
236 *p = UC_BREAK_POSSIBLE;
237 break;
238 case I:
239 *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
240 break;
241 case P:
242 *p = UC_BREAK_PROHIBITED;
243 break;
244 default:
245 abort ();
248 last_prop = prop;
249 seen_space = NULL;
252 prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA)
253 ? LBP_HL_BA
254 : prop);
257 if (prop == LBP_RI)
258 ri_count++;
259 else
260 ri_count = 0;
262 p++;
264 while (s < s_end);
268 #if defined IN_LIBUNISTRING
269 /* For backward compatibility with older versions of libunistring. */
271 # undef u32_possible_linebreaks
273 void
274 u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding,
275 char *p)
277 u32_possible_linebreaks_loop (s, n, encoding, -1, p);
280 #endif
282 void
283 u32_possible_linebreaks_v2 (const uint32_t *s, size_t n, const char *encoding,
284 char *p)
286 u32_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);