usleep tests: Avoid failure due to known Cygwin 3.5.3 bug.
[gnulib.git] / lib / unilbrk / u8-possible-linebreaks.c
blob72d0749b8eb537544c2fe7272fea3bc5d7d3297c
1 /* Line breaking of UTF-8 strings.
2 Copyright (C) 2001-2003, 2006-2024 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation, either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
26 #include <config.h>
28 /* Specification. */
29 #include "unilbrk.h"
30 #include "unilbrk/internal.h"
32 #include <stdlib.h>
33 #include <string.h>
35 #include "unilbrk/lbrktables.h"
36 #include "uniwidth/cjk.h"
37 #include "unistr.h"
39 /* This file implements
40 Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */
42 void
43 u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding,
44 int cr, char *p)
46 if (n > 0)
48 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL);
49 const uint8_t *s_end = s + n;
50 int prev_prop = LBP_BK; /* line break property of last character */
51 int last_prop = LBP_BK; /* line break property of last non-space character */
52 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
54 /* Don't break inside multibyte characters. */
55 memset (p, UC_BREAK_PROHIBITED, n);
57 /* Number of consecutive regional indicator (RI) characters seen
58 immediately before the current point. */
59 size_t ri_count = 0;
63 ucs4_t uc;
64 int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
65 s += count;
66 int prop = unilbrkprop_lookup (uc);
68 if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
70 /* (LB4,LB5,LB6) Mandatory break. */
71 *p = UC_BREAK_MANDATORY;
72 /* cr is either LBP_CR or -1. In the first case, recognize
73 a CR-LF sequence. */
74 if (prev_prop == cr && prop == LBP_LF)
75 p[-1] = UC_BREAK_CR_BEFORE_LF;
76 prev_prop = prop;
77 last_prop = LBP_BK;
78 seen_space = NULL;
80 else
82 /* Resolve property values whose behaviour is not fixed. */
83 switch (prop)
85 case LBP_AI:
86 /* Resolve ambiguous. */
87 prop = LBP_AI_REPLACEMENT;
88 break;
89 case LBP_CB:
90 /* This is arbitrary. */
91 prop = LBP_ID1;
92 break;
93 case LBP_SA:
94 /* We don't handle complex scripts yet.
95 Treat LBP_SA like LBP_XX. */
96 case LBP_XX:
97 /* This is arbitrary. */
98 prop = LBP_AL;
99 break;
100 case LBP_QU2:
101 /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous
102 character's line break property was not one of
103 BK, CR, LF, OP, QU, GL, SP, ZW. */
104 switch (prev_prop)
106 case LBP_BK:
107 case LBP_CR:
108 case LBP_LF:
109 case LBP_OP1: case LBP_OP2:
110 case LBP_QU1: case LBP_QU2: case LBP_QU3:
111 case LBP_GL:
112 case LBP_SP:
113 case LBP_ZW:
114 break;
115 default:
116 prop = LBP_QU1;
117 break;
119 break;
120 case LBP_QU3:
121 /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next
122 character's line break property is not one of
123 BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */
125 int next_prop;
126 if (s < s_end)
128 ucs4_t next_uc;
129 (void) u8_mbtouc_unsafe (&next_uc, s, s_end - s);
130 next_prop = unilbrkprop_lookup (next_uc);
132 else
133 next_prop = LBP_BK;
134 switch (next_prop)
136 case LBP_BK:
137 case LBP_CR:
138 case LBP_LF:
139 case LBP_SP:
140 case LBP_GL:
141 case LBP_WJ:
142 case LBP_CL:
143 case LBP_QU1: case LBP_QU2: case LBP_QU3:
144 case LBP_CP1: case LBP_CP2:
145 case LBP_EX:
146 case LBP_IS:
147 case LBP_SY:
148 case LBP_ZW:
149 break;
150 default:
151 prop = LBP_QU1;
152 break;
155 break;
158 /* Deal with spaces and combining characters. */
159 if (prop == LBP_SP)
161 /* (LB7) Don't break just before a space. */
162 *p = UC_BREAK_PROHIBITED;
163 seen_space = p;
165 else if (prop == LBP_ZW)
167 /* (LB7) Don't break just before a zero-width space. */
168 *p = UC_BREAK_PROHIBITED;
169 last_prop = LBP_ZW;
170 seen_space = NULL;
172 else if (prop == LBP_CM || prop == LBP_ZWJ)
174 /* (LB9) Don't break just before a combining character or
175 zero-width joiner, except immediately after a mandatory
176 break character, space, or zero-width space. */
177 if (last_prop == LBP_BK)
179 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
180 *p = UC_BREAK_PROHIBITED;
181 /* (LB10) Treat CM or ZWJ as AL. */
182 last_prop = LBP_AL;
183 seen_space = NULL;
185 else if (last_prop == LBP_ZW || seen_space != NULL)
187 /* (LB8) Break after zero-width space. */
188 /* (LB18) Break after spaces.
189 We do *not* implement the "legacy support for space
190 character as base for combining marks" because now the
191 NBSP CM sequence is recommended instead of SP CM. */
192 *p = UC_BREAK_POSSIBLE;
193 /* (LB10) Treat CM or ZWJ as AL. */
194 last_prop = LBP_AL;
195 seen_space = NULL;
197 else
199 /* Treat X CM as if it were X. */
200 *p = UC_BREAK_PROHIBITED;
203 else
205 /* prop must be usable as an index for table 7.3 of UTR #14. */
206 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
207 abort ();
209 if (last_prop == LBP_BK)
211 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
212 *p = UC_BREAK_PROHIBITED;
214 else if (last_prop == LBP_ZW)
216 /* (LB8) Break after zero-width space. */
217 *p = UC_BREAK_POSSIBLE;
219 else if (prev_prop == LBP_ZWJ)
221 /* (LB8a) Don't break right after a zero-width joiner. */
222 *p = UC_BREAK_PROHIBITED;
224 else if (last_prop == LBP_RI && prop == LBP_RI)
226 /* (LB30a) Break between two regional indicator symbols
227 if and only if there are an even number of regional
228 indicators preceding the position of the break. */
229 *p = (seen_space != NULL || (ri_count % 2) == 0
230 ? UC_BREAK_POSSIBLE
231 : UC_BREAK_PROHIBITED);
233 else if (prev_prop == LBP_HL_BA)
235 /* (LB21a) Don't break after Hebrew + Hyphen/Break-After. */
236 *p = UC_BREAK_PROHIBITED;
238 else
240 switch (unilbrk_table [last_prop] [prop])
242 case D:
243 *p = UC_BREAK_POSSIBLE;
244 break;
245 case I:
246 *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
247 break;
248 case P:
249 *p = UC_BREAK_PROHIBITED;
250 break;
251 default:
252 abort ();
255 last_prop = prop;
256 seen_space = NULL;
259 prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA)
260 ? LBP_HL_BA
261 : prop);
264 if (prop == LBP_RI)
265 ri_count++;
266 else
267 ri_count = 0;
269 p += count;
271 while (s < s_end);
275 #if defined IN_LIBUNISTRING
276 /* For backward compatibility with older versions of libunistring. */
278 # undef u8_possible_linebreaks
280 void
281 u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding,
282 char *p)
284 u8_possible_linebreaks_loop (s, n, encoding, -1, p);
287 #endif
289 void
290 u8_possible_linebreaks_v2 (const uint8_t *s, size_t n, const char *encoding,
291 char *p)
293 u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
297 #ifdef TEST
299 #include <stdio.h>
300 #include <string.h>
302 /* Read the contents of an input stream, and return it, terminated with a NUL
303 byte. */
304 char *
305 read_file (FILE *stream)
307 #define BUFSIZE 4096
308 char *buf = NULL;
309 int alloc = 0;
310 int size = 0;
311 int count;
313 while (! feof (stream))
315 if (size + BUFSIZE > alloc)
317 alloc = alloc + alloc / 2;
318 if (alloc < size + BUFSIZE)
319 alloc = size + BUFSIZE;
320 buf = realloc (buf, alloc);
321 if (buf == NULL)
323 fprintf (stderr, "out of memory\n");
324 exit (1);
327 count = fread (buf + size, 1, BUFSIZE, stream);
328 if (count == 0)
330 if (ferror (stream))
332 perror ("fread");
333 exit (1);
336 else
337 size += count;
339 buf = realloc (buf, size + 1);
340 if (buf == NULL)
342 fprintf (stderr, "out of memory\n");
343 exit (1);
345 buf[size] = '\0';
346 return buf;
347 #undef BUFSIZE
351 main (int argc, char * argv[])
353 if (argc == 1)
355 /* Display all the break opportunities in the input string. */
356 char *input = read_file (stdin);
357 int length = strlen (input);
358 char *breaks = malloc (length);
359 int i;
361 u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks);
363 for (i = 0; i < length; i++)
365 switch (breaks[i])
367 case UC_BREAK_POSSIBLE:
368 /* U+2027 in UTF-8 encoding */
369 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
370 break;
371 case UC_BREAK_MANDATORY:
372 /* U+21B2 (or U+21B5) in UTF-8 encoding */
373 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
374 break;
375 case UC_BREAK_CR_BEFORE_LF:
376 /* U+21E4 in UTF-8 encoding */
377 putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout);
378 break;
379 case UC_BREAK_PROHIBITED:
380 break;
381 default:
382 abort ();
384 putc (input[i], stdout);
387 free (breaks);
389 return 0;
391 else
392 return 1;
395 #endif /* TEST */