1 /* Line breaking of UTF-8 strings.
2 Copyright (C) 2001-2003, 2006-2024 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation, either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
30 #include "unilbrk/internal.h"
35 #include "unilbrk/lbrktables.h"
36 #include "uniwidth/cjk.h"
39 /* This file implements
40 Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */
43 u8_possible_linebreaks_loop (const uint8_t *s
, size_t n
, const char *encoding
,
48 int LBP_AI_REPLACEMENT
= (is_cjk_encoding (encoding
) ? LBP_ID1
: LBP_AL
);
49 const uint8_t *s_end
= s
+ n
;
50 int prev_prop
= LBP_BK
; /* line break property of last character */
51 int last_prop
= LBP_BK
; /* line break property of last non-space character */
52 char *seen_space
= NULL
; /* Was a space seen after the last non-space character? */
54 /* Don't break inside multibyte characters. */
55 memset (p
, UC_BREAK_PROHIBITED
, n
);
57 /* Number of consecutive regional indicator (RI) characters seen
58 immediately before the current point. */
64 int count
= u8_mbtouc_unsafe (&uc
, s
, s_end
- s
);
66 int prop
= unilbrkprop_lookup (uc
);
68 if (prop
== LBP_BK
|| prop
== LBP_LF
|| prop
== LBP_CR
)
70 /* (LB4,LB5,LB6) Mandatory break. */
71 *p
= UC_BREAK_MANDATORY
;
72 /* cr is either LBP_CR or -1. In the first case, recognize
74 if (prev_prop
== cr
&& prop
== LBP_LF
)
75 p
[-1] = UC_BREAK_CR_BEFORE_LF
;
82 /* Resolve property values whose behaviour is not fixed. */
86 /* Resolve ambiguous. */
87 prop
= LBP_AI_REPLACEMENT
;
90 /* This is arbitrary. */
94 /* We don't handle complex scripts yet.
95 Treat LBP_SA like LBP_XX. */
97 /* This is arbitrary. */
101 /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous
102 character's line break property was not one of
103 BK, CR, LF, OP, QU, GL, SP, ZW. */
109 case LBP_OP1
: case LBP_OP2
:
110 case LBP_QU1
: case LBP_QU2
: case LBP_QU3
:
121 /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next
122 character's line break property is not one of
123 BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */
129 (void) u8_mbtouc_unsafe (&next_uc
, s
, s_end
- s
);
130 next_prop
= unilbrkprop_lookup (next_uc
);
143 case LBP_QU1
: case LBP_QU2
: case LBP_QU3
:
144 case LBP_CP1
: case LBP_CP2
:
158 /* Deal with spaces and combining characters. */
161 /* (LB7) Don't break just before a space. */
162 *p
= UC_BREAK_PROHIBITED
;
165 else if (prop
== LBP_ZW
)
167 /* (LB7) Don't break just before a zero-width space. */
168 *p
= UC_BREAK_PROHIBITED
;
172 else if (prop
== LBP_CM
|| prop
== LBP_ZWJ
)
174 /* (LB9) Don't break just before a combining character or
175 zero-width joiner, except immediately after a mandatory
176 break character, space, or zero-width space. */
177 if (last_prop
== LBP_BK
)
179 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
180 *p
= UC_BREAK_PROHIBITED
;
181 /* (LB10) Treat CM or ZWJ as AL. */
185 else if (last_prop
== LBP_ZW
|| seen_space
!= NULL
)
187 /* (LB8) Break after zero-width space. */
188 /* (LB18) Break after spaces.
189 We do *not* implement the "legacy support for space
190 character as base for combining marks" because now the
191 NBSP CM sequence is recommended instead of SP CM. */
192 *p
= UC_BREAK_POSSIBLE
;
193 /* (LB10) Treat CM or ZWJ as AL. */
199 /* Treat X CM as if it were X. */
200 *p
= UC_BREAK_PROHIBITED
;
205 /* prop must be usable as an index for table 7.3 of UTR #14. */
206 if (!(prop
>= 0 && prop
< sizeof (unilbrk_table
) / sizeof (unilbrk_table
[0])))
209 if (last_prop
== LBP_BK
)
211 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
212 *p
= UC_BREAK_PROHIBITED
;
214 else if (last_prop
== LBP_ZW
)
216 /* (LB8) Break after zero-width space. */
217 *p
= UC_BREAK_POSSIBLE
;
219 else if (prev_prop
== LBP_ZWJ
)
221 /* (LB8a) Don't break right after a zero-width joiner. */
222 *p
= UC_BREAK_PROHIBITED
;
224 else if (last_prop
== LBP_RI
&& prop
== LBP_RI
)
226 /* (LB30a) Break between two regional indicator symbols
227 if and only if there are an even number of regional
228 indicators preceding the position of the break. */
229 *p
= (seen_space
!= NULL
|| (ri_count
% 2) == 0
231 : UC_BREAK_PROHIBITED
);
233 else if (prev_prop
== LBP_HL_BA
)
235 /* (LB21a) Don't break after Hebrew + Hyphen/Break-After. */
236 *p
= UC_BREAK_PROHIBITED
;
240 switch (unilbrk_table
[last_prop
] [prop
])
243 *p
= UC_BREAK_POSSIBLE
;
246 *p
= (seen_space
!= NULL
? UC_BREAK_POSSIBLE
: UC_BREAK_PROHIBITED
);
249 *p
= UC_BREAK_PROHIBITED
;
259 prev_prop
= (prev_prop
== LBP_HL
&& (prop
== LBP_HY
|| prop
== LBP_BA
)
275 #if defined IN_LIBUNISTRING
276 /* For backward compatibility with older versions of libunistring. */
278 # undef u8_possible_linebreaks
281 u8_possible_linebreaks (const uint8_t *s
, size_t n
, const char *encoding
,
284 u8_possible_linebreaks_loop (s
, n
, encoding
, -1, p
);
290 u8_possible_linebreaks_v2 (const uint8_t *s
, size_t n
, const char *encoding
,
293 u8_possible_linebreaks_loop (s
, n
, encoding
, LBP_CR
, p
);
302 /* Read the contents of an input stream, and return it, terminated with a NUL
305 read_file (FILE *stream
)
313 while (! feof (stream
))
315 if (size
+ BUFSIZE
> alloc
)
317 alloc
= alloc
+ alloc
/ 2;
318 if (alloc
< size
+ BUFSIZE
)
319 alloc
= size
+ BUFSIZE
;
320 buf
= realloc (buf
, alloc
);
323 fprintf (stderr
, "out of memory\n");
327 count
= fread (buf
+ size
, 1, BUFSIZE
, stream
);
339 buf
= realloc (buf
, size
+ 1);
342 fprintf (stderr
, "out of memory\n");
351 main (int argc
, char * argv
[])
355 /* Display all the break opportunities in the input string. */
356 char *input
= read_file (stdin
);
357 int length
= strlen (input
);
358 char *breaks
= malloc (length
);
361 u8_possible_linebreaks_v2 ((uint8_t *) input
, length
, "UTF-8", breaks
);
363 for (i
= 0; i
< length
; i
++)
367 case UC_BREAK_POSSIBLE
:
368 /* U+2027 in UTF-8 encoding */
369 putc (0xe2, stdout
); putc (0x80, stdout
); putc (0xa7, stdout
);
371 case UC_BREAK_MANDATORY
:
372 /* U+21B2 (or U+21B5) in UTF-8 encoding */
373 putc (0xe2, stdout
); putc (0x86, stdout
); putc (0xb2, stdout
);
375 case UC_BREAK_CR_BEFORE_LF
:
376 /* U+21E4 in UTF-8 encoding */
377 putc (0xe2, stdout
); putc (0x87, stdout
); putc (0xa4, stdout
);
379 case UC_BREAK_PROHIBITED
:
384 putc (input
[i
], stdout
);