1 /* Line breaking of UTF-32 strings.
2 Copyright (C) 2001-2003, 2006-2024 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation, either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
30 #include "unilbrk/internal.h"
34 #include "unilbrk/lbrktables.h"
35 #include "uniwidth/cjk.h"
37 /* This file implements
38 Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */
41 u32_possible_linebreaks_loop (const uint32_t *s
, size_t n
, const char *encoding
,
46 int LBP_AI_REPLACEMENT
= (is_cjk_encoding (encoding
) ? LBP_ID1
: LBP_AL
);
47 const uint32_t *s_end
= s
+ n
;
48 int prev_prop
= LBP_BK
; /* line break property of last character */
49 int last_prop
= LBP_BK
; /* line break property of last non-space character */
50 char *seen_space
= NULL
; /* Was a space seen after the last non-space character? */
52 /* Number of consecutive regional indicator (RI) characters seen
53 immediately before the current point. */
60 int prop
= unilbrkprop_lookup (uc
);
62 if (prop
== LBP_BK
|| prop
== LBP_LF
|| prop
== LBP_CR
)
64 /* (LB4,LB5,LB6) Mandatory break. */
65 *p
= UC_BREAK_MANDATORY
;
66 /* cr is either LBP_CR or -1. In the first case, recognize
68 if (prev_prop
== cr
&& prop
== LBP_LF
)
69 p
[-1] = UC_BREAK_CR_BEFORE_LF
;
76 /* Resolve property values whose behaviour is not fixed. */
80 /* Resolve ambiguous. */
81 prop
= LBP_AI_REPLACEMENT
;
84 /* This is arbitrary. */
88 /* We don't handle complex scripts yet.
89 Treat LBP_SA like LBP_XX. */
91 /* This is arbitrary. */
95 /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous
96 character's line break property was not one of
97 BK, CR, LF, OP, QU, GL, SP, ZW. */
103 case LBP_OP1
: case LBP_OP2
:
104 case LBP_QU1
: case LBP_QU2
: case LBP_QU3
:
115 /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next
116 character's line break property is not one of
117 BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */
123 next_prop
= unilbrkprop_lookup (next_uc
);
136 case LBP_QU1
: case LBP_QU2
: case LBP_QU3
:
137 case LBP_CP1
: case LBP_CP2
:
151 /* Deal with spaces and combining characters. */
154 /* (LB7) Don't break just before a space. */
155 *p
= UC_BREAK_PROHIBITED
;
158 else if (prop
== LBP_ZW
)
160 /* (LB7) Don't break just before a zero-width space. */
161 *p
= UC_BREAK_PROHIBITED
;
165 else if (prop
== LBP_CM
|| prop
== LBP_ZWJ
)
167 /* (LB9) Don't break just before a combining character or
168 zero-width joiner, except immediately after a mandatory
169 break character, space, or zero-width space. */
170 if (last_prop
== LBP_BK
)
172 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
173 *p
= UC_BREAK_PROHIBITED
;
174 /* (LB10) Treat CM or ZWJ as AL. */
178 else if (last_prop
== LBP_ZW
|| seen_space
!= NULL
)
180 /* (LB8) Break after zero-width space. */
181 /* (LB18) Break after spaces.
182 We do *not* implement the "legacy support for space
183 character as base for combining marks" because now the
184 NBSP CM sequence is recommended instead of SP CM. */
185 *p
= UC_BREAK_POSSIBLE
;
186 /* (LB10) Treat CM or ZWJ as AL. */
192 /* Treat X CM as if it were X. */
193 *p
= UC_BREAK_PROHIBITED
;
198 /* prop must be usable as an index for table 7.3 of UTR #14. */
199 if (!(prop
>= 0 && prop
< sizeof (unilbrk_table
) / sizeof (unilbrk_table
[0])))
202 if (last_prop
== LBP_BK
)
204 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
205 *p
= UC_BREAK_PROHIBITED
;
207 else if (last_prop
== LBP_ZW
)
209 /* (LB8) Break after zero-width space. */
210 *p
= UC_BREAK_POSSIBLE
;
212 else if (prev_prop
== LBP_ZWJ
)
214 /* (LB8a) Don't break right after a zero-width joiner. */
215 *p
= UC_BREAK_PROHIBITED
;
217 else if (last_prop
== LBP_RI
&& prop
== LBP_RI
)
219 /* (LB30a) Break between two regional indicator symbols
220 if and only if there are an even number of regional
221 indicators preceding the position of the break. */
222 *p
= (seen_space
!= NULL
|| (ri_count
% 2) == 0
224 : UC_BREAK_PROHIBITED
);
226 else if (prev_prop
== LBP_HL_BA
)
228 /* (LB21a) Don't break after Hebrew + Hyphen/Break-After. */
229 *p
= UC_BREAK_PROHIBITED
;
233 switch (unilbrk_table
[last_prop
] [prop
])
236 *p
= UC_BREAK_POSSIBLE
;
239 *p
= (seen_space
!= NULL
? UC_BREAK_POSSIBLE
: UC_BREAK_PROHIBITED
);
242 *p
= UC_BREAK_PROHIBITED
;
252 prev_prop
= (prev_prop
== LBP_HL
&& (prop
== LBP_HY
|| prop
== LBP_BA
)
268 #if defined IN_LIBUNISTRING
269 /* For backward compatibility with older versions of libunistring. */
271 # undef u32_possible_linebreaks
274 u32_possible_linebreaks (const uint32_t *s
, size_t n
, const char *encoding
,
277 u32_possible_linebreaks_loop (s
, n
, encoding
, -1, p
);
283 u32_possible_linebreaks_v2 (const uint32_t *s
, size_t n
, const char *encoding
,
286 u32_possible_linebreaks_loop (s
, n
, encoding
, LBP_CR
, p
);