1 /* Word breaks in UTF-8/UTF-16/UTF-32 strings. -*- coding: utf-8 -*-
2 Copyright (C) 2009-2021 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
19 FUNC (const UNIT
*s
, size_t n
, char *p
)
23 const UNIT
*s_end
= s
+ n
;
25 /* Word break property of the last character.
26 -1 at the very beginning of the string. */
27 int last_char_prop
= -1;
29 /* Format and Extend characters are ignored; this means, the mostly used
30 unit is the complex character (= character with subsequent ignored
32 Word break property of the last complex character.
33 -1 at the very beginning of the string. */
34 int last_compchar_prop
= -1;
35 char *last_compchar_ptr
= NULL
;
37 /* For recognizing rules involving 3 complex characters:
38 Word break property of the second-to-last complex character.
39 -1 at the very beginning of the string. */
40 int secondlast_compchar_prop
= -1;
44 /* Don't break inside multibyte characters. */
50 int count
= U_MBTOUC_UNSAFE (&uc
, s
, s_end
- s
);
51 int prop
= uc_wordbreak_property (uc
);
53 /* No break at the start of the string. */
54 if (last_char_prop
>= 0)
56 /* No break between CR and LF (WB3). */
57 if (last_char_prop
== WBP_CR
&& prop
== WBP_LF
)
59 /* Break before and after newlines (WB3a, WB3b). */
60 else if ((last_char_prop
== WBP_CR
61 || last_char_prop
== WBP_LF
62 || last_char_prop
== WBP_NEWLINE
)
65 || prop
== WBP_NEWLINE
))
67 /* No break within emoji zwj sequence (WB3c). */
68 else if (last_char_prop
== WBP_ZWJ
&&
69 (prop
== WBP_GAZ
|| prop
== WBP_EBG
))
71 /* Ignore Format and Extend characters. */
72 else if (!(prop
== WBP_EXTEND
|| prop
== WBP_FORMAT
|| prop
== WBP_ZWJ
))
74 /* No break in these situations (see UAX #29):
76 secondlast last current
78 (ALetter | HL) (MidLetter | MidNumLet | SQ) × (ALetter | HL) (WB7)
79 (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6)
80 Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11)
81 Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12)
84 ^ (RI RI)* RI × RI (WB15)
85 [^RI] (RI RI)* RI × RI (WB16)
87 /* No break across certain punctuation. Also, disable word
88 breaks that were recognized earlier (due to lookahead of
89 only one complex character). */
90 if (((prop
== WBP_ALETTER
92 && (last_compchar_prop
== WBP_MIDLETTER
93 || last_compchar_prop
== WBP_MIDNUMLET
94 || last_compchar_prop
== WBP_SQ
)
95 && (secondlast_compchar_prop
== WBP_ALETTER
96 || secondlast_compchar_prop
== WBP_HL
))
97 || (prop
== WBP_NUMERIC
98 && (last_compchar_prop
== WBP_MIDNUM
99 || last_compchar_prop
== WBP_MIDNUMLET
100 || last_compchar_prop
== WBP_SQ
)
101 && secondlast_compchar_prop
== WBP_NUMERIC
)
103 && last_compchar_prop
== WBP_DQ
104 && secondlast_compchar_prop
== WBP_HL
))
106 *last_compchar_ptr
= 0;
109 /* Break before RI, if odd number of RI's are
110 preceding (WB15, WB16). */
111 else if (last_compchar_prop
== WBP_RI
&& prop
== WBP_RI
)
113 if (ri_count
% 2 == 0)
117 /* Break after Format and Extend character. */
118 else if (last_compchar_prop
== WBP_EXTEND
119 || last_compchar_prop
== WBP_FORMAT
)
123 int last_compchar_index
=
124 uniwbrk_prop_index
[last_compchar_prop
];
125 int index
= uniwbrk_prop_index
[prop
];
127 /* Break between unknown pair (WB999). */
128 if (last_compchar_index
< 0 || index
< 0)
130 /* Perform a single table lookup. */
131 else if (uniwbrk_table
[last_compchar_index
][index
])
138 last_char_prop
= prop
;
140 /* Ignore Format and Extend characters, except at the
141 start of the line. */
142 if (last_compchar_prop
< 0
143 || last_compchar_prop
== WBP_CR
144 || last_compchar_prop
== WBP_LF
145 || last_compchar_prop
== WBP_NEWLINE
146 || !(prop
== WBP_EXTEND
|| prop
== WBP_FORMAT
|| prop
== WBP_ZWJ
))
148 secondlast_compchar_prop
= last_compchar_prop
;
149 last_compchar_prop
= prop
;
150 last_compchar_ptr
= p
;