1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 /* Quoting the Unicode standard, section "Default Case Algorithms":
19 Find the word boundaries in X according to Unicode Standard Annex #29,
20 “Text Boundaries.” For each word boundary, find the first cased character
21 F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
22 then map all characters C between F and the following word boundary to
23 Lowercase_Mapping(C). */
26 FUNC (const UNIT
*s
, size_t n
,
27 casing_prefix_context_t prefix_context
,
28 casing_suffix_context_t suffix_context
,
29 const char *iso639_language
,
31 UNIT
*resultbuf
, size_t *lengthp
)
33 /* The result being accumulated. */
37 /* An array containing the word break positions. */
40 /* Initialize the accumulator. */
41 if (nf
!= NULL
|| resultbuf
== NULL
)
53 /* Initialize the word breaks array. */
56 wordbreaks
= (char *) malloc (n
);
57 if (wordbreaks
== NULL
)
62 U_WORDBREAKS (s
, n
, wordbreaks
);
68 const UNIT
*s_end
= s
+ n
;
69 const char *wp
= wordbreaks
;
71 /* When considering the string as segmented by word boundaries: For each
73 - In the first part, we are searching for the first cased character.
74 In this state, in_word_first_part = true, and no conversion takes
76 - In the second part, we are converting every character: the first
77 among these characters to title case, the other ones to lower case.
78 In this state, in_word_first_part = false. */
79 bool in_word_first_part
= true;
81 /* Helper for evaluating the FINAL_SIGMA condition:
82 Last character that was not case-ignorable. */
83 ucs4_t last_char_except_ignorable
=
84 prefix_context
.last_char_except_ignorable
;
86 /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
87 Last character that was of combining class 230 ("Above") or 0. */
88 ucs4_t last_char_normal_or_above
=
89 prefix_context
.last_char_normal_or_above
;
93 /* Fetch the next character. */
95 int count
= U_MBTOUC_UNSAFE (&uc
, s
, s_end
- s
);
97 ucs4_t (*single_character_map
) (ucs4_t
);
98 size_t offset_in_rule
; /* offset in 'struct special_casing_rule' */
101 unsigned int mapped_count
;
104 /* Crossing a word boundary. */
105 in_word_first_part
= true;
107 /* Determine single_character_map, offset_in_rule.
108 There are three possibilities:
109 - uc should not be converted.
110 - uc should be titlecased.
111 - uc should be lowercased. */
112 if (in_word_first_part
)
114 if (uc_is_cased (uc
))
116 /* uc is to be titlecased. */
117 single_character_map
= uc_totitle
;
118 offset_in_rule
= offsetof (struct special_casing_rule
, title
[0]);
119 in_word_first_part
= false;
123 /* uc is not converted. */
124 single_character_map
= NULL
;
130 /* uc is to be lowercased. */
131 single_character_map
= uc_tolower
;
132 offset_in_rule
= offsetof (struct special_casing_rule
, lower
[0]);
135 /* Actually map uc. */
136 if (single_character_map
== NULL
)
145 /* Look first in the special-casing table. */
148 code
[0] = (uc
>> 8) & 0xff;
151 for (code
[2] = 0; ; code
[2]++)
153 const struct special_casing_rule
*rule
=
154 gl_unicase_special_lookup (code
, 3);
159 /* Test if the condition applies. */
160 /* Does the language apply? */
161 if (rule
->language
[0] == '\0'
162 || (iso639_language
!= NULL
163 && iso639_language
[0] == rule
->language
[0]
164 && iso639_language
[1] == rule
->language
[1]))
166 /* Does the context apply? */
167 int context
= rule
->context
;
178 case SCC_FINAL_SIGMA
:
179 /* "Before" condition: preceded by a sequence
180 consisting of a cased letter and a case-ignorable
182 "After" condition: not followed by a sequence
183 consisting of a case-ignorable sequence and then a
185 /* Test the "before" condition. */
186 applies
= uc_is_cased (last_char_except_ignorable
);
187 /* Test the "after" condition. */
190 const UNIT
*s2
= s
+ count
;
196 int count2
= U_MBTOUC_UNSAFE (&uc2
, s2
, s_end
- s2
);
197 /* Our uc_is_case_ignorable function is
198 known to return false for all cased
199 characters. So we can call
200 uc_is_case_ignorable first. */
201 if (!uc_is_case_ignorable (uc2
))
203 applies
= ! uc_is_cased (uc2
);
210 applies
= ! uc_is_cased (suffix_context
.first_char_except_ignorable
);
217 case SCC_AFTER_SOFT_DOTTED
:
218 /* "Before" condition: There is a Soft_Dotted character
219 before it, with no intervening character of
220 combining class 0 or 230 (Above). */
221 /* Test the "before" condition. */
222 applies
= uc_is_property_soft_dotted (last_char_normal_or_above
);
226 /* "After" condition: followed by a character of
227 combining class 230 (Above) with no intervening
228 character of combining class 0 or 230 (Above). */
229 /* Test the "after" condition. */
231 const UNIT
*s2
= s
+ count
;
238 int count2
= U_MBTOUC_UNSAFE (&uc2
, s2
, s_end
- s2
);
239 int ccc
= uc_combining_class (uc2
);
245 if (ccc
== UC_CCC_NR
)
251 applies
= ((suffix_context
.bits
& SCC_MORE_ABOVE_MASK
) != 0);
259 /* "After" condition: followed by COMBINING DOT ABOVE
260 (U+0307). Any sequence of characters with a
261 combining class that is neither 0 nor 230 may
262 intervene between the current character and the
263 combining dot above. */
264 /* Test the "after" condition. */
266 const UNIT
*s2
= s
+ count
;
273 int count2
= U_MBTOUC_UNSAFE (&uc2
, s2
, s_end
- s2
);
274 if (uc2
== 0x0307) /* COMBINING DOT ABOVE */
280 int ccc
= uc_combining_class (uc2
);
281 if (ccc
== UC_CCC_A
|| ccc
== UC_CCC_NR
)
288 applies
= ((suffix_context
.bits
& SCC_BEFORE_DOT_MASK
) != 0);
296 /* "Before" condition: There is an uppercase I before
297 it, and there is no intervening character of
298 combining class 0 or 230 (Above). */
299 /* Test the "before" condition. */
300 applies
= (last_char_normal_or_above
== 'I');
306 if (rule
->context
< 0)
312 Look up the mapping (0 to 3 characters). */
313 const unsigned short *mapped_in_rule
=
314 (const unsigned short *)((const char *)rule
+ offset_in_rule
);
316 if (mapped_in_rule
[0] == 0)
320 mapped_uc
[0] = mapped_in_rule
[0];
321 if (mapped_in_rule
[1] == 0)
325 mapped_uc
[1] = mapped_in_rule
[1];
326 if (mapped_in_rule
[2] == 0)
330 mapped_uc
[2] = mapped_in_rule
[2];
339 /* Optimization: Save a hash table lookup in the next round. */
345 /* No special-cased mapping. So use the locale and context independent
347 mapped_uc
[0] = single_character_map (uc
);
351 /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */
355 for (i
= 0; i
< mapped_count
; i
++)
357 ucs4_t muc
= mapped_uc
[i
];
359 /* Append muc to the result accumulator. */
360 if (length
< allocated
)
362 int ret
= U_UCTOMB (result
+ length
, muc
, allocated
- length
);
375 size_t old_allocated
= allocated
;
376 size_t new_allocated
= 2 * old_allocated
;
377 if (new_allocated
< 64)
379 if (new_allocated
< old_allocated
) /* integer overflow? */
385 larger_result
= (UNIT
*) malloc (new_allocated
* sizeof (UNIT
));
386 if (larger_result
== NULL
)
392 else if (result
== resultbuf
)
394 larger_result
= (UNIT
*) malloc (new_allocated
* sizeof (UNIT
));
395 if (larger_result
== NULL
)
400 U_CPY (larger_result
, resultbuf
, length
);
405 (UNIT
*) realloc (result
, new_allocated
* sizeof (UNIT
));
406 if (larger_result
== NULL
)
412 result
= larger_result
;
413 allocated
= new_allocated
;
415 int ret
= U_UCTOMB (result
+ length
, muc
, allocated
- length
);
432 if (!uc_is_case_ignorable (uc
))
433 last_char_except_ignorable
= uc
;
436 int ccc
= uc_combining_class (uc
);
437 if (ccc
== UC_CCC_A
|| ccc
== UC_CCC_NR
)
438 last_char_normal_or_above
= uc
;
450 /* Finally, normalize the result. */
451 UNIT
*normalized_result
;
453 normalized_result
= U_NORMALIZE (nf
, result
, length
, resultbuf
, lengthp
);
454 if (normalized_result
== NULL
)
458 return normalized_result
;
465 /* Return a non-NULL value. NULL means error. */
466 result
= (UNIT
*) malloc (1);
474 else if (result
!= resultbuf
&& length
< allocated
)
476 /* Shrink the allocated memory if possible. */
479 memory
= (UNIT
*) realloc (result
, length
* sizeof (UNIT
));
489 int saved_errno
= errno
;
494 if (result
!= resultbuf
)
496 int saved_errno
= errno
;