1 /* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
19 FUNC (const UNIT
*s
, size_t n
,
20 casing_prefix_context_t prefix_context
,
21 casing_suffix_context_t suffix_context
,
22 const char *iso639_language
,
23 ucs4_t (*single_character_map
) (ucs4_t
),
24 size_t offset_in_rule
, /* offset in 'struct special_casing_rule' */
26 UNIT
*resultbuf
, size_t *lengthp
)
28 /* The result being accumulated. */
33 /* Initialize the accumulator. */
34 if (nf
!= NULL
|| resultbuf
== NULL
)
47 const UNIT
*s_end
= s
+ n
;
49 /* Helper for evaluating the FINAL_SIGMA condition:
50 Last character that was not case-ignorable. */
51 ucs4_t last_char_except_ignorable
=
52 prefix_context
.last_char_except_ignorable
;
54 /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
55 Last character that was of combining class 230 ("Above") or 0. */
56 ucs4_t last_char_normal_or_above
=
57 prefix_context
.last_char_normal_or_above
;
62 int count
= U_MBTOUC_UNSAFE (&uc
, s
, s_end
- s
);
65 unsigned int mapped_count
;
69 /* Look first in the special-casing table. */
72 code
[0] = (uc
>> 8) & 0xff;
75 for (code
[2] = 0; ; code
[2]++)
77 const struct special_casing_rule
*rule
=
78 gl_unicase_special_lookup (code
, 3);
83 /* Test if the condition applies. */
84 /* Does the language apply? */
85 if (rule
->language
[0] == '\0'
86 || (iso639_language
!= NULL
87 && iso639_language
[0] == rule
->language
[0]
88 && iso639_language
[1] == rule
->language
[1]))
90 /* Does the context apply? */
91 int context
= rule
->context
;
102 case SCC_FINAL_SIGMA
:
103 /* "Before" condition: preceded by a sequence
104 consisting of a cased letter and a case-ignorable
106 "After" condition: not followed by a sequence
107 consisting of a case-ignorable sequence and then a
109 /* Test the "before" condition. */
110 applies
= uc_is_cased (last_char_except_ignorable
);
111 /* Test the "after" condition. */
114 const UNIT
*s2
= s
+ count
;
120 int count2
= U_MBTOUC_UNSAFE (&uc2
, s2
, s_end
- s2
);
121 /* Our uc_is_case_ignorable function is
122 known to return false for all cased
123 characters. So we can call
124 uc_is_case_ignorable first. */
125 if (!uc_is_case_ignorable (uc2
))
127 applies
= ! uc_is_cased (uc2
);
134 applies
= ! uc_is_cased (suffix_context
.first_char_except_ignorable
);
141 case SCC_AFTER_SOFT_DOTTED
:
142 /* "Before" condition: There is a Soft_Dotted character
143 before it, with no intervening character of
144 combining class 0 or 230 (Above). */
145 /* Test the "before" condition. */
146 applies
= uc_is_property_soft_dotted (last_char_normal_or_above
);
150 /* "After" condition: followed by a character of
151 combining class 230 (Above) with no intervening
152 character of combining class 0 or 230 (Above). */
153 /* Test the "after" condition. */
155 const UNIT
*s2
= s
+ count
;
162 int count2
= U_MBTOUC_UNSAFE (&uc2
, s2
, s_end
- s2
);
163 int ccc
= uc_combining_class (uc2
);
169 if (ccc
== UC_CCC_NR
)
175 applies
= ((suffix_context
.bits
& SCC_MORE_ABOVE_MASK
) != 0);
183 /* "After" condition: followed by COMBINING DOT ABOVE
184 (U+0307). Any sequence of characters with a
185 combining class that is neither 0 nor 230 may
186 intervene between the current character and the
187 combining dot above. */
188 /* Test the "after" condition. */
190 const UNIT
*s2
= s
+ count
;
197 int count2
= U_MBTOUC_UNSAFE (&uc2
, s2
, s_end
- s2
);
198 if (uc2
== 0x0307) /* COMBINING DOT ABOVE */
204 int ccc
= uc_combining_class (uc2
);
205 if (ccc
== UC_CCC_A
|| ccc
== UC_CCC_NR
)
212 applies
= ((suffix_context
.bits
& SCC_BEFORE_DOT_MASK
) != 0);
220 /* "Before" condition: There is an uppercase I before
221 it, and there is no intervening character of
222 combining class 0 or 230 (Above). */
223 /* Test the "before" condition. */
224 applies
= (last_char_normal_or_above
== 'I');
230 if (rule
->context
< 0)
236 Look up the mapping (0 to 3 characters). */
237 const unsigned short *mapped_in_rule
=
238 (const unsigned short *)((const char *)rule
+ offset_in_rule
);
240 if (mapped_in_rule
[0] == 0)
244 mapped_uc
[0] = mapped_in_rule
[0];
245 if (mapped_in_rule
[1] == 0)
249 mapped_uc
[1] = mapped_in_rule
[1];
250 if (mapped_in_rule
[2] == 0)
254 mapped_uc
[2] = mapped_in_rule
[2];
263 /* Optimization: Save a hash table lookup in the next round. */
269 /* No special-cased mapping. So use the locale and context independent
271 mapped_uc
[0] = single_character_map (uc
);
275 /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */
279 for (i
= 0; i
< mapped_count
; i
++)
281 ucs4_t muc
= mapped_uc
[i
];
283 /* Append muc to the result accumulator. */
284 if (length
< allocated
)
286 int ret
= U_UCTOMB (result
+ length
, muc
, allocated
- length
);
299 size_t old_allocated
= allocated
;
300 size_t new_allocated
= 2 * old_allocated
;
301 if (new_allocated
< 64)
303 if (new_allocated
< old_allocated
) /* integer overflow? */
309 larger_result
= (UNIT
*) malloc (new_allocated
* sizeof (UNIT
));
310 if (larger_result
== NULL
)
316 else if (result
== resultbuf
)
318 larger_result
= (UNIT
*) malloc (new_allocated
* sizeof (UNIT
));
319 if (larger_result
== NULL
)
324 U_CPY (larger_result
, resultbuf
, length
);
329 (UNIT
*) realloc (result
, new_allocated
* sizeof (UNIT
));
330 if (larger_result
== NULL
)
336 result
= larger_result
;
337 allocated
= new_allocated
;
339 int ret
= U_UCTOMB (result
+ length
, muc
, allocated
- length
);
356 if (!uc_is_case_ignorable (uc
))
357 last_char_except_ignorable
= uc
;
360 int ccc
= uc_combining_class (uc
);
361 if (ccc
== UC_CCC_A
|| ccc
== UC_CCC_NR
)
362 last_char_normal_or_above
= uc
;
371 /* Finally, normalize the result. */
372 UNIT
*normalized_result
;
374 normalized_result
= U_NORMALIZE (nf
, result
, length
, resultbuf
, lengthp
);
375 if (normalized_result
== NULL
)
379 return normalized_result
;
386 /* Return a non-NULL value. NULL means error. */
387 result
= (UNIT
*) malloc (1);
395 else if (result
!= resultbuf
&& length
< allocated
)
397 /* Shrink the allocated memory if possible. */
400 memory
= (UNIT
*) realloc (result
, length
* sizeof (UNIT
));
409 if (result
!= resultbuf
)
411 int saved_errno
= errno
;