all: prefer https: URLs
[gnulib.git] / lib / unicase / u-casemap.h
blobd79654a5faf98c1b60431d3d8e2c0447568db20f
1 /* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 UNIT *
19 FUNC (const UNIT *s, size_t n,
20 casing_prefix_context_t prefix_context,
21 casing_suffix_context_t suffix_context,
22 const char *iso639_language,
23 ucs4_t (*single_character_map) (ucs4_t),
24 size_t offset_in_rule, /* offset in 'struct special_casing_rule' */
25 uninorm_t nf,
26 UNIT *resultbuf, size_t *lengthp)
28 /* The result being accumulated. */
29 UNIT *result;
30 size_t length;
31 size_t allocated;
33 /* Initialize the accumulator. */
34 if (nf != NULL || resultbuf == NULL)
36 result = NULL;
37 allocated = 0;
39 else
41 result = resultbuf;
42 allocated = *lengthp;
44 length = 0;
47 const UNIT *s_end = s + n;
49 /* Helper for evaluating the FINAL_SIGMA condition:
50 Last character that was not case-ignorable. */
51 ucs4_t last_char_except_ignorable =
52 prefix_context.last_char_except_ignorable;
54 /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
55 Last character that was of combining class 230 ("Above") or 0. */
56 ucs4_t last_char_normal_or_above =
57 prefix_context.last_char_normal_or_above;
59 while (s < s_end)
61 ucs4_t uc;
62 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
64 ucs4_t mapped_uc[3];
65 unsigned int mapped_count;
67 if (uc < 0x10000)
69 /* Look first in the special-casing table. */
70 char code[3];
72 code[0] = (uc >> 8) & 0xff;
73 code[1] = uc & 0xff;
75 for (code[2] = 0; ; code[2]++)
77 const struct special_casing_rule *rule =
78 gl_unicase_special_lookup (code, 3);
80 if (rule == NULL)
81 break;
83 /* Test if the condition applies. */
84 /* Does the language apply? */
85 if (rule->language[0] == '\0'
86 || (iso639_language != NULL
87 && iso639_language[0] == rule->language[0]
88 && iso639_language[1] == rule->language[1]))
90 /* Does the context apply? */
91 int context = rule->context;
92 bool applies;
94 if (context < 0)
95 context = - context;
96 switch (context)
98 case SCC_ALWAYS:
99 applies = true;
100 break;
102 case SCC_FINAL_SIGMA:
103 /* "Before" condition: preceded by a sequence
104 consisting of a cased letter and a case-ignorable
105 sequence.
106 "After" condition: not followed by a sequence
107 consisting of a case-ignorable sequence and then a
108 cased letter. */
109 /* Test the "before" condition. */
110 applies = uc_is_cased (last_char_except_ignorable);
111 /* Test the "after" condition. */
112 if (applies)
114 const UNIT *s2 = s + count;
115 for (;;)
117 if (s2 < s_end)
119 ucs4_t uc2;
120 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
121 /* Our uc_is_case_ignorable function is
122 known to return false for all cased
123 characters. So we can call
124 uc_is_case_ignorable first. */
125 if (!uc_is_case_ignorable (uc2))
127 applies = ! uc_is_cased (uc2);
128 break;
130 s2 += count2;
132 else
134 applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
135 break;
139 break;
141 case SCC_AFTER_SOFT_DOTTED:
142 /* "Before" condition: There is a Soft_Dotted character
143 before it, with no intervening character of
144 combining class 0 or 230 (Above). */
145 /* Test the "before" condition. */
146 applies = uc_is_property_soft_dotted (last_char_normal_or_above);
147 break;
149 case SCC_MORE_ABOVE:
150 /* "After" condition: followed by a character of
151 combining class 230 (Above) with no intervening
152 character of combining class 0 or 230 (Above). */
153 /* Test the "after" condition. */
155 const UNIT *s2 = s + count;
156 applies = false;
157 for (;;)
159 if (s2 < s_end)
161 ucs4_t uc2;
162 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
163 int ccc = uc_combining_class (uc2);
164 if (ccc == UC_CCC_A)
166 applies = true;
167 break;
169 if (ccc == UC_CCC_NR)
170 break;
171 s2 += count2;
173 else
175 applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
176 break;
180 break;
182 case SCC_BEFORE_DOT:
183 /* "After" condition: followed by COMBINING DOT ABOVE
184 (U+0307). Any sequence of characters with a
185 combining class that is neither 0 nor 230 may
186 intervene between the current character and the
187 combining dot above. */
188 /* Test the "after" condition. */
190 const UNIT *s2 = s + count;
191 applies = false;
192 for (;;)
194 if (s2 < s_end)
196 ucs4_t uc2;
197 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
198 if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
200 applies = true;
201 break;
204 int ccc = uc_combining_class (uc2);
205 if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
206 break;
208 s2 += count2;
210 else
212 applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
213 break;
217 break;
219 case SCC_AFTER_I:
220 /* "Before" condition: There is an uppercase I before
221 it, and there is no intervening character of
222 combining class 0 or 230 (Above). */
223 /* Test the "before" condition. */
224 applies = (last_char_normal_or_above == 'I');
225 break;
227 default:
228 abort ();
230 if (rule->context < 0)
231 applies = !applies;
233 if (applies)
235 /* The rule applies.
236 Look up the mapping (0 to 3 characters). */
237 const unsigned short *mapped_in_rule =
238 (const unsigned short *)((const char *)rule + offset_in_rule);
240 if (mapped_in_rule[0] == 0)
241 mapped_count = 0;
242 else
244 mapped_uc[0] = mapped_in_rule[0];
245 if (mapped_in_rule[1] == 0)
246 mapped_count = 1;
247 else
249 mapped_uc[1] = mapped_in_rule[1];
250 if (mapped_in_rule[2] == 0)
251 mapped_count = 2;
252 else
254 mapped_uc[2] = mapped_in_rule[2];
255 mapped_count = 3;
259 goto found_mapping;
263 /* Optimization: Save a hash table lookup in the next round. */
264 if (!rule->has_next)
265 break;
269 /* No special-cased mapping. So use the locale and context independent
270 mapping. */
271 mapped_uc[0] = single_character_map (uc);
272 mapped_count = 1;
274 found_mapping:
275 /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */
277 unsigned int i;
279 for (i = 0; i < mapped_count; i++)
281 ucs4_t muc = mapped_uc[i];
283 /* Append muc to the result accumulator. */
284 if (length < allocated)
286 int ret = U_UCTOMB (result + length, muc, allocated - length);
287 if (ret == -1)
289 errno = EINVAL;
290 goto fail;
292 if (ret >= 0)
294 length += ret;
295 goto done_appending;
299 size_t old_allocated = allocated;
300 size_t new_allocated = 2 * old_allocated;
301 if (new_allocated < 64)
302 new_allocated = 64;
303 if (new_allocated < old_allocated) /* integer overflow? */
304 abort ();
306 UNIT *larger_result;
307 if (result == NULL)
309 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
310 if (larger_result == NULL)
312 errno = ENOMEM;
313 goto fail;
316 else if (result == resultbuf)
318 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
319 if (larger_result == NULL)
321 errno = ENOMEM;
322 goto fail;
324 U_CPY (larger_result, resultbuf, length);
326 else
328 larger_result =
329 (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
330 if (larger_result == NULL)
332 errno = ENOMEM;
333 goto fail;
336 result = larger_result;
337 allocated = new_allocated;
339 int ret = U_UCTOMB (result + length, muc, allocated - length);
340 if (ret == -1)
342 errno = EINVAL;
343 goto fail;
345 if (ret < 0)
346 abort ();
347 length += ret;
348 goto done_appending;
352 done_appending: ;
356 if (!uc_is_case_ignorable (uc))
357 last_char_except_ignorable = uc;
360 int ccc = uc_combining_class (uc);
361 if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
362 last_char_normal_or_above = uc;
365 s += count;
369 if (nf != NULL)
371 /* Finally, normalize the result. */
372 UNIT *normalized_result;
374 normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
375 if (normalized_result == NULL)
376 goto fail;
378 free (result);
379 return normalized_result;
382 if (length == 0)
384 if (result == NULL)
386 /* Return a non-NULL value. NULL means error. */
387 result = (UNIT *) malloc (1);
388 if (result == NULL)
390 errno = ENOMEM;
391 goto fail;
395 else if (result != resultbuf && length < allocated)
397 /* Shrink the allocated memory if possible. */
398 UNIT *memory;
400 memory = (UNIT *) realloc (result, length * sizeof (UNIT));
401 if (memory != NULL)
402 result = memory;
405 *lengthp = length;
406 return result;
408 fail:
409 if (result != resultbuf)
411 int saved_errno = errno;
412 free (result);
413 errno = saved_errno;
415 return NULL;