*-map tests: Fix compilation error.
[gnulib.git] / lib / uninorm / u-normalize-internal.h
blobf5a55591439b55b791cb616cd5afda35e9bc55f8
1 /* Decomposition and composition of Unicode strings.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 UNIT *
19 FUNC (uninorm_t nf, const UNIT *s, size_t n,
20 UNIT *resultbuf, size_t *lengthp)
22 int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23 ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
25 /* The result being accumulated. */
26 UNIT *result;
27 size_t length;
28 size_t allocated;
29 /* The buffer for sorting. */
30 #define SORTBUF_PREALLOCATED 64
31 struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32 struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33 size_t sortbuf_allocated;
34 size_t sortbuf_count;
36 /* Initialize the accumulator. */
37 if (resultbuf == NULL)
39 result = NULL;
40 allocated = 0;
42 else
44 result = resultbuf;
45 allocated = *lengthp;
47 length = 0;
49 /* Initialize the buffer for sorting. */
50 sortbuf = sortbuf_preallocated;
51 sortbuf_allocated = SORTBUF_PREALLOCATED;
52 sortbuf_count = 0;
55 const UNIT *s_end = s + n;
57 for (;;)
59 int count;
60 ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61 int decomposed_count;
62 int i;
64 if (s < s_end)
66 /* Fetch the next character. */
67 count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68 decomposed_count = 1;
70 /* Decompose it, recursively.
71 It would be possible to precompute the recursive decomposition
72 and store it in a table. But this would significantly increase
73 the size of the decomposition tables, because for example for
74 U+1FC1 the recursive canonical decomposition and the recursive
75 compatibility decomposition are different. */
77 int curr;
79 for (curr = 0; curr < decomposed_count; )
81 /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82 all elements are atomic. */
83 ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84 int curr_decomposed_count;
86 curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87 if (curr_decomposed_count >= 0)
89 /* Move curr_decomposed[0..curr_decomposed_count-1] over
90 decomposed[curr], making room. It's not worth using
91 memcpy() here, since the counts are so small. */
92 int shift = curr_decomposed_count - 1;
94 if (shift < 0)
95 abort ();
96 if (shift > 0)
98 int j;
100 decomposed_count += shift;
101 if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102 abort ();
103 for (j = decomposed_count - 1 - shift; j > curr; j--)
104 decomposed[j + shift] = decomposed[j];
106 for (; shift >= 0; shift--)
107 decomposed[curr + shift] = curr_decomposed[shift];
109 else
111 /* decomposed[curr] is atomic. */
112 curr++;
117 else
119 count = 0;
120 decomposed_count = 0;
123 i = 0;
124 for (;;)
126 ucs4_t uc;
127 int ccc;
129 if (s < s_end)
131 /* Fetch the next character from the decomposition. */
132 if (i == decomposed_count)
133 break;
134 uc = decomposed[i];
135 ccc = uc_combining_class (uc);
137 else
139 /* End of string reached. */
140 uc = 0;
141 ccc = 0;
144 if (ccc == 0)
146 size_t j;
148 /* Apply the canonical ordering algorithm to the accumulated
149 sequence of characters. */
150 if (sortbuf_count > 1)
151 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152 sortbuf + sortbuf_count);
154 if (composer != NULL)
156 /* Attempt to combine decomposed characters, as specified
157 in the Unicode Standard Annex #15 "Unicode Normalization
158 Forms". We need to check
159 1. whether the first accumulated character is a
160 "starter" (i.e. has ccc = 0). This is usually the
161 case. But when the string starts with a
162 non-starter, the sortbuf also starts with a
163 non-starter. Btw, this check could also be
164 omitted, because the composition table has only
165 entries (code1, code2) for which code1 is a
166 starter; if the first accumulated character is not
167 a starter, no lookup will succeed.
168 2. If the sortbuf has more than one character, check
169 for each of these characters that are not "blocked"
170 from the starter (i.e. have a ccc that is higher
171 than the ccc of the previous character) whether it
172 can be combined with the first character.
173 3. If only one character is left in sortbuf, check
174 whether it can be combined with the next character
175 (also a starter). */
176 if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
178 for (j = 1; j < sortbuf_count; )
180 if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
182 ucs4_t combined =
183 composer (sortbuf[0].code, sortbuf[j].code);
184 if (combined)
186 size_t k;
188 sortbuf[0].code = combined;
189 /* sortbuf[0].ccc = 0, still valid. */
190 for (k = j + 1; k < sortbuf_count; k++)
191 sortbuf[k - 1] = sortbuf[k];
192 sortbuf_count--;
193 continue;
196 j++;
198 if (s < s_end && sortbuf_count == 1)
200 ucs4_t combined =
201 composer (sortbuf[0].code, uc);
202 if (combined)
204 uc = combined;
205 ccc = 0;
206 /* uc could be further combined with subsequent
207 characters. So don't put it into sortbuf[0] in
208 this round, only in the next round. */
209 sortbuf_count = 0;
215 for (j = 0; j < sortbuf_count; j++)
217 ucs4_t muc = sortbuf[j].code;
219 /* Append muc to the result accumulator. */
220 if (length < allocated)
222 int ret =
223 U_UCTOMB (result + length, muc, allocated - length);
224 if (ret == -1)
226 errno = EINVAL;
227 goto fail;
229 if (ret >= 0)
231 length += ret;
232 goto done_appending;
236 size_t old_allocated = allocated;
237 size_t new_allocated = 2 * old_allocated;
238 if (new_allocated < 64)
239 new_allocated = 64;
240 if (new_allocated < old_allocated) /* integer overflow? */
241 abort ();
243 UNIT *larger_result;
244 if (result == NULL)
246 larger_result =
247 (UNIT *) malloc (new_allocated * sizeof (UNIT));
248 if (larger_result == NULL)
250 errno = ENOMEM;
251 goto fail;
254 else if (result == resultbuf)
256 larger_result =
257 (UNIT *) malloc (new_allocated * sizeof (UNIT));
258 if (larger_result == NULL)
260 errno = ENOMEM;
261 goto fail;
263 U_CPY (larger_result, resultbuf, length);
265 else
267 larger_result =
268 (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269 if (larger_result == NULL)
271 errno = ENOMEM;
272 goto fail;
275 result = larger_result;
276 allocated = new_allocated;
278 int ret =
279 U_UCTOMB (result + length, muc, allocated - length);
280 if (ret == -1)
282 errno = EINVAL;
283 goto fail;
285 if (ret < 0)
286 abort ();
287 length += ret;
288 goto done_appending;
292 done_appending: ;
295 /* sortbuf is now empty. */
296 sortbuf_count = 0;
299 if (!(s < s_end))
300 /* End of string reached. */
301 break;
303 /* Append (uc, ccc) to sortbuf. */
304 if (sortbuf_count == sortbuf_allocated)
306 struct ucs4_with_ccc *new_sortbuf;
308 sortbuf_allocated = 2 * sortbuf_allocated;
309 if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310 abort ();
311 new_sortbuf =
312 (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313 if (new_sortbuf == NULL)
315 errno = ENOMEM;
316 goto fail;
318 memcpy (new_sortbuf, sortbuf,
319 sortbuf_count * sizeof (struct ucs4_with_ccc));
320 if (sortbuf != sortbuf_preallocated)
321 free (sortbuf);
322 sortbuf = new_sortbuf;
324 sortbuf[sortbuf_count].code = uc;
325 sortbuf[sortbuf_count].ccc = ccc;
326 sortbuf_count++;
328 i++;
331 if (!(s < s_end))
332 /* End of string reached. */
333 break;
335 s += count;
339 if (length == 0)
341 if (result == NULL)
343 /* Return a non-NULL value. NULL means error. */
344 result = (UNIT *) malloc (1);
345 if (result == NULL)
347 errno = ENOMEM;
348 goto fail;
352 else if (result != resultbuf && length < allocated)
354 /* Shrink the allocated memory if possible. */
355 UNIT *memory;
357 memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358 if (memory != NULL)
359 result = memory;
362 if (sortbuf_count > 0)
363 abort ();
364 if (sortbuf != sortbuf_preallocated)
365 free (sortbuf);
367 *lengthp = length;
368 return result;
370 fail:
372 int saved_errno = errno;
373 if (sortbuf != sortbuf_preallocated)
374 free (sortbuf);
375 if (result != resultbuf)
376 free (result);
377 errno = saved_errno;
379 return NULL;