Avoid build errors due to wrong references between modules.
[gnulib.git] / lib / uninorm / uninorm-filter.c
blobe1933c7ba401ee7a0f5cc265d1e0564b608f8cd0
1 /* Stream-based normalization of Unicode strings.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 #include <config.h>
20 /* Specification. */
21 #include "uninorm.h"
23 #include <errno.h>
24 #include <stddef.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #include "unictype.h"
29 #include "normalize-internal.h"
30 #include "uninorm/decompose-internal.h"
33 struct uninorm_filter
35 /* Characteristics of the normalization form. */
36 int (*decomposer) (ucs4_t uc, ucs4_t *decomposition);
37 ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2);
39 /* The encapsulated stream. */
40 int (*stream_func) (void *stream_data, ucs4_t uc);
41 void *stream_data;
43 /* The buffer for sorting. */
44 #define SORTBUF_PREALLOCATED 64
45 struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
46 struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
47 size_t sortbuf_allocated;
48 size_t sortbuf_count;
51 struct uninorm_filter *
52 uninorm_filter_create (uninorm_t nf,
53 int (*stream_func) (void *stream_data, ucs4_t uc),
54 void *stream_data)
56 struct uninorm_filter *filter =
57 (struct uninorm_filter *) malloc (sizeof (struct uninorm_filter));
59 if (filter == NULL)
60 /* errno is ENOMEM. */
61 return NULL;
63 filter->decomposer = nf->decomposer;
64 filter->composer = nf->composer;
65 filter->stream_func = stream_func;
66 filter->stream_data = stream_data;
67 filter->sortbuf = filter->sortbuf_preallocated;
68 filter->sortbuf_allocated = SORTBUF_PREALLOCATED;
69 filter->sortbuf_count = 0;
71 return filter;
74 int
75 uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc_arg)
77 ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
78 int decomposed_count;
80 /* Accept the next character. */
81 decomposed[0] = uc_arg;
82 decomposed_count = 1;
84 /* Decompose it, recursively.
85 It would be possible to precompute the recursive decomposition
86 and store it in a table. But this would significantly increase
87 the size of the decomposition tables, because for example for
88 U+1FC1 the recursive canonical decomposition and the recursive
89 compatibility decomposition are different. */
91 int curr;
93 for (curr = 0; curr < decomposed_count; )
95 /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
96 all elements are atomic. */
97 ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
98 int curr_decomposed_count;
100 curr_decomposed_count =
101 filter->decomposer (decomposed[curr], curr_decomposed);
102 if (curr_decomposed_count >= 0)
104 /* Move curr_decomposed[0..curr_decomposed_count-1] over
105 decomposed[curr], making room. It's not worth using
106 memcpy() here, since the counts are so small. */
107 int shift = curr_decomposed_count - 1;
109 if (shift < 0)
110 abort ();
111 if (shift > 0)
113 int j;
115 decomposed_count += shift;
116 if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
117 abort ();
118 for (j = decomposed_count - 1 - shift; j > curr; j--)
119 decomposed[j + shift] = decomposed[j];
121 for (; shift >= 0; shift--)
122 decomposed[curr + shift] = curr_decomposed[shift];
124 else
126 /* decomposed[curr] is atomic. */
127 curr++;
133 /* Cache sortbuf and sortbuf_count in local register variables. */
134 struct ucs4_with_ccc *sortbuf = filter->sortbuf;
135 size_t sortbuf_count = filter->sortbuf_count;
136 int i;
138 for (i = 0; i < decomposed_count; i++)
140 /* Fetch the next character from the decomposition. */
141 ucs4_t uc = decomposed[i];
142 int ccc = uc_combining_class (uc);
144 if (ccc == 0)
146 size_t j;
148 /* Apply the canonical ordering algorithm to the accumulated
149 sequence of characters. */
150 if (sortbuf_count > 1)
151 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152 sortbuf + sortbuf_count);
154 if (filter->composer != NULL)
156 /* Attempt to combine decomposed characters, as specified
157 in the Unicode Standard Annex #15 "Unicode Normalization
158 Forms". We need to check
159 1. whether the first accumulated character is a
160 "starter" (i.e. has ccc = 0). This is usually the
161 case. But when the string starts with a
162 non-starter, the sortbuf also starts with a
163 non-starter. Btw, this check could also be
164 omitted, because the composition table has only
165 entries (code1, code2) for which code1 is a
166 starter; if the first accumulated character is not
167 a starter, no lookup will succeed.
168 2. If the sortbuf has more than one character, check
169 for each of these characters that are not "blocked"
170 from the starter (i.e. have a ccc that is higher
171 than the ccc of the previous character) whether it
172 can be combined with the first character.
173 3. If only one character is left in sortbuf, check
174 whether it can be combined with the next character
175 (also a starter). */
176 if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
178 for (j = 1; j < sortbuf_count; )
180 if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
182 ucs4_t combined =
183 filter->composer (sortbuf[0].code, sortbuf[j].code);
184 if (combined)
186 size_t k;
188 sortbuf[0].code = combined;
189 /* sortbuf[0].ccc = 0, still valid. */
190 for (k = j + 1; k < sortbuf_count; k++)
191 sortbuf[k - 1] = sortbuf[k];
192 sortbuf_count--;
193 continue;
196 j++;
198 if (sortbuf_count == 1)
200 ucs4_t combined =
201 filter->composer (sortbuf[0].code, uc);
202 if (combined)
204 uc = combined;
205 ccc = 0;
206 /* uc could be further combined with subsequent
207 characters. So don't put it into sortbuf[0] in
208 this round, only in the next round. */
209 sortbuf_count = 0;
215 for (j = 0; j < sortbuf_count; j++)
217 ucs4_t muc = sortbuf[j].code;
219 /* Output muc to the encapsulated stream. */
220 int ret = filter->stream_func (filter->stream_data, muc);
221 if (ret < 0)
223 /* errno is set here. */
224 filter->sortbuf_count = 0;
225 return -1;
229 /* sortbuf is now empty. */
230 sortbuf_count = 0;
233 /* Append (uc, ccc) to sortbuf. */
234 if (sortbuf_count == filter->sortbuf_allocated)
236 struct ucs4_with_ccc *new_sortbuf;
238 filter->sortbuf_allocated = 2 * filter->sortbuf_allocated;
239 if (filter->sortbuf_allocated < sortbuf_count) /* integer overflow? */
240 abort ();
241 new_sortbuf =
242 (struct ucs4_with_ccc *)
243 malloc (2 * filter->sortbuf_allocated * sizeof (struct ucs4_with_ccc));
244 if (new_sortbuf == NULL)
246 /* errno is ENOMEM. */
247 filter->sortbuf_count = sortbuf_count;
248 return -1;
250 memcpy (new_sortbuf, filter->sortbuf,
251 sortbuf_count * sizeof (struct ucs4_with_ccc));
252 if (filter->sortbuf != filter->sortbuf_preallocated)
253 free (filter->sortbuf);
254 filter->sortbuf = new_sortbuf;
255 /* Update cache of filter->sortbuf. */
256 sortbuf = filter->sortbuf;
258 sortbuf[sortbuf_count].code = uc;
259 sortbuf[sortbuf_count].ccc = ccc;
260 sortbuf_count++;
263 filter->sortbuf_count = sortbuf_count;
266 return 0;
269 /* Bring data buffered in the filter to its destination, the encapsulated
270 stream.
271 Return 0 if successful, or -1 with errno set upon failure.
272 Note! If after calling this function, additional characters are written
273 into the filter, the resulting character sequence in the encapsulated stream
274 will not necessarily be normalized. */
276 uninorm_filter_flush (struct uninorm_filter *filter)
278 /* Cache sortbuf and sortbuf_count in local register variables. */
279 struct ucs4_with_ccc * const sortbuf = filter->sortbuf;
280 size_t sortbuf_count = filter->sortbuf_count;
281 size_t j;
283 /* Apply the canonical ordering algorithm to the accumulated
284 sequence of characters. */
285 if (sortbuf_count > 1)
286 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
287 sortbuf + sortbuf_count);
289 if (filter->composer != NULL)
291 /* Attempt to combine decomposed characters, as specified
292 in the Unicode Standard Annex #15 "Unicode Normalization
293 Forms". We need to check
294 1. whether the first accumulated character is a
295 "starter" (i.e. has ccc = 0). This is usually the
296 case. But when the string starts with a
297 non-starter, the sortbuf also starts with a
298 non-starter. Btw, this check could also be
299 omitted, because the composition table has only
300 entries (code1, code2) for which code1 is a
301 starter; if the first accumulated character is not
302 a starter, no lookup will succeed.
303 2. If the sortbuf has more than one character, check
304 for each of these characters that are not "blocked"
305 from the starter (i.e. have a ccc that is higher
306 than the ccc of the previous character) whether it
307 can be combined with the first character.
308 3. If only one character is left in sortbuf, check
309 whether it can be combined with the next character
310 (also a starter). */
311 if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
313 for (j = 1; j < sortbuf_count; )
315 if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
317 ucs4_t combined =
318 filter->composer (sortbuf[0].code, sortbuf[j].code);
319 if (combined)
321 size_t k;
323 sortbuf[0].code = combined;
324 /* sortbuf[0].ccc = 0, still valid. */
325 for (k = j + 1; k < sortbuf_count; k++)
326 sortbuf[k - 1] = sortbuf[k];
327 sortbuf_count--;
328 continue;
331 j++;
336 for (j = 0; j < sortbuf_count; j++)
338 ucs4_t muc = sortbuf[j].code;
340 /* Output muc to the encapsulated stream. */
341 int ret = filter->stream_func (filter->stream_data, muc);
342 if (ret < 0)
344 /* errno is set here. */
345 filter->sortbuf_count = 0;
346 return -1;
350 /* sortbuf is now empty. */
351 filter->sortbuf_count = 0;
353 return 0;
356 /* Bring data buffered in the filter to its destination, the encapsulated
357 stream, then close and free the filter.
358 Return 0 if successful, or -1 with errno set upon failure. */
360 uninorm_filter_free (struct uninorm_filter *filter)
362 int ret = uninorm_filter_flush (filter);
364 if (ret < 0)
365 /* errno is set here. */
366 return -1;
368 if (filter->sortbuf_count > 0)
369 abort ();
370 if (filter->sortbuf != filter->sortbuf_preallocated)
371 free (filter->sortbuf);
372 free (filter);
374 return 0;