1 /* Stream-based normalization of Unicode strings.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
29 #include "normalize-internal.h"
30 #include "uninorm/decompose-internal.h"
35 /* Characteristics of the normalization form. */
36 int (*decomposer
) (ucs4_t uc
, ucs4_t
*decomposition
);
37 ucs4_t (*composer
) (ucs4_t uc1
, ucs4_t uc2
);
39 /* The encapsulated stream. */
40 int (*stream_func
) (void *stream_data
, ucs4_t uc
);
43 /* The buffer for sorting. */
44 #define SORTBUF_PREALLOCATED 64
45 struct ucs4_with_ccc sortbuf_preallocated
[2 * SORTBUF_PREALLOCATED
];
46 struct ucs4_with_ccc
*sortbuf
; /* array of size 2 * sortbuf_allocated */
47 size_t sortbuf_allocated
;
51 struct uninorm_filter
*
52 uninorm_filter_create (uninorm_t nf
,
53 int (*stream_func
) (void *stream_data
, ucs4_t uc
),
56 struct uninorm_filter
*filter
=
57 (struct uninorm_filter
*) malloc (sizeof (struct uninorm_filter
));
60 /* errno is ENOMEM. */
63 filter
->decomposer
= nf
->decomposer
;
64 filter
->composer
= nf
->composer
;
65 filter
->stream_func
= stream_func
;
66 filter
->stream_data
= stream_data
;
67 filter
->sortbuf
= filter
->sortbuf_preallocated
;
68 filter
->sortbuf_allocated
= SORTBUF_PREALLOCATED
;
69 filter
->sortbuf_count
= 0;
75 uninorm_filter_write (struct uninorm_filter
*filter
, ucs4_t uc_arg
)
77 ucs4_t decomposed
[UC_DECOMPOSITION_MAX_LENGTH
];
80 /* Accept the next character. */
81 decomposed
[0] = uc_arg
;
84 /* Decompose it, recursively.
85 It would be possible to precompute the recursive decomposition
86 and store it in a table. But this would significantly increase
87 the size of the decomposition tables, because for example for
88 U+1FC1 the recursive canonical decomposition and the recursive
89 compatibility decomposition are different. */
93 for (curr
= 0; curr
< decomposed_count
; )
95 /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
96 all elements are atomic. */
97 ucs4_t curr_decomposed
[UC_DECOMPOSITION_MAX_LENGTH
];
98 int curr_decomposed_count
;
100 curr_decomposed_count
=
101 filter
->decomposer (decomposed
[curr
], curr_decomposed
);
102 if (curr_decomposed_count
>= 0)
104 /* Move curr_decomposed[0..curr_decomposed_count-1] over
105 decomposed[curr], making room. It's not worth using
106 memcpy() here, since the counts are so small. */
107 int shift
= curr_decomposed_count
- 1;
115 decomposed_count
+= shift
;
116 if (decomposed_count
> UC_DECOMPOSITION_MAX_LENGTH
)
118 for (j
= decomposed_count
- 1 - shift
; j
> curr
; j
--)
119 decomposed
[j
+ shift
] = decomposed
[j
];
121 for (; shift
>= 0; shift
--)
122 decomposed
[curr
+ shift
] = curr_decomposed
[shift
];
126 /* decomposed[curr] is atomic. */
133 /* Cache sortbuf and sortbuf_count in local register variables. */
134 struct ucs4_with_ccc
*sortbuf
= filter
->sortbuf
;
135 size_t sortbuf_count
= filter
->sortbuf_count
;
138 for (i
= 0; i
< decomposed_count
; i
++)
140 /* Fetch the next character from the decomposition. */
141 ucs4_t uc
= decomposed
[i
];
142 int ccc
= uc_combining_class (uc
);
148 /* Apply the canonical ordering algorithm to the accumulated
149 sequence of characters. */
150 if (sortbuf_count
> 1)
151 gl_uninorm_decompose_merge_sort_inplace (sortbuf
, sortbuf_count
,
152 sortbuf
+ sortbuf_count
);
154 if (filter
->composer
!= NULL
)
156 /* Attempt to combine decomposed characters, as specified
157 in the Unicode Standard Annex #15 "Unicode Normalization
158 Forms". We need to check
159 1. whether the first accumulated character is a
160 "starter" (i.e. has ccc = 0). This is usually the
161 case. But when the string starts with a
162 non-starter, the sortbuf also starts with a
163 non-starter. Btw, this check could also be
164 omitted, because the composition table has only
165 entries (code1, code2) for which code1 is a
166 starter; if the first accumulated character is not
167 a starter, no lookup will succeed.
168 2. If the sortbuf has more than one character, check
169 for each of these characters that are not "blocked"
170 from the starter (i.e. have a ccc that is higher
171 than the ccc of the previous character) whether it
172 can be combined with the first character.
173 3. If only one character is left in sortbuf, check
174 whether it can be combined with the next character
176 if (sortbuf_count
> 0 && sortbuf
[0].ccc
== 0)
178 for (j
= 1; j
< sortbuf_count
; )
180 if (sortbuf
[j
].ccc
> sortbuf
[j
- 1].ccc
)
183 filter
->composer (sortbuf
[0].code
, sortbuf
[j
].code
);
188 sortbuf
[0].code
= combined
;
189 /* sortbuf[0].ccc = 0, still valid. */
190 for (k
= j
+ 1; k
< sortbuf_count
; k
++)
191 sortbuf
[k
- 1] = sortbuf
[k
];
198 if (sortbuf_count
== 1)
201 filter
->composer (sortbuf
[0].code
, uc
);
206 /* uc could be further combined with subsequent
207 characters. So don't put it into sortbuf[0] in
208 this round, only in the next round. */
215 for (j
= 0; j
< sortbuf_count
; j
++)
217 ucs4_t muc
= sortbuf
[j
].code
;
219 /* Output muc to the encapsulated stream. */
220 int ret
= filter
->stream_func (filter
->stream_data
, muc
);
223 /* errno is set here. */
224 filter
->sortbuf_count
= 0;
229 /* sortbuf is now empty. */
233 /* Append (uc, ccc) to sortbuf. */
234 if (sortbuf_count
== filter
->sortbuf_allocated
)
236 struct ucs4_with_ccc
*new_sortbuf
;
238 filter
->sortbuf_allocated
= 2 * filter
->sortbuf_allocated
;
239 if (filter
->sortbuf_allocated
< sortbuf_count
) /* integer overflow? */
242 (struct ucs4_with_ccc
*)
243 malloc (2 * filter
->sortbuf_allocated
* sizeof (struct ucs4_with_ccc
));
244 if (new_sortbuf
== NULL
)
246 /* errno is ENOMEM. */
247 filter
->sortbuf_count
= sortbuf_count
;
250 memcpy (new_sortbuf
, filter
->sortbuf
,
251 sortbuf_count
* sizeof (struct ucs4_with_ccc
));
252 if (filter
->sortbuf
!= filter
->sortbuf_preallocated
)
253 free (filter
->sortbuf
);
254 filter
->sortbuf
= new_sortbuf
;
255 /* Update cache of filter->sortbuf. */
256 sortbuf
= filter
->sortbuf
;
258 sortbuf
[sortbuf_count
].code
= uc
;
259 sortbuf
[sortbuf_count
].ccc
= ccc
;
263 filter
->sortbuf_count
= sortbuf_count
;
269 /* Bring data buffered in the filter to its destination, the encapsulated
271 Return 0 if successful, or -1 with errno set upon failure.
272 Note! If after calling this function, additional characters are written
273 into the filter, the resulting character sequence in the encapsulated stream
274 will not necessarily be normalized. */
276 uninorm_filter_flush (struct uninorm_filter
*filter
)
278 /* Cache sortbuf and sortbuf_count in local register variables. */
279 struct ucs4_with_ccc
* const sortbuf
= filter
->sortbuf
;
280 size_t sortbuf_count
= filter
->sortbuf_count
;
283 /* Apply the canonical ordering algorithm to the accumulated
284 sequence of characters. */
285 if (sortbuf_count
> 1)
286 gl_uninorm_decompose_merge_sort_inplace (sortbuf
, sortbuf_count
,
287 sortbuf
+ sortbuf_count
);
289 if (filter
->composer
!= NULL
)
291 /* Attempt to combine decomposed characters, as specified
292 in the Unicode Standard Annex #15 "Unicode Normalization
293 Forms". We need to check
294 1. whether the first accumulated character is a
295 "starter" (i.e. has ccc = 0). This is usually the
296 case. But when the string starts with a
297 non-starter, the sortbuf also starts with a
298 non-starter. Btw, this check could also be
299 omitted, because the composition table has only
300 entries (code1, code2) for which code1 is a
301 starter; if the first accumulated character is not
302 a starter, no lookup will succeed.
303 2. If the sortbuf has more than one character, check
304 for each of these characters that are not "blocked"
305 from the starter (i.e. have a ccc that is higher
306 than the ccc of the previous character) whether it
307 can be combined with the first character.
308 3. If only one character is left in sortbuf, check
309 whether it can be combined with the next character
311 if (sortbuf_count
> 0 && sortbuf
[0].ccc
== 0)
313 for (j
= 1; j
< sortbuf_count
; )
315 if (sortbuf
[j
].ccc
> sortbuf
[j
- 1].ccc
)
318 filter
->composer (sortbuf
[0].code
, sortbuf
[j
].code
);
323 sortbuf
[0].code
= combined
;
324 /* sortbuf[0].ccc = 0, still valid. */
325 for (k
= j
+ 1; k
< sortbuf_count
; k
++)
326 sortbuf
[k
- 1] = sortbuf
[k
];
336 for (j
= 0; j
< sortbuf_count
; j
++)
338 ucs4_t muc
= sortbuf
[j
].code
;
340 /* Output muc to the encapsulated stream. */
341 int ret
= filter
->stream_func (filter
->stream_data
, muc
);
344 /* errno is set here. */
345 filter
->sortbuf_count
= 0;
350 /* sortbuf is now empty. */
351 filter
->sortbuf_count
= 0;
356 /* Bring data buffered in the filter to its destination, the encapsulated
357 stream, then close and free the filter.
358 Return 0 if successful, or -1 with errno set upon failure. */
360 uninorm_filter_free (struct uninorm_filter
*filter
)
362 int ret
= uninorm_filter_flush (filter
);
365 /* errno is set here. */
368 if (filter
->sortbuf_count
> 0)
370 if (filter
->sortbuf
!= filter
->sortbuf_preallocated
)
371 free (filter
->sortbuf
);