Update copyright dates with scripts/update-copyrights.
[glibc.git] / string / strxfrm_l.c
blob98494470b6e397a4b4f4e665b63704e94eea1459
1 /* Copyright (C) 1995-2015 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 #include <assert.h>
20 #include <langinfo.h>
21 #include <locale.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <sys/param.h>
28 #ifndef STRING_TYPE
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # define STRXFRM __strxfrm_l
32 # define STRCMP strcmp
33 # define STRLEN strlen
34 # define STPNCPY __stpncpy
35 # define WEIGHT_H "../locale/weight.h"
36 # define SUFFIX MB
37 # define L(arg) arg
38 #endif
40 #define CONCAT(a,b) CONCAT1(a,b)
41 #define CONCAT1(a,b) a##b
43 #include "../locale/localeinfo.h"
44 #include WEIGHT_H
47 #ifndef WIDE_CHAR_VERSION
49 /* We need UTF-8 encoding of numbers. */
50 static int
51 utf8_encode (char *buf, int val)
53 int retval;
55 if (val < 0x80)
57 *buf++ = (char) val;
58 retval = 1;
60 else
62 int step;
64 for (step = 2; step < 6; ++step)
65 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
66 break;
67 retval = step;
69 *buf = (unsigned char) (~0xff >> step);
70 --step;
73 buf[step] = 0x80 | (val & 0x3f);
74 val >>= 6;
76 while (--step > 0);
77 *buf |= val;
80 return retval;
82 #endif
85 size_t
86 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
88 struct __locale_data *current = l->__locales[LC_COLLATE];
89 uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
90 /* We don't assign the following values right away since it might be
91 unnecessary in case there are no rules. */
92 const unsigned char *rulesets;
93 const int32_t *table;
94 const USTRING_TYPE *weights;
95 const USTRING_TYPE *extra;
96 const int32_t *indirect;
97 uint_fast32_t pass;
98 size_t needed;
99 size_t last_needed;
100 const USTRING_TYPE *usrc;
101 size_t srclen = STRLEN (src);
102 int32_t *idxarr;
103 unsigned char *rulearr;
104 size_t idxmax;
105 size_t idxcnt;
106 int use_malloc;
108 if (nrules == 0)
110 if (n != 0)
111 STPNCPY (dest, src, MIN (srclen + 1, n));
113 return srclen;
116 rulesets = (const unsigned char *)
117 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
118 table = (const int32_t *)
119 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
120 weights = (const USTRING_TYPE *)
121 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
122 extra = (const USTRING_TYPE *)
123 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
124 indirect = (const int32_t *)
125 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
126 use_malloc = 0;
128 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
129 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
130 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
131 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
133 /* Handle an empty string as a special case. */
134 if (srclen == 0)
136 if (n != 0)
137 *dest = L('\0');
138 return 0;
141 /* We need the elements of the string as unsigned values since they
142 are used as indeces. */
143 usrc = (const USTRING_TYPE *) src;
145 /* Perform the first pass over the string and while doing this find
146 and store the weights for each character. Since we want this to
147 be as fast as possible we are using `alloca' to store the temporary
148 values. But since there is no limit on the length of the string
149 we have to use `malloc' if the string is too long. We should be
150 very conservative here. */
151 if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1)))
153 idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
154 rulearr = (unsigned char *) &idxarr[srclen];
156 if (idxarr == NULL)
157 /* No memory. Well, go with the stack then.
159 XXX Once this implementation is stable we will handle this
160 differently. Instead of precomputing the indeces we will
161 do this in time. This means, though, that this happens for
162 every pass again. */
163 goto try_stack;
164 use_malloc = 1;
166 else
168 try_stack:
169 idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
170 rulearr = (unsigned char *) alloca (srclen + 1);
173 idxmax = 0;
176 int32_t tmp = findidx (table, indirect, extra, &usrc, -1);
177 rulearr[idxmax] = tmp >> 24;
178 idxarr[idxmax] = tmp & 0xffffff;
180 ++idxmax;
182 while (*usrc != L('\0'));
184 /* This element is only read, the value never used but to determine
185 another value which then is ignored. */
186 rulearr[idxmax] = '\0';
188 /* Now the passes over the weights. We now use the indeces we found
189 before. */
190 needed = 0;
191 for (pass = 0; pass < nrules; ++pass)
193 size_t backw_stop = ~0ul;
194 int rule = rulesets[rulearr[0] * nrules + pass];
195 /* We assume that if a rule has defined `position' in one section
196 this is true for all of them. */
197 int position = rule & sort_position;
199 last_needed = needed;
200 if (position == 0)
202 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
204 if ((rule & sort_forward) != 0)
206 size_t len;
208 if (backw_stop != ~0ul)
210 /* Handle the pushed elements now. */
211 size_t backw;
213 for (backw = idxcnt; backw > backw_stop; )
215 --backw;
216 len = weights[idxarr[backw]++];
218 if (needed + len < n)
219 while (len-- > 0)
220 dest[needed++] = weights[idxarr[backw]++];
221 else
223 /* No more characters fit into the buffer. */
224 needed += len;
225 idxarr[backw] += len;
229 backw_stop = ~0ul;
232 /* Now handle the forward element. */
233 len = weights[idxarr[idxcnt]++];
234 if (needed + len < n)
235 while (len-- > 0)
236 dest[needed++] = weights[idxarr[idxcnt]++];
237 else
239 /* No more characters fit into the buffer. */
240 needed += len;
241 idxarr[idxcnt] += len;
244 else
246 /* Remember where the backwards series started. */
247 if (backw_stop == ~0ul)
248 backw_stop = idxcnt;
251 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
255 if (backw_stop != ~0ul)
257 /* Handle the pushed elements now. */
258 size_t backw;
260 backw = idxcnt;
261 while (backw > backw_stop)
263 size_t len = weights[idxarr[--backw]++];
265 if (needed + len < n)
266 while (len-- > 0)
267 dest[needed++] = weights[idxarr[backw]++];
268 else
270 /* No more characters fit into the buffer. */
271 needed += len;
272 idxarr[backw] += len;
277 else
279 int val = 1;
280 #ifndef WIDE_CHAR_VERSION
281 char buf[7];
282 size_t buflen;
283 #endif
284 size_t i;
286 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
288 if ((rule & sort_forward) != 0)
290 size_t len;
292 if (backw_stop != ~0ul)
294 /* Handle the pushed elements now. */
295 size_t backw;
297 for (backw = idxcnt; backw > backw_stop; )
299 --backw;
300 len = weights[idxarr[backw]++];
301 if (len != 0)
303 #ifdef WIDE_CHAR_VERSION
304 if (needed + 1 + len < n)
306 dest[needed] = val;
307 for (i = 0; i < len; ++i)
308 dest[needed + 1 + i] =
309 weights[idxarr[backw] + i];
311 needed += 1 + len;
312 #else
313 buflen = utf8_encode (buf, val);
314 if (needed + buflen + len < n)
316 for (i = 0; i < buflen; ++i)
317 dest[needed + i] = buf[i];
318 for (i = 0; i < len; ++i)
319 dest[needed + buflen + i] =
320 weights[idxarr[backw] + i];
322 needed += buflen + len;
323 #endif
324 idxarr[backw] += len;
325 val = 1;
327 else
328 ++val;
331 backw_stop = ~0ul;
334 /* Now handle the forward element. */
335 len = weights[idxarr[idxcnt]++];
336 if (len != 0)
338 #ifdef WIDE_CHAR_VERSION
339 if (needed + 1+ len < n)
341 dest[needed] = val;
342 for (i = 0; i < len; ++i)
343 dest[needed + 1 + i] =
344 weights[idxarr[idxcnt] + i];
346 needed += 1 + len;
347 #else
348 buflen = utf8_encode (buf, val);
349 if (needed + buflen + len < n)
351 for (i = 0; i < buflen; ++i)
352 dest[needed + i] = buf[i];
353 for (i = 0; i < len; ++i)
354 dest[needed + buflen + i] =
355 weights[idxarr[idxcnt] + i];
357 needed += buflen + len;
358 #endif
359 idxarr[idxcnt] += len;
360 val = 1;
362 else
363 /* Note that we don't have to increment `idxarr[idxcnt]'
364 since the length is zero. */
365 ++val;
367 else
369 /* Remember where the backwards series started. */
370 if (backw_stop == ~0ul)
371 backw_stop = idxcnt;
374 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
377 if (backw_stop != ~0ul)
379 /* Handle the pushed elements now. */
380 size_t backw;
382 backw = idxmax - 1;
383 while (backw > backw_stop)
385 size_t len = weights[idxarr[--backw]++];
386 if (len != 0)
388 #ifdef WIDE_CHAR_VERSION
389 if (needed + 1 + len < n)
391 dest[needed] = val;
392 for (i = 0; i < len; ++i)
393 dest[needed + 1 + i] =
394 weights[idxarr[backw] + i];
396 needed += 1 + len;
397 #else
398 buflen = utf8_encode (buf, val);
399 if (needed + buflen + len < n)
401 for (i = 0; i < buflen; ++i)
402 dest[needed + i] = buf[i];
403 for (i = 0; i < len; ++i)
404 dest[needed + buflen + i] =
405 weights[idxarr[backw] + i];
407 needed += buflen + len;
408 #endif
409 idxarr[backw] += len;
410 val = 1;
412 else
413 ++val;
418 /* Finally store the byte to separate the passes or terminate
419 the string. */
420 if (needed < n)
421 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
422 ++needed;
425 /* This is a little optimization: many collation specifications have
426 a `position' rule at the end and if no non-ignored character
427 is found the last \1 byte is immediately followed by a \0 byte
428 signalling this. We can avoid the \1 byte(s). */
429 if (needed > 2 && needed == last_needed + 1)
431 /* Remove the \1 byte. */
432 if (--needed <= n)
433 dest[needed - 1] = L('\0');
436 /* Free the memory if needed. */
437 if (use_malloc)
438 free (idxarr);
440 /* Return the number of bytes/words we need, but don't count the NUL
441 byte/word at the end. */
442 return needed - 1;
444 libc_hidden_def (STRXFRM)
446 #ifndef WIDE_CHAR_VERSION
447 weak_alias (__strxfrm_l, strxfrm_l)
448 #endif