Fix memory handling in strxfrm_l [BZ #16009]
[glibc.git] / string / strxfrm_l.c
blob921d1f76c47d440f7e8de054f97b00a1334fc994
1 /* Copyright (C) 1995-2015 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 #include <assert.h>
20 #include <langinfo.h>
21 #include <locale.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <sys/param.h>
28 #ifndef STRING_TYPE
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # define STRXFRM __strxfrm_l
32 # define STRCMP strcmp
33 # define STRLEN strlen
34 # define STPNCPY __stpncpy
35 # define WEIGHT_H "../locale/weight.h"
36 # define SUFFIX MB
37 # define L(arg) arg
38 #endif
40 #define CONCAT(a,b) CONCAT1(a,b)
41 #define CONCAT1(a,b) a##b
43 /* Maximum string size that is calculated with cached indices. Right now this
44 is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
45 lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
46 #define SMALL_STR_SIZE 4095
48 #include "../locale/localeinfo.h"
49 #include WEIGHT_H
51 /* Group locale data for shorter parameter lists. */
52 typedef struct
54 uint_fast32_t nrules;
55 unsigned char *rulesets;
56 USTRING_TYPE *weights;
57 int32_t *table;
58 USTRING_TYPE *extra;
59 int32_t *indirect;
60 } locale_data_t;
62 #ifndef WIDE_CHAR_VERSION
64 /* We need UTF-8 encoding of numbers. */
65 static int
66 utf8_encode (char *buf, int val)
68 int retval;
70 if (val < 0x80)
72 *buf++ = (char) val;
73 retval = 1;
75 else
77 int step;
79 for (step = 2; step < 6; ++step)
80 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
81 break;
82 retval = step;
84 *buf = (unsigned char) (~0xff >> step);
85 --step;
88 buf[step] = 0x80 | (val & 0x3f);
89 val >>= 6;
91 while (--step > 0);
92 *buf |= val;
95 return retval;
97 #endif
99 /* Find next weight and rule index. Inlined since called for every char. */
100 static __always_inline size_t
101 find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
102 unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
104 int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
105 -1);
106 *rule_idx = tmp >> 24;
107 int32_t idx = tmp & 0xffffff;
108 size_t len = l_data->weights[idx++];
110 /* Skip over indices of previous levels. */
111 for (int i = 0; i < pass; i++)
113 idx += len;
114 len = l_data->weights[idx++];
117 *weight_idx = idx;
118 return len;
121 static int
122 find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
123 const int pass)
125 int32_t weight_idx;
126 unsigned char rule_idx;
127 const USTRING_TYPE *usrc = us;
129 find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
130 return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
133 /* Do the transformation. */
134 static size_t
135 do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
136 const locale_data_t *l_data)
138 int32_t weight_idx;
139 unsigned char rule_idx;
140 uint_fast32_t pass;
141 size_t needed = 0;
142 size_t last_needed;
144 /* Now the passes over the weights. */
145 for (pass = 0; pass < l_data->nrules; ++pass)
147 size_t backw_len = 0;
148 last_needed = needed;
149 const USTRING_TYPE *cur = usrc;
150 const USTRING_TYPE *backw_start = NULL;
152 /* We assume that if a rule has defined `position' in one section
153 this is true for all of them. */
154 int position = find_position (cur, l_data, pass);
156 if (position == 0)
158 while (*cur != L('\0'))
160 const USTRING_TYPE *pos = cur;
161 size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
162 pass);
163 int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
165 if ((rule & sort_forward) != 0)
167 /* Handle the pushed backward sequence. */
168 if (backw_start != NULL)
170 for (size_t i = backw_len; i > 0; )
172 int32_t weight_idx;
173 unsigned char rule_idx;
174 size_t len = find_idx (&backw_start, &weight_idx,
175 &rule_idx, l_data, pass);
176 if (needed + i < n)
177 for (size_t j = len; j > 0; j--)
178 dest[needed + i - j] =
179 l_data->weights[weight_idx++];
181 i -= len;
184 needed += backw_len;
185 backw_start = NULL;
186 backw_len = 0;
189 /* Now handle the forward element. */
190 if (needed + len < n)
191 while (len-- > 0)
192 dest[needed++] = l_data->weights[weight_idx++];
193 else
194 /* No more characters fit into the buffer. */
195 needed += len;
197 else
199 /* Remember start of the backward sequence & track length. */
200 if (backw_start == NULL)
201 backw_start = pos;
202 backw_len += len;
207 /* Handle the pushed backward sequence. */
208 if (backw_start != NULL)
210 for (size_t i = backw_len; i > 0; )
212 size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
213 l_data, pass);
214 if (needed + i < n)
215 for (size_t j = len; j > 0; j--)
216 dest[needed + i - j] =
217 l_data->weights[weight_idx++];
219 i -= len;
222 needed += backw_len;
225 else
227 int val = 1;
228 #ifndef WIDE_CHAR_VERSION
229 char buf[7];
230 size_t buflen;
231 #endif
232 size_t i;
234 while (*cur != L('\0'))
236 const USTRING_TYPE *pos = cur;
237 size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
238 pass);
239 int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
241 if ((rule & sort_forward) != 0)
243 /* Handle the pushed backward sequence. */
244 if (backw_start != NULL)
246 for (size_t p = backw_len; p > 0; p--)
248 size_t len;
249 int32_t weight_idx;
250 unsigned char rule_idx;
251 const USTRING_TYPE *backw_cur = backw_start;
253 /* To prevent a warning init the used vars. */
254 len = find_idx (&backw_cur, &weight_idx,
255 &rule_idx, l_data, pass);
257 for (i = 1; i < p; i++)
258 len = find_idx (&backw_cur, &weight_idx,
259 &rule_idx, l_data, pass);
261 if (len != 0)
263 #ifdef WIDE_CHAR_VERSION
264 if (needed + 1 + len < n)
266 dest[needed] = val;
267 for (i = 0; i < len; ++i)
268 dest[needed + 1 + i] =
269 l_data->weights[weight_idx + i];
271 needed += 1 + len;
272 #else
273 buflen = utf8_encode (buf, val);
274 if (needed + buflen + len < n)
276 for (i = 0; i < buflen; ++i)
277 dest[needed + i] = buf[i];
278 for (i = 0; i < len; ++i)
279 dest[needed + buflen + i] =
280 l_data->weights[weight_idx + i];
282 needed += buflen + len;
283 #endif
284 val = 1;
286 else
287 ++val;
290 backw_start = NULL;
291 backw_len = 0;
294 /* Now handle the forward element. */
295 if (len != 0)
297 #ifdef WIDE_CHAR_VERSION
298 if (needed + 1 + len < n)
300 dest[needed] = val;
301 for (i = 0; i < len; ++i)
302 dest[needed + 1 + i] =
303 l_data->weights[weight_idx + i];
305 needed += 1 + len;
306 #else
307 buflen = utf8_encode (buf, val);
308 if (needed + buflen + len < n)
310 for (i = 0; i < buflen; ++i)
311 dest[needed + i] = buf[i];
312 for (i = 0; i < len; ++i)
313 dest[needed + buflen + i] =
314 l_data->weights[weight_idx + i];
316 needed += buflen + len;
317 #endif
318 val = 1;
320 else
321 ++val;
323 else
325 /* Remember start of the backward sequence & track length. */
326 if (backw_start == NULL)
327 backw_start = pos;
328 backw_len++;
332 /* Handle the pushed backward sequence. */
333 if (backw_start != NULL)
335 for (size_t p = backw_len; p > 0; p--)
337 size_t len;
338 int32_t weight_idx;
339 unsigned char rule_idx;
340 const USTRING_TYPE *backw_cur = backw_start;
342 /* To prevent a warning init the used vars. */
343 len = find_idx (&backw_cur, &weight_idx,
344 &rule_idx, l_data, pass);
346 for (i = 1; i < p; i++)
347 len = find_idx (&backw_cur, &weight_idx,
348 &rule_idx, l_data, pass);
350 if (len != 0)
352 #ifdef WIDE_CHAR_VERSION
353 if (needed + 1 + len < n)
355 dest[needed] = val;
356 for (i = 0; i < len; ++i)
357 dest[needed + 1 + i] =
358 l_data->weights[weight_idx + i];
360 needed += 1 + len;
361 #else
362 buflen = utf8_encode (buf, val);
363 if (needed + buflen + len < n)
365 for (i = 0; i < buflen; ++i)
366 dest[needed + i] = buf[i];
367 for (i = 0; i < len; ++i)
368 dest[needed + buflen + i] =
369 l_data->weights[weight_idx + i];
371 needed += buflen + len;
372 #endif
373 val = 1;
375 else
376 ++val;
381 /* Finally store the byte to separate the passes or terminate
382 the string. */
383 if (needed < n)
384 dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
385 ++needed;
388 /* This is a little optimization: many collation specifications have
389 a `position' rule at the end and if no non-ignored character
390 is found the last \1 byte is immediately followed by a \0 byte
391 signalling this. We can avoid the \1 byte(s). */
392 if (needed > 2 && needed == last_needed + 1)
394 /* Remove the \1 byte. */
395 if (--needed <= n)
396 dest[needed - 1] = L('\0');
399 /* Return the number of bytes/words we need, but don't count the NUL
400 byte/word at the end. */
401 return needed - 1;
404 /* Do the transformation using weight-index and rule cache. */
405 static size_t
406 do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
407 size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
409 uint_fast32_t nrules = l_data->nrules;
410 unsigned char *rulesets = l_data->rulesets;
411 USTRING_TYPE *weights = l_data->weights;
412 uint_fast32_t pass;
413 size_t needed = 0;
414 size_t last_needed;
415 size_t idxcnt;
417 /* Now the passes over the weights. */
418 for (pass = 0; pass < nrules; ++pass)
420 size_t backw_stop = ~0ul;
421 int rule = rulesets[rulearr[0] * nrules + pass];
422 /* We assume that if a rule has defined `position' in one section
423 this is true for all of them. */
424 int position = rule & sort_position;
426 last_needed = needed;
427 if (position == 0)
429 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
431 if ((rule & sort_forward) != 0)
433 size_t len;
435 if (backw_stop != ~0ul)
437 /* Handle the pushed elements now. */
438 size_t backw;
440 for (backw = idxcnt; backw > backw_stop; )
442 --backw;
443 len = weights[idxarr[backw]++];
445 if (needed + len < n)
446 while (len-- > 0)
447 dest[needed++] = weights[idxarr[backw]++];
448 else
450 /* No more characters fit into the buffer. */
451 needed += len;
452 idxarr[backw] += len;
456 backw_stop = ~0ul;
459 /* Now handle the forward element. */
460 len = weights[idxarr[idxcnt]++];
461 if (needed + len < n)
462 while (len-- > 0)
463 dest[needed++] = weights[idxarr[idxcnt]++];
464 else
466 /* No more characters fit into the buffer. */
467 needed += len;
468 idxarr[idxcnt] += len;
471 else
473 /* Remember where the backwards series started. */
474 if (backw_stop == ~0ul)
475 backw_stop = idxcnt;
478 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
482 if (backw_stop != ~0ul)
484 /* Handle the pushed elements now. */
485 size_t backw;
487 backw = idxcnt;
488 while (backw > backw_stop)
490 size_t len = weights[idxarr[--backw]++];
492 if (needed + len < n)
493 while (len-- > 0)
494 dest[needed++] = weights[idxarr[backw]++];
495 else
497 /* No more characters fit into the buffer. */
498 needed += len;
499 idxarr[backw] += len;
504 else
506 int val = 1;
507 #ifndef WIDE_CHAR_VERSION
508 char buf[7];
509 size_t buflen;
510 #endif
511 size_t i;
513 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
515 if ((rule & sort_forward) != 0)
517 size_t len;
519 if (backw_stop != ~0ul)
521 /* Handle the pushed elements now. */
522 size_t backw;
524 for (backw = idxcnt; backw > backw_stop; )
526 --backw;
527 len = weights[idxarr[backw]++];
528 if (len != 0)
530 #ifdef WIDE_CHAR_VERSION
531 if (needed + 1 + len < n)
533 dest[needed] = val;
534 for (i = 0; i < len; ++i)
535 dest[needed + 1 + i] =
536 weights[idxarr[backw] + i];
538 needed += 1 + len;
539 #else
540 buflen = utf8_encode (buf, val);
541 if (needed + buflen + len < n)
543 for (i = 0; i < buflen; ++i)
544 dest[needed + i] = buf[i];
545 for (i = 0; i < len; ++i)
546 dest[needed + buflen + i] =
547 weights[idxarr[backw] + i];
549 needed += buflen + len;
550 #endif
551 idxarr[backw] += len;
552 val = 1;
554 else
555 ++val;
558 backw_stop = ~0ul;
561 /* Now handle the forward element. */
562 len = weights[idxarr[idxcnt]++];
563 if (len != 0)
565 #ifdef WIDE_CHAR_VERSION
566 if (needed + 1+ len < n)
568 dest[needed] = val;
569 for (i = 0; i < len; ++i)
570 dest[needed + 1 + i] =
571 weights[idxarr[idxcnt] + i];
573 needed += 1 + len;
574 #else
575 buflen = utf8_encode (buf, val);
576 if (needed + buflen + len < n)
578 for (i = 0; i < buflen; ++i)
579 dest[needed + i] = buf[i];
580 for (i = 0; i < len; ++i)
581 dest[needed + buflen + i] =
582 weights[idxarr[idxcnt] + i];
584 needed += buflen + len;
585 #endif
586 idxarr[idxcnt] += len;
587 val = 1;
589 else
590 /* Note that we don't have to increment `idxarr[idxcnt]'
591 since the length is zero. */
592 ++val;
594 else
596 /* Remember where the backwards series started. */
597 if (backw_stop == ~0ul)
598 backw_stop = idxcnt;
601 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
604 if (backw_stop != ~0ul)
606 /* Handle the pushed elements now. */
607 size_t backw;
609 backw = idxmax - 1;
610 while (backw > backw_stop)
612 size_t len = weights[idxarr[--backw]++];
613 if (len != 0)
615 #ifdef WIDE_CHAR_VERSION
616 if (needed + 1 + len < n)
618 dest[needed] = val;
619 for (i = 0; i < len; ++i)
620 dest[needed + 1 + i] =
621 weights[idxarr[backw] + i];
623 needed += 1 + len;
624 #else
625 buflen = utf8_encode (buf, val);
626 if (needed + buflen + len < n)
628 for (i = 0; i < buflen; ++i)
629 dest[needed + i] = buf[i];
630 for (i = 0; i < len; ++i)
631 dest[needed + buflen + i] =
632 weights[idxarr[backw] + i];
634 needed += buflen + len;
635 #endif
636 idxarr[backw] += len;
637 val = 1;
639 else
640 ++val;
645 /* Finally store the byte to separate the passes or terminate
646 the string. */
647 if (needed < n)
648 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
649 ++needed;
652 /* This is a little optimization: many collation specifications have
653 a `position' rule at the end and if no non-ignored character
654 is found the last \1 byte is immediately followed by a \0 byte
655 signalling this. We can avoid the \1 byte(s). */
656 if (needed > 2 && needed == last_needed + 1)
658 /* Remove the \1 byte. */
659 if (--needed <= n)
660 dest[needed - 1] = L('\0');
663 /* Return the number of bytes/words we need, but don't count the NUL
664 byte/word at the end. */
665 return needed - 1;
668 size_t
669 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
671 locale_data_t l_data;
672 struct __locale_data *current = l->__locales[LC_COLLATE];
673 l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
675 /* Handle byte comparison case. */
676 if (l_data.nrules == 0)
678 size_t srclen = STRLEN (src);
680 if (n != 0)
681 STPNCPY (dest, src, MIN (srclen + 1, n));
683 return srclen;
686 /* Handle an empty string, code hereafter relies on strlen (src) > 0. */
687 if (*src == L('\0'))
689 if (n != 0)
690 *dest = L('\0');
691 return 0;
694 /* Get the locale data. */
695 l_data.rulesets = (unsigned char *)
696 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
697 l_data.table = (int32_t *)
698 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
699 l_data.weights = (USTRING_TYPE *)
700 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
701 l_data.extra = (USTRING_TYPE *)
702 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
703 l_data.indirect = (int32_t *)
704 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
706 assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
707 assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
708 assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
709 assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
711 /* We need the elements of the string as unsigned values since they
712 are used as indeces. */
713 const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
715 /* Allocate cache for small strings on the stack and fill it with weight and
716 rule indices. If the cache size is not sufficient, continue with the
717 uncached xfrm version. */
718 size_t idxmax = 0;
719 const USTRING_TYPE *cur = usrc;
720 int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
721 unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
725 int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
726 -1);
727 rulearr[idxmax] = tmp >> 24;
728 idxarr[idxmax] = tmp & 0xffffff;
730 ++idxmax;
732 while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
734 /* This element is only read, the value never used but to determine
735 another value which then is ignored. */
736 rulearr[idxmax] = '\0';
738 /* Do the transformation. */
739 if (*cur == L('\0'))
740 return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
741 else
742 return do_xfrm (usrc, dest, n, &l_data);
744 libc_hidden_def (STRXFRM)
746 #ifndef WIDE_CHAR_VERSION
747 weak_alias (__strxfrm_l, strxfrm_l)
748 #endif