Pashto yesstr/nostr locale are missing
[glibc.git] / string / strxfrm_l.c
blob7e04bfc14a9992c846effef04f9b8044beb832d8
1 /* Copyright (C) 1995-2017 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 #include <assert.h>
20 #include <langinfo.h>
21 #include <locale.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <sys/param.h>
28 #ifndef STRING_TYPE
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # define STRXFRM __strxfrm_l
32 # define STRLEN strlen
33 # define STPNCPY __stpncpy
34 # define WEIGHT_H "../locale/weight.h"
35 # define SUFFIX MB
36 # define L(arg) arg
37 #endif
39 #define CONCAT(a,b) CONCAT1(a,b)
40 #define CONCAT1(a,b) a##b
42 /* Maximum string size that is calculated with cached indices. Right now this
43 is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
44 lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
45 #define SMALL_STR_SIZE 4095
47 #include "../locale/localeinfo.h"
48 #include WEIGHT_H
50 /* Group locale data for shorter parameter lists. */
51 typedef struct
53 uint_fast32_t nrules;
54 unsigned char *rulesets;
55 USTRING_TYPE *weights;
56 int32_t *table;
57 USTRING_TYPE *extra;
58 int32_t *indirect;
59 } locale_data_t;
61 #ifndef WIDE_CHAR_VERSION
63 /* We need UTF-8 encoding of numbers. */
64 static int
65 utf8_encode (char *buf, int val)
67 int retval;
69 if (val < 0x80)
71 *buf++ = (char) val;
72 retval = 1;
74 else
76 int step;
78 for (step = 2; step < 6; ++step)
79 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
80 break;
81 retval = step;
83 *buf = (unsigned char) (~0xff >> step);
84 --step;
87 buf[step] = 0x80 | (val & 0x3f);
88 val >>= 6;
90 while (--step > 0);
91 *buf |= val;
94 return retval;
96 #endif
98 /* Find next weight and rule index. Inlined since called for every char. */
99 static __always_inline size_t
100 find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
101 unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
103 int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
104 -1);
105 *rule_idx = tmp >> 24;
106 int32_t idx = tmp & 0xffffff;
107 size_t len = l_data->weights[idx++];
109 /* Skip over indices of previous levels. */
110 for (int i = 0; i < pass; i++)
112 idx += len;
113 len = l_data->weights[idx++];
116 *weight_idx = idx;
117 return len;
120 static int
121 find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
122 const int pass)
124 int32_t weight_idx;
125 unsigned char rule_idx;
126 const USTRING_TYPE *usrc = us;
128 find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
129 return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
132 /* Do the transformation. */
133 static size_t
134 do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
135 const locale_data_t *l_data)
137 int32_t weight_idx;
138 unsigned char rule_idx;
139 uint_fast32_t pass;
140 size_t needed = 0;
141 size_t last_needed;
143 /* Now the passes over the weights. */
144 for (pass = 0; pass < l_data->nrules; ++pass)
146 size_t backw_len = 0;
147 last_needed = needed;
148 const USTRING_TYPE *cur = usrc;
149 const USTRING_TYPE *backw_start = NULL;
151 /* We assume that if a rule has defined `position' in one section
152 this is true for all of them. */
153 int position = find_position (cur, l_data, pass);
155 if (position == 0)
157 while (*cur != L('\0'))
159 const USTRING_TYPE *pos = cur;
160 size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
161 pass);
162 int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
164 if ((rule & sort_forward) != 0)
166 /* Handle the pushed backward sequence. */
167 if (backw_start != NULL)
169 for (size_t i = backw_len; i > 0; )
171 int32_t weight_idx;
172 unsigned char rule_idx;
173 size_t len = find_idx (&backw_start, &weight_idx,
174 &rule_idx, l_data, pass);
175 if (needed + i < n)
176 for (size_t j = len; j > 0; j--)
177 dest[needed + i - j] =
178 l_data->weights[weight_idx++];
180 i -= len;
183 needed += backw_len;
184 backw_start = NULL;
185 backw_len = 0;
188 /* Now handle the forward element. */
189 if (needed + len < n)
190 while (len-- > 0)
191 dest[needed++] = l_data->weights[weight_idx++];
192 else
193 /* No more characters fit into the buffer. */
194 needed += len;
196 else
198 /* Remember start of the backward sequence & track length. */
199 if (backw_start == NULL)
200 backw_start = pos;
201 backw_len += len;
206 /* Handle the pushed backward sequence. */
207 if (backw_start != NULL)
209 for (size_t i = backw_len; i > 0; )
211 size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
212 l_data, pass);
213 if (needed + i < n)
214 for (size_t j = len; j > 0; j--)
215 dest[needed + i - j] =
216 l_data->weights[weight_idx++];
218 i -= len;
221 needed += backw_len;
224 else
226 int val = 1;
227 #ifndef WIDE_CHAR_VERSION
228 char buf[7];
229 size_t buflen;
230 #endif
231 size_t i;
233 while (*cur != L('\0'))
235 const USTRING_TYPE *pos = cur;
236 size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
237 pass);
238 int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
240 if ((rule & sort_forward) != 0)
242 /* Handle the pushed backward sequence. */
243 if (backw_start != NULL)
245 for (size_t p = backw_len; p > 0; p--)
247 size_t len;
248 int32_t weight_idx;
249 unsigned char rule_idx;
250 const USTRING_TYPE *backw_cur = backw_start;
252 /* To prevent a warning init the used vars. */
253 len = find_idx (&backw_cur, &weight_idx,
254 &rule_idx, l_data, pass);
256 for (i = 1; i < p; i++)
257 len = find_idx (&backw_cur, &weight_idx,
258 &rule_idx, l_data, pass);
260 if (len != 0)
262 #ifdef WIDE_CHAR_VERSION
263 if (needed + 1 + len < n)
265 dest[needed] = val;
266 for (i = 0; i < len; ++i)
267 dest[needed + 1 + i] =
268 l_data->weights[weight_idx + i];
270 needed += 1 + len;
271 #else
272 buflen = utf8_encode (buf, val);
273 if (needed + buflen + len < n)
275 for (i = 0; i < buflen; ++i)
276 dest[needed + i] = buf[i];
277 for (i = 0; i < len; ++i)
278 dest[needed + buflen + i] =
279 l_data->weights[weight_idx + i];
281 needed += buflen + len;
282 #endif
283 val = 1;
285 else
286 ++val;
289 backw_start = NULL;
290 backw_len = 0;
293 /* Now handle the forward element. */
294 if (len != 0)
296 #ifdef WIDE_CHAR_VERSION
297 if (needed + 1 + len < n)
299 dest[needed] = val;
300 for (i = 0; i < len; ++i)
301 dest[needed + 1 + i] =
302 l_data->weights[weight_idx + i];
304 needed += 1 + len;
305 #else
306 buflen = utf8_encode (buf, val);
307 if (needed + buflen + len < n)
309 for (i = 0; i < buflen; ++i)
310 dest[needed + i] = buf[i];
311 for (i = 0; i < len; ++i)
312 dest[needed + buflen + i] =
313 l_data->weights[weight_idx + i];
315 needed += buflen + len;
316 #endif
317 val = 1;
319 else
320 ++val;
322 else
324 /* Remember start of the backward sequence & track length. */
325 if (backw_start == NULL)
326 backw_start = pos;
327 backw_len++;
331 /* Handle the pushed backward sequence. */
332 if (backw_start != NULL)
334 for (size_t p = backw_len; p > 0; p--)
336 size_t len;
337 int32_t weight_idx;
338 unsigned char rule_idx;
339 const USTRING_TYPE *backw_cur = backw_start;
341 /* To prevent a warning init the used vars. */
342 len = find_idx (&backw_cur, &weight_idx,
343 &rule_idx, l_data, pass);
345 for (i = 1; i < p; i++)
346 len = find_idx (&backw_cur, &weight_idx,
347 &rule_idx, l_data, pass);
349 if (len != 0)
351 #ifdef WIDE_CHAR_VERSION
352 if (needed + 1 + len < n)
354 dest[needed] = val;
355 for (i = 0; i < len; ++i)
356 dest[needed + 1 + i] =
357 l_data->weights[weight_idx + i];
359 needed += 1 + len;
360 #else
361 buflen = utf8_encode (buf, val);
362 if (needed + buflen + len < n)
364 for (i = 0; i < buflen; ++i)
365 dest[needed + i] = buf[i];
366 for (i = 0; i < len; ++i)
367 dest[needed + buflen + i] =
368 l_data->weights[weight_idx + i];
370 needed += buflen + len;
371 #endif
372 val = 1;
374 else
375 ++val;
380 /* Finally store the byte to separate the passes or terminate
381 the string. */
382 if (needed < n)
383 dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
384 ++needed;
387 /* This is a little optimization: many collation specifications have
388 a `position' rule at the end and if no non-ignored character
389 is found the last \1 byte is immediately followed by a \0 byte
390 signalling this. We can avoid the \1 byte(s). */
391 if (needed > 2 && needed == last_needed + 1)
393 /* Remove the \1 byte. */
394 if (--needed <= n)
395 dest[needed - 1] = L('\0');
398 /* Return the number of bytes/words we need, but don't count the NUL
399 byte/word at the end. */
400 return needed - 1;
403 /* Do the transformation using weight-index and rule cache. */
404 static size_t
405 do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
406 size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
408 uint_fast32_t nrules = l_data->nrules;
409 unsigned char *rulesets = l_data->rulesets;
410 USTRING_TYPE *weights = l_data->weights;
411 uint_fast32_t pass;
412 size_t needed = 0;
413 size_t last_needed;
414 size_t idxcnt;
416 /* Now the passes over the weights. */
417 for (pass = 0; pass < nrules; ++pass)
419 size_t backw_stop = ~0ul;
420 int rule = rulesets[rulearr[0] * nrules + pass];
421 /* We assume that if a rule has defined `position' in one section
422 this is true for all of them. */
423 int position = rule & sort_position;
425 last_needed = needed;
426 if (position == 0)
428 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
430 if ((rule & sort_forward) != 0)
432 size_t len;
434 if (backw_stop != ~0ul)
436 /* Handle the pushed elements now. */
437 size_t backw;
439 for (backw = idxcnt; backw > backw_stop; )
441 --backw;
442 len = weights[idxarr[backw]++];
444 if (needed + len < n)
445 while (len-- > 0)
446 dest[needed++] = weights[idxarr[backw]++];
447 else
449 /* No more characters fit into the buffer. */
450 needed += len;
451 idxarr[backw] += len;
455 backw_stop = ~0ul;
458 /* Now handle the forward element. */
459 len = weights[idxarr[idxcnt]++];
460 if (needed + len < n)
461 while (len-- > 0)
462 dest[needed++] = weights[idxarr[idxcnt]++];
463 else
465 /* No more characters fit into the buffer. */
466 needed += len;
467 idxarr[idxcnt] += len;
470 else
472 /* Remember where the backwards series started. */
473 if (backw_stop == ~0ul)
474 backw_stop = idxcnt;
477 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
481 if (backw_stop != ~0ul)
483 /* Handle the pushed elements now. */
484 size_t backw;
486 backw = idxcnt;
487 while (backw > backw_stop)
489 size_t len = weights[idxarr[--backw]++];
491 if (needed + len < n)
492 while (len-- > 0)
493 dest[needed++] = weights[idxarr[backw]++];
494 else
496 /* No more characters fit into the buffer. */
497 needed += len;
498 idxarr[backw] += len;
503 else
505 int val = 1;
506 #ifndef WIDE_CHAR_VERSION
507 char buf[7];
508 size_t buflen;
509 #endif
510 size_t i;
512 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
514 if ((rule & sort_forward) != 0)
516 size_t len;
518 if (backw_stop != ~0ul)
520 /* Handle the pushed elements now. */
521 size_t backw;
523 for (backw = idxcnt; backw > backw_stop; )
525 --backw;
526 len = weights[idxarr[backw]++];
527 if (len != 0)
529 #ifdef WIDE_CHAR_VERSION
530 if (needed + 1 + len < n)
532 dest[needed] = val;
533 for (i = 0; i < len; ++i)
534 dest[needed + 1 + i] =
535 weights[idxarr[backw] + i];
537 needed += 1 + len;
538 #else
539 buflen = utf8_encode (buf, val);
540 if (needed + buflen + len < n)
542 for (i = 0; i < buflen; ++i)
543 dest[needed + i] = buf[i];
544 for (i = 0; i < len; ++i)
545 dest[needed + buflen + i] =
546 weights[idxarr[backw] + i];
548 needed += buflen + len;
549 #endif
550 idxarr[backw] += len;
551 val = 1;
553 else
554 ++val;
557 backw_stop = ~0ul;
560 /* Now handle the forward element. */
561 len = weights[idxarr[idxcnt]++];
562 if (len != 0)
564 #ifdef WIDE_CHAR_VERSION
565 if (needed + 1+ len < n)
567 dest[needed] = val;
568 for (i = 0; i < len; ++i)
569 dest[needed + 1 + i] =
570 weights[idxarr[idxcnt] + i];
572 needed += 1 + len;
573 #else
574 buflen = utf8_encode (buf, val);
575 if (needed + buflen + len < n)
577 for (i = 0; i < buflen; ++i)
578 dest[needed + i] = buf[i];
579 for (i = 0; i < len; ++i)
580 dest[needed + buflen + i] =
581 weights[idxarr[idxcnt] + i];
583 needed += buflen + len;
584 #endif
585 idxarr[idxcnt] += len;
586 val = 1;
588 else
589 /* Note that we don't have to increment `idxarr[idxcnt]'
590 since the length is zero. */
591 ++val;
593 else
595 /* Remember where the backwards series started. */
596 if (backw_stop == ~0ul)
597 backw_stop = idxcnt;
600 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
603 if (backw_stop != ~0ul)
605 /* Handle the pushed elements now. */
606 size_t backw;
608 backw = idxmax - 1;
609 while (backw > backw_stop)
611 size_t len = weights[idxarr[--backw]++];
612 if (len != 0)
614 #ifdef WIDE_CHAR_VERSION
615 if (needed + 1 + len < n)
617 dest[needed] = val;
618 for (i = 0; i < len; ++i)
619 dest[needed + 1 + i] =
620 weights[idxarr[backw] + i];
622 needed += 1 + len;
623 #else
624 buflen = utf8_encode (buf, val);
625 if (needed + buflen + len < n)
627 for (i = 0; i < buflen; ++i)
628 dest[needed + i] = buf[i];
629 for (i = 0; i < len; ++i)
630 dest[needed + buflen + i] =
631 weights[idxarr[backw] + i];
633 needed += buflen + len;
634 #endif
635 idxarr[backw] += len;
636 val = 1;
638 else
639 ++val;
644 /* Finally store the byte to separate the passes or terminate
645 the string. */
646 if (needed < n)
647 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
648 ++needed;
651 /* This is a little optimization: many collation specifications have
652 a `position' rule at the end and if no non-ignored character
653 is found the last \1 byte is immediately followed by a \0 byte
654 signalling this. We can avoid the \1 byte(s). */
655 if (needed > 2 && needed == last_needed + 1)
657 /* Remove the \1 byte. */
658 if (--needed <= n)
659 dest[needed - 1] = L('\0');
662 /* Return the number of bytes/words we need, but don't count the NUL
663 byte/word at the end. */
664 return needed - 1;
667 size_t
668 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, locale_t l)
670 locale_data_t l_data;
671 struct __locale_data *current = l->__locales[LC_COLLATE];
672 l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
674 /* Handle byte comparison case. */
675 if (l_data.nrules == 0)
677 size_t srclen = STRLEN (src);
679 if (n != 0)
680 STPNCPY (dest, src, MIN (srclen + 1, n));
682 return srclen;
685 /* Handle an empty string, code hereafter relies on strlen (src) > 0. */
686 if (*src == L('\0'))
688 if (n != 0)
689 *dest = L('\0');
690 return 0;
693 /* Get the locale data. */
694 l_data.rulesets = (unsigned char *)
695 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
696 l_data.table = (int32_t *)
697 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
698 l_data.weights = (USTRING_TYPE *)
699 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
700 l_data.extra = (USTRING_TYPE *)
701 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
702 l_data.indirect = (int32_t *)
703 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
705 assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
706 assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
707 assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
708 assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
710 /* We need the elements of the string as unsigned values since they
711 are used as indeces. */
712 const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
714 /* Allocate cache for small strings on the stack and fill it with weight and
715 rule indices. If the cache size is not sufficient, continue with the
716 uncached xfrm version. */
717 size_t idxmax = 0;
718 const USTRING_TYPE *cur = usrc;
719 int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
720 unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
724 int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
725 -1);
726 rulearr[idxmax] = tmp >> 24;
727 idxarr[idxmax] = tmp & 0xffffff;
729 ++idxmax;
731 while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
733 /* This element is only read, the value never used but to determine
734 another value which then is ignored. */
735 rulearr[idxmax] = '\0';
737 /* Do the transformation. */
738 if (*cur == L('\0'))
739 return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
740 else
741 return do_xfrm (usrc, dest, n, &l_data);
743 libc_hidden_def (STRXFRM)
745 #ifndef WIDE_CHAR_VERSION
746 weak_alias (__strxfrm_l, strxfrm_l)
747 #endif