1 /* Copyright (C) 1995-2021 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
26 #include <sys/param.h>
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # define STRXFRM __strxfrm_l
32 # define STRLEN strlen
33 # define STPNCPY __stpncpy
34 # define WEIGHT_H "../locale/weight.h"
39 #define CONCAT(a,b) CONCAT1(a,b)
40 #define CONCAT1(a,b) a##b
42 /* Maximum string size that is calculated with cached indices. Right now this
43 is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
44 lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
45 #define SMALL_STR_SIZE 4095
47 #include "../locale/localeinfo.h"
50 /* Group locale data for shorter parameter lists. */
54 unsigned char *rulesets
;
55 USTRING_TYPE
*weights
;
61 #ifndef WIDE_CHAR_VERSION
63 /* We need UTF-8 encoding of numbers. */
65 utf8_encode (char *buf
, int val
)
78 for (step
= 2; step
< 6; ++step
)
79 if ((val
& (~(uint32_t)0 << (5 * step
+ 1))) == 0)
83 *buf
= (unsigned char) (~0xff >> step
);
87 buf
[step
] = 0x80 | (val
& 0x3f);
98 /* Find next weight and rule index. Inlined since called for every char. */
99 static __always_inline
size_t
100 find_idx (const USTRING_TYPE
**us
, int32_t *weight_idx
,
101 unsigned char *rule_idx
, const locale_data_t
*l_data
, const int pass
)
103 int32_t tmp
= findidx (l_data
->table
, l_data
->indirect
, l_data
->extra
, us
,
105 *rule_idx
= tmp
>> 24;
106 int32_t idx
= tmp
& 0xffffff;
107 size_t len
= l_data
->weights
[idx
++];
109 /* Skip over indices of previous levels. */
110 for (int i
= 0; i
< pass
; i
++)
113 len
= l_data
->weights
[idx
++];
121 find_position (const USTRING_TYPE
*us
, const locale_data_t
*l_data
,
125 unsigned char rule_idx
;
126 const USTRING_TYPE
*usrc
= us
;
128 find_idx (&usrc
, &weight_idx
, &rule_idx
, l_data
, pass
);
129 return l_data
->rulesets
[rule_idx
* l_data
->nrules
+ pass
] & sort_position
;
132 /* Do the transformation. */
134 do_xfrm (const USTRING_TYPE
*usrc
, STRING_TYPE
*dest
, size_t n
,
135 const locale_data_t
*l_data
)
138 unsigned char rule_idx
;
143 /* Now the passes over the weights. */
144 for (pass
= 0; pass
< l_data
->nrules
; ++pass
)
146 size_t backw_len
= 0;
147 last_needed
= needed
;
148 const USTRING_TYPE
*cur
= usrc
;
149 const USTRING_TYPE
*backw_start
= NULL
;
151 /* We assume that if a rule has defined `position' in one section
152 this is true for all of them. */
153 int position
= find_position (cur
, l_data
, pass
);
157 while (*cur
!= L('\0'))
159 const USTRING_TYPE
*pos
= cur
;
160 size_t len
= find_idx (&cur
, &weight_idx
, &rule_idx
, l_data
,
162 int rule
= l_data
->rulesets
[rule_idx
* l_data
->nrules
+ pass
];
164 if ((rule
& sort_forward
) != 0)
166 /* Handle the pushed backward sequence. */
167 if (backw_start
!= NULL
)
169 for (size_t i
= backw_len
; i
> 0; )
172 unsigned char rule_idx
;
173 size_t len
= find_idx (&backw_start
, &weight_idx
,
174 &rule_idx
, l_data
, pass
);
176 for (size_t j
= len
; j
> 0; j
--)
177 dest
[needed
+ i
- j
] =
178 l_data
->weights
[weight_idx
++];
188 /* Now handle the forward element. */
189 if (needed
+ len
< n
)
191 dest
[needed
++] = l_data
->weights
[weight_idx
++];
193 /* No more characters fit into the buffer. */
198 /* Remember start of the backward sequence & track length. */
199 if (backw_start
== NULL
)
206 /* Handle the pushed backward sequence. */
207 if (backw_start
!= NULL
)
209 for (size_t i
= backw_len
; i
> 0; )
211 size_t len
= find_idx (&backw_start
, &weight_idx
, &rule_idx
,
214 for (size_t j
= len
; j
> 0; j
--)
215 dest
[needed
+ i
- j
] =
216 l_data
->weights
[weight_idx
++];
227 #ifndef WIDE_CHAR_VERSION
233 while (*cur
!= L('\0'))
235 const USTRING_TYPE
*pos
= cur
;
236 size_t len
= find_idx (&cur
, &weight_idx
, &rule_idx
, l_data
,
238 int rule
= l_data
->rulesets
[rule_idx
* l_data
->nrules
+ pass
];
240 if ((rule
& sort_forward
) != 0)
242 /* Handle the pushed backward sequence. */
243 if (backw_start
!= NULL
)
245 for (size_t p
= backw_len
; p
> 0; p
--)
249 unsigned char rule_idx
;
250 const USTRING_TYPE
*backw_cur
= backw_start
;
252 /* To prevent a warning init the used vars. */
253 len
= find_idx (&backw_cur
, &weight_idx
,
254 &rule_idx
, l_data
, pass
);
256 for (i
= 1; i
< p
; i
++)
257 len
= find_idx (&backw_cur
, &weight_idx
,
258 &rule_idx
, l_data
, pass
);
262 #ifdef WIDE_CHAR_VERSION
263 if (needed
+ 1 + len
< n
)
266 for (i
= 0; i
< len
; ++i
)
267 dest
[needed
+ 1 + i
] =
268 l_data
->weights
[weight_idx
+ i
];
272 buflen
= utf8_encode (buf
, val
);
273 if (needed
+ buflen
+ len
< n
)
275 for (i
= 0; i
< buflen
; ++i
)
276 dest
[needed
+ i
] = buf
[i
];
277 for (i
= 0; i
< len
; ++i
)
278 dest
[needed
+ buflen
+ i
] =
279 l_data
->weights
[weight_idx
+ i
];
281 needed
+= buflen
+ len
;
293 /* Now handle the forward element. */
296 #ifdef WIDE_CHAR_VERSION
297 if (needed
+ 1 + len
< n
)
300 for (i
= 0; i
< len
; ++i
)
301 dest
[needed
+ 1 + i
] =
302 l_data
->weights
[weight_idx
+ i
];
306 buflen
= utf8_encode (buf
, val
);
307 if (needed
+ buflen
+ len
< n
)
309 for (i
= 0; i
< buflen
; ++i
)
310 dest
[needed
+ i
] = buf
[i
];
311 for (i
= 0; i
< len
; ++i
)
312 dest
[needed
+ buflen
+ i
] =
313 l_data
->weights
[weight_idx
+ i
];
315 needed
+= buflen
+ len
;
324 /* Remember start of the backward sequence & track length. */
325 if (backw_start
== NULL
)
331 /* Handle the pushed backward sequence. */
332 if (backw_start
!= NULL
)
334 for (size_t p
= backw_len
; p
> 0; p
--)
338 unsigned char rule_idx
;
339 const USTRING_TYPE
*backw_cur
= backw_start
;
341 /* To prevent a warning init the used vars. */
342 len
= find_idx (&backw_cur
, &weight_idx
,
343 &rule_idx
, l_data
, pass
);
345 for (i
= 1; i
< p
; i
++)
346 len
= find_idx (&backw_cur
, &weight_idx
,
347 &rule_idx
, l_data
, pass
);
351 #ifdef WIDE_CHAR_VERSION
352 if (needed
+ 1 + len
< n
)
355 for (i
= 0; i
< len
; ++i
)
356 dest
[needed
+ 1 + i
] =
357 l_data
->weights
[weight_idx
+ i
];
361 buflen
= utf8_encode (buf
, val
);
362 if (needed
+ buflen
+ len
< n
)
364 for (i
= 0; i
< buflen
; ++i
)
365 dest
[needed
+ i
] = buf
[i
];
366 for (i
= 0; i
< len
; ++i
)
367 dest
[needed
+ buflen
+ i
] =
368 l_data
->weights
[weight_idx
+ i
];
370 needed
+= buflen
+ len
;
380 /* Finally store the byte to separate the passes or terminate
383 dest
[needed
] = pass
+ 1 < l_data
->nrules
? L('\1') : L('\0');
387 /* This is a little optimization: many collation specifications have
388 a `position' rule at the end and if no non-ignored character
389 is found the last \1 byte is immediately followed by a \0 byte
390 signalling this. We can avoid the \1 byte(s). */
391 if (needed
> 2 && needed
== last_needed
+ 1)
393 /* Remove the \1 byte. */
395 dest
[needed
- 1] = L('\0');
398 /* Return the number of bytes/words we need, but don't count the NUL
399 byte/word at the end. */
403 /* Do the transformation using weight-index and rule cache. */
405 do_xfrm_cached (STRING_TYPE
*dest
, size_t n
, const locale_data_t
*l_data
,
406 size_t idxmax
, int32_t *idxarr
, const unsigned char *rulearr
)
408 uint_fast32_t nrules
= l_data
->nrules
;
409 unsigned char *rulesets
= l_data
->rulesets
;
410 USTRING_TYPE
*weights
= l_data
->weights
;
416 /* Now the passes over the weights. */
417 for (pass
= 0; pass
< nrules
; ++pass
)
419 size_t backw_stop
= ~0ul;
420 int rule
= rulesets
[rulearr
[0] * nrules
+ pass
];
421 /* We assume that if a rule has defined `position' in one section
422 this is true for all of them. */
423 int position
= rule
& sort_position
;
425 last_needed
= needed
;
428 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
430 if ((rule
& sort_forward
) != 0)
434 if (backw_stop
!= ~0ul)
436 /* Handle the pushed elements now. */
439 for (backw
= idxcnt
; backw
> backw_stop
; )
442 len
= weights
[idxarr
[backw
]++];
444 if (needed
+ len
< n
)
446 dest
[needed
++] = weights
[idxarr
[backw
]++];
449 /* No more characters fit into the buffer. */
451 idxarr
[backw
] += len
;
458 /* Now handle the forward element. */
459 len
= weights
[idxarr
[idxcnt
]++];
460 if (needed
+ len
< n
)
462 dest
[needed
++] = weights
[idxarr
[idxcnt
]++];
465 /* No more characters fit into the buffer. */
467 idxarr
[idxcnt
] += len
;
472 /* Remember where the backwards series started. */
473 if (backw_stop
== ~0ul)
477 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
481 if (backw_stop
!= ~0ul)
483 /* Handle the pushed elements now. */
487 while (backw
> backw_stop
)
489 size_t len
= weights
[idxarr
[--backw
]++];
491 if (needed
+ len
< n
)
493 dest
[needed
++] = weights
[idxarr
[backw
]++];
496 /* No more characters fit into the buffer. */
498 idxarr
[backw
] += len
;
506 #ifndef WIDE_CHAR_VERSION
512 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
514 if ((rule
& sort_forward
) != 0)
518 if (backw_stop
!= ~0ul)
520 /* Handle the pushed elements now. */
523 for (backw
= idxcnt
; backw
> backw_stop
; )
526 len
= weights
[idxarr
[backw
]++];
529 #ifdef WIDE_CHAR_VERSION
530 if (needed
+ 1 + len
< n
)
533 for (i
= 0; i
< len
; ++i
)
534 dest
[needed
+ 1 + i
] =
535 weights
[idxarr
[backw
] + i
];
539 buflen
= utf8_encode (buf
, val
);
540 if (needed
+ buflen
+ len
< n
)
542 for (i
= 0; i
< buflen
; ++i
)
543 dest
[needed
+ i
] = buf
[i
];
544 for (i
= 0; i
< len
; ++i
)
545 dest
[needed
+ buflen
+ i
] =
546 weights
[idxarr
[backw
] + i
];
548 needed
+= buflen
+ len
;
550 idxarr
[backw
] += len
;
560 /* Now handle the forward element. */
561 len
= weights
[idxarr
[idxcnt
]++];
564 #ifdef WIDE_CHAR_VERSION
565 if (needed
+ 1+ len
< n
)
568 for (i
= 0; i
< len
; ++i
)
569 dest
[needed
+ 1 + i
] =
570 weights
[idxarr
[idxcnt
] + i
];
574 buflen
= utf8_encode (buf
, val
);
575 if (needed
+ buflen
+ len
< n
)
577 for (i
= 0; i
< buflen
; ++i
)
578 dest
[needed
+ i
] = buf
[i
];
579 for (i
= 0; i
< len
; ++i
)
580 dest
[needed
+ buflen
+ i
] =
581 weights
[idxarr
[idxcnt
] + i
];
583 needed
+= buflen
+ len
;
585 idxarr
[idxcnt
] += len
;
589 /* Note that we don't have to increment `idxarr[idxcnt]'
590 since the length is zero. */
595 /* Remember where the backwards series started. */
596 if (backw_stop
== ~0ul)
600 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
603 if (backw_stop
!= ~0ul)
605 /* Handle the pushed elements now. */
609 while (backw
> backw_stop
)
611 size_t len
= weights
[idxarr
[--backw
]++];
614 #ifdef WIDE_CHAR_VERSION
615 if (needed
+ 1 + len
< n
)
618 for (i
= 0; i
< len
; ++i
)
619 dest
[needed
+ 1 + i
] =
620 weights
[idxarr
[backw
] + i
];
624 buflen
= utf8_encode (buf
, val
);
625 if (needed
+ buflen
+ len
< n
)
627 for (i
= 0; i
< buflen
; ++i
)
628 dest
[needed
+ i
] = buf
[i
];
629 for (i
= 0; i
< len
; ++i
)
630 dest
[needed
+ buflen
+ i
] =
631 weights
[idxarr
[backw
] + i
];
633 needed
+= buflen
+ len
;
635 idxarr
[backw
] += len
;
644 /* Finally store the byte to separate the passes or terminate
647 dest
[needed
] = pass
+ 1 < nrules
? L('\1') : L('\0');
651 /* This is a little optimization: many collation specifications have
652 a `position' rule at the end and if no non-ignored character
653 is found the last \1 byte is immediately followed by a \0 byte
654 signalling this. We can avoid the \1 byte(s). */
655 if (needed
> 2 && needed
== last_needed
+ 1)
657 /* Remove the \1 byte. */
659 dest
[needed
- 1] = L('\0');
662 /* Return the number of bytes/words we need, but don't count the NUL
663 byte/word at the end. */
668 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
, locale_t l
)
670 locale_data_t l_data
;
671 struct __locale_data
*current
= l
->__locales
[LC_COLLATE
];
672 l_data
.nrules
= current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_NRULES
)].word
;
674 /* Handle byte comparison case. */
675 if (l_data
.nrules
== 0)
677 size_t srclen
= STRLEN (src
);
680 STPNCPY (dest
, src
, MIN (srclen
+ 1, n
));
685 /* Handle an empty string, code hereafter relies on strlen (src) > 0. */
693 /* Get the locale data. */
694 l_data
.rulesets
= (unsigned char *)
695 current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS
)].string
;
696 l_data
.table
= (int32_t *)
697 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE
,SUFFIX
))].string
;
698 l_data
.weights
= (USTRING_TYPE
*)
699 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
))].string
;
700 l_data
.extra
= (USTRING_TYPE
*)
701 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
))].string
;
702 l_data
.indirect
= (int32_t *)
703 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
))].string
;
705 assert (((uintptr_t) l_data
.table
) % __alignof__ (l_data
.table
[0]) == 0);
706 assert (((uintptr_t) l_data
.weights
) % __alignof__ (l_data
.weights
[0]) == 0);
707 assert (((uintptr_t) l_data
.extra
) % __alignof__ (l_data
.extra
[0]) == 0);
708 assert (((uintptr_t) l_data
.indirect
) % __alignof__ (l_data
.indirect
[0]) == 0);
710 /* We need the elements of the string as unsigned values since they
711 are used as indices. */
712 const USTRING_TYPE
*usrc
= (const USTRING_TYPE
*) src
;
714 /* Allocate cache for small strings on the stack and fill it with weight and
715 rule indices. If the cache size is not sufficient, continue with the
716 uncached xfrm version. */
718 const USTRING_TYPE
*cur
= usrc
;
719 int32_t *idxarr
= alloca (SMALL_STR_SIZE
* sizeof (int32_t));
720 unsigned char *rulearr
= alloca (SMALL_STR_SIZE
+ 1);
724 int32_t tmp
= findidx (l_data
.table
, l_data
.indirect
, l_data
.extra
, &cur
,
726 rulearr
[idxmax
] = tmp
>> 24;
727 idxarr
[idxmax
] = tmp
& 0xffffff;
731 while (*cur
!= L('\0') && idxmax
< SMALL_STR_SIZE
);
733 /* This element is only read, the value never used but to determine
734 another value which then is ignored. */
735 rulearr
[idxmax
] = '\0';
737 /* Do the transformation. */
739 return do_xfrm_cached (dest
, n
, &l_data
, idxmax
, idxarr
, rulearr
);
741 return do_xfrm (usrc
, dest
, n
, &l_data
);
743 libc_hidden_def (STRXFRM
)
745 #ifndef WIDE_CHAR_VERSION
746 weak_alias (__strxfrm_l
, strxfrm_l
)