1 /* Copyright (C) 1995-2015 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
26 #include <sys/param.h>
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # define STRXFRM __strxfrm_l
32 # define STRCMP strcmp
33 # define STRLEN strlen
34 # define STPNCPY __stpncpy
35 # define WEIGHT_H "../locale/weight.h"
40 #define CONCAT(a,b) CONCAT1(a,b)
41 #define CONCAT1(a,b) a##b
43 /* Maximum string size that is calculated with cached indices. Right now this
44 is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
45 lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
46 #define SMALL_STR_SIZE 4095
48 #include "../locale/localeinfo.h"
51 /* Group locale data for shorter parameter lists. */
55 unsigned char *rulesets
;
56 USTRING_TYPE
*weights
;
62 #ifndef WIDE_CHAR_VERSION
64 /* We need UTF-8 encoding of numbers. */
66 utf8_encode (char *buf
, int val
)
79 for (step
= 2; step
< 6; ++step
)
80 if ((val
& (~(uint32_t)0 << (5 * step
+ 1))) == 0)
84 *buf
= (unsigned char) (~0xff >> step
);
88 buf
[step
] = 0x80 | (val
& 0x3f);
99 /* Find next weight and rule index. Inlined since called for every char. */
100 static __always_inline
size_t
101 find_idx (const USTRING_TYPE
**us
, int32_t *weight_idx
,
102 unsigned char *rule_idx
, const locale_data_t
*l_data
, const int pass
)
104 int32_t tmp
= findidx (l_data
->table
, l_data
->indirect
, l_data
->extra
, us
,
106 *rule_idx
= tmp
>> 24;
107 int32_t idx
= tmp
& 0xffffff;
108 size_t len
= l_data
->weights
[idx
++];
110 /* Skip over indices of previous levels. */
111 for (int i
= 0; i
< pass
; i
++)
114 len
= l_data
->weights
[idx
++];
122 find_position (const USTRING_TYPE
*us
, const locale_data_t
*l_data
,
126 unsigned char rule_idx
;
127 const USTRING_TYPE
*usrc
= us
;
129 find_idx (&usrc
, &weight_idx
, &rule_idx
, l_data
, pass
);
130 return l_data
->rulesets
[rule_idx
* l_data
->nrules
+ pass
] & sort_position
;
133 /* Do the transformation. */
135 do_xfrm (const USTRING_TYPE
*usrc
, STRING_TYPE
*dest
, size_t n
,
136 const locale_data_t
*l_data
)
139 unsigned char rule_idx
;
144 /* Now the passes over the weights. */
145 for (pass
= 0; pass
< l_data
->nrules
; ++pass
)
147 size_t backw_len
= 0;
148 last_needed
= needed
;
149 const USTRING_TYPE
*cur
= usrc
;
150 const USTRING_TYPE
*backw_start
= NULL
;
152 /* We assume that if a rule has defined `position' in one section
153 this is true for all of them. */
154 int position
= find_position (cur
, l_data
, pass
);
158 while (*cur
!= L('\0'))
160 const USTRING_TYPE
*pos
= cur
;
161 size_t len
= find_idx (&cur
, &weight_idx
, &rule_idx
, l_data
,
163 int rule
= l_data
->rulesets
[rule_idx
* l_data
->nrules
+ pass
];
165 if ((rule
& sort_forward
) != 0)
167 /* Handle the pushed backward sequence. */
168 if (backw_start
!= NULL
)
170 for (size_t i
= backw_len
; i
> 0; )
173 unsigned char rule_idx
;
174 size_t len
= find_idx (&backw_start
, &weight_idx
,
175 &rule_idx
, l_data
, pass
);
177 for (size_t j
= len
; j
> 0; j
--)
178 dest
[needed
+ i
- j
] =
179 l_data
->weights
[weight_idx
++];
189 /* Now handle the forward element. */
190 if (needed
+ len
< n
)
192 dest
[needed
++] = l_data
->weights
[weight_idx
++];
194 /* No more characters fit into the buffer. */
199 /* Remember start of the backward sequence & track length. */
200 if (backw_start
== NULL
)
207 /* Handle the pushed backward sequence. */
208 if (backw_start
!= NULL
)
210 for (size_t i
= backw_len
; i
> 0; )
212 size_t len
= find_idx (&backw_start
, &weight_idx
, &rule_idx
,
215 for (size_t j
= len
; j
> 0; j
--)
216 dest
[needed
+ i
- j
] =
217 l_data
->weights
[weight_idx
++];
228 #ifndef WIDE_CHAR_VERSION
234 while (*cur
!= L('\0'))
236 const USTRING_TYPE
*pos
= cur
;
237 size_t len
= find_idx (&cur
, &weight_idx
, &rule_idx
, l_data
,
239 int rule
= l_data
->rulesets
[rule_idx
* l_data
->nrules
+ pass
];
241 if ((rule
& sort_forward
) != 0)
243 /* Handle the pushed backward sequence. */
244 if (backw_start
!= NULL
)
246 for (size_t p
= backw_len
; p
> 0; p
--)
250 unsigned char rule_idx
;
251 const USTRING_TYPE
*backw_cur
= backw_start
;
253 /* To prevent a warning init the used vars. */
254 len
= find_idx (&backw_cur
, &weight_idx
,
255 &rule_idx
, l_data
, pass
);
257 for (i
= 1; i
< p
; i
++)
258 len
= find_idx (&backw_cur
, &weight_idx
,
259 &rule_idx
, l_data
, pass
);
263 #ifdef WIDE_CHAR_VERSION
264 if (needed
+ 1 + len
< n
)
267 for (i
= 0; i
< len
; ++i
)
268 dest
[needed
+ 1 + i
] =
269 l_data
->weights
[weight_idx
+ i
];
273 buflen
= utf8_encode (buf
, val
);
274 if (needed
+ buflen
+ len
< n
)
276 for (i
= 0; i
< buflen
; ++i
)
277 dest
[needed
+ i
] = buf
[i
];
278 for (i
= 0; i
< len
; ++i
)
279 dest
[needed
+ buflen
+ i
] =
280 l_data
->weights
[weight_idx
+ i
];
282 needed
+= buflen
+ len
;
294 /* Now handle the forward element. */
297 #ifdef WIDE_CHAR_VERSION
298 if (needed
+ 1 + len
< n
)
301 for (i
= 0; i
< len
; ++i
)
302 dest
[needed
+ 1 + i
] =
303 l_data
->weights
[weight_idx
+ i
];
307 buflen
= utf8_encode (buf
, val
);
308 if (needed
+ buflen
+ len
< n
)
310 for (i
= 0; i
< buflen
; ++i
)
311 dest
[needed
+ i
] = buf
[i
];
312 for (i
= 0; i
< len
; ++i
)
313 dest
[needed
+ buflen
+ i
] =
314 l_data
->weights
[weight_idx
+ i
];
316 needed
+= buflen
+ len
;
325 /* Remember start of the backward sequence & track length. */
326 if (backw_start
== NULL
)
332 /* Handle the pushed backward sequence. */
333 if (backw_start
!= NULL
)
335 for (size_t p
= backw_len
; p
> 0; p
--)
339 unsigned char rule_idx
;
340 const USTRING_TYPE
*backw_cur
= backw_start
;
342 /* To prevent a warning init the used vars. */
343 len
= find_idx (&backw_cur
, &weight_idx
,
344 &rule_idx
, l_data
, pass
);
346 for (i
= 1; i
< p
; i
++)
347 len
= find_idx (&backw_cur
, &weight_idx
,
348 &rule_idx
, l_data
, pass
);
352 #ifdef WIDE_CHAR_VERSION
353 if (needed
+ 1 + len
< n
)
356 for (i
= 0; i
< len
; ++i
)
357 dest
[needed
+ 1 + i
] =
358 l_data
->weights
[weight_idx
+ i
];
362 buflen
= utf8_encode (buf
, val
);
363 if (needed
+ buflen
+ len
< n
)
365 for (i
= 0; i
< buflen
; ++i
)
366 dest
[needed
+ i
] = buf
[i
];
367 for (i
= 0; i
< len
; ++i
)
368 dest
[needed
+ buflen
+ i
] =
369 l_data
->weights
[weight_idx
+ i
];
371 needed
+= buflen
+ len
;
381 /* Finally store the byte to separate the passes or terminate
384 dest
[needed
] = pass
+ 1 < l_data
->nrules
? L('\1') : L('\0');
388 /* This is a little optimization: many collation specifications have
389 a `position' rule at the end and if no non-ignored character
390 is found the last \1 byte is immediately followed by a \0 byte
391 signalling this. We can avoid the \1 byte(s). */
392 if (needed
> 2 && needed
== last_needed
+ 1)
394 /* Remove the \1 byte. */
396 dest
[needed
- 1] = L('\0');
399 /* Return the number of bytes/words we need, but don't count the NUL
400 byte/word at the end. */
404 /* Do the transformation using weight-index and rule cache. */
406 do_xfrm_cached (STRING_TYPE
*dest
, size_t n
, const locale_data_t
*l_data
,
407 size_t idxmax
, int32_t *idxarr
, const unsigned char *rulearr
)
409 uint_fast32_t nrules
= l_data
->nrules
;
410 unsigned char *rulesets
= l_data
->rulesets
;
411 USTRING_TYPE
*weights
= l_data
->weights
;
417 /* Now the passes over the weights. */
418 for (pass
= 0; pass
< nrules
; ++pass
)
420 size_t backw_stop
= ~0ul;
421 int rule
= rulesets
[rulearr
[0] * nrules
+ pass
];
422 /* We assume that if a rule has defined `position' in one section
423 this is true for all of them. */
424 int position
= rule
& sort_position
;
426 last_needed
= needed
;
429 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
431 if ((rule
& sort_forward
) != 0)
435 if (backw_stop
!= ~0ul)
437 /* Handle the pushed elements now. */
440 for (backw
= idxcnt
; backw
> backw_stop
; )
443 len
= weights
[idxarr
[backw
]++];
445 if (needed
+ len
< n
)
447 dest
[needed
++] = weights
[idxarr
[backw
]++];
450 /* No more characters fit into the buffer. */
452 idxarr
[backw
] += len
;
459 /* Now handle the forward element. */
460 len
= weights
[idxarr
[idxcnt
]++];
461 if (needed
+ len
< n
)
463 dest
[needed
++] = weights
[idxarr
[idxcnt
]++];
466 /* No more characters fit into the buffer. */
468 idxarr
[idxcnt
] += len
;
473 /* Remember where the backwards series started. */
474 if (backw_stop
== ~0ul)
478 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
482 if (backw_stop
!= ~0ul)
484 /* Handle the pushed elements now. */
488 while (backw
> backw_stop
)
490 size_t len
= weights
[idxarr
[--backw
]++];
492 if (needed
+ len
< n
)
494 dest
[needed
++] = weights
[idxarr
[backw
]++];
497 /* No more characters fit into the buffer. */
499 idxarr
[backw
] += len
;
507 #ifndef WIDE_CHAR_VERSION
513 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
515 if ((rule
& sort_forward
) != 0)
519 if (backw_stop
!= ~0ul)
521 /* Handle the pushed elements now. */
524 for (backw
= idxcnt
; backw
> backw_stop
; )
527 len
= weights
[idxarr
[backw
]++];
530 #ifdef WIDE_CHAR_VERSION
531 if (needed
+ 1 + len
< n
)
534 for (i
= 0; i
< len
; ++i
)
535 dest
[needed
+ 1 + i
] =
536 weights
[idxarr
[backw
] + i
];
540 buflen
= utf8_encode (buf
, val
);
541 if (needed
+ buflen
+ len
< n
)
543 for (i
= 0; i
< buflen
; ++i
)
544 dest
[needed
+ i
] = buf
[i
];
545 for (i
= 0; i
< len
; ++i
)
546 dest
[needed
+ buflen
+ i
] =
547 weights
[idxarr
[backw
] + i
];
549 needed
+= buflen
+ len
;
551 idxarr
[backw
] += len
;
561 /* Now handle the forward element. */
562 len
= weights
[idxarr
[idxcnt
]++];
565 #ifdef WIDE_CHAR_VERSION
566 if (needed
+ 1+ len
< n
)
569 for (i
= 0; i
< len
; ++i
)
570 dest
[needed
+ 1 + i
] =
571 weights
[idxarr
[idxcnt
] + i
];
575 buflen
= utf8_encode (buf
, val
);
576 if (needed
+ buflen
+ len
< n
)
578 for (i
= 0; i
< buflen
; ++i
)
579 dest
[needed
+ i
] = buf
[i
];
580 for (i
= 0; i
< len
; ++i
)
581 dest
[needed
+ buflen
+ i
] =
582 weights
[idxarr
[idxcnt
] + i
];
584 needed
+= buflen
+ len
;
586 idxarr
[idxcnt
] += len
;
590 /* Note that we don't have to increment `idxarr[idxcnt]'
591 since the length is zero. */
596 /* Remember where the backwards series started. */
597 if (backw_stop
== ~0ul)
601 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
604 if (backw_stop
!= ~0ul)
606 /* Handle the pushed elements now. */
610 while (backw
> backw_stop
)
612 size_t len
= weights
[idxarr
[--backw
]++];
615 #ifdef WIDE_CHAR_VERSION
616 if (needed
+ 1 + len
< n
)
619 for (i
= 0; i
< len
; ++i
)
620 dest
[needed
+ 1 + i
] =
621 weights
[idxarr
[backw
] + i
];
625 buflen
= utf8_encode (buf
, val
);
626 if (needed
+ buflen
+ len
< n
)
628 for (i
= 0; i
< buflen
; ++i
)
629 dest
[needed
+ i
] = buf
[i
];
630 for (i
= 0; i
< len
; ++i
)
631 dest
[needed
+ buflen
+ i
] =
632 weights
[idxarr
[backw
] + i
];
634 needed
+= buflen
+ len
;
636 idxarr
[backw
] += len
;
645 /* Finally store the byte to separate the passes or terminate
648 dest
[needed
] = pass
+ 1 < nrules
? L('\1') : L('\0');
652 /* This is a little optimization: many collation specifications have
653 a `position' rule at the end and if no non-ignored character
654 is found the last \1 byte is immediately followed by a \0 byte
655 signalling this. We can avoid the \1 byte(s). */
656 if (needed
> 2 && needed
== last_needed
+ 1)
658 /* Remove the \1 byte. */
660 dest
[needed
- 1] = L('\0');
663 /* Return the number of bytes/words we need, but don't count the NUL
664 byte/word at the end. */
669 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
, __locale_t l
)
671 locale_data_t l_data
;
672 struct __locale_data
*current
= l
->__locales
[LC_COLLATE
];
673 l_data
.nrules
= current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_NRULES
)].word
;
675 /* Handle byte comparison case. */
676 if (l_data
.nrules
== 0)
678 size_t srclen
= STRLEN (src
);
681 STPNCPY (dest
, src
, MIN (srclen
+ 1, n
));
686 /* Handle an empty string, code hereafter relies on strlen (src) > 0. */
694 /* Get the locale data. */
695 l_data
.rulesets
= (unsigned char *)
696 current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS
)].string
;
697 l_data
.table
= (int32_t *)
698 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE
,SUFFIX
))].string
;
699 l_data
.weights
= (USTRING_TYPE
*)
700 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
))].string
;
701 l_data
.extra
= (USTRING_TYPE
*)
702 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
))].string
;
703 l_data
.indirect
= (int32_t *)
704 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
))].string
;
706 assert (((uintptr_t) l_data
.table
) % __alignof__ (l_data
.table
[0]) == 0);
707 assert (((uintptr_t) l_data
.weights
) % __alignof__ (l_data
.weights
[0]) == 0);
708 assert (((uintptr_t) l_data
.extra
) % __alignof__ (l_data
.extra
[0]) == 0);
709 assert (((uintptr_t) l_data
.indirect
) % __alignof__ (l_data
.indirect
[0]) == 0);
711 /* We need the elements of the string as unsigned values since they
712 are used as indeces. */
713 const USTRING_TYPE
*usrc
= (const USTRING_TYPE
*) src
;
715 /* Allocate cache for small strings on the stack and fill it with weight and
716 rule indices. If the cache size is not sufficient, continue with the
717 uncached xfrm version. */
719 const USTRING_TYPE
*cur
= usrc
;
720 int32_t *idxarr
= alloca (SMALL_STR_SIZE
* sizeof (int32_t));
721 unsigned char *rulearr
= alloca (SMALL_STR_SIZE
+ 1);
725 int32_t tmp
= findidx (l_data
.table
, l_data
.indirect
, l_data
.extra
, &cur
,
727 rulearr
[idxmax
] = tmp
>> 24;
728 idxarr
[idxmax
] = tmp
& 0xffffff;
732 while (*cur
!= L('\0') && idxmax
< SMALL_STR_SIZE
);
734 /* This element is only read, the value never used but to determine
735 another value which then is ignored. */
736 rulearr
[idxmax
] = '\0';
738 /* Do the transformation. */
740 return do_xfrm_cached (dest
, n
, &l_data
, idxmax
, idxarr
, rulearr
);
742 return do_xfrm (usrc
, dest
, n
, &l_data
);
744 libc_hidden_def (STRXFRM
)
746 #ifndef WIDE_CHAR_VERSION
747 weak_alias (__strxfrm_l
, strxfrm_l
)