1 /* Copyright (C) 1995-2014 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
26 #include <sys/param.h>
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # define STRXFRM __strxfrm_l
32 # define STRCMP strcmp
33 # define STRLEN strlen
34 # define STPNCPY __stpncpy
35 # define WEIGHT_H "../locale/weight.h"
40 #define CONCAT(a,b) CONCAT1(a,b)
41 #define CONCAT1(a,b) a##b
43 #include "../locale/localeinfo.h"
46 #ifndef WIDE_CHAR_VERSION
48 /* We need UTF-8 encoding of numbers. */
50 utf8_encode (char *buf
, int val
)
63 for (step
= 2; step
< 6; ++step
)
64 if ((val
& (~(uint32_t)0 << (5 * step
+ 1))) == 0)
68 *buf
= (unsigned char) (~0xff >> step
);
72 buf
[step
] = 0x80 | (val
& 0x3f);
85 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
, __locale_t l
)
87 struct __locale_data
*current
= l
->__locales
[LC_COLLATE
];
88 uint_fast32_t nrules
= current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_NRULES
)].word
;
89 /* We don't assign the following values right away since it might be
90 unnecessary in case there are no rules. */
91 const unsigned char *rulesets
;
93 const USTRING_TYPE
*weights
;
94 const USTRING_TYPE
*extra
;
95 const int32_t *indirect
;
99 const USTRING_TYPE
*usrc
;
100 size_t srclen
= STRLEN (src
);
102 unsigned char *rulearr
;
112 STPNCPY (dest
, src
, MIN (srclen
+ 1, n
));
117 rulesets
= (const unsigned char *)
118 current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS
)].string
;
119 table
= (const int32_t *)
120 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE
,SUFFIX
))].string
;
121 weights
= (const USTRING_TYPE
*)
122 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
))].string
;
123 extra
= (const USTRING_TYPE
*)
124 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
))].string
;
125 indirect
= (const int32_t *)
126 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
))].string
;
129 assert (((uintptr_t) table
) % __alignof__ (table
[0]) == 0);
130 assert (((uintptr_t) weights
) % __alignof__ (weights
[0]) == 0);
131 assert (((uintptr_t) extra
) % __alignof__ (extra
[0]) == 0);
132 assert (((uintptr_t) indirect
) % __alignof__ (indirect
[0]) == 0);
134 /* Handle an empty string as a special case. */
142 /* We need the elements of the string as unsigned values since they
143 are used as indeces. */
144 usrc
= (const USTRING_TYPE
*) src
;
146 /* Perform the first pass over the string and while doing this find
147 and store the weights for each character. Since we want this to
148 be as fast as possible we are using `alloca' to store the temporary
149 values. But since there is no limit on the length of the string
150 we have to use `malloc' if the string is too long. We should be
151 very conservative here. */
152 if (! __libc_use_alloca ((srclen
+ 1) * (sizeof (int32_t) + 1)))
154 idxarr
= (int32_t *) malloc ((srclen
+ 1) * (sizeof (int32_t) + 1));
155 rulearr
= (unsigned char *) &idxarr
[srclen
];
158 /* No memory. Well, go with the stack then.
160 XXX Once this implementation is stable we will handle this
161 differently. Instead of precomputing the indeces we will
162 do this in time. This means, though, that this happens for
170 idxarr
= (int32_t *) alloca (srclen
* sizeof (int32_t));
171 rulearr
= (unsigned char *) alloca (srclen
+ 1);
177 int32_t tmp
= findidx (&usrc
, -1);
178 rulearr
[idxmax
] = tmp
>> 24;
179 idxarr
[idxmax
] = tmp
& 0xffffff;
183 while (*usrc
!= L('\0'));
185 /* This element is only read, the value never used but to determine
186 another value which then is ignored. */
187 rulearr
[idxmax
] = '\0';
189 /* Now the passes over the weights. We now use the indeces we found
192 for (pass
= 0; pass
< nrules
; ++pass
)
194 size_t backw_stop
= ~0ul;
195 int rule
= rulesets
[rulearr
[0] * nrules
+ pass
];
196 /* We assume that if a rule has defined `position' in one section
197 this is true for all of them. */
198 int position
= rule
& sort_position
;
200 last_needed
= needed
;
203 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
205 if ((rule
& sort_forward
) != 0)
209 if (backw_stop
!= ~0ul)
211 /* Handle the pushed elements now. */
214 for (backw
= idxcnt
; backw
> backw_stop
; )
217 len
= weights
[idxarr
[backw
]++];
219 if (needed
+ len
< n
)
221 dest
[needed
++] = weights
[idxarr
[backw
]++];
224 /* No more characters fit into the buffer. */
226 idxarr
[backw
] += len
;
233 /* Now handle the forward element. */
234 len
= weights
[idxarr
[idxcnt
]++];
235 if (needed
+ len
< n
)
237 dest
[needed
++] = weights
[idxarr
[idxcnt
]++];
240 /* No more characters fit into the buffer. */
242 idxarr
[idxcnt
] += len
;
247 /* Remember where the backwards series started. */
248 if (backw_stop
== ~0ul)
252 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
256 if (backw_stop
!= ~0ul)
258 /* Handle the pushed elements now. */
262 while (backw
> backw_stop
)
264 size_t len
= weights
[idxarr
[--backw
]++];
266 if (needed
+ len
< n
)
268 dest
[needed
++] = weights
[idxarr
[backw
]++];
271 /* No more characters fit into the buffer. */
273 idxarr
[backw
] += len
;
281 #ifndef WIDE_CHAR_VERSION
287 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
289 if ((rule
& sort_forward
) != 0)
293 if (backw_stop
!= ~0ul)
295 /* Handle the pushed elements now. */
298 for (backw
= idxcnt
; backw
> backw_stop
; )
301 len
= weights
[idxarr
[backw
]++];
304 #ifdef WIDE_CHAR_VERSION
305 if (needed
+ 1 + len
< n
)
308 for (i
= 0; i
< len
; ++i
)
309 dest
[needed
+ 1 + i
] =
310 weights
[idxarr
[backw
] + i
];
314 buflen
= utf8_encode (buf
, val
);
315 if (needed
+ buflen
+ len
< n
)
317 for (i
= 0; i
< buflen
; ++i
)
318 dest
[needed
+ i
] = buf
[i
];
319 for (i
= 0; i
< len
; ++i
)
320 dest
[needed
+ buflen
+ i
] =
321 weights
[idxarr
[backw
] + i
];
323 needed
+= buflen
+ len
;
325 idxarr
[backw
] += len
;
335 /* Now handle the forward element. */
336 len
= weights
[idxarr
[idxcnt
]++];
339 #ifdef WIDE_CHAR_VERSION
340 if (needed
+ 1+ len
< n
)
343 for (i
= 0; i
< len
; ++i
)
344 dest
[needed
+ 1 + i
] =
345 weights
[idxarr
[idxcnt
] + i
];
349 buflen
= utf8_encode (buf
, val
);
350 if (needed
+ buflen
+ len
< n
)
352 for (i
= 0; i
< buflen
; ++i
)
353 dest
[needed
+ i
] = buf
[i
];
354 for (i
= 0; i
< len
; ++i
)
355 dest
[needed
+ buflen
+ i
] =
356 weights
[idxarr
[idxcnt
] + i
];
358 needed
+= buflen
+ len
;
360 idxarr
[idxcnt
] += len
;
364 /* Note that we don't have to increment `idxarr[idxcnt]'
365 since the length is zero. */
370 /* Remember where the backwards series started. */
371 if (backw_stop
== ~0ul)
375 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
378 if (backw_stop
!= ~0ul)
380 /* Handle the pushed elements now. */
384 while (backw
> backw_stop
)
386 size_t len
= weights
[idxarr
[--backw
]++];
389 #ifdef WIDE_CHAR_VERSION
390 if (needed
+ 1 + len
< n
)
393 for (i
= 0; i
< len
; ++i
)
394 dest
[needed
+ 1 + i
] =
395 weights
[idxarr
[backw
] + i
];
399 buflen
= utf8_encode (buf
, val
);
400 if (needed
+ buflen
+ len
< n
)
402 for (i
= 0; i
< buflen
; ++i
)
403 dest
[needed
+ i
] = buf
[i
];
404 for (i
= 0; i
< len
; ++i
)
405 dest
[needed
+ buflen
+ i
] =
406 weights
[idxarr
[backw
] + i
];
408 needed
+= buflen
+ len
;
410 idxarr
[backw
] += len
;
419 /* Finally store the byte to separate the passes or terminate
422 dest
[needed
] = pass
+ 1 < nrules
? L('\1') : L('\0');
426 /* This is a little optimization: many collation specifications have
427 a `position' rule at the end and if no non-ignored character
428 is found the last \1 byte is immediately followed by a \0 byte
429 signalling this. We can avoid the \1 byte(s). */
430 if (needed
> 2 && needed
== last_needed
+ 1)
432 /* Remove the \1 byte. */
434 dest
[needed
- 1] = L('\0');
437 /* Free the memory if needed. */
441 /* Return the number of bytes/words we need, but don't count the NUL
442 byte/word at the end. */
445 libc_hidden_def (STRXFRM
)
447 #ifndef WIDE_CHAR_VERSION
448 weak_alias (__strxfrm_l
, strxfrm_l
)