Update.
[glibc.git] / string / strxfrm_l.c
blob44b605168af503ff2ad44440f17bbb3702c4c71e
1 /* Copyright (C) 1995,96,97,2002, 2004 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <locale.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <sys/param.h>
29 #ifndef STRING_TYPE
30 # define STRING_TYPE char
31 # define USTRING_TYPE unsigned char
32 # define STRXFRM __strxfrm_l
33 # define STRCMP strcmp
34 # define STRLEN strlen
35 # define STPNCPY __stpncpy
36 # define WEIGHT_H "../locale/weight.h"
37 # define SUFFIX MB
38 # define L(arg) arg
39 #endif
41 #define CONCAT(a,b) CONCAT1(a,b)
42 #define CONCAT1(a,b) a##b
44 #include "../locale/localeinfo.h"
47 #ifndef WIDE_CHAR_VERSION
49 /* We need UTF-8 encoding of numbers. */
50 static int
51 utf8_encode (char *buf, int val)
53 int retval;
55 if (val < 0x80)
57 *buf++ = (char) val;
58 retval = 1;
60 else
62 int step;
64 for (step = 2; step < 6; ++step)
65 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
66 break;
67 retval = step;
69 *buf = (unsigned char) (~0xff >> step);
70 --step;
73 buf[step] = 0x80 | (val & 0x3f);
74 val >>= 6;
76 while (--step > 0);
77 *buf |= val;
80 return retval;
82 #endif
85 size_t
86 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
88 struct locale_data *current = l->__locales[LC_COLLATE];
89 uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
90 /* We don't assign the following values right away since it might be
91 unnecessary in case there are no rules. */
92 const unsigned char *rulesets;
93 const int32_t *table;
94 const USTRING_TYPE *weights;
95 const USTRING_TYPE *extra;
96 const int32_t *indirect;
97 uint_fast32_t pass;
98 size_t needed;
99 const USTRING_TYPE *usrc;
100 size_t srclen = STRLEN (src);
101 int32_t *idxarr;
102 unsigned char *rulearr;
103 size_t idxmax;
104 size_t idxcnt;
105 int use_malloc;
107 #include WEIGHT_H
109 if (nrules == 0)
111 if (n != 0)
112 STPNCPY (dest, src, MIN (srclen + 1, n));
114 return srclen;
117 rulesets = (const unsigned char *)
118 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
119 table = (const int32_t *)
120 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
121 weights = (const USTRING_TYPE *)
122 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
123 extra = (const USTRING_TYPE *)
124 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
125 indirect = (const int32_t *)
126 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
127 use_malloc = 0;
129 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
130 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
131 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
132 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
134 /* Handle an empty string as a special case. */
135 if (srclen == 0)
137 if (n != 0)
138 *dest = L('\0');
139 return 0;
142 /* We need the elements of the string as unsigned values since they
143 are used as indeces. */
144 usrc = (const USTRING_TYPE *) src;
146 /* Perform the first pass over the string and while doing this find
147 and store the weights for each character. Since we want this to
148 be as fast as possible we are using `alloca' to store the temporary
149 values. But since there is no limit on the length of the string
150 we have to use `malloc' if the string is too long. We should be
151 very conservative here. */
152 if (! __libc_use_alloca (srclen))
154 idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
155 rulearr = (unsigned char *) &idxarr[srclen];
157 if (idxarr == NULL)
158 /* No memory. Well, go with the stack then.
160 XXX Once this implementation is stable we will handle this
161 differently. Instead of precomputing the indeces we will
162 do this in time. This means, though, that this happens for
163 every pass again. */
164 goto try_stack;
165 use_malloc = 1;
167 else
169 try_stack:
170 idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
171 rulearr = (unsigned char *) alloca (srclen + 1);
174 idxmax = 0;
177 int32_t tmp = findidx (&usrc);
178 rulearr[idxmax] = tmp >> 24;
179 idxarr[idxmax] = tmp & 0xffffff;
181 ++idxmax;
183 while (*usrc != L('\0'));
185 /* This element is only read, the value never used but to determine
186 another value which then is ignored. */
187 rulearr[idxmax] = '\0';
189 /* Now the passes over the weights. We now use the indeces we found
190 before. */
191 needed = 0;
192 for (pass = 0; pass < nrules; ++pass)
194 size_t backw_stop = ~0ul;
195 int rule = rulesets[rulearr[0] * nrules + pass];
196 /* We assume that if a rule has defined `position' in one section
197 this is true for all of them. */
198 int position = rule & sort_position;
200 if (position == 0)
202 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
204 if ((rule & sort_forward) != 0)
206 size_t len;
208 if (backw_stop != ~0ul)
210 /* Handle the pushed elements now. */
211 size_t backw;
213 for (backw = idxcnt - 1; backw >= backw_stop; --backw)
215 len = weights[idxarr[backw]++];
217 if (needed + len < n)
218 while (len-- > 0)
219 dest[needed++] = weights[idxarr[backw]++];
220 else
222 /* No more characters fit into the buffer. */
223 needed += len;
224 idxarr[backw] += len;
228 backw_stop = ~0ul;
231 /* Now handle the forward element. */
232 len = weights[idxarr[idxcnt]++];
233 if (needed + len < n)
234 while (len-- > 0)
235 dest[needed++] = weights[idxarr[idxcnt]++];
236 else
238 /* No more characters fit into the buffer. */
239 needed += len;
240 idxarr[idxcnt] += len;
243 else
245 /* Remember where the backwards series started. */
246 if (backw_stop == ~0ul)
247 backw_stop = idxcnt;
250 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
254 if (backw_stop != ~0ul)
256 /* Handle the pushed elements now. */
257 size_t backw;
259 backw = idxcnt;
260 while (backw > backw_stop)
262 size_t len = weights[idxarr[--backw]++];
264 if (needed + len < n)
265 while (len-- > 0)
266 dest[needed++] = weights[idxarr[backw]++];
267 else
269 /* No more characters fit into the buffer. */
270 needed += len;
271 idxarr[backw] += len;
276 else
278 int val = 1;
279 #ifndef WIDE_CHAR_VERSION
280 char buf[7];
281 size_t buflen;
282 #endif
283 size_t i;
285 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
287 if ((rule & sort_forward) != 0)
289 size_t len;
291 if (backw_stop != ~0ul)
293 /* Handle the pushed elements now. */
294 size_t backw;
296 for (backw = idxcnt - 1; backw >= backw_stop; --backw)
298 len = weights[idxarr[backw]++];
299 if (len != 0)
301 #ifdef WIDE_CHAR_VERSION
302 if (needed + 1 + len < n)
304 dest[needed] = val;
305 for (i = 0; i < len; ++i)
306 dest[needed + 1 + i] =
307 weights[idxarr[backw] + i];
309 needed += 1 + len;
310 #else
311 buflen = utf8_encode (buf, val);
312 if (needed + buflen + len < n)
314 for (i = 0; i < buflen; ++i)
315 dest[needed + i] = buf[i];
316 for (i = 0; i < len; ++i)
317 dest[needed + buflen + i] =
318 weights[idxarr[backw] + i];
320 needed += buflen + len;
321 #endif
322 idxarr[backw] += len;
323 val = 1;
325 else
326 ++val;
329 backw_stop = ~0ul;
332 /* Now handle the forward element. */
333 len = weights[idxarr[idxcnt]++];
334 if (len != 0)
336 #ifdef WIDE_CHAR_VERSION
337 if (needed + 1+ len < n)
339 dest[needed] = val;
340 for (i = 0; i < len; ++i)
341 dest[needed + 1 + i] =
342 weights[idxarr[idxcnt] + i];
344 needed += 1 + len;
345 #else
346 buflen = utf8_encode (buf, val);
347 if (needed + buflen + len < n)
349 for (i = 0; i < buflen; ++i)
350 dest[needed + i] = buf[i];
351 for (i = 0; i < len; ++i)
352 dest[needed + buflen + i] =
353 weights[idxarr[idxcnt] + i];
355 needed += buflen + len;
356 #endif
357 idxarr[idxcnt] += len;
358 val = 1;
360 else
361 /* Note that we don't have to increment `idxarr[idxcnt]'
362 since the length is zero. */
363 ++val;
365 else
367 /* Remember where the backwards series started. */
368 if (backw_stop == ~0ul)
369 backw_stop = idxcnt;
372 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
375 if (backw_stop != ~0ul)
377 /* Handle the pushed elements now. */
378 size_t backw;
380 backw = idxmax - 1;
381 while (backw > backw_stop)
383 size_t len = weights[idxarr[--backw]++];
384 if (len != 0)
386 #ifdef WIDE_CHAR_VERSION
387 if (needed + 1 + len < n)
389 dest[needed] = val;
390 for (i = 0; i < len; ++i)
391 dest[needed + 1 + i] =
392 weights[idxarr[backw] + i];
394 needed += 1 + len;
395 #else
396 buflen = utf8_encode (buf, val);
397 if (needed + buflen + len < n)
399 for (i = 0; i < buflen; ++i)
400 dest[needed + i] = buf[i];
401 for (i = 0; i < len; ++i)
402 dest[needed + buflen + i] =
403 weights[idxarr[backw] + i];
405 needed += buflen + len;
406 #endif
407 idxarr[backw] += len;
408 val = 1;
410 else
411 ++val;
416 /* Finally store the byte to separate the passes or terminate
417 the string. */
418 if (needed < n)
419 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
420 ++needed;
423 /* This is a little optimization: many collation specifications have
424 a `position' rule at the end and if no non-ignored character
425 is found the last \1 byte is immediately followed by a \0 byte
426 signalling this. We can avoid the \1 byte(s). */
427 if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
429 /* Remove the \1 byte. */
430 --needed;
431 dest[needed - 1] = L('\0');
434 /* Free the memory if needed. */
435 if (use_malloc)
436 free (idxarr);
438 /* Return the number of bytes/words we need, but don't count the NUL
439 byte/word at the end. */
440 return needed - 1;
442 libc_hidden_def (STRXFRM)
444 #ifndef WIDE_CHAR_VERSION
445 weak_alias (__strxfrm_l, strxfrm_l)
446 #endif