2003-05-16 Roland McGrath <roland@redhat.com>
[glibc.git] / string / strxfrm.c
blob9db57829d9522d956c363a14cdf72971925d5b00
1 /* Copyright (C) 1995-1999,2000,2001,2002,2003 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <locale.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <sys/param.h>
29 #ifndef STRING_TYPE
30 # define STRING_TYPE char
31 # define USTRING_TYPE unsigned char
32 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
33 # define STRXFRM __strxfrm_l
34 # else
35 # define STRXFRM strxfrm
36 # endif
37 # define STRCMP strcmp
38 # define STRLEN strlen
39 # define STPNCPY __stpncpy
40 # define WEIGHT_H "../locale/weight.h"
41 # define SUFFIX MB
42 # define L(arg) arg
43 #endif
45 #define CONCAT(a,b) CONCAT1(a,b)
46 #define CONCAT1(a,b) a##b
48 #include "../locale/localeinfo.h"
51 #ifndef WIDE_CHAR_VERSION
53 /* We need UTF-8 encoding of numbers. */
54 static inline int
55 utf8_encode (char *buf, int val)
57 int retval;
59 if (val < 0x80)
61 *buf++ = (char) val;
62 retval = 1;
64 else
66 int step;
68 for (step = 2; step < 6; ++step)
69 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
70 break;
71 retval = step;
73 *buf = (unsigned char) (~0xff >> step);
74 --step;
77 buf[step] = 0x80 | (val & 0x3f);
78 val >>= 6;
80 while (--step > 0);
81 *buf |= val;
84 return retval;
86 #endif
89 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
90 size_t
91 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
92 #else
93 size_t
94 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
95 #endif
97 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
98 struct locale_data *current = l->__locales[LC_COLLATE];
99 uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
100 #else
101 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
102 #endif
103 /* We don't assign the following values right away since it might be
104 unnecessary in case there are no rules. */
105 const unsigned char *rulesets;
106 const int32_t *table;
107 const USTRING_TYPE *weights;
108 const USTRING_TYPE *extra;
109 const int32_t *indirect;
110 uint_fast32_t pass;
111 size_t needed;
112 const USTRING_TYPE *usrc;
113 size_t srclen = STRLEN (src);
114 int32_t *idxarr;
115 unsigned char *rulearr;
116 size_t idxmax;
117 size_t idxcnt;
118 int use_malloc;
120 #include WEIGHT_H
122 if (nrules == 0)
124 if (n != 0)
125 STPNCPY (dest, src, MIN (srclen + 1, n));
127 return srclen;
130 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
131 rulesets = (const unsigned char *)
132 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
133 table = (const int32_t *)
134 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
135 weights = (const USTRING_TYPE *)
136 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
137 extra = (const USTRING_TYPE *)
138 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
139 indirect = (const int32_t *)
140 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
141 #else
142 rulesets = (const unsigned char *)
143 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
144 table = (const int32_t *)
145 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
146 weights = (const USTRING_TYPE *)
147 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
148 extra = (const USTRING_TYPE *)
149 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
150 indirect = (const int32_t *)
151 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
152 #endif
153 use_malloc = 0;
155 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
156 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
157 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
158 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
160 /* Handle an empty string as a special case. */
161 if (srclen == 0)
163 if (n != 0)
164 *dest = L('\0');
165 return 0;
168 /* We need the elements of the string as unsigned values since they
169 are used as indeces. */
170 usrc = (const USTRING_TYPE *) src;
172 /* Perform the first pass over the string and while doing this find
173 and store the weights for each character. Since we want this to
174 be as fast as possible we are using `alloca' to store the temporary
175 values. But since there is no limit on the length of the string
176 we have to use `malloc' if the string is too long. We should be
177 very conservative here. */
178 if (! __libc_use_alloca (srclen))
180 idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
181 rulearr = (unsigned char *) &idxarr[srclen];
183 if (idxarr == NULL)
184 /* No memory. Well, go with the stack then.
186 XXX Once this implementation is stable we will handle this
187 differently. Instead of precomputing the indeces we will
188 do this in time. This means, though, that this happens for
189 every pass again. */
190 goto try_stack;
191 use_malloc = 1;
193 else
195 try_stack:
196 idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
197 rulearr = (unsigned char *) alloca (srclen + 1);
200 idxmax = 0;
203 int32_t tmp = findidx (&usrc);
204 rulearr[idxmax] = tmp >> 24;
205 idxarr[idxmax] = tmp & 0xffffff;
207 ++idxmax;
209 while (*usrc != L('\0'));
211 /* This element is only read, the value never used but to determine
212 another value which then is ignored. */
213 rulearr[idxmax] = '\0';
215 /* Now the passes over the weights. We now use the indeces we found
216 before. */
217 needed = 0;
218 for (pass = 0; pass < nrules; ++pass)
220 size_t backw_stop = ~0ul;
221 int rule = rulesets[rulearr[0] * nrules + pass];
222 /* We assume that if a rule has defined `position' in one section
223 this is true for all of them. */
224 int position = rule & sort_position;
226 if (position == 0)
228 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
230 if ((rule & sort_forward) != 0)
232 size_t len;
234 if (backw_stop != ~0ul)
236 /* Handle the pushed elements now. */
237 size_t backw;
239 for (backw = idxcnt - 1; backw >= backw_stop; --backw)
241 len = weights[idxarr[backw]++];
243 if (needed + len < n)
244 while (len-- > 0)
245 dest[needed++] = weights[idxarr[backw]++];
246 else
248 /* No more characters fit into the buffer. */
249 needed += len;
250 idxarr[backw] += len;
254 backw_stop = ~0ul;
257 /* Now handle the forward element. */
258 len = weights[idxarr[idxcnt]++];
259 if (needed + len < n)
260 while (len-- > 0)
261 dest[needed++] = weights[idxarr[idxcnt]++];
262 else
264 /* No more characters fit into the buffer. */
265 needed += len;
266 idxarr[idxcnt] += len;
269 else
271 /* Remember where the backwards series started. */
272 if (backw_stop == ~0ul)
273 backw_stop = idxcnt;
276 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
280 if (backw_stop != ~0ul)
282 /* Handle the pushed elements now. */
283 size_t backw;
285 backw = idxcnt;
286 while (backw > backw_stop)
288 size_t len = weights[idxarr[--backw]++];
290 if (needed + len < n)
291 while (len-- > 0)
292 dest[needed++] = weights[idxarr[backw]++];
293 else
295 /* No more characters fit into the buffer. */
296 needed += len;
297 idxarr[backw] += len;
302 else
304 int val = 1;
305 #ifndef WIDE_CHAR_VERSION
306 char buf[7];
307 size_t buflen;
308 #endif
309 size_t i;
311 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
313 if ((rule & sort_forward) != 0)
315 size_t len;
317 if (backw_stop != ~0ul)
319 /* Handle the pushed elements now. */
320 size_t backw;
322 for (backw = idxcnt - 1; backw >= backw_stop; --backw)
324 len = weights[idxarr[backw]++];
325 if (len != 0)
327 #ifdef WIDE_CHAR_VERSION
328 if (needed + 1 + len < n)
330 dest[needed] = val;
331 for (i = 0; i < len; ++i)
332 dest[needed + 1 + i] =
333 weights[idxarr[backw] + i];
335 needed += 1 + len;
336 #else
337 buflen = utf8_encode (buf, val);
338 if (needed + buflen + len < n)
340 for (i = 0; i < buflen; ++i)
341 dest[needed + i] = buf[i];
342 for (i = 0; i < len; ++i)
343 dest[needed + buflen + i] =
344 weights[idxarr[backw] + i];
346 needed += buflen + len;
347 #endif
348 idxarr[backw] += len;
349 val = 1;
351 else
352 ++val;
355 backw_stop = ~0ul;
358 /* Now handle the forward element. */
359 len = weights[idxarr[idxcnt]++];
360 if (len != 0)
362 #ifdef WIDE_CHAR_VERSION
363 if (needed + 1+ len < n)
365 dest[needed] = val;
366 for (i = 0; i < len; ++i)
367 dest[needed + 1 + i] =
368 weights[idxarr[idxcnt] + i];
370 needed += 1 + len;
371 #else
372 buflen = utf8_encode (buf, val);
373 if (needed + buflen + len < n)
375 for (i = 0; i < buflen; ++i)
376 dest[needed + i] = buf[i];
377 for (i = 0; i < len; ++i)
378 dest[needed + buflen + i] =
379 weights[idxarr[idxcnt] + i];
381 needed += buflen + len;
382 #endif
383 idxarr[idxcnt] += len;
384 val = 1;
386 else
387 /* Note that we don't have to increment `idxarr[idxcnt]'
388 since the length is zero. */
389 ++val;
391 else
393 /* Remember where the backwards series started. */
394 if (backw_stop == ~0ul)
395 backw_stop = idxcnt;
398 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
401 if (backw_stop != ~0ul)
403 /* Handle the pushed elements now. */
404 size_t backw;
406 backw = idxmax - 1;
407 while (backw > backw_stop)
409 size_t len = weights[idxarr[--backw]++];
410 if (len != 0)
412 #ifdef WIDE_CHAR_VERSION
413 if (needed + 1 + len < n)
415 dest[needed] = val;
416 for (i = 0; i < len; ++i)
417 dest[needed + 1 + i] =
418 weights[idxarr[backw] + i];
420 needed += 1 + len;
421 #else
422 buflen = utf8_encode (buf, val);
423 if (needed + buflen + len < n)
425 for (i = 0; i < buflen; ++i)
426 dest[needed + i] = buf[i];
427 for (i = 0; i < len; ++i)
428 dest[needed + buflen + i] =
429 weights[idxarr[backw] + i];
431 needed += buflen + len;
432 #endif
433 idxarr[backw] += len;
434 val = 1;
436 else
437 ++val;
442 /* Finally store the byte to separate the passes or terminate
443 the string. */
444 if (needed < n)
445 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
446 ++needed;
449 /* This is a little optimization: many collation specifications have
450 a `position' rule at the end and if no non-ignored character
451 is found the last \1 byte is immediately followed by a \0 byte
452 signalling this. We can avoid the \1 byte(s). */
453 if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
455 /* Remove the \1 byte. */
456 --needed;
457 dest[needed - 1] = L('\0');
460 /* Free the memory if needed. */
461 if (use_malloc)
462 free (idxarr);
464 /* Return the number of bytes/words we need, but don't count the NUL
465 byte/word at the end. */
466 return needed - 1;