Update.
[glibc.git] / string / strxfrm.c
blob7ddc5f3f813eb5cecea304bfc7d78ef450b56c0c
1 /* Copyright (C) 1995-1999, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #include <langinfo.h>
21 #include <stddef.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <string.h>
26 #ifndef STRING_TYPE
27 # define STRING_TYPE char
28 # define USTRING_TYPE unsigned char
29 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
30 # define STRXFRM __strxfrm_l
31 # else
32 # define STRXFRM strxfrm
33 # endif
34 # define STRCMP strcmp
35 # define STRLEN strlen
36 # define STPNCPY __stpncpy
37 # define WEIGHT_H "../locale/weight.h"
38 # define SUFFIX MB
39 # define L(arg) arg
40 #endif
42 #define CONCAT(a,b) CONCAT1(a,b)
43 #define CONCAT1(a,b) a##b
45 #include "../locale/localeinfo.h"
48 #ifndef WIDE_CHAR_VERSION
49 /* These are definitions used by some of the functions for handling
50 UTF-8 encoding below. */
51 static const uint32_t encoding_mask[] =
53 ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
56 static const unsigned char encoding_byte[] =
58 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
62 /* We need UTF-8 encoding of numbers. */
63 static inline int
64 utf8_encode (char *buf, int val)
66 char *startp = buf;
67 int retval;
69 if (val < 0x80)
71 *buf++ = (char) val;
72 retval = 1;
74 else
76 int step;
78 for (step = 2; step < 6; ++step)
79 if ((val & encoding_mask[step - 2]) == 0)
80 break;
81 retval = step;
83 *buf = encoding_byte[step - 2];
84 --step;
87 buf[step] = 0x80 | (val & 0x3f);
88 val >>= 6;
90 while (--step > 0);
91 *buf |= val;
94 return buf - startp;
96 #endif
99 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
100 size_t
101 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
102 #else
103 size_t
104 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
105 #endif
107 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
108 struct locale_data *current = l->__locales[LC_COLLATE];
109 uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
110 #else
111 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
112 #endif
113 /* We don't assign the following values right away since it might be
114 unnecessary in case there are no rules. */
115 const unsigned char *rulesets;
116 const int32_t *table;
117 const USTRING_TYPE *weights;
118 const USTRING_TYPE *extra;
119 const int32_t *indirect;
120 uint_fast32_t pass;
121 size_t needed;
122 const USTRING_TYPE *usrc;
123 size_t srclen = STRLEN (src);
124 int32_t *idxarr;
125 unsigned char *rulearr;
126 size_t idxmax;
127 size_t idxcnt;
128 int use_malloc;
129 #ifdef WIDE_CHAR_VERSION
130 size_t size;
131 size_t layers;
132 const wint_t *names;
133 #endif
135 #include WEIGHT_H
137 if (nrules == 0)
139 if (n != 0)
140 STPNCPY (dest, src, n);
142 return srclen;
145 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
146 rulesets = (const unsigned char *)
147 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
148 table = (const int32_t *)
149 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
150 weights = (const USTRING_TYPE *)
151 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
152 extra = (const USTRING_TYPE *)
153 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
154 indirect = (const int32_t *)
155 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
156 # ifdef WIDE_CHAR_VERSION
157 names = (const wint_t *)
158 current->values[_NL_ITEM_INDEX (_NL_COLLATE_NAMES)].string;
159 size = current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].word;
160 layers = current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].word;
161 # endif
162 #else
163 rulesets = (const unsigned char *)
164 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
165 table = (const int32_t *)
166 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
167 weights = (const USTRING_TYPE *)
168 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
169 extra = (const USTRING_TYPE *)
170 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
171 indirect = (const int32_t *)
172 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
173 # ifdef WIDE_CHAR_VERSION
174 names = (const wint_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_NAMES);
175 size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_SIZE);
176 layers = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_LAYERS);
177 # endif
178 #endif
179 use_malloc = 0;
181 /* Handle an empty string as a special case. */
182 if (srclen == 0)
184 if (n != 0)
185 *dest = L('\0');
186 return 1;
189 /* We need the elements of the string as unsigned values since they
190 are used as indeces. */
191 usrc = (const USTRING_TYPE *) src;
193 /* Perform the first pass over the string and while doing this find
194 and store the weights for each character. Since we want this to
195 be as fast as possible we are using `alloca' to store the temporary
196 values. But since there is no limit on the length of the string
197 we have to use `malloc' if the string is too long. We should be
198 very conservative here. */
199 if (srclen >= 16384)
201 idxarr = (int32_t *) malloc (srclen * (sizeof (int32_t) + 1));
202 rulearr = (unsigned char *) &idxarr[srclen];
204 if (idxarr == NULL)
205 /* No memory. Well, go with the stack then.
207 XXX Once this implementation is stable we will handle this
208 differently. Instead of precomputing the indeces we will
209 do this in time. This means, though, that this happens for
210 every pass again. */
211 goto try_stack;
212 use_malloc = 1;
214 else
216 try_stack:
217 idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
218 rulearr = (unsigned char *) alloca (srclen);
221 idxmax = 0;
224 int32_t tmp = findidx (&usrc);
225 rulearr[idxmax] = tmp >> 24;
226 idxarr[idxmax] = tmp & 0xffffff;
228 ++idxmax;
230 while (*usrc != L('\0'));
232 /* Now the passes over the weights. We now use the indeces we found
233 before. */
234 needed = 0;
235 for (pass = 0; pass < nrules; ++pass)
237 size_t backw_stop = ~0ul;
238 int rule = rulesets[rulearr[0] * nrules + pass];
239 /* We assume that if a rule has defined `position' in one section
240 this is true for all of them. */
241 int position = rule & sort_position;
243 if (position == 0)
245 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
247 if ((rule & sort_forward) != 0)
249 size_t len;
251 if (backw_stop != ~0ul)
253 /* Handle the pushed elements now. */
254 size_t backw;
256 for (backw = idxcnt - 1; backw >= backw_stop; --backw)
258 len = weights[idxarr[backw]++];
260 if (needed + len < n)
261 while (len-- > 0)
262 dest[needed++] = weights[idxarr[backw]++];
263 else
265 /* No more characters fit into the buffer. */
266 needed += len;
267 idxarr[backw] += len;
271 backw_stop = ~0ul;
274 /* Now handle the forward element. */
275 len = weights[idxarr[idxcnt]++];
276 if (needed + len < n)
277 while (len-- > 0)
278 dest[needed++] = weights[idxarr[idxcnt]++];
279 else
281 /* No more characters fit into the buffer. */
282 needed += len;
283 idxarr[idxcnt] += len;
286 else
288 /* Remember where the backwards series started. */
289 if (backw_stop == ~0ul)
290 backw_stop = idxcnt;
293 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
297 if (backw_stop != ~0ul)
299 /* Handle the pushed elements now. */
300 size_t backw;
302 backw = idxcnt;
303 while (backw > backw_stop)
305 size_t len = weights[idxarr[--backw]++];
307 if (needed + len < n)
308 while (len-- > 0)
309 dest[needed++] = weights[idxarr[backw]++];
310 else
312 /* No more characters fit into the buffer. */
313 needed += len;
314 idxarr[backw] += len;
319 else
321 int val = 1;
322 #ifndef WIDE_CHAR_VERSION
323 char buf[7];
324 size_t buflen;
325 #endif
326 size_t i;
328 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
330 if ((rule & sort_forward) != 0)
332 size_t len;
334 if (backw_stop != ~0ul)
336 /* Handle the pushed elements now. */
337 size_t backw;
339 for (backw = idxcnt - 1; backw >= backw_stop; --backw)
341 len = weights[idxarr[backw]++];
342 if (len != 0)
344 #ifdef WIDE_CHAR_VERSION
345 if (needed + 1 + len < n)
347 dest[needed] = val;
348 for (i = 0; i < len; ++i)
349 dest[needed + 1 + i] =
350 weights[idxarr[backw] + i];
352 needed += 1 + len;
353 #else
354 buflen = utf8_encode (buf, val);
355 if (needed + buflen + len < n)
357 for (i = 0; i < buflen; ++i)
358 dest[needed + i] = buf[i];
359 for (i = 0; i < len; ++i)
360 dest[needed + buflen + i] =
361 weights[idxarr[backw] + i];
363 needed += buflen + len;
364 #endif
365 idxarr[backw] += len;
366 val = 1;
368 else
369 ++val;
372 backw_stop = ~0ul;
375 /* Now handle the forward element. */
376 len = weights[idxarr[idxcnt]++];
377 if (len != 0)
379 #ifdef WIDE_CHAR_VERSION
380 if (needed + 1+ len < n)
382 dest[needed] = val;
383 for (i = 0; i < len; ++i)
384 dest[needed + 1 + i] =
385 weights[idxarr[idxcnt] + i];
387 needed += 1 + len;
388 #else
389 buflen = utf8_encode (buf, val);
390 if (needed + buflen + len < n)
392 for (i = 0; i < buflen; ++i)
393 dest[needed + i] = buf[i];
394 for (i = 0; i < len; ++i)
395 dest[needed + buflen + i] =
396 weights[idxarr[idxcnt] + i];
398 needed += buflen + len;
399 #endif
400 idxarr[idxcnt] += len;
401 val = 1;
403 else
404 /* Note that we don't have to increment `idxarr[idxcnt]'
405 since the length is zero. */
406 ++val;
408 else
410 /* Remember where the backwards series started. */
411 if (backw_stop == ~0ul)
412 backw_stop = idxcnt;
415 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
418 if (backw_stop != ~0)
420 /* Handle the pushed elements now. */
421 size_t backw;
423 backw = idxmax - 1;
424 while (backw > backw_stop)
426 size_t len = weights[idxarr[--backw]++];
427 if (len != 0)
429 #ifdef WIDE_CHAR_VERSION
430 if (needed + 1 + len < n)
432 dest[needed] = val;
433 for (i = 0; i < len; ++i)
434 dest[needed + 1 + i] =
435 weights[idxarr[backw] + i];
437 needed += 1 + len;
438 #else
439 buflen = utf8_encode (buf, val);
440 if (needed + buflen + len < n)
442 for (i = 0; i < buflen; ++i)
443 dest[needed + i] = buf[i];
444 for (i = 0; i < len; ++i)
445 dest[needed + buflen + i] =
446 weights[idxarr[backw] + i];
448 needed += buflen + len;
449 #endif
450 idxarr[backw] += len;
451 val = 1;
453 else
454 ++val;
459 /* Finally store the byte to separate the passes or terminate
460 the string. */
461 if (needed < n)
462 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
463 ++needed;
466 /* This is a little optimization: many collation specifications have
467 a `position' rule at the end and if no non-ignored character
468 is found the last \1 byte is immediately followed by a \0 byte
469 signalling this. We can avoid the \1 byte(s). */
470 if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
472 /* Remove the \1 byte. */
473 --needed;
474 dest[needed - 1] = L('\0');
477 /* Free the memory if needed. */
478 if (use_malloc)
479 free (idxarr);
481 /* Return the number of bytes/words we need, but don't count the NUL
482 byte/word at the end. */
483 return needed - 1;