1 /* Copyright (C) 1995, 1996, 1997, 2002, 2004, 2005, 2006
2 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28 #include <sys/param.h>
31 # define STRING_TYPE char
32 # define USTRING_TYPE unsigned char
33 # define STRXFRM __strxfrm_l
34 # define STRCMP strcmp
35 # define STRLEN strlen
36 # define STPNCPY __stpncpy
37 # define WEIGHT_H "../locale/weight.h"
42 #define CONCAT(a,b) CONCAT1(a,b)
43 #define CONCAT1(a,b) a##b
45 #include "../locale/localeinfo.h"
48 #ifndef WIDE_CHAR_VERSION
50 /* We need UTF-8 encoding of numbers. */
52 utf8_encode (char *buf
, int val
)
65 for (step
= 2; step
< 6; ++step
)
66 if ((val
& (~(uint32_t)0 << (5 * step
+ 1))) == 0)
70 *buf
= (unsigned char) (~0xff >> step
);
74 buf
[step
] = 0x80 | (val
& 0x3f);
87 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
, __locale_t l
)
89 struct locale_data
*current
= l
->__locales
[LC_COLLATE
];
90 uint_fast32_t nrules
= current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_NRULES
)].word
;
91 /* We don't assign the following values right away since it might be
92 unnecessary in case there are no rules. */
93 const unsigned char *rulesets
;
95 const USTRING_TYPE
*weights
;
96 const USTRING_TYPE
*extra
;
97 const int32_t *indirect
;
101 const USTRING_TYPE
*usrc
;
102 size_t srclen
= STRLEN (src
);
104 unsigned char *rulearr
;
114 STPNCPY (dest
, src
, MIN (srclen
+ 1, n
));
119 rulesets
= (const unsigned char *)
120 current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS
)].string
;
121 table
= (const int32_t *)
122 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE
,SUFFIX
))].string
;
123 weights
= (const USTRING_TYPE
*)
124 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
))].string
;
125 extra
= (const USTRING_TYPE
*)
126 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
))].string
;
127 indirect
= (const int32_t *)
128 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
))].string
;
131 assert (((uintptr_t) table
) % __alignof__ (table
[0]) == 0);
132 assert (((uintptr_t) weights
) % __alignof__ (weights
[0]) == 0);
133 assert (((uintptr_t) extra
) % __alignof__ (extra
[0]) == 0);
134 assert (((uintptr_t) indirect
) % __alignof__ (indirect
[0]) == 0);
136 /* Handle an empty string as a special case. */
144 /* We need the elements of the string as unsigned values since they
145 are used as indeces. */
146 usrc
= (const USTRING_TYPE
*) src
;
148 /* Perform the first pass over the string and while doing this find
149 and store the weights for each character. Since we want this to
150 be as fast as possible we are using `alloca' to store the temporary
151 values. But since there is no limit on the length of the string
152 we have to use `malloc' if the string is too long. We should be
153 very conservative here. */
154 if (! __libc_use_alloca (srclen
))
156 idxarr
= (int32_t *) malloc ((srclen
+ 1) * (sizeof (int32_t) + 1));
157 rulearr
= (unsigned char *) &idxarr
[srclen
];
160 /* No memory. Well, go with the stack then.
162 XXX Once this implementation is stable we will handle this
163 differently. Instead of precomputing the indeces we will
164 do this in time. This means, though, that this happens for
172 idxarr
= (int32_t *) alloca (srclen
* sizeof (int32_t));
173 rulearr
= (unsigned char *) alloca (srclen
+ 1);
179 int32_t tmp
= findidx (&usrc
);
180 rulearr
[idxmax
] = tmp
>> 24;
181 idxarr
[idxmax
] = tmp
& 0xffffff;
185 while (*usrc
!= L('\0'));
187 /* This element is only read, the value never used but to determine
188 another value which then is ignored. */
189 rulearr
[idxmax
] = '\0';
191 /* Now the passes over the weights. We now use the indeces we found
194 for (pass
= 0; pass
< nrules
; ++pass
)
196 size_t backw_stop
= ~0ul;
197 int rule
= rulesets
[rulearr
[0] * nrules
+ pass
];
198 /* We assume that if a rule has defined `position' in one section
199 this is true for all of them. */
200 int position
= rule
& sort_position
;
202 last_needed
= needed
;
205 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
207 if ((rule
& sort_forward
) != 0)
211 if (backw_stop
!= ~0ul)
213 /* Handle the pushed elements now. */
216 for (backw
= idxcnt
; backw
> backw_stop
; )
219 len
= weights
[idxarr
[backw
]++];
221 if (needed
+ len
< n
)
223 dest
[needed
++] = weights
[idxarr
[backw
]++];
226 /* No more characters fit into the buffer. */
228 idxarr
[backw
] += len
;
235 /* Now handle the forward element. */
236 len
= weights
[idxarr
[idxcnt
]++];
237 if (needed
+ len
< n
)
239 dest
[needed
++] = weights
[idxarr
[idxcnt
]++];
242 /* No more characters fit into the buffer. */
244 idxarr
[idxcnt
] += len
;
249 /* Remember where the backwards series started. */
250 if (backw_stop
== ~0ul)
254 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
258 if (backw_stop
!= ~0ul)
260 /* Handle the pushed elements now. */
264 while (backw
> backw_stop
)
266 size_t len
= weights
[idxarr
[--backw
]++];
268 if (needed
+ len
< n
)
270 dest
[needed
++] = weights
[idxarr
[backw
]++];
273 /* No more characters fit into the buffer. */
275 idxarr
[backw
] += len
;
283 #ifndef WIDE_CHAR_VERSION
289 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
291 if ((rule
& sort_forward
) != 0)
295 if (backw_stop
!= ~0ul)
297 /* Handle the pushed elements now. */
300 for (backw
= idxcnt
; backw
> backw_stop
; )
303 len
= weights
[idxarr
[backw
]++];
306 #ifdef WIDE_CHAR_VERSION
307 if (needed
+ 1 + len
< n
)
310 for (i
= 0; i
< len
; ++i
)
311 dest
[needed
+ 1 + i
] =
312 weights
[idxarr
[backw
] + i
];
316 buflen
= utf8_encode (buf
, val
);
317 if (needed
+ buflen
+ len
< n
)
319 for (i
= 0; i
< buflen
; ++i
)
320 dest
[needed
+ i
] = buf
[i
];
321 for (i
= 0; i
< len
; ++i
)
322 dest
[needed
+ buflen
+ i
] =
323 weights
[idxarr
[backw
] + i
];
325 needed
+= buflen
+ len
;
327 idxarr
[backw
] += len
;
337 /* Now handle the forward element. */
338 len
= weights
[idxarr
[idxcnt
]++];
341 #ifdef WIDE_CHAR_VERSION
342 if (needed
+ 1+ len
< n
)
345 for (i
= 0; i
< len
; ++i
)
346 dest
[needed
+ 1 + i
] =
347 weights
[idxarr
[idxcnt
] + i
];
351 buflen
= utf8_encode (buf
, val
);
352 if (needed
+ buflen
+ len
< n
)
354 for (i
= 0; i
< buflen
; ++i
)
355 dest
[needed
+ i
] = buf
[i
];
356 for (i
= 0; i
< len
; ++i
)
357 dest
[needed
+ buflen
+ i
] =
358 weights
[idxarr
[idxcnt
] + i
];
360 needed
+= buflen
+ len
;
362 idxarr
[idxcnt
] += len
;
366 /* Note that we don't have to increment `idxarr[idxcnt]'
367 since the length is zero. */
372 /* Remember where the backwards series started. */
373 if (backw_stop
== ~0ul)
377 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
380 if (backw_stop
!= ~0ul)
382 /* Handle the pushed elements now. */
386 while (backw
> backw_stop
)
388 size_t len
= weights
[idxarr
[--backw
]++];
391 #ifdef WIDE_CHAR_VERSION
392 if (needed
+ 1 + len
< n
)
395 for (i
= 0; i
< len
; ++i
)
396 dest
[needed
+ 1 + i
] =
397 weights
[idxarr
[backw
] + i
];
401 buflen
= utf8_encode (buf
, val
);
402 if (needed
+ buflen
+ len
< n
)
404 for (i
= 0; i
< buflen
; ++i
)
405 dest
[needed
+ i
] = buf
[i
];
406 for (i
= 0; i
< len
; ++i
)
407 dest
[needed
+ buflen
+ i
] =
408 weights
[idxarr
[backw
] + i
];
410 needed
+= buflen
+ len
;
412 idxarr
[backw
] += len
;
421 /* Finally store the byte to separate the passes or terminate
424 dest
[needed
] = pass
+ 1 < nrules
? L('\1') : L('\0');
428 /* This is a little optimization: many collation specifications have
429 a `position' rule at the end and if no non-ignored character
430 is found the last \1 byte is immediately followed by a \0 byte
431 signalling this. We can avoid the \1 byte(s). */
432 if (needed
> 2 && needed
== last_needed
+ 1)
434 /* Remove the \1 byte. */
436 dest
[needed
- 1] = L('\0');
439 /* Free the memory if needed. */
443 /* Return the number of bytes/words we need, but don't count the NUL
444 byte/word at the end. */
447 libc_hidden_def (STRXFRM
)
449 #ifndef WIDE_CHAR_VERSION
450 weak_alias (__strxfrm_l
, strxfrm_l
)