1 /* Copyright (C) 1995-1999,2000,2001,2002 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27 #include <sys/param.h>
30 # define STRING_TYPE char
31 # define USTRING_TYPE unsigned char
32 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
33 # define STRXFRM __strxfrm_l
35 # define STRXFRM strxfrm
37 # define STRCMP strcmp
38 # define STRLEN strlen
39 # define STPNCPY __stpncpy
40 # define WEIGHT_H "../locale/weight.h"
45 #define CONCAT(a,b) CONCAT1(a,b)
46 #define CONCAT1(a,b) a##b
48 #include "../locale/localeinfo.h"
51 #ifndef WIDE_CHAR_VERSION
53 /* We need UTF-8 encoding of numbers. */
55 utf8_encode (char *buf
, int val
)
68 for (step
= 2; step
< 6; ++step
)
69 if ((val
& (~(uint32_t)0 << (5 * step
+ 1))) == 0)
73 *buf
= (unsigned char) (~0xff >> step
);
77 buf
[step
] = 0x80 | (val
& 0x3f);
89 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
91 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
)
94 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
, __locale_t l
)
97 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
98 struct locale_data
*current
= l
->__locales
[LC_COLLATE
];
99 uint_fast32_t nrules
= current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_NRULES
)].word
;
101 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
103 /* We don't assign the following values right away since it might be
104 unnecessary in case there are no rules. */
105 const unsigned char *rulesets
;
106 const int32_t *table
;
107 const USTRING_TYPE
*weights
;
108 const USTRING_TYPE
*extra
;
109 const int32_t *indirect
;
112 const USTRING_TYPE
*usrc
;
113 size_t srclen
= STRLEN (src
);
115 unsigned char *rulearr
;
125 STPNCPY (dest
, src
, MIN (srclen
+ 1, n
));
130 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
131 rulesets
= (const unsigned char *)
132 current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS
)].string
;
133 table
= (const int32_t *)
134 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE
,SUFFIX
))].string
;
135 weights
= (const USTRING_TYPE
*)
136 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
))].string
;
137 extra
= (const USTRING_TYPE
*)
138 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
))].string
;
139 indirect
= (const int32_t *)
140 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
))].string
;
142 rulesets
= (const unsigned char *)
143 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_RULESETS
);
144 table
= (const int32_t *)
145 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_TABLE
,SUFFIX
));
146 weights
= (const USTRING_TYPE
*)
147 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
));
148 extra
= (const USTRING_TYPE
*)
149 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
));
150 indirect
= (const int32_t *)
151 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
));
155 assert (((uintptr_t) table
) % __alignof__ (table
[0]) == 0);
156 assert (((uintptr_t) weights
) % __alignof__ (weights
[0]) == 0);
157 assert (((uintptr_t) extra
) % __alignof__ (extra
[0]) == 0);
158 assert (((uintptr_t) indirect
) % __alignof__ (indirect
[0]) == 0);
160 /* Handle an empty string as a special case. */
168 /* We need the elements of the string as unsigned values since they
169 are used as indeces. */
170 usrc
= (const USTRING_TYPE
*) src
;
172 /* Perform the first pass over the string and while doing this find
173 and store the weights for each character. Since we want this to
174 be as fast as possible we are using `alloca' to store the temporary
175 values. But since there is no limit on the length of the string
176 we have to use `malloc' if the string is too long. We should be
177 very conservative here. */
180 idxarr
= (int32_t *) malloc ((srclen
+ 1) * (sizeof (int32_t) + 1));
181 rulearr
= (unsigned char *) &idxarr
[srclen
];
184 /* No memory. Well, go with the stack then.
186 XXX Once this implementation is stable we will handle this
187 differently. Instead of precomputing the indeces we will
188 do this in time. This means, though, that this happens for
196 idxarr
= (int32_t *) alloca (srclen
* sizeof (int32_t));
197 rulearr
= (unsigned char *) alloca (srclen
+ 1);
199 /* This element is only read, the value never used but to determine
200 another value which then is ignored. */
201 rulearr
[srclen
] = '\0';
206 int32_t tmp
= findidx (&usrc
);
207 rulearr
[idxmax
] = tmp
>> 24;
208 idxarr
[idxmax
] = tmp
& 0xffffff;
212 while (*usrc
!= L('\0'));
214 /* Now the passes over the weights. We now use the indeces we found
217 for (pass
= 0; pass
< nrules
; ++pass
)
219 size_t backw_stop
= ~0ul;
220 int rule
= rulesets
[rulearr
[0] * nrules
+ pass
];
221 /* We assume that if a rule has defined `position' in one section
222 this is true for all of them. */
223 int position
= rule
& sort_position
;
227 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
229 if ((rule
& sort_forward
) != 0)
233 if (backw_stop
!= ~0ul)
235 /* Handle the pushed elements now. */
238 for (backw
= idxcnt
- 1; backw
>= backw_stop
; --backw
)
240 len
= weights
[idxarr
[backw
]++];
242 if (needed
+ len
< n
)
244 dest
[needed
++] = weights
[idxarr
[backw
]++];
247 /* No more characters fit into the buffer. */
249 idxarr
[backw
] += len
;
256 /* Now handle the forward element. */
257 len
= weights
[idxarr
[idxcnt
]++];
258 if (needed
+ len
< n
)
260 dest
[needed
++] = weights
[idxarr
[idxcnt
]++];
263 /* No more characters fit into the buffer. */
265 idxarr
[idxcnt
] += len
;
270 /* Remember where the backwards series started. */
271 if (backw_stop
== ~0ul)
275 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
279 if (backw_stop
!= ~0ul)
281 /* Handle the pushed elements now. */
285 while (backw
> backw_stop
)
287 size_t len
= weights
[idxarr
[--backw
]++];
289 if (needed
+ len
< n
)
291 dest
[needed
++] = weights
[idxarr
[backw
]++];
294 /* No more characters fit into the buffer. */
296 idxarr
[backw
] += len
;
304 #ifndef WIDE_CHAR_VERSION
310 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
312 if ((rule
& sort_forward
) != 0)
316 if (backw_stop
!= ~0ul)
318 /* Handle the pushed elements now. */
321 for (backw
= idxcnt
- 1; backw
>= backw_stop
; --backw
)
323 len
= weights
[idxarr
[backw
]++];
326 #ifdef WIDE_CHAR_VERSION
327 if (needed
+ 1 + len
< n
)
330 for (i
= 0; i
< len
; ++i
)
331 dest
[needed
+ 1 + i
] =
332 weights
[idxarr
[backw
] + i
];
336 buflen
= utf8_encode (buf
, val
);
337 if (needed
+ buflen
+ len
< n
)
339 for (i
= 0; i
< buflen
; ++i
)
340 dest
[needed
+ i
] = buf
[i
];
341 for (i
= 0; i
< len
; ++i
)
342 dest
[needed
+ buflen
+ i
] =
343 weights
[idxarr
[backw
] + i
];
345 needed
+= buflen
+ len
;
347 idxarr
[backw
] += len
;
357 /* Now handle the forward element. */
358 len
= weights
[idxarr
[idxcnt
]++];
361 #ifdef WIDE_CHAR_VERSION
362 if (needed
+ 1+ len
< n
)
365 for (i
= 0; i
< len
; ++i
)
366 dest
[needed
+ 1 + i
] =
367 weights
[idxarr
[idxcnt
] + i
];
371 buflen
= utf8_encode (buf
, val
);
372 if (needed
+ buflen
+ len
< n
)
374 for (i
= 0; i
< buflen
; ++i
)
375 dest
[needed
+ i
] = buf
[i
];
376 for (i
= 0; i
< len
; ++i
)
377 dest
[needed
+ buflen
+ i
] =
378 weights
[idxarr
[idxcnt
] + i
];
380 needed
+= buflen
+ len
;
382 idxarr
[idxcnt
] += len
;
386 /* Note that we don't have to increment `idxarr[idxcnt]'
387 since the length is zero. */
392 /* Remember where the backwards series started. */
393 if (backw_stop
== ~0ul)
397 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
400 if (backw_stop
!= ~0ul)
402 /* Handle the pushed elements now. */
406 while (backw
> backw_stop
)
408 size_t len
= weights
[idxarr
[--backw
]++];
411 #ifdef WIDE_CHAR_VERSION
412 if (needed
+ 1 + len
< n
)
415 for (i
= 0; i
< len
; ++i
)
416 dest
[needed
+ 1 + i
] =
417 weights
[idxarr
[backw
] + i
];
421 buflen
= utf8_encode (buf
, val
);
422 if (needed
+ buflen
+ len
< n
)
424 for (i
= 0; i
< buflen
; ++i
)
425 dest
[needed
+ i
] = buf
[i
];
426 for (i
= 0; i
< len
; ++i
)
427 dest
[needed
+ buflen
+ i
] =
428 weights
[idxarr
[backw
] + i
];
430 needed
+= buflen
+ len
;
432 idxarr
[backw
] += len
;
441 /* Finally store the byte to separate the passes or terminate
444 dest
[needed
] = pass
+ 1 < nrules
? L('\1') : L('\0');
448 /* This is a little optimization: many collation specifications have
449 a `position' rule at the end and if no non-ignored character
450 is found the last \1 byte is immediately followed by a \0 byte
451 signalling this. We can avoid the \1 byte(s). */
452 if (needed
<= n
&& needed
> 2 && dest
[needed
- 2] == L('\1'))
454 /* Remove the \1 byte. */
456 dest
[needed
- 1] = L('\0');
459 /* Free the memory if needed. */
463 /* Return the number of bytes/words we need, but don't count the NUL
464 byte/word at the end. */