1 /* Copyright (C) 1995-1999, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
27 # define STRING_TYPE char
28 # define USTRING_TYPE unsigned char
29 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
30 # define STRXFRM __strxfrm_l
32 # define STRXFRM strxfrm
34 # define STRCMP strcmp
35 # define STRLEN strlen
36 # define STPNCPY __stpncpy
37 # define WEIGHT_H "../locale/weight.h"
42 #define CONCAT(a,b) CONCAT1(a,b)
43 #define CONCAT1(a,b) a##b
45 #include "../locale/localeinfo.h"
48 #ifndef WIDE_CHAR_VERSION
49 /* These are definitions used by some of the functions for handling
50 UTF-8 encoding below. */
51 static const uint32_t encoding_mask
[] =
53 ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
56 static const unsigned char encoding_byte
[] =
58 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
62 /* We need UTF-8 encoding of numbers. */
64 utf8_encode (char *buf
, int val
)
78 for (step
= 2; step
< 6; ++step
)
79 if ((val
& encoding_mask
[step
- 2]) == 0)
83 *buf
= encoding_byte
[step
- 2];
87 buf
[step
] = 0x80 | (val
& 0x3f);
99 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
101 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
)
104 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
, __locale_t l
)
107 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
108 struct locale_data
*current
= l
->__locales
[LC_COLLATE
];
109 uint_fast32_t nrules
= *((uint32_t *) current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_NRULES
)].string
);
111 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
113 /* We don't assign the following values right away since it might be
114 unnecessary in case there are no rules. */
115 const unsigned char *rulesets
;
116 const int32_t *table
;
117 const USTRING_TYPE
*weights
;
118 const USTRING_TYPE
*extra
;
119 const int32_t *indirect
;
122 const USTRING_TYPE
*usrc
;
123 size_t srclen
= STRLEN (src
);
125 unsigned char *rulearr
;
129 #ifdef WIDE_CHAR_VERSION
140 STPNCPY (dest
, src
, n
);
145 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
146 rulesets
= (const unsigned char *)
147 current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS
)].string
;
148 table
= (const int32_t *)
149 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE
,SUFFIX
))].string
;
150 weights
= (const USTRING_TYPE
*)
151 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
))].string
;
152 extra
= (const USTRING_TYPE
*)
153 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
))].string
;
154 indirect
= (const int32_t *)
155 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
))].string
;
156 # ifdef WIDE_CHAR_VERSION
157 names
= (const wint_t *)
158 current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_NAMES
)].string
;
159 size
= current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE
)].word
;
160 layers
= current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS
)].word
;
163 rulesets
= (const unsigned char *)
164 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_RULESETS
);
165 table
= (const int32_t *)
166 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_TABLE
,SUFFIX
));
167 weights
= (const USTRING_TYPE
*)
168 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
));
169 extra
= (const USTRING_TYPE
*)
170 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
));
171 indirect
= (const int32_t *)
172 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
));
173 # ifdef WIDE_CHAR_VERSION
174 names
= (const wint_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_NAMES
);
175 size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_HASH_SIZE
);
176 layers
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_HASH_LAYERS
);
181 /* Handle an empty string as a special case. */
189 /* We need the elements of the string as unsigned values since they
190 are used as indeces. */
191 usrc
= (const USTRING_TYPE
*) src
;
193 /* Perform the first pass over the string and while doing this find
194 and store the weights for each character. Since we want this to
195 be as fast as possible we are using `alloca' to store the temporary
196 values. But since there is no limit on the length of the string
197 we have to use `malloc' if the string is too long. We should be
198 very conservative here. */
201 idxarr
= (int32_t *) malloc (srclen
* (sizeof (int32_t) + 1));
202 rulearr
= (unsigned char *) &idxarr
[srclen
];
205 /* No memory. Well, go with the stack then.
207 XXX Once this implementation is stable we will handle this
208 differently. Instead of precomputing the indeces we will
209 do this in time. This means, though, that this happens for
217 idxarr
= (int32_t *) alloca (srclen
* sizeof (int32_t));
218 rulearr
= (unsigned char *) alloca (srclen
);
224 int32_t tmp
= findidx (&usrc
);
225 rulearr
[idxmax
] = tmp
>> 24;
226 idxarr
[idxmax
] = tmp
& 0xffffff;
230 while (*usrc
!= L('\0'));
232 /* Now the passes over the weights. We now use the indeces we found
235 for (pass
= 0; pass
< nrules
; ++pass
)
237 size_t backw_stop
= ~0ul;
238 int rule
= rulesets
[rulearr
[0] * nrules
+ pass
];
239 /* We assume that if a rule has defined `position' in one section
240 this is true for all of them. */
241 int position
= rule
& sort_position
;
245 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
247 if ((rule
& sort_forward
) != 0)
251 if (backw_stop
!= ~0ul)
253 /* Handle the pushed elements now. */
256 for (backw
= idxcnt
- 1; backw
>= backw_stop
; --backw
)
258 len
= weights
[idxarr
[backw
]++];
260 if (needed
+ len
< n
)
262 dest
[needed
++] = weights
[idxarr
[backw
]++];
265 /* No more characters fit into the buffer. */
267 idxarr
[backw
] += len
;
274 /* Now handle the forward element. */
275 len
= weights
[idxarr
[idxcnt
]++];
276 if (needed
+ len
< n
)
278 dest
[needed
++] = weights
[idxarr
[idxcnt
]++];
281 /* No more characters fit into the buffer. */
283 idxarr
[idxcnt
] += len
;
288 /* Remember where the backwards series started. */
289 if (backw_stop
== ~0ul)
293 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
297 if (backw_stop
!= ~0ul)
299 /* Handle the pushed elements now. */
303 while (backw
> backw_stop
)
305 size_t len
= weights
[idxarr
[--backw
]++];
307 if (needed
+ len
< n
)
309 dest
[needed
++] = weights
[idxarr
[backw
]++];
312 /* No more characters fit into the buffer. */
314 idxarr
[backw
] += len
;
322 #ifndef WIDE_CHAR_VERSION
328 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
330 if ((rule
& sort_forward
) != 0)
334 if (backw_stop
!= ~0ul)
336 /* Handle the pushed elements now. */
339 for (backw
= idxcnt
- 1; backw
>= backw_stop
; --backw
)
341 len
= weights
[idxarr
[backw
]++];
344 #ifdef WIDE_CHAR_VERSION
345 if (needed
+ 1 + len
< n
)
348 for (i
= 0; i
< len
; ++i
)
349 dest
[needed
+ 1 + i
] =
350 weights
[idxarr
[backw
] + i
];
354 buflen
= utf8_encode (buf
, val
);
355 if (needed
+ buflen
+ len
< n
)
357 for (i
= 0; i
< buflen
; ++i
)
358 dest
[needed
+ i
] = buf
[i
];
359 for (i
= 0; i
< len
; ++i
)
360 dest
[needed
+ buflen
+ i
] =
361 weights
[idxarr
[backw
] + i
];
363 needed
+= buflen
+ len
;
365 idxarr
[backw
] += len
;
375 /* Now handle the forward element. */
376 len
= weights
[idxarr
[idxcnt
]++];
379 #ifdef WIDE_CHAR_VERSION
380 if (needed
+ 1+ len
< n
)
383 for (i
= 0; i
< len
; ++i
)
384 dest
[needed
+ 1 + i
] =
385 weights
[idxarr
[idxcnt
] + i
];
389 buflen
= utf8_encode (buf
, val
);
390 if (needed
+ buflen
+ len
< n
)
392 for (i
= 0; i
< buflen
; ++i
)
393 dest
[needed
+ i
] = buf
[i
];
394 for (i
= 0; i
< len
; ++i
)
395 dest
[needed
+ buflen
+ i
] =
396 weights
[idxarr
[idxcnt
] + i
];
398 needed
+= buflen
+ len
;
400 idxarr
[idxcnt
] += len
;
404 /* Note that we don't have to increment `idxarr[idxcnt]'
405 since the length is zero. */
410 /* Remember where the backwards series started. */
411 if (backw_stop
== ~0ul)
415 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
418 if (backw_stop
!= ~0)
420 /* Handle the pushed elements now. */
424 while (backw
> backw_stop
)
426 size_t len
= weights
[idxarr
[--backw
]++];
429 #ifdef WIDE_CHAR_VERSION
430 if (needed
+ 1 + len
< n
)
433 for (i
= 0; i
< len
; ++i
)
434 dest
[needed
+ 1 + i
] =
435 weights
[idxarr
[backw
] + i
];
439 buflen
= utf8_encode (buf
, val
);
440 if (needed
+ buflen
+ len
< n
)
442 for (i
= 0; i
< buflen
; ++i
)
443 dest
[needed
+ i
] = buf
[i
];
444 for (i
= 0; i
< len
; ++i
)
445 dest
[needed
+ buflen
+ i
] =
446 weights
[idxarr
[backw
] + i
];
448 needed
+= buflen
+ len
;
450 idxarr
[backw
] += len
;
459 /* Finally store the byte to separate the passes or terminate
462 dest
[needed
] = pass
+ 1 < nrules
? L('\1') : L('\0');
466 /* This is a little optimization: many collation specifications have
467 a `position' rule at the end and if no non-ignored character
468 is found the last \1 byte is immediately followed by a \0 byte
469 signalling this. We can avoid the \1 byte(s). */
470 if (needed
<= n
&& needed
> 2 && dest
[needed
- 2] == L('\1'))
472 /* Remove the \1 byte. */
474 dest
[needed
- 1] = L('\0');
477 /* Free the memory if needed. */
481 /* Return the number of bytes/words we need, but don't count the NUL
482 byte/word at the end. */