2 * Unicode sort key generation
4 * Copyright 2003 Dmitry Timoshkov
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 #include "wine/unicode.h"
22 extern unsigned int wine_decompose( WCHAR ch
, WCHAR
*dst
, unsigned int dstlen
);
23 extern const unsigned int collation_table
[];
26 * flags - normalization NORM_* flags
28 * FIXME: 'variable' flag not handled
30 int wine_get_sortkey(int flags
, const WCHAR
*src
, int srclen
, char *dst
, int dstlen
)
32 WCHAR dummy
[4]; /* no decomposition is larger than 4 chars */
35 const WCHAR
*src_save
= src
;
36 int srclen_save
= srclen
;
38 key_len
[0] = key_len
[1] = key_len
[2] = key_len
[3] = 0;
39 for (; srclen
; srclen
--, src
++)
41 unsigned int i
, decomposed_len
= 1;/*wine_decompose(*src, dummy, 4);*/
45 for (i
= 0; i
< decomposed_len
; i
++)
50 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
51 * and skips white space and punctuation characters for
54 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
57 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
59 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
60 if (ce
!= (unsigned int)-1)
62 if (ce
>> 16) key_len
[0] += 2;
63 if ((ce
>> 8) & 0xff) key_len
[1]++;
64 if ((ce
>> 4) & 0x0f) key_len
[2]++;
67 if (wch
>> 8) key_len
[3]++;
74 if (wch
>> 8) key_len
[0]++;
75 if (wch
& 0xff) key_len
[0]++;
81 if (!dstlen
) /* compute length */
82 /* 4 * '\1' + 1 * '\0' + key length */
83 return key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1;
85 if (dstlen
< key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1)
86 return 0; /* overflow */
92 key_ptr
[1] = key_ptr
[0] + key_len
[0] + 1;
93 key_ptr
[2] = key_ptr
[1] + key_len
[1] + 1;
94 key_ptr
[3] = key_ptr
[2] + key_len
[2] + 1;
96 for (; srclen
; srclen
--, src
++)
98 unsigned int i
, decomposed_len
= 1;/*wine_decompose(*src, dummy, 4);*/
102 for (i
= 0; i
< decomposed_len
; i
++)
104 WCHAR wch
= dummy
[i
];
107 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
108 * and skips white space and punctuation characters for
109 * NORM_IGNORESYMBOLS.
111 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
114 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
116 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
117 if (ce
!= (unsigned int)-1)
120 if ((key
= ce
>> 16))
122 *key_ptr
[0]++ = key
>> 8;
123 *key_ptr
[0]++ = key
& 0xff;
125 /* make key 1 start from 2 */
126 if ((key
= (ce
>> 8) & 0xff)) *key_ptr
[1]++ = key
+ 1;
127 /* make key 2 start from 2 */
128 if ((key
= (ce
>> 4) & 0x0f)) *key_ptr
[2]++ = key
+ 1;
129 /* key 3 is always a character code */
132 if (wch
>> 8) *key_ptr
[3]++ = wch
>> 8;
133 if (wch
& 0xff) *key_ptr
[3]++ = wch
& 0xff;
138 *key_ptr
[0]++ = 0xff;
139 *key_ptr
[0]++ = 0xfe;
140 if (wch
>> 8) *key_ptr
[0]++ = wch
>> 8;
141 if (wch
& 0xff) *key_ptr
[0]++ = wch
& 0xff;
150 *key_ptr
[3]++ = '\1';
153 return key_ptr
[3] - dst
;
156 static inline int compare_unicode_weights(int flags
, const WCHAR
*str1
, int len1
,
157 const WCHAR
*str2
, int len2
)
159 unsigned int ce1
, ce2
;
162 /* 32-bit collation element table format:
163 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
164 * case weight - high 4 bit of low 8 bit.
166 while (len1
> 0 && len2
> 0)
168 if (flags
& NORM_IGNORESYMBOLS
)
171 /* FIXME: not tested */
172 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
178 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
187 /* hyphen and apostrophe are treated differently depending on
188 * whether SORT_STRINGSORT specified or not
190 if (!(flags
& SORT_STRINGSORT
))
192 if (*str1
== '-' || *str1
== '\'')
194 if (*str2
!= '-' && *str2
!= '\'')
201 else if (*str2
== '-' || *str2
== '\'')
209 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
210 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
212 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
213 ret
= (ce1
>> 16) - (ce2
>> 16);
224 while (len1
&& !*str1
)
229 while (len2
&& !*str2
)
237 static inline int compare_diacritic_weights(int flags
, const WCHAR
*str1
, int len1
,
238 const WCHAR
*str2
, int len2
)
240 unsigned int ce1
, ce2
;
243 /* 32-bit collation element table format:
244 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
245 * case weight - high 4 bit of low 8 bit.
247 while (len1
> 0 && len2
> 0)
249 if (flags
& NORM_IGNORESYMBOLS
)
252 /* FIXME: not tested */
253 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
259 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
268 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
269 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
271 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
272 ret
= ((ce1
>> 8) & 0xff) - ((ce2
>> 8) & 0xff);
283 while (len1
&& !*str1
)
288 while (len2
&& !*str2
)
296 static inline int compare_case_weights(int flags
, const WCHAR
*str1
, int len1
,
297 const WCHAR
*str2
, int len2
)
299 unsigned int ce1
, ce2
;
302 /* 32-bit collation element table format:
303 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
304 * case weight - high 4 bit of low 8 bit.
306 while (len1
> 0 && len2
> 0)
308 if (flags
& NORM_IGNORESYMBOLS
)
311 /* FIXME: not tested */
312 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
318 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
327 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
328 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
330 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
331 ret
= ((ce1
>> 4) & 0x0f) - ((ce2
>> 4) & 0x0f);
342 while (len1
&& !*str1
)
347 while (len2
&& !*str2
)
355 int wine_compare_string(int flags
, const WCHAR
*str1
, int len1
,
356 const WCHAR
*str2
, int len2
)
360 ret
= compare_unicode_weights(flags
, str1
, len1
, str2
, len2
);
363 if (!(flags
& NORM_IGNORENONSPACE
))
364 ret
= compare_diacritic_weights(flags
, str1
, len1
, str2
, len2
);
365 if (!ret
&& !(flags
& NORM_IGNORECASE
))
366 ret
= compare_case_weights(flags
, str1
, len1
, str2
, len2
);