2 * Unicode sort key generation
4 * Copyright 2003 Dmitry Timoshkov
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 #include "wine/unicode.h"
22 extern int get_decomposition(WCHAR src
, WCHAR
*dst
, unsigned int dstlen
);
23 extern const unsigned int collation_table
[];
26 * flags - normalization NORM_* flags
28 * FIXME: 'variable' flag not handled
30 int wine_get_sortkey(int flags
, const WCHAR
*src
, int srclen
, char *dst
, int dstlen
)
32 WCHAR dummy
[4]; /* no decomposition is larger than 4 chars */
35 const WCHAR
*src_save
= src
;
36 int srclen_save
= srclen
;
38 key_len
[0] = key_len
[1] = key_len
[2] = key_len
[3] = 0;
39 for (; srclen
; srclen
--, src
++)
41 int decomposed_len
= 1;/*get_decomposition(*src, dummy, 4);*/
46 for (i
= 0; i
< decomposed_len
; i
++)
51 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
52 * and skips white space and punctuation characters for
55 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
58 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
60 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
61 if (ce
!= (unsigned int)-1)
63 if (ce
>> 16) key_len
[0] += 2;
64 if ((ce
>> 8) & 0xff) key_len
[1]++;
65 if ((ce
>> 4) & 0x0f) key_len
[2]++;
68 if (wch
>> 8) key_len
[3]++;
75 if (wch
>> 8) key_len
[0]++;
76 if (wch
& 0xff) key_len
[0]++;
82 if (!dstlen
) /* compute length */
83 /* 4 * '\1' + 1 * '\0' + key length */
84 return key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1;
86 if (dstlen
< key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1)
87 return 0; /* overflow */
93 key_ptr
[1] = key_ptr
[0] + key_len
[0] + 1;
94 key_ptr
[2] = key_ptr
[1] + key_len
[1] + 1;
95 key_ptr
[3] = key_ptr
[2] + key_len
[2] + 1;
97 for (; srclen
; srclen
--, src
++)
99 int decomposed_len
= 1;/*get_decomposition(*src, dummy, 4);*/
104 for (i
= 0; i
< decomposed_len
; i
++)
106 WCHAR wch
= dummy
[i
];
109 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
110 * and skips white space and punctuation characters for
111 * NORM_IGNORESYMBOLS.
113 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
116 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
118 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
119 if (ce
!= (unsigned int)-1)
122 if ((key
= ce
>> 16))
124 *key_ptr
[0]++ = key
>> 8;
125 *key_ptr
[0]++ = key
& 0xff;
127 /* make key 1 start from 2 */
128 if ((key
= (ce
>> 8) & 0xff)) *key_ptr
[1]++ = key
+ 1;
129 /* make key 2 start from 2 */
130 if ((key
= (ce
>> 4) & 0x0f)) *key_ptr
[2]++ = key
+ 1;
131 /* key 3 is always a character code */
134 if (wch
>> 8) *key_ptr
[3]++ = wch
>> 8;
135 if (wch
& 0xff) *key_ptr
[3]++ = wch
& 0xff;
140 *key_ptr
[0]++ = 0xff;
141 *key_ptr
[0]++ = 0xfe;
142 if (wch
>> 8) *key_ptr
[0]++ = wch
>> 8;
143 if (wch
& 0xff) *key_ptr
[0]++ = wch
& 0xff;
152 *key_ptr
[3]++ = '\1';
155 return key_ptr
[3] - dst
;
158 static inline int compare_unicode_weights(int flags
, const WCHAR
*str1
, int len1
,
159 const WCHAR
*str2
, int len2
)
161 unsigned int ce1
, ce2
;
164 /* 32-bit collation element table format:
165 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
166 * case weight - high 4 bit of low 8 bit.
168 while (len1
> 0 && len2
> 0)
170 if (flags
& NORM_IGNORESYMBOLS
)
173 /* FIXME: not tested */
174 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
180 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
189 /* hyphen and apostrophe are treated differently depending on
190 * whether SORT_STRINGSORT specified or not
192 if (!(flags
& SORT_STRINGSORT
))
194 if (*str1
== '-' || *str1
== '\'')
196 if (*str2
!= '-' && *str2
!= '\'')
203 else if (*str2
== '-' || *str2
== '\'')
211 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
212 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
214 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
215 ret
= (ce1
>> 16) - (ce2
>> 16);
229 static inline int compare_diacritic_weights(int flags
, const WCHAR
*str1
, int len1
,
230 const WCHAR
*str2
, int len2
)
232 unsigned int ce1
, ce2
;
235 /* 32-bit collation element table format:
236 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
237 * case weight - high 4 bit of low 8 bit.
239 while (len1
> 0 && len2
> 0)
241 if (flags
& NORM_IGNORESYMBOLS
)
244 /* FIXME: not tested */
245 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
251 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
260 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
261 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
263 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
264 ret
= ((ce1
>> 8) & 0xff) - ((ce2
>> 8) & 0xff);
278 static inline int compare_case_weights(int flags
, const WCHAR
*str1
, int len1
,
279 const WCHAR
*str2
, int len2
)
281 unsigned int ce1
, ce2
;
284 /* 32-bit collation element table format:
285 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
286 * case weight - high 4 bit of low 8 bit.
288 while (len1
> 0 && len2
> 0)
290 if (flags
& NORM_IGNORESYMBOLS
)
293 /* FIXME: not tested */
294 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
300 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
309 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
310 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
312 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
313 ret
= ((ce1
>> 4) & 0x0f) - ((ce2
>> 4) & 0x0f);
327 static inline int real_length(const WCHAR
*str
, int len
)
329 while (len
&& !str
[len
- 1]) len
--;
333 int wine_compare_string(int flags
, const WCHAR
*str1
, int len1
,
334 const WCHAR
*str2
, int len2
)
338 len1
= real_length(str1
, len1
);
339 len2
= real_length(str2
, len2
);
341 ret
= compare_unicode_weights(flags
, str1
, len1
, str2
, len2
);
344 if (!(flags
& NORM_IGNORENONSPACE
))
345 ret
= compare_diacritic_weights(flags
, str1
, len1
, str2
, len2
);
346 if (!ret
&& !(flags
& NORM_IGNORECASE
))
347 ret
= compare_case_weights(flags
, str1
, len1
, str2
, len2
);