2 * Unicode sort key generation
4 * Copyright 2003 Dmitry Timoshkov
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 #include "wine/unicode.h"
22 extern unsigned int wine_decompose( int flags
, WCHAR ch
, WCHAR
*dst
, unsigned int dstlen
);
23 extern const unsigned int collation_table
[];
26 * flags - normalization NORM_* flags
28 * FIXME: 'variable' flag not handled
30 int wine_get_sortkey(int flags
, const WCHAR
*src
, int srclen
, char *dst
, int dstlen
)
32 WCHAR dummy
[4]; /* no decomposition is larger than 4 chars */
35 const WCHAR
*src_save
= src
;
36 int srclen_save
= srclen
;
38 key_len
[0] = key_len
[1] = key_len
[2] = key_len
[3] = 0;
39 for (; srclen
; srclen
--, src
++)
41 unsigned int i
, decomposed_len
= 1;/*wine_decompose(*src, dummy, 4);*/
45 for (i
= 0; i
< decomposed_len
; i
++)
50 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
51 * and skips white space and punctuation characters for
54 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
57 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
59 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
60 if (ce
!= (unsigned int)-1)
62 if (ce
>> 16) key_len
[0] += 2;
63 if ((ce
>> 8) & 0xff) key_len
[1]++;
64 if ((ce
>> 4) & 0x0f) key_len
[2]++;
67 if (wch
>> 8) key_len
[3]++;
74 if (wch
>> 8) key_len
[0]++;
75 if (wch
& 0xff) key_len
[0]++;
81 if (!dstlen
) /* compute length */
82 /* 4 * '\1' + key length */
83 return key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4;
85 if (dstlen
< key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1)
86 return 0; /* overflow */
92 key_ptr
[1] = key_ptr
[0] + key_len
[0] + 1;
93 key_ptr
[2] = key_ptr
[1] + key_len
[1] + 1;
94 key_ptr
[3] = key_ptr
[2] + key_len
[2] + 1;
96 for (; srclen
; srclen
--, src
++)
98 unsigned int i
, decomposed_len
= 1;/*wine_decompose(*src, dummy, 4);*/
102 for (i
= 0; i
< decomposed_len
; i
++)
104 WCHAR wch
= dummy
[i
];
107 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
108 * and skips white space and punctuation characters for
109 * NORM_IGNORESYMBOLS.
111 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
114 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
116 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
117 if (ce
!= (unsigned int)-1)
120 if ((key
= ce
>> 16))
122 *key_ptr
[0]++ = key
>> 8;
123 *key_ptr
[0]++ = key
& 0xff;
125 /* make key 1 start from 2 */
126 if ((key
= (ce
>> 8) & 0xff)) *key_ptr
[1]++ = key
+ 1;
127 /* make key 2 start from 2 */
128 if ((key
= (ce
>> 4) & 0x0f)) *key_ptr
[2]++ = key
+ 1;
129 /* key 3 is always a character code */
132 if (wch
>> 8) *key_ptr
[3]++ = wch
>> 8;
133 if (wch
& 0xff) *key_ptr
[3]++ = wch
& 0xff;
138 *key_ptr
[0]++ = 0xff;
139 *key_ptr
[0]++ = 0xfe;
140 if (wch
>> 8) *key_ptr
[0]++ = wch
>> 8;
141 if (wch
& 0xff) *key_ptr
[0]++ = wch
& 0xff;
150 *key_ptr
[3]++ = '\1';
153 return key_ptr
[3] - dst
;
163 static unsigned int get_weight(WCHAR ch
, enum weight type
)
167 ret
= collation_table
[collation_table
[ch
>> 8] + (ch
& 0xff)];
168 if (ret
== (unsigned int)-1)
175 case DIACRITIC_WEIGHT
:
176 return (ret
>> 8) & 0xff;
179 return (ret
>> 4) & 0x0f;
183 static void inc_str_pos(const WCHAR
**str
, int *len
, int *dpos
, int *dlen
)
194 static inline int compare_weights(int flags
, const WCHAR
*str1
, int len1
,
195 const WCHAR
*str2
, int len2
, enum weight type
)
197 int dpos1
= 0, dpos2
= 0, dlen1
= 0, dlen2
= 0;
198 WCHAR dstr1
[4], dstr2
[4];
199 unsigned int ce1
, ce2
;
201 /* 32-bit collation element table format:
202 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
203 * case weight - high 4 bit of low 8 bit.
205 while (len1
> 0 && len2
> 0)
207 if (!dlen1
) dlen1
= wine_decompose(0, *str1
, dstr1
, 4);
208 if (!dlen2
) dlen2
= wine_decompose(0, *str2
, dstr2
, 4);
210 if (flags
& NORM_IGNORESYMBOLS
)
213 /* FIXME: not tested */
214 if (get_char_typeW(dstr1
[dpos1
]) & (C1_PUNCT
| C1_SPACE
))
216 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
219 if (get_char_typeW(dstr2
[dpos2
]) & (C1_PUNCT
| C1_SPACE
))
221 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
227 /* hyphen and apostrophe are treated differently depending on
228 * whether SORT_STRINGSORT specified or not
230 if (type
== UNICODE_WEIGHT
&& !(flags
& SORT_STRINGSORT
))
232 if (dstr1
[dpos1
] == '-' || dstr1
[dpos1
] == '\'')
234 if (dstr2
[dpos2
] != '-' && dstr2
[dpos2
] != '\'')
236 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
240 else if (dstr2
[dpos2
] == '-' || dstr2
[dpos2
] == '\'')
242 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
247 ce1
= get_weight(dstr1
[dpos1
], type
);
250 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
253 ce2
= get_weight(dstr2
[dpos2
], type
);
256 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
260 if (ce1
- ce2
) return ce1
- ce2
;
262 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
263 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
267 if (!dlen1
) dlen1
= wine_decompose(0, *str1
, dstr1
, 4);
269 ce1
= get_weight(dstr1
[dpos1
], type
);
271 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
275 if (!dlen2
) dlen2
= wine_decompose(0, *str2
, dstr2
, 4);
277 ce2
= get_weight(dstr2
[dpos2
], type
);
279 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
284 int wine_compare_string(int flags
, const WCHAR
*str1
, int len1
,
285 const WCHAR
*str2
, int len2
)
289 ret
= compare_weights(flags
, str1
, len1
, str2
, len2
, UNICODE_WEIGHT
);
292 if (!(flags
& NORM_IGNORENONSPACE
))
293 ret
= compare_weights(flags
, str1
, len1
, str2
, len2
, DIACRITIC_WEIGHT
);
294 if (!ret
&& !(flags
& NORM_IGNORECASE
))
295 ret
= compare_weights(flags
, str1
, len1
, str2
, len2
, CASE_WEIGHT
);