2 * Unicode sort key generation
4 * Copyright 2003 Dmitry Timoshkov
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
27 extern const unsigned int collation_table
[];
28 extern const unsigned short nfd_table
[] DECLSPEC_HIDDEN
;
30 static const WCHAR
*get_decomposition( WCHAR ch
, unsigned int *len
)
32 unsigned short offset
= nfd_table
[nfd_table
[ch
>> 8] + ((ch
>> 4) & 0xf)] + (ch
& 0xf);
33 unsigned short start
= nfd_table
[offset
];
34 unsigned short end
= nfd_table
[offset
+ 1];
36 if ((*len
= end
- start
)) return nfd_table
+ start
;
42 * flags - normalization NORM_* flags
44 * FIXME: 'variable' flag not handled
46 int wine_get_sortkey_obsolete(int flags
, const WCHAR
*src
, int srclen
, char *dst
, int dstlen
)
48 WCHAR dummy
[4]; /* no decomposition is larger than 4 chars */
51 const WCHAR
*src_save
= src
;
52 int srclen_save
= srclen
;
54 key_len
[0] = key_len
[1] = key_len
[2] = key_len
[3] = 0;
55 for (; srclen
; srclen
--, src
++)
57 unsigned int i
, decomposed_len
= 1;/*wine_decompose(*src, dummy, 4);*/
61 for (i
= 0; i
< decomposed_len
; i
++)
66 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
67 * and skips white space and punctuation characters for
70 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
73 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
75 ce
= collation_table
[collation_table
[collation_table
[wch
>> 8] + ((wch
>> 4) & 0x0f)] + (wch
& 0xf)];
76 if (ce
!= (unsigned int)-1)
78 if (ce
>> 16) key_len
[0] += 2;
79 if ((ce
>> 8) & 0xff) key_len
[1]++;
80 if ((ce
>> 4) & 0x0f) key_len
[2]++;
83 if (wch
>> 8) key_len
[3]++;
90 if (wch
>> 8) key_len
[0]++;
91 if (wch
& 0xff) key_len
[0]++;
97 if (!dstlen
) /* compute length */
98 /* 4 * '\1' + key length */
99 return key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4;
101 if (dstlen
< key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1)
102 return 0; /* overflow */
105 srclen
= srclen_save
;
108 key_ptr
[1] = key_ptr
[0] + key_len
[0] + 1;
109 key_ptr
[2] = key_ptr
[1] + key_len
[1] + 1;
110 key_ptr
[3] = key_ptr
[2] + key_len
[2] + 1;
112 for (; srclen
; srclen
--, src
++)
114 unsigned int i
, decomposed_len
= 1;/*wine_decompose(*src, dummy, 4);*/
118 for (i
= 0; i
< decomposed_len
; i
++)
120 WCHAR wch
= dummy
[i
];
123 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
124 * and skips white space and punctuation characters for
125 * NORM_IGNORESYMBOLS.
127 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
130 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
132 ce
= collation_table
[collation_table
[collation_table
[wch
>> 8] + ((wch
>> 4) & 0x0f)] + (wch
& 0xf)];
133 if (ce
!= (unsigned int)-1)
136 if ((key
= ce
>> 16))
138 *key_ptr
[0]++ = key
>> 8;
139 *key_ptr
[0]++ = key
& 0xff;
141 /* make key 1 start from 2 */
142 if ((key
= (ce
>> 8) & 0xff)) *key_ptr
[1]++ = key
+ 1;
143 /* make key 2 start from 2 */
144 if ((key
= (ce
>> 4) & 0x0f)) *key_ptr
[2]++ = key
+ 1;
145 /* key 3 is always a character code */
148 if (wch
>> 8) *key_ptr
[3]++ = wch
>> 8;
149 if (wch
& 0xff) *key_ptr
[3]++ = wch
& 0xff;
154 *key_ptr
[0]++ = 0xff;
155 *key_ptr
[0]++ = 0xfe;
156 if (wch
>> 8) *key_ptr
[0]++ = wch
>> 8;
157 if (wch
& 0xff) *key_ptr
[0]++ = wch
& 0xff;
166 *key_ptr
[3]++ = '\1';
169 return key_ptr
[3] - dst
;
179 static unsigned int get_weight(WCHAR ch
, enum weight type
)
183 ret
= collation_table
[collation_table
[collation_table
[ch
>> 8] + ((ch
>> 4) & 0x0f)] + (ch
& 0xf)];
184 if (ret
== (unsigned int)-1)
191 case DIACRITIC_WEIGHT
:
192 return (ret
>> 8) & 0xff;
195 return (ret
>> 4) & 0x0f;
199 static void inc_str_pos(const WCHAR
**str
, int *len
, unsigned int *dpos
, unsigned int *dlen
)
210 static inline int compare_weights(int flags
, const WCHAR
*str1
, int len1
,
211 const WCHAR
*str2
, int len2
, enum weight type
)
213 unsigned int ce1
, ce2
, dpos1
= 0, dpos2
= 0, dlen1
= 0, dlen2
= 0;
214 const WCHAR
*dstr1
= NULL
, *dstr2
= NULL
;
216 /* 32-bit collation element table format:
217 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
218 * case weight - high 4 bit of low 8 bit.
220 while (len1
> 0 && len2
> 0)
222 if (!dlen1
&& !(dstr1
= get_decomposition( *str1
, &dlen1
))) dstr1
= str1
;
223 if (!dlen2
&& !(dstr2
= get_decomposition( *str2
, &dlen2
))) dstr2
= str2
;
225 if (flags
& NORM_IGNORESYMBOLS
)
228 /* FIXME: not tested */
229 if (get_char_typeW(dstr1
[dpos1
]) & (C1_PUNCT
| C1_SPACE
))
231 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
234 if (get_char_typeW(dstr2
[dpos2
]) & (C1_PUNCT
| C1_SPACE
))
236 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
242 /* hyphen and apostrophe are treated differently depending on
243 * whether SORT_STRINGSORT specified or not
245 if (type
== UNICODE_WEIGHT
&& !(flags
& SORT_STRINGSORT
))
247 if (dstr1
[dpos1
] == '-' || dstr1
[dpos1
] == '\'')
249 if (dstr2
[dpos2
] != '-' && dstr2
[dpos2
] != '\'')
251 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
255 else if (dstr2
[dpos2
] == '-' || dstr2
[dpos2
] == '\'')
257 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
262 ce1
= get_weight(dstr1
[dpos1
], type
);
265 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
268 ce2
= get_weight(dstr2
[dpos2
], type
);
271 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
275 if (ce1
- ce2
) return ce1
- ce2
;
277 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
278 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
282 if (!dlen1
&& !(dstr1
= get_decomposition( *str1
, &dlen1
))) dstr1
= str1
;
283 ce1
= get_weight(dstr1
[dpos1
], type
);
285 inc_str_pos(&str1
, &len1
, &dpos1
, &dlen1
);
289 if (!dlen2
&& !(dstr2
= get_decomposition( *str2
, &dlen2
))) dstr2
= str2
;
290 ce2
= get_weight(dstr2
[dpos2
], type
);
292 inc_str_pos(&str2
, &len2
, &dpos2
, &dlen2
);
297 int wine_compare_string_obsolete(int flags
, const WCHAR
*str1
, int len1
,
298 const WCHAR
*str2
, int len2
)
302 ret
= compare_weights(flags
, str1
, len1
, str2
, len2
, UNICODE_WEIGHT
);
305 if (!(flags
& NORM_IGNORENONSPACE
))
306 ret
= compare_weights(flags
, str1
, len1
, str2
, len2
, DIACRITIC_WEIGHT
);
307 if (!ret
&& !(flags
& NORM_IGNORECASE
))
308 ret
= compare_weights(flags
, str1
, len1
, str2
, len2
, CASE_WEIGHT
);
313 __ASM_OBSOLETE(wine_get_sortkey
);
314 __ASM_OBSOLETE(wine_compare_string
);
316 #endif /* __ASM_OBSOLETE */