1 /** @file api_unicode.cc
2 * @brief Test the Unicode and UTF-8 classes and functions.
4 /* Copyright (C) 2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "api_unicode.h"
28 #include "testutils.h"
38 static const testcase testcases
[] = {
39 { "abcd", "abcd" }, // Sanity check!
40 { "a\x80""bcd", "a\xc2\x80""bcd" },
41 { "a\xa0", "a\xc2\xa0" },
42 { "a\xa0z", "a\xc2\xa0z" },
43 { "x\xc1yz", "x\xc3\x81yz" },
44 { "\xc2z", "\xc3\x82z" },
45 { "\xc2", "\xc3\x82" },
46 { "xy\xc3z", "xy\xc3\x83z" },
47 { "xy\xc3\xc3z", "xy\xc3\x83\xc3\x83z" },
48 { "xy\xc3\xc3", "xy\xc3\x83\xc3\x83" },
49 { "\xe0", "\xc3\xa0" },
50 { "\xe0\x80", "\xc3\xa0\xc2\x80" },
51 { "\xe0\xc0", "\xc3\xa0\xc3\x80" },
52 { "\xe0\xc0z", "\xc3\xa0\xc3\x80z" },
53 { "\xe0\xc0zz", "\xc3\xa0\xc3\x80zz" },
54 { "\xe0\xc0\x81", "\xc3\xa0\xc3\x80\xc2\x81" },
55 { "\xe0\x82\xc1", "\xc3\xa0\xc2\x82\xc3\x81" },
56 { "\xe0\xc5\xc7", "\xc3\xa0\xc3\x85\xc3\x87" },
57 { "\xf0", "\xc3\xb0" },
58 { "\xf0\x80", "\xc3\xb0\xc2\x80" },
59 { "\xf0\xc0", "\xc3\xb0\xc3\x80" },
60 { "\xf0\xc0z", "\xc3\xb0\xc3\x80z" },
61 { "\xf0\xc0zz", "\xc3\xb0\xc3\x80zz" },
62 { "\xf0\xc0\x81", "\xc3\xb0\xc3\x80\xc2\x81" },
63 { "\xf0\x82\xc1", "\xc3\xb0\xc2\x82\xc3\x81" },
64 { "\xf0\xc5\xc7", "\xc3\xb0\xc3\x85\xc3\x87" },
65 { "\xf0\xc0\x81\xc9", "\xc3\xb0\xc3\x80\xc2\x81\xc3\x89" },
66 { "\xf0\x82\xc1\xc8", "\xc3\xb0\xc2\x82\xc3\x81\xc3\x88" },
67 { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
68 { "\xf0\xc0\x81\x89", "\xc3\xb0\xc3\x80\xc2\x81\xc2\x89" },
69 { "\xf0\x82\xc1\x88", "\xc3\xb0\xc2\x82\xc3\x81\xc2\x88" },
70 { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
71 { "\xf4P\x80\x80", "\xc3\xb4P\xc2\x80\xc2\x80" },
72 { "\xf4\x80P\x80", "\xc3\xb4\xc2\x80P\xc2\x80" },
73 { "\xf4\x80\x80P", "\xc3\xb4\xc2\x80\xc2\x80P" },
74 { "\xfe\xffxyzzy", "\xc3\xbe\xc3\xbfxyzzy" },
75 // Overlong encodings:
76 { "\xc0\x80", "\xc3\x80\xc2\x80" },
77 { "\xc0\xbf", "\xc3\x80\xc2\xbf" },
78 { "\xc1\x80", "\xc3\x81\xc2\x80" },
79 { "\xc1\xbf", "\xc3\x81\xc2\xbf" },
80 { "\xe0\x80\x80", "\xc3\xa0\xc2\x80\xc2\x80" },
81 { "\xe0\x9f\xbf", "\xc3\xa0\xc2\x9f\xc2\xbf" },
82 { "\xf0\x80\x80\x80", "\xc3\xb0\xc2\x80\xc2\x80\xc2\x80" },
83 { "\xf0\x8f\xbf\xbf", "\xc3\xb0\xc2\x8f\xc2\xbf\xc2\xbf" },
85 { "\xf4\x90\x80\x80", "\xc3\xb4\xc2\x90\xc2\x80\xc2\x80" },
89 // Test handling of invalid UTF-8 is as desired.
90 DEFINE_TESTCASE(utf8iterator1
, !backend
) {
92 for (p
= testcases
; p
->a
; ++p
) {
94 tout
<< '"' << p
->a
<< "\" and \"" << p
->b
<< '"' << endl
;
95 size_t a_len
= strlen(p
->a
);
96 Xapian::Utf8Iterator
a(p
->a
, a_len
);
98 size_t b_len
= strlen(p
->b
);
99 Xapian::Utf8Iterator
b(p
->b
, b_len
);
101 while (a
!= Xapian::Utf8Iterator() && b
!= Xapian::Utf8Iterator()) {
107 // Test that we don't reach the end of one before the other.
108 TEST(a
== Xapian::Utf8Iterator());
109 TEST(b
== Xapian::Utf8Iterator());
119 static const testcase2 testcases2
[] = {
125 { "\xe0\xa0\x80", 0x0800 },
126 { "\xe1\x80\x80", 0x1000 },
127 { "\xf0\xa8\xa8\x8f", 166415 },
128 { "\xf3\x80\x80\x80", 0x0c0000 },
129 { "\xf4\x80\x80\x80", 0x100000 },
133 // Test decoding of UTF-8.
134 DEFINE_TESTCASE(utf8iterator2
, !backend
) {
136 for (p
= testcases2
; p
->a
; ++p
) {
137 Xapian::Utf8Iterator
a(p
->a
);
139 TEST(a
!= Xapian::Utf8Iterator());
140 TEST_EQUAL(*a
, p
->n
);
141 TEST(++a
== Xapian::Utf8Iterator());
146 // Test Unicode categorisation.
147 DEFINE_TESTCASE(unicode1
, !backend
) {
148 using namespace Xapian
;
149 TEST_EQUAL(Unicode::get_category('a'), Unicode::LOWERCASE_LETTER
);
150 TEST_EQUAL(Unicode::get_category('0'), Unicode::DECIMAL_DIGIT_NUMBER
);
151 TEST_EQUAL(Unicode::get_category('$'), Unicode::CURRENCY_SYMBOL
);
152 TEST_EQUAL(Unicode::get_category(0xa3), Unicode::CURRENCY_SYMBOL
);
153 // U+0242 was added in Unicode 5.0.0.
154 TEST_EQUAL(Unicode::get_category(0x242), Unicode::LOWERCASE_LETTER
);
155 // U+0526 was added in Unicode 6.0.0.
156 TEST_EQUAL(Unicode::get_category(0x0526), Unicode::UPPERCASE_LETTER
);
157 // U+0527 was added in Unicode 6.0.0.
158 TEST_EQUAL(Unicode::get_category(0x0527), Unicode::LOWERCASE_LETTER
);
159 // U+0620 was added in Unicode 6.0.0.
160 TEST_EQUAL(Unicode::get_category(0x0620), Unicode::OTHER_LETTER
);
161 // U+065F was added in Unicode 6.0.0.
162 TEST_EQUAL(Unicode::get_category(0x065F), Unicode::NON_SPACING_MARK
);
163 // U+06DE changed category in Unicode 6.0.0.
164 TEST_EQUAL(Unicode::get_category(0x06DE), Unicode::OTHER_SYMBOL
);
165 // U+0840 was added in Unicode 6.0.0.
166 TEST_EQUAL(Unicode::get_category(0x0840), Unicode::OTHER_LETTER
);
167 // U+093A was added in Unicode 6.0.0.
168 TEST_EQUAL(Unicode::get_category(0x093A), Unicode::NON_SPACING_MARK
);
169 // U+093B was added in Unicode 6.0.0.
170 TEST_EQUAL(Unicode::get_category(0x093B), Unicode::COMBINING_SPACING_MARK
);
171 // U+0CF1 changed category in Unicode 6.0.0.
172 TEST_EQUAL(Unicode::get_category(0x0CF1), Unicode::OTHER_LETTER
);
173 // U+0CF2 changed category in Unicode 6.0.0.
174 TEST_EQUAL(Unicode::get_category(0x0CF2), Unicode::OTHER_LETTER
);
175 // U+11A7 was added in Unicode 5.2.0.
176 TEST_EQUAL(Unicode::get_category(0x11A7), Unicode::OTHER_LETTER
);
177 // U+9FCB was added in Unicode 5.2.0.
178 TEST_EQUAL(Unicode::get_category(0x9FCB), Unicode::OTHER_LETTER
);
179 // U+FA6C was added in Unicode 5.2.0.
180 TEST_EQUAL(Unicode::get_category(0xFA6C), Unicode::OTHER_LETTER
);
181 TEST_EQUAL(Unicode::get_category(0xFFFF), Unicode::UNASSIGNED
);
182 // Test characters outside BMP.
183 TEST_EQUAL(Unicode::get_category(0x10345), Unicode::OTHER_LETTER
);
184 TEST_EQUAL(Unicode::get_category(0x10FFFD), Unicode::PRIVATE_USE
);
185 TEST_EQUAL(Unicode::get_category(0x10FFFF), Unicode::UNASSIGNED
);
186 // U+1109A was added in Unicode 5.2.0.
187 TEST_EQUAL(Unicode::get_category(0x1109a), Unicode::OTHER_LETTER
);
188 // U+1F773 was added in Unicode 6.0.0.
189 TEST_EQUAL(Unicode::get_category(0x1F773), Unicode::OTHER_SYMBOL
);
190 // U+2B740 was added in Unicode 6.0.0.
191 TEST_EQUAL(Unicode::get_category(0x2B740), Unicode::OTHER_LETTER
);
192 // U+2B81D was added in Unicode 6.0.0.
193 TEST_EQUAL(Unicode::get_category(0x2B81D), Unicode::OTHER_LETTER
);
194 // U+00A7 changed category in Unicode 6.1.0 (was OTHER_SYMBOL).
195 TEST_EQUAL(Unicode::get_category(0xA7), Unicode::OTHER_PUNCTUATION
);
196 // U+00AA changed category in Unicode 6.1.0 (was LOWERCASE_LETTER).
197 TEST_EQUAL(Unicode::get_category(0xAA), Unicode::OTHER_LETTER
);
198 // U+00B6 changed category in Unicode 6.1.0 (was OTHER_SYMBOL).
199 TEST_EQUAL(Unicode::get_category(0xB6), Unicode::OTHER_PUNCTUATION
);
200 // U+00BA changed category in Unicode 6.1.0 (was LOWERCASE_LETTER).
201 TEST_EQUAL(Unicode::get_category(0xBA), Unicode::OTHER_LETTER
);
202 // U+058F was added in Unicode 6.1.0.
203 TEST_EQUAL(Unicode::get_category(0x058F), Unicode::CURRENCY_SYMBOL
);
204 // U+0604 was added in Unicode 6.1.0.
205 TEST_EQUAL(Unicode::get_category(0x0604), Unicode::FORMAT
);
206 // U+08A0 was added in Unicode 6.1.0.
207 TEST_EQUAL(Unicode::get_category(0x08A0), Unicode::OTHER_LETTER
);
208 // U+08E4 was added in Unicode 6.1.0.
209 TEST_EQUAL(Unicode::get_category(0x08E4), Unicode::NON_SPACING_MARK
);
210 // U+0AF0 was added in Unicode 6.1.0.
211 TEST_EQUAL(Unicode::get_category(0x0AF0), Unicode::OTHER_PUNCTUATION
);
212 // U+9FCC was added in Unicode 6.1.0.
213 TEST_EQUAL(Unicode::get_category(0x9FCC), Unicode::OTHER_LETTER
);
214 // U+A7F9 was added in Unicode 6.1.0.
215 TEST_EQUAL(Unicode::get_category(0xA7F9), Unicode::MODIFIER_LETTER
);
216 // U+110F0 was added in Unicode 6.1.0.
217 TEST_EQUAL(Unicode::get_category(0x110F0), Unicode::DECIMAL_DIGIT_NUMBER
);
218 // U+11100 was added in Unicode 6.1.0.
219 TEST_EQUAL(Unicode::get_category(0x11100), Unicode::NON_SPACING_MARK
);
220 // U+1EEF0 was added in Unicode 6.1.0.
221 TEST_EQUAL(Unicode::get_category(0x1EEF0), Unicode::MATH_SYMBOL
);
222 // U+1F634 was added in Unicode 6.1.0.
223 TEST_EQUAL(Unicode::get_category(0x1F634), Unicode::OTHER_SYMBOL
);
224 // U+20BA was added in Unicode 6.2.0.
225 TEST_EQUAL(Unicode::get_category(0x20BA), Unicode::CURRENCY_SYMBOL
);
226 // U+061C was added in Unicode 6.3.0.
227 TEST_EQUAL(Unicode::get_category(0x61C), Unicode::FORMAT
);
228 // U+037F "GREEK CAPITAL LETTER YOT" was added in Unicode 7.0.0.
229 TEST_EQUAL(Unicode::get_category(0x37F), Unicode::UPPERCASE_LETTER
);
231 // Added or changed in Unicode 8.0.0:
232 // U+08B3 "ARABIC LETTER AIN WITH THREE DOTS BELOW".
233 TEST_EQUAL(Unicode::get_category(0x8B3), Unicode::OTHER_LETTER
);
234 // U+0AF9 "GUJARATI LETTER ZHA".
235 TEST_EQUAL(Unicode::get_category(0xAF9), Unicode::OTHER_LETTER
);
236 // U+0C5A "TELUGU LETTER RRRA".
237 TEST_EQUAL(Unicode::get_category(0xC5A), Unicode::OTHER_LETTER
);
238 // U+0D5F "MALAYALAM LETTER ARCHAIC II".
239 TEST_EQUAL(Unicode::get_category(0xD5F), Unicode::OTHER_LETTER
);
240 // U+13F5 "CHEROKEE LETTER MV".
241 TEST_EQUAL(Unicode::get_category(0x13F5), Unicode::UPPERCASE_LETTER
);
242 // U+13F8 "CHEROKEE SMALL LETTER YE".
243 TEST_EQUAL(Unicode::get_category(0x13F8), Unicode::LOWERCASE_LETTER
);
244 // U+19B7 "NEW TAI LUE VOWEL SIGN O" changed to be OTHER_LETTER in 8.0.0.
245 TEST_EQUAL(Unicode::get_category(0x19B7), Unicode::OTHER_LETTER
);
246 // U+20BE "LARI SIGN".
247 TEST_EQUAL(Unicode::get_category(0x20BE), Unicode::CURRENCY_SYMBOL
);
248 // U+218A "TURNED DIGIT TWO".
249 TEST_EQUAL(Unicode::get_category(0x218A), Unicode::OTHER_SYMBOL
);
250 // U+10C9C "OLD HUNGARIAN CAPITAL LETTER OO".
251 TEST_EQUAL(Unicode::get_category(0x10C9C), Unicode::UPPERCASE_LETTER
);
252 // U+12399 "CUNEIFORM SIGN U U".
253 TEST_EQUAL(Unicode::get_category(0x12399), Unicode::OTHER_LETTER
);
254 // U+1D800 "SIGNWRITING HAND-FIST INDEX".
255 TEST_EQUAL(Unicode::get_category(0x1D800), Unicode::OTHER_SYMBOL
);
257 // Added or changed in Unicode 9.0.0:
258 // U+08B6 "ARABIC LETTER BEH WITH SMALL MEEM ABOVE"
259 TEST_EQUAL(Unicode::get_category(0x8B6), Unicode::OTHER_LETTER
);
260 // U+08E2 "ARABIC DISPUTED END OF AYAH"
261 TEST_EQUAL(Unicode::get_category(0x8E2), Unicode::FORMAT
);
262 // U+0C80 "KANNADA SIGN SPACING CANDRABINDU"
263 TEST_EQUAL(Unicode::get_category(0xC80), Unicode::OTHER_LETTER
);
264 // U+0D56 "MALAYALAM LETTER CHILLU LLL"
265 TEST_EQUAL(Unicode::get_category(0xD56), Unicode::OTHER_LETTER
);
266 // U+0D58 "MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH"
267 TEST_EQUAL(Unicode::get_category(0xD58), Unicode::OTHER_NUMBER
);
268 // U+1885 "MONGOLIAN LETTER ALI GALI BALUDA"
269 TEST_EQUAL(Unicode::get_category(0x1885), Unicode::NON_SPACING_MARK
);
270 // U+1886 "MONGOLIAN LETTER ALI GALI THREE BALUDA"
271 TEST_EQUAL(Unicode::get_category(0x1886), Unicode::NON_SPACING_MARK
);
272 // U+104FB "OSAGE SMALL LETTER ZHA"
273 TEST_EQUAL(Unicode::get_category(0x104FB), Unicode::LOWERCASE_LETTER
);
274 // U+1141F "NEWA LETTER TA"
275 TEST_EQUAL(Unicode::get_category(0x1141F), Unicode::OTHER_LETTER
);
277 TEST_EQUAL(Unicode::get_category(0x1F989), Unicode::OTHER_SYMBOL
);
279 // Added in Unicode 10.0.0:
280 // U+20BF "BITCOIN SIGN"
281 TEST_EQUAL(Unicode::get_category(0x20BF), Unicode::CURRENCY_SYMBOL
);
282 // U+23FF "OBSERVER EYE SYMBOL"
283 TEST_EQUAL(Unicode::get_category(0x23FF), Unicode::OTHER_SYMBOL
);
284 // U+1032D "OLD ITALIC LETTER YE"
285 TEST_EQUAL(Unicode::get_category(0x1032D), Unicode::OTHER_LETTER
);
286 // U+11A34 "ZANABAZAR SQUARE SIGN VIRAMA"
287 TEST_EQUAL(Unicode::get_category(0x11A34), Unicode::NON_SPACING_MARK
);
288 // U+1F6F8 "FLYING SAUCER"
289 TEST_EQUAL(Unicode::get_category(0x1F6F8), Unicode::OTHER_SYMBOL
);
291 TEST_EQUAL(Unicode::get_category(0x1F9E6), Unicode::OTHER_SYMBOL
);
293 // Added in Unicode 11.0.0:
294 // U+0560 "ARMENIAN SMALL LETTER TURNED AYB"
295 TEST_EQUAL(Unicode::get_category(0x0560), Unicode::LOWERCASE_LETTER
);
296 // U+05EF "HEBREW YOD TRIANGLE"
297 TEST_EQUAL(Unicode::get_category(0x05EF), Unicode::OTHER_LETTER
);
298 // U+07FF "NKO TAMAN SIGN"
299 TEST_EQUAL(Unicode::get_category(0x07FF), Unicode::CURRENCY_SYMBOL
);
300 // U+08D3 "ARABIC SMALL LOW WAW"
301 TEST_EQUAL(Unicode::get_category(0x08D3), Unicode::NON_SPACING_MARK
);
302 // U+1878 "MONGOLIAN LETTER CHA WITH TWO DOTS"
303 TEST_EQUAL(Unicode::get_category(0x1878), Unicode::OTHER_LETTER
);
304 // U+1F12F "COPYLEFT SYMBOL"
305 TEST_EQUAL(Unicode::get_category(0x1F12F), Unicode::OTHER_SYMBOL
);
307 // Changed category in Unicode 11.0.0:
308 // U+10D0 "GEORGIAN LETTER AN"
309 TEST_EQUAL(Unicode::get_category(0x10D0), Unicode::LOWERCASE_LETTER
);
311 // Test some invalid Unicode values.
312 TEST_EQUAL(Unicode::get_category(0x110000), Unicode::UNASSIGNED
);
313 TEST_EQUAL(Unicode::get_category(0xFFFFFFFF), Unicode::UNASSIGNED
);
317 DEFINE_TESTCASE(caseconvert1
, !backend
) {
318 using namespace Xapian
;
319 for (unsigned ch
= 0; ch
< 128; ++ch
) {
320 TEST_EQUAL(Unicode::tolower(ch
), unsigned(tolower(ch
)));
321 TEST_EQUAL(Unicode::toupper(ch
), unsigned(toupper(ch
)));
324 // U+0242 was added in Unicode 5.0.0 as a lowercase form of U+0241.
325 TEST_EQUAL(Unicode::tolower(0x242), 0x242);
326 TEST_EQUAL(Unicode::toupper(0x242), 0x241);
327 TEST_EQUAL(Unicode::toupper(0x241), 0x241);
328 TEST_EQUAL(Unicode::tolower(0x241), 0x242);
330 // Regression test for bug fixed in 1.2.17.
331 TEST_EQUAL(Unicode::tolower(0x1c5), 0x1c6);
332 TEST_EQUAL(Unicode::tolower(0x1c8), 0x1c9);
333 TEST_EQUAL(Unicode::tolower(0x1cb), 0x1cc);
334 TEST_EQUAL(Unicode::tolower(0x1f2), 0x1f3);
336 // Pound currency symbol:
337 TEST_EQUAL(Unicode::tolower(0xa3), 0xa3);
338 TEST_EQUAL(Unicode::toupper(0xa3), 0xa3);
340 TEST_EQUAL(Unicode::tolower(0xFFFF), 0xFFFF);
341 TEST_EQUAL(Unicode::toupper(0xFFFF), 0xFFFF);
342 // Test characters outside BMP.
343 TEST_EQUAL(Unicode::tolower(0x10345), 0x10345);
344 TEST_EQUAL(Unicode::toupper(0x10345), 0x10345);
345 TEST_EQUAL(Unicode::tolower(0x10FFFD), 0x10FFFD);
346 TEST_EQUAL(Unicode::toupper(0x10FFFD), 0x10FFFD);
347 TEST_EQUAL(Unicode::tolower(0x10FFFF), 0x10FFFF);
348 TEST_EQUAL(Unicode::toupper(0x10FFFF), 0x10FFFF);
349 // Test some invalid Unicode values.
350 TEST_EQUAL(Unicode::tolower(0x110000), 0x110000);
351 TEST_EQUAL(Unicode::toupper(0x110000), 0x110000);
352 TEST_EQUAL(Unicode::tolower(0xFFFFFFFF), 0xFFFFFFFF);
353 TEST_EQUAL(Unicode::toupper(0xFFFFFFFF), 0xFFFFFFFF);
358 /// Test Unicode 5.1 and later support.
359 DEFINE_TESTCASE(caseconvert2
, !backend
) {
360 using namespace Xapian
;
362 TEST_EQUAL(Unicode::toupper(0x250), 0x2c6f);
363 TEST_EQUAL(Unicode::toupper(0x251), 0x2c6d);
364 TEST_EQUAL(Unicode::toupper(0x271), 0x2c6e);
366 TEST_EQUAL(Unicode::get_category(0x2ec), Unicode::MODIFIER_LETTER
);
367 TEST_EQUAL(Unicode::get_category(0x374), Unicode::MODIFIER_LETTER
);
368 TEST_EQUAL(Unicode::get_category(0x487), Unicode::NON_SPACING_MARK
);
369 TEST_EQUAL(Unicode::get_category(0x5be), Unicode::DASH_PUNCTUATION
);
370 TEST_EQUAL(Unicode::get_category(0x1f093), Unicode::OTHER_SYMBOL
);
372 // U+0526, U+0527 and U+A78D were added in Unicode 6.0.0:
373 TEST_EQUAL(Unicode::toupper(0x265), 0xa78d);
374 TEST_EQUAL(Unicode::tolower(0xa78d), 0x265);
375 TEST_EQUAL(Unicode::tolower(0x526), 0x527);
376 TEST_EQUAL(Unicode::toupper(0x527), 0x526);
378 // U+A7AA was added in Unicode 6.1.0:
379 TEST_EQUAL(Unicode::toupper(0x266), 0xa7aa);
380 TEST_EQUAL(Unicode::tolower(0xa7aa), 0x266);
381 TEST_EQUAL(Unicode::tolower(0x526), 0x527);
382 TEST_EQUAL(Unicode::toupper(0x527), 0x526);
384 TEST_EQUAL(Unicode::tolower(0x370), 0x371);
385 TEST_EQUAL(Unicode::toupper(0x371), 0x370);
386 TEST_EQUAL(Unicode::tolower(0x372), 0x373);
387 TEST_EQUAL(Unicode::toupper(0x373), 0x372);
388 TEST_EQUAL(Unicode::tolower(0x376), 0x377);
389 TEST_EQUAL(Unicode::toupper(0x377), 0x376);
390 TEST_EQUAL(Unicode::tolower(0x3cf), 0x3d7);
391 TEST_EQUAL(Unicode::toupper(0x3d7), 0x3cf);
393 // U+20BA was added in Unicode 6.2.0:
394 TEST_EQUAL(Unicode::toupper(0x20ba), 0x20ba);
395 TEST_EQUAL(Unicode::tolower(0x20ba), 0x20ba);
397 // U+061C was added in Unicode 6.3.0:
398 TEST_EQUAL(Unicode::toupper(0x61c), 0x61c);
399 TEST_EQUAL(Unicode::tolower(0x61c), 0x61c);
402 for (u
= 0x514; u
< 0x524; u
+= 2) {
403 TEST_EQUAL(Unicode::get_category(u
), Unicode::UPPERCASE_LETTER
);
404 TEST_EQUAL(Unicode::get_category(u
+ 1), Unicode::LOWERCASE_LETTER
);
405 TEST_EQUAL(Unicode::tolower(u
), u
+ 1);
406 TEST_EQUAL(Unicode::toupper(u
+ 1), u
);
409 // U+A7B1 was added in Unicode 8.0.0 as an uppercase form of U+0287.
410 TEST_EQUAL(Unicode::tolower(0xA7B1), 0x0287);
411 TEST_EQUAL(Unicode::toupper(0xA7B1), 0xA7B1);
412 TEST_EQUAL(Unicode::tolower(0x0287), 0x0287);
413 TEST_EQUAL(Unicode::toupper(0x0287), 0xA7B1);
415 // U+A7B4 (capital) and U+A7B5 (small) added in Unicode 8.0.0
416 TEST_EQUAL(Unicode::tolower(0xA7B4), 0xA7B5);
417 TEST_EQUAL(Unicode::toupper(0xA7B4), 0xA7B4);
418 TEST_EQUAL(Unicode::tolower(0xA7B5), 0xA7B5);
419 TEST_EQUAL(Unicode::toupper(0xA7B5), 0xA7B4);
421 // U+A7AE was added in Unicode 9.0.0 as an uppercase form of U+026A.
422 TEST_EQUAL(Unicode::tolower(0xA7AE), 0x026A);
423 TEST_EQUAL(Unicode::toupper(0xA7AE), 0xA7AE);
424 TEST_EQUAL(Unicode::tolower(0x026A), 0x026A);
425 TEST_EQUAL(Unicode::toupper(0x026A), 0xA7AE);
427 // U+A7AE was added in Unicode 9.0.0 as an uppercase form of U+026A.
428 TEST_EQUAL(Unicode::tolower(0xA7AE), 0x026A);
429 TEST_EQUAL(Unicode::toupper(0xA7AE), 0xA7AE);
430 TEST_EQUAL(Unicode::tolower(0x026A), 0x026A);
431 TEST_EQUAL(Unicode::toupper(0x026A), 0xA7AE);
433 // U+0560 was added in Unicode 11.0.0 (lowercase, no other forms).
434 TEST_EQUAL(Unicode::tolower(0x0560), 0x0560);
435 TEST_EQUAL(Unicode::toupper(0x0560), 0x0560);
437 // U+10D0 changed to be lowercase in Unicode 11.0.0 and U+1C90 was added.
438 TEST_EQUAL(Unicode::tolower(0x10D0), 0x10D0);
439 TEST_EQUAL(Unicode::toupper(0x10D0), 0x1C90);
440 TEST_EQUAL(Unicode::tolower(0x1C90), 0x10D0);
441 TEST_EQUAL(Unicode::toupper(0x1C90), 0x1C90);
446 DEFINE_TESTCASE(utf8convert1
, !backend
) {
448 Xapian::Unicode::append_utf8(s
, 'a');
449 Xapian::Unicode::append_utf8(s
, 128);
450 Xapian::Unicode::append_utf8(s
, 160);
451 Xapian::Unicode::append_utf8(s
, 0xFFFF);
452 Xapian::Unicode::append_utf8(s
, 166415);
453 Xapian::Unicode::append_utf8(s
, 0x10345);
454 Xapian::Unicode::append_utf8(s
, 0x10FFFD);
455 Xapian::Unicode::append_utf8(s
, 0xFFFFFFFF);
456 Xapian::Unicode::append_utf8(s
, 'z');
457 TEST_STRINGS_EQUAL(s
, "a"
471 DEFINE_TESTCASE(unicodepredicates1
, !backend
) {
472 static const unsigned wordchars
[] = {
473 // DECIMAL_DIGIT_NUMBER
475 0x10D30, // (added in Unicode 11.0.0)
476 0x11D51, // (added in Unicode 10.0.0)
477 0x11DA9, // (added in Unicode 11.0.0)
479 0x1ECB3, // (added in Unicode 11.0.0)
481 'a', 'z', 0x250, 0x251, 0x271, 0x3d7,
482 0x242, // (added in Unicode 5.0.0)
483 // LOWERCASE_LETTER (added in Unicode 5.1.0)
484 0x371, 0x373, 0x377, 0x514, 0x516, 0x518, 0x51a, 0x51c, 0x51e,
486 0x16E78, // (added in Unicode 11.0.0)
489 // UPPERCASE_LETTER (added in Unicode 5.1.0)
490 0x370, 0x372, 0x376, 0x3cf, 0x515, 0x517, 0x519, 0x51b, 0x51d, 0x51f,
491 0x521, 0x523, 0x2c6d, 0x2c6e, 0x2c6f,
492 0x16E45, // (added in Unicode 11.0.0)
494 0x8bb, // Added in Unicode 9.0.0
495 0xc80, // Added in Unicode 9.0.0
496 0x312e, // Added in Unicode 10.0.0
499 0x2ec, // Added in Unicode 5.1.0
500 0x374, // Added in Unicode 5.1.0
501 0x16fe1, // Added in Unicode 10.0.0
502 // NON_SPACING_MARK (added to is_wordchar() in 1.1.0)
504 0x487, // Added in Unicode 5.1.0
505 0x8d3, // Added in Unicode 11.0.0
506 0x8db, // Added in Unicode 9.0.0
507 0x11d47, // Added in Unicode 10.0.0
510 static const unsigned currency
[] = {
513 // CURRENCY_SYMBOL (added in Unicode 6.2.0)
515 // CURRENCY_SYMBOL (added in Unicode 8.0.0)
517 // CURRENCY_SYMBOL (added in Unicode 10.0.0)
519 // CURRENCY_SYMBOL (added in Unicode 11.0.0)
523 static const unsigned whitespace
[] = {
525 '\t', '\n', '\f', '\r',
530 static const unsigned other
[] = {
531 // DASH_PUNCTUATION (added in Unicode 5.1.0)
534 0xd4f, // Added in Unicode 9.0.0
535 0x1f093, // Added in Unicode 5.1.0
536 0x1f263, // Added in Unicode 10.0.0
537 0x1fa62, // Added in Unicode 11.0.0
539 0x61c, // Added in Unicode 6.3.0
540 0x8e2, // Added in Unicode 9.0.0
542 0xffff, 0x10ffff, 0x110000, 0xFFFFFFFF,
548 for (const unsigned * p
= wordchars
; *p
; ++p
) {
549 TEST(Xapian::Unicode::is_wordchar(*p
));
550 TEST(!Xapian::Unicode::is_currency(*p
));
551 TEST(!Xapian::Unicode::is_whitespace(*p
));
554 for (const unsigned * p
= currency
; *p
; ++p
) {
555 TEST(!Xapian::Unicode::is_wordchar(*p
));
556 TEST(Xapian::Unicode::is_currency(*p
));
557 TEST(!Xapian::Unicode::is_whitespace(*p
));
560 for (const unsigned * p
= whitespace
; *p
; ++p
) {
561 TEST(!Xapian::Unicode::is_wordchar(*p
));
562 TEST(!Xapian::Unicode::is_currency(*p
));
563 TEST(Xapian::Unicode::is_whitespace(*p
));
566 for (const unsigned * p
= other
; *p
; ++p
) {
567 TEST(!Xapian::Unicode::is_wordchar(*p
));
568 TEST(!Xapian::Unicode::is_currency(*p
));
569 TEST(!Xapian::Unicode::is_whitespace(*p
));