1 /* utf8converttest.cc: test convert_to_utf8()
3 * Copyright (C) 2008,2009,2013 Olly Betts
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
27 #include "utf8convert.h"
38 static const testcase tests
[] = {
39 { "utf8", "Hello world", 0, "Hello world" },
40 { "iso-8859-1", "Hello world", 0, "Hello world" },
41 { "us-ascii", "Hello world", 0, "Hello world" },
42 { "iso-8859-1", "Hello\xa0world", 0, "Hello\xc2\xa0world" },
43 { "ISO-8859-1", "Hello\xa0world", 0, "Hello\xc2\xa0world" },
44 { "ISO8859-1", "Hello\xa0world", 0, "Hello\xc2\xa0world" },
45 #if !defined USE_ICONV || defined __GNU_LIBRARY__
46 // "8859_1" is not understood by Solaris iconv, for example.
47 { "8859_1", "Hello\xa0world", 0, "Hello\xc2\xa0world" },
49 { "UTF16BE", "\0T\0e\0s\0t", 8, "Test" },
50 { "UTF16", "\xfe\xff\0T\0e\0s\0t", 10, "Test" },
51 { "UTF_16BE", "\0T\0e\0s\0t", 8, "Test" },
52 { "UTF 16BE", "\0T\0e\0s\0t", 8, "Test" },
53 { "UTF16LE", "T\0e\0s\0t\0", 8, "Test" },
54 { "UTF16", "\xff\xfeT\0e\0s\0t\0", 10, "Test" },
55 { "UCS-2BE", "\0T\0e\0s\0t", 8, "Test" },
56 { "UCS2BE", "\0T\0e\0s\0t", 8, "Test" },
57 { "UCS_2BE", "\0T\0e\0s\0t", 8, "Test" },
58 { "UCS 2BE", "\0T\0e\0s\0t", 8, "Test" },
59 { "UCS-2LE", "T\0e\0s\0t\0", 8, "Test" },
60 { "UCS2LE", "T\0e\0s\0t\0", 8, "Test" },
61 { "UTF16BE", "\xdb\xff\xdf\xfd", 0, "\xf4\x8f\xbf\xbd" },
62 { "UTF16", "\xfe\xff\xdb\xff\xdf\xfd", 0, "\xf4\x8f\xbf\xbd" },
63 { "UTF-16", "\xfe\xff\xdb\xff\xdf\xfd", 0, "\xf4\x8f\xbf\xbd" },
64 { "UTF16LE", "\xff\xdb\xfd\xdf", 0, "\xf4\x8f\xbf\xbd" },
65 { "UTF16", "\xff\xfe\xff\xdb\xfd\xdf", 0, "\xf4\x8f\xbf\xbd" },
66 { "UCS-2", "\xfe\xff\0T\0e\0s\0t", 10, "Test" },
67 { "UCS-2", "\xff\xfeT\0e\0s\0t\0", 10, "Test" },
68 { "UCS2", "\xfe\xff\0T\0e\0s\0t", 10, "Test" },
69 { "UCS2", "\xff\xfeT\0e\0s\0t\0", 10, "Test" },
70 // If there's no BOM, we're supposed to assume BE.
71 { "UTF16", "\xdb\xff\xdf\xfd", 0, "\xf4\x8f\xbf\xbd" },
72 // Test "promoting" charset to windows-1252:
73 { "iso-8859-1", "Price: \x80""20", 0, "Price: \xe2\x82\xac""20" },
74 { "ISO-8859-1", "\x80\x81\x82\x83\x84\x85\x86\x87", 0, "\xe2\x82\xac\xc2\x81\xe2\x80\x9a\xc6\x92\xe2\x80\x9e\xe2\x80\xa6\xe2\x80\xa0\xe2\x80\xa1" },
75 { "ISO-8859-1", "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f", 0, "\xcb\x86\xe2\x80\xb0\xc5\xa0\xe2\x80\xb9\xc5\x92\xc2\x8d\xc5\xbd\xc2\x8f" },
76 { "ISO-8859-1", "\x90\x91\x92\x93\x94\x95\x96\x97", 0, "\xc2\x90\xe2\x80\x98\xe2\x80\x99\xe2\x80\x9c\xe2\x80\x9d\xe2\x80\xa2\xe2\x80\x93\xe2\x80\x94" },
77 { "ISO-8859-1", "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", 0, "\xcb\x9c\xe2\x84\xa2\xc5\xa1\xe2\x80\xba\xc5\x93\xc2\x9d\xc5\xbe\xc5\xb8" },
78 { "ISO-8859-1", "\x7e\x7f\xa0\xa1", 0, "\x7e\x7f\xc2\xa0\xc2\xa1" },
85 for (size_t i
= 0; tests
[i
].charset
; ++i
) {
86 size_t len
= tests
[i
].len
;
89 dump
.assign(tests
[i
].dump
, len
);
91 dump
.assign(tests
[i
].dump
);
93 convert_to_utf8(dump
, tests
[i
].charset
);
94 if (tests
[i
].utf8
!= dump
) {
95 cout
<< "Converting from " << tests
[i
].charset
<< "\n"
96 "Expected [" << tests
[i
].utf8
<< "]\n"
97 "Got [" << dump
<< "]" << endl
;