1 /*****************************************************************************
2 * utf8.c: Test for UTF-8 encoding/decoding stuff
3 *****************************************************************************
4 * Copyright (C) 2006 Rémi Denis-Courmont
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU Lesser General Public License as published by
9 * the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this program; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20 *****************************************************************************/
26 #include <vlc_common.h>
27 #include <vlc_charset.h>
33 static void test_towc(const char *in
, size_t want_len
, uint32_t want_cp
)
38 if (want_len
!= (size_t)-1)
39 printf("\"%s\" is U+%04"PRIX32
" (%zu bytes)\n", in
, want_cp
, want_len
);
41 printf("Invalid sequence of %zu bytes\n", strlen(in
));
43 len
= vlc_towc(in
, &cp
);
47 printf(" ERROR: length mismatch: %zd\n", len
);
51 if (len
!= (size_t)-1 && want_cp
!= cp
)
53 printf(" ERROR: code point mismatch: %04"PRIX32
"\n", cp
);
58 static void test (const char *in
, const char *out
)
60 bool isutf8
= !strcmp (in
, out
);
61 char *str
= strdup (in
);
66 printf ("\"%s\" should be accepted...\n", in
);
68 printf ("\"%s\" should be rewritten as \"%s\"...\n", in
, out
);
70 if ((IsUTF8 (in
) != NULL
) != isutf8
)
72 printf (" ERROR: IsUTF8 (%s) failed\n", in
);
76 if ((EnsureUTF8 (str
) != NULL
) != isutf8
)
78 printf (" ERROR: EnsureUTF8 (%s) failed\n", in
);
82 if (strcmp (str
, out
))
84 printf (" ERROR: got \"%s\"\n", str
);
88 if ((EnsureUTF8 (str
) == NULL
) || IsUTF8 (str
) == NULL
)
90 printf (" ERROR: EnsureUTF8 (%s) is not UTF-8\n", in
);
96 static void test_strcasestr (const char *h
, const char *n
, ssize_t offset
)
98 printf ("\"%s\" should %sbe found in \"%s\"...\n", n
,
99 (offset
!= -1) ? "" : "not ", h
);
101 const char *ret
= vlc_strcasestr (h
, n
);
106 printf ("ERROR: got \"%s\"\n", ret
);
114 printf ("ERROR: not found\n");
117 if ((ret
- h
) != offset
)
119 printf ("ERROR: got \"%s\" instead of \"%s\"\n",
129 (void)setvbuf (stdout
, NULL
, _IONBF
, 0);
131 /* Valid sequences */
133 test_towc("\n", 1, '\n');
134 test_towc("\x7F", 1, 0x7F);
135 test_towc("\xC3\xA9", 2, 0xE9);
136 test_towc("\xDF\xBF", 2, 0x7FF);
137 test_towc("\xE2\x82\xAC", 3, 0x20AC);
138 test_towc("\xEF\xBF\xBF", 3, 0xFFFF);
139 test_towc("\xF0\x90\x80\x81", 4, 0x10001);
140 test_towc("\xF4\x80\x80\x81", 4, 0x100001);
141 test_towc("\xF4\x8F\xBF\xBF", 4, 0x10FFFF);
143 test_towc("\xC0\x80", -1, 0);
144 test_towc("\xC1\xBF", -1, 0x7F);
145 test_towc("\xE0\x80\x80", -1, 0);
146 test_towc("\xE0\x9F\xBF", -1, 0x7FF);
147 test_towc("\xF0\x80\x80\x80", -1, 0);
148 test_towc("\xF0\x8F\xBF\xBF", -1, 0xFFFF);
150 test_towc("\xF4\x90\x80\x80", -1, 0x110000);
151 test_towc("\xF7\xBF\xBF\xBF", -1, 0x1FFFFF);
153 test_towc("\xED\x9F\xBF", 3, 0xD7FF);
154 test_towc("\xED\xA0\x80", -1, 0xD800);
155 test_towc("\xED\xBF\xBF", -1, 0xDFFF);
156 test_towc("\xEE\x80\x80", 3, 0xE000);
157 /* Spurious continuation byte */
158 test_towc("\x80", -1, 0);
159 test_towc("\xBF", -1, 0);
160 /* Missing continuation byte */
161 test_towc("\xDF", -1, 0x7FF);
162 test_towc("\xEF", -1, 0xFFFF);
163 test_towc("\xF4", -1, 0x10FFFF);
164 test_towc("\xEF\xBF", -1, 0xFFFF);
165 test_towc("\xF4\xBF\xBF", -1, 0x10FFFF);
168 test ("this_should_not_be_modified_1234",
169 "this_should_not_be_modified_1234");
171 test ("\xFF", "?"); // invalid byte
172 test ("\xEF\xBB\xBFHello", "\xEF\xBB\xBFHello"); // BOM
173 test ("\x00\xE9", ""); // no conversion past end of string
175 test ("T\xC3\xA9l\xC3\xA9vision \xE2\x82\xAC", "Télévision €");
176 test ("T\xE9l\xE9vision", "T?l?vision");
177 test ("\xC1\x94\xC3\xa9l\xC3\xA9vision", "??élévision"); /* overlong */
179 test ("Hel\xF0\x83\x85\x87lo", "Hel????lo"); /* more overlong */
181 test_strcasestr ("", "", 0);
182 test_strcasestr ("", "a", -1);
183 test_strcasestr ("a", "", 0);
184 test_strcasestr ("heLLo", "l", 2);
185 test_strcasestr ("heLLo", "lo", 3);
186 test_strcasestr ("heLLo", "llo", 2);
187 test_strcasestr ("heLLo", "la", -1);
188 test_strcasestr ("heLLo", "oa", -1);
189 test_strcasestr ("Télé", "é", 1);
190 test_strcasestr ("Télé", "élé", 1);
191 test_strcasestr ("Télé", "léé", -1);