src/test/utf8.c

   1 /*****************************************************************************
   2  * utf8.c: Test for UTF-8 encoding/decoding stuff
   3  *****************************************************************************
   4  * Copyright (C) 2006 Rémi Denis-Courmont
   5  * $Id$
   6  *
   7  * This program is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU Lesser General Public License as published by
   9  * the Free Software Foundation; either version 2.1 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  * GNU Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public License
  18  * along with this program; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  20  *****************************************************************************/
  21
  22 #ifdef HAVE_CONFIG_H
  23 # include "config.h"
  24 #endif
  25
  26 #include <vlc_common.h>
  27 #include <vlc_charset.h>
  28
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <stdbool.h>
  32
  33 static void test_towc(const char *in, size_t want_len, uint32_t want_cp)
  34 {
  35     uint32_t cp;
  36     size_t len;
  37
  38     if (want_len != (size_t)-1)
  39         printf("\"%s\" is U+%04"PRIX32" (%zu bytes)\n", in, want_cp, want_len);
  40     else
  41         printf("Invalid sequence of %zu bytes\n", strlen(in));
  42
  43     len = vlc_towc(in, &cp);
  44
  45     if (len != want_len)
  46     {
  47         printf(" ERROR: length mismatch: %zd\n", len);
  48         exit(1);
  49     }
  50
  51     if (len != (size_t)-1 && want_cp != cp)
  52     {
  53         printf(" ERROR: code point mismatch: %04"PRIX32"\n", cp);
  54         exit(1);
  55     }
  56 }
  57
  58 static void test (const char *in, const char *out)
  59 {
  60     bool isutf8 = !strcmp (in, out);
  61     char *str = strdup (in);
  62     if (str == NULL)
  63         abort ();
  64
  65     if (isutf8)
  66         printf ("\"%s\" should be accepted...\n", in);
  67     else
  68         printf ("\"%s\" should be rewritten as \"%s\"...\n", in, out);
  69
  70     if ((IsUTF8 (in) != NULL) != isutf8)
  71     {
  72         printf (" ERROR: IsUTF8 (%s) failed\n", in);
  73         exit (1);
  74     }
  75
  76     if ((EnsureUTF8 (str) != NULL) != isutf8)
  77     {
  78         printf (" ERROR: EnsureUTF8 (%s) failed\n", in);
  79         exit (2);
  80     }
  81
  82     if (strcmp (str, out))
  83     {
  84         printf (" ERROR: got \"%s\"\n", str);
  85         exit (3);
  86     }
  87
  88     if ((EnsureUTF8 (str) == NULL) || IsUTF8 (str) == NULL)
  89     {
  90         printf (" ERROR: EnsureUTF8 (%s) is not UTF-8\n", in);
  91         exit (4);
  92     }
  93     free (str);
  94 }
  95
  96 static void test_strcasestr (const char *h, const char *n, ssize_t offset)
  97 {
  98     printf ("\"%s\" should %sbe found in \"%s\"...\n", n,
  99             (offset != -1) ? "" : "not ", h);
 100
 101     const char *ret = vlc_strcasestr (h, n);
 102     if (offset == -1)
 103     {
 104         if (ret != NULL)
 105         {
 106             printf ("ERROR: got \"%s\"\n", ret);
 107             exit (10);
 108         }
 109     }
 110     else
 111     {
 112         if (ret == NULL)
 113         {
 114             printf ("ERROR: not found\n");
 115             exit (11);
 116         }
 117         if ((ret - h) != offset)
 118         {
 119             printf ("ERROR: got \"%s\" instead of \"%s\"\n",
 120                     ret, h + offset);
 121             exit (12);
 122         }
 123     }
 124 }
 125
 126
 127 int main (void)
 128 {
 129     (void)setvbuf (stdout, NULL, _IONBF, 0);
 130
 131     /* Valid sequences */
 132     test_towc("", 0, 0);
 133     test_towc("\n", 1, '\n');
 134     test_towc("\x7F", 1, 0x7F);
 135     test_towc("\xC3\xA9", 2, 0xE9);
 136     test_towc("\xDF\xBF", 2, 0x7FF);
 137     test_towc("\xE2\x82\xAC", 3, 0x20AC);
 138     test_towc("\xEF\xBF\xBF", 3, 0xFFFF);
 139     test_towc("\xF0\x90\x80\x81", 4, 0x10001);
 140     test_towc("\xF4\x80\x80\x81", 4, 0x100001);
 141     test_towc("\xF4\x8F\xBF\xBF", 4, 0x10FFFF);
 142     /* Overlongs */
 143     test_towc("\xC0\x80", -1, 0);
 144     test_towc("\xC1\xBF", -1, 0x7F);
 145     test_towc("\xE0\x80\x80", -1, 0);
 146     test_towc("\xE0\x9F\xBF", -1, 0x7FF);
 147     test_towc("\xF0\x80\x80\x80", -1, 0);
 148     test_towc("\xF0\x8F\xBF\xBF", -1, 0xFFFF);
 149     /* Out of range */
 150     test_towc("\xF4\x90\x80\x80", -1, 0x110000);
 151     test_towc("\xF7\xBF\xBF\xBF", -1, 0x1FFFFF);
 152     /* Surrogates */
 153     test_towc("\xED\x9F\xBF", 3, 0xD7FF);
 154     test_towc("\xED\xA0\x80", -1, 0xD800);
 155     test_towc("\xED\xBF\xBF", -1, 0xDFFF);
 156     test_towc("\xEE\x80\x80", 3, 0xE000);
 157     /* Spurious continuation byte */
 158     test_towc("\x80", -1, 0);
 159     test_towc("\xBF", -1, 0);
 160     /* Missing continuation byte */
 161     test_towc("\xDF", -1, 0x7FF);
 162     test_towc("\xEF", -1, 0xFFFF);
 163     test_towc("\xF4", -1, 0x10FFFF);
 164     test_towc("\xEF\xBF", -1, 0xFFFF);
 165     test_towc("\xF4\xBF\xBF", -1, 0x10FFFF);
 166
 167     test ("", "");
 168     test ("this_should_not_be_modified_1234",
 169           "this_should_not_be_modified_1234");
 170
 171     test ("\xFF", "?"); // invalid byte
 172     test ("\xEF\xBB\xBFHello", "\xEF\xBB\xBFHello"); // BOM
 173     test ("\x00\xE9", ""); // no conversion past end of string
 174
 175     test ("T\xC3\xA9l\xC3\xA9vision \xE2\x82\xAC", "Télévision €");
 176     test ("T\xE9l\xE9vision", "T?l?vision");
 177     test ("\xC1\x94\xC3\xa9l\xC3\xA9vision", "??élévision"); /* overlong */
 178
 179     test ("Hel\xF0\x83\x85\x87lo", "Hel????lo"); /* more overlong */
 180
 181     test_strcasestr ("", "", 0);
 182     test_strcasestr ("", "a", -1);
 183     test_strcasestr ("a", "", 0);
 184     test_strcasestr ("heLLo", "l", 2);
 185     test_strcasestr ("heLLo", "lo", 3);
 186     test_strcasestr ("heLLo", "llo", 2);
 187     test_strcasestr ("heLLo", "la", -1);
 188     test_strcasestr ("heLLo", "oa", -1);
 189     test_strcasestr ("Télé", "é", 1);
 190     test_strcasestr ("Télé", "élé", 1);
 191     test_strcasestr ("Télé", "léé", -1);
 192
 193     return 0;
 194 }