xapian-applications/omega/utf8converttest.cc

   1 /* utf8converttest.cc: test convert_to_utf8()
   2  *
   3  * Copyright (C) 2008,2009,2013 Olly Betts
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  18  * USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include <cstdlib>
  24 #include <iostream>
  25 #include <string>
  26
  27 #include "utf8convert.h"
  28
  29 using namespace std;
  30
  31 struct testcase {
  32     const char * charset;
  33     const char * dump;
  34     size_t len;
  35     const char * utf8;
  36 };
  37
  38 static const testcase tests[] = {
  39     { "utf8", "Hello world", 0, "Hello world" },
  40     { "iso-8859-1", "Hello world", 0, "Hello world" },
  41     { "us-ascii", "Hello world", 0, "Hello world" },
  42     { "iso-8859-1", "Hello\xa0world", 0, "Hello\xc2\xa0world" },
  43     { "ISO-8859-1", "Hello\xa0world", 0, "Hello\xc2\xa0world" },
  44     { "ISO8859-1", "Hello\xa0world", 0, "Hello\xc2\xa0world" },
  45 #if !defined USE_ICONV || defined __GNU_LIBRARY__
  46     // "8859_1" is not understood by Solaris iconv, for example.
  47     { "8859_1", "Hello\xa0world", 0, "Hello\xc2\xa0world" },
  48 #endif
  49     { "UTF16BE", "\0T\0e\0s\0t", 8, "Test" },
  50     { "UTF16", "\xfe\xff\0T\0e\0s\0t", 10, "Test" },
  51     { "UTF_16BE", "\0T\0e\0s\0t", 8, "Test" },
  52     { "UTF 16BE", "\0T\0e\0s\0t", 8, "Test" },
  53     { "UTF16LE", "T\0e\0s\0t\0", 8, "Test" },
  54     { "UTF16", "\xff\xfeT\0e\0s\0t\0", 10, "Test" },
  55     { "UCS-2BE", "\0T\0e\0s\0t", 8, "Test" },
  56     { "UCS2BE", "\0T\0e\0s\0t", 8, "Test" },
  57     { "UCS_2BE", "\0T\0e\0s\0t", 8, "Test" },
  58     { "UCS 2BE", "\0T\0e\0s\0t", 8, "Test" },
  59     { "UCS-2LE", "T\0e\0s\0t\0", 8, "Test" },
  60     { "UCS2LE", "T\0e\0s\0t\0", 8, "Test" },
  61     { "UTF16BE", "\xdb\xff\xdf\xfd", 0, "\xf4\x8f\xbf\xbd" },
  62     { "UTF16", "\xfe\xff\xdb\xff\xdf\xfd", 0, "\xf4\x8f\xbf\xbd" },
  63     { "UTF-16", "\xfe\xff\xdb\xff\xdf\xfd", 0, "\xf4\x8f\xbf\xbd" },
  64     { "UTF16LE", "\xff\xdb\xfd\xdf", 0, "\xf4\x8f\xbf\xbd" },
  65     { "UTF16", "\xff\xfe\xff\xdb\xfd\xdf", 0, "\xf4\x8f\xbf\xbd" },
  66     { "UCS-2", "\xfe\xff\0T\0e\0s\0t", 10, "Test" },
  67     { "UCS-2", "\xff\xfeT\0e\0s\0t\0", 10, "Test" },
  68     { "UCS2", "\xfe\xff\0T\0e\0s\0t", 10, "Test" },
  69     { "UCS2", "\xff\xfeT\0e\0s\0t\0", 10, "Test" },
  70     // If there's no BOM, we're supposed to assume BE.
  71     { "UTF16", "\xdb\xff\xdf\xfd", 0, "\xf4\x8f\xbf\xbd" },
  72     // Test "promoting" charset to windows-1252:
  73     { "iso-8859-1", "Price: \x80""20", 0, "Price: \xe2\x82\xac""20" },
  74     { "ISO-8859-1", "\x80\x81\x82\x83\x84\x85\x86\x87", 0, "\xe2\x82\xac\xc2\x81\xe2\x80\x9a\xc6\x92\xe2\x80\x9e\xe2\x80\xa6\xe2\x80\xa0\xe2\x80\xa1" },
  75     { "ISO-8859-1", "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f", 0, "\xcb\x86\xe2\x80\xb0\xc5\xa0\xe2\x80\xb9\xc5\x92\xc2\x8d\xc5\xbd\xc2\x8f" },
  76     { "ISO-8859-1", "\x90\x91\x92\x93\x94\x95\x96\x97", 0, "\xc2\x90\xe2\x80\x98\xe2\x80\x99\xe2\x80\x9c\xe2\x80\x9d\xe2\x80\xa2\xe2\x80\x93\xe2\x80\x94" },
  77     { "ISO-8859-1", "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", 0, "\xcb\x9c\xe2\x84\xa2\xc5\xa1\xe2\x80\xba\xc5\x93\xc2\x9d\xc5\xbe\xc5\xb8" },
  78     { "ISO-8859-1", "\x7e\x7f\xa0\xa1", 0, "\x7e\x7f\xc2\xa0\xc2\xa1" },
  79     { 0, 0, 0, 0 }
  80 };
  81
  82 int
  83 main()
  84 {
  85     for (size_t i = 0; tests[i].charset; ++i) {
  86         size_t len = tests[i].len;
  87         string dump;
  88         if (len) {
  89             dump.assign(tests[i].dump, len);
  90         } else {
  91             dump.assign(tests[i].dump);
  92         }
  93         convert_to_utf8(dump, tests[i].charset);
  94         if (tests[i].utf8 != dump) {
  95             cout << "Converting from " << tests[i].charset << "\n"
  96                     "Expected [" << tests[i].utf8 << "]\n"
  97                     "Got      [" << dump << "]" << endl;
  98             exit(1);
  99         }
 100     }
 101 }