xapian-applications/omega/utf8convert.cc

   1 /* utf8convert.cc: convert a string to UTF-8 encoding.
   2  *
   3  * Copyright (C) 2006,2007,2008,2010,2013,2017 Olly Betts
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  18  */
  19
  20 #include <config.h>
  21
  22 #include "utf8convert.h"
  23
  24 #include <algorithm>
  25 #include <string>
  26
  27 #include "safeerrno.h"
  28 #ifdef USE_ICONV
  29 # include <iconv.h>
  30 #endif
  31 #include <xapian.h>
  32 #include "strcasecmp.h"
  33 #include "stringutils.h"
  34
  35 using namespace std;
  36
  37 void
  38 convert_to_utf8(string & text, const string & charset)
  39 {
  40     // Shortcut if it's already in utf8!
  41     if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0)
  42         return;
  43     if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0)
  44         return;
  45
  46     // Nobody has told us what charset it's in, so do as little work as
  47     // possible!
  48     if (charset.empty())
  49         return;
  50
  51     char buf[1024];
  52     string tmp;
  53
  54     /* Handle iso-8859-1/windows-1252/cp-1252, utf-16/ucs-2,
  55      * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
  56     const char * p = charset.c_str();
  57
  58     bool utf16 = false;
  59     if (strncasecmp(p, "utf", 3) == 0) {
  60         p += 3;
  61         if (*p == '-' || *p == '_' || *p == ' ') ++p;
  62         if (*p != '1' || p[1] != '6') goto try_iconv;
  63         p += 2;
  64         utf16 = true;
  65     } else if (strncasecmp(p, "ucs", 3) == 0) {
  66         p += 3;
  67         if (*p == '-' || *p == '_' || *p == ' ') ++p;
  68         if (*p != '2') goto try_iconv;
  69         ++p;
  70         utf16 = true;
  71     }
  72
  73     if (utf16) {
  74         if (text.size() < 2) return;
  75
  76         bool big_endian = true;
  77         string::const_iterator i = text.begin();
  78         if (*p == '\0') {
  79             // GNU iconv doesn't seem to handle BOMs.
  80             if (startswith(text, "\xfe\xff")) {
  81                 i += 2;
  82             } else if (startswith(text, "\xff\xfe")) {
  83                 big_endian = false;
  84                 i += 2;
  85             }
  86             // UTF-16 with no BOM is meant to be assumed to be BE.  Strictly
  87             // speaking, we're not meant to assume anything for UCS-2 with
  88             // no BOM, but we've got to do something, so we might as well
  89             // assume it's UTF-16 mislabelled, which is easy and sane.
  90         } else if (strcasecmp(p, "LE") == 0) {
  91             big_endian = false;
  92         } else if (!(strcasecmp(p, "BE") == 0)) {
  93             goto try_iconv;
  94         }
  95
  96         tmp.reserve(text.size() / 2);
  97
  98         size_t start = 0;
  99         if (text.size() & 1) {
 100             // If there's a half-character at the end, nuke it now to make the
 101             // conversion loop below simpler.
 102             text.resize(text.size() - 1);
 103         }
 104
 105         while (i != text.end()) {
 106             unsigned ch = static_cast<unsigned char>(*i++);
 107             unsigned ch2 = static_cast<unsigned char>(*i++);
 108             if (big_endian) {
 109                 ch = (ch << 8) | ch2;
 110             } else {
 111                 ch = (ch2 << 8) | ch;
 112             }
 113             if (ch >> 10 == 0xd800 >> 10) {
 114                 // Surrogate pair.
 115                 if (i == text.end()) break;
 116                 unsigned hi = (ch & 0x3ff);
 117                 ch = static_cast<unsigned char>(*i++);
 118                 ch2 = static_cast<unsigned char>(*i++);
 119                 if (big_endian) {
 120                     ch = (ch << 8) | ch2;
 121                 } else {
 122                     ch = (ch2 << 8) | ch;
 123                 }
 124                 if (ch >> 10 == 0xdc00 >> 10) {
 125                     ch &= 0x3ff;
 126                     ch |= (hi << 10);
 127                     ch += 0x10000;
 128                 }
 129             }
 130             start += Xapian::Unicode::to_utf8(ch, buf + start);
 131             if (start >= sizeof(buf) - 4) {
 132                 tmp.append(buf, start);
 133                 start = 0;
 134             }
 135         }
 136         if (start) tmp.append(buf, start);
 137     } else {
 138         // Assume windows-1252 if iso-8859-1 is specified.  The only
 139         // differences are in the range 128-159 which are control characters in
 140         // iso-8859-1, and a lot of content is mislabelled.  We use our own
 141         // conversion code for this case, as GNU iconv fails if it sees one of
 142         // the unassigned code points in windows-1252, whereas it would accept
 143         // the same input as iso-8859-1, and it seems undesirable to be
 144         // rejecting input due to this behind-the-scenes character set
 145         // shenanigans.
 146         const char * q = NULL;
 147         if (strncasecmp(p, "windows", 7) == 0) {
 148             q = p + 7;
 149         } else if (strncasecmp(p, "cp", 2) == 0) {
 150             q = p + 2;
 151         }
 152         if (q) {
 153             if (*q == '-' || *q == '_' || *q == ' ') ++q;
 154             if (strcmp(q, "1252") != 0)
 155                 goto try_iconv;
 156         } else {
 157             if (strncasecmp(p, "iso", 3) == 0) {
 158                 p += 3;
 159                 if (*p == '-' || *p == '_' || *p == ' ') ++p;
 160             }
 161             if (strncmp(p, "8859", 4) != 0) goto try_iconv;
 162             p += 4;
 163             if (*p == '-' || *p == '_' || *p == ' ') ++p;
 164             if (strcmp(p, "1") != 0) goto try_iconv;
 165         }
 166
 167         // FIXME: pull this out as a standard "normalise utf-8" function?
 168         tmp.reserve(text.size());
 169
 170         size_t start = 0;
 171         for (string::const_iterator i = text.begin(); i != text.end(); ++i) {
 172             static const unsigned cp1252_to_unicode[32] = {
 173                 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
 174                 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
 175                 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
 176                 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
 177             };
 178             unsigned ch = static_cast<unsigned char>(*i);
 179             if (ch - 128 < sizeof(cp1252_to_unicode) / sizeof(*cp1252_to_unicode))
 180                 ch = cp1252_to_unicode[ch - 128];
 181             start += Xapian::Unicode::to_utf8(ch, buf + start);
 182             if (start >= sizeof(buf) - 4) {
 183                 tmp.append(buf, start);
 184                 start = 0;
 185             }
 186         }
 187         if (start) tmp.append(buf, start);
 188     }
 189
 190     if (false) {
 191 try_iconv:
 192 #ifdef USE_ICONV
 193         iconv_t conv = iconv_open("UTF-8", charset.c_str());
 194         if (conv == reinterpret_cast<iconv_t>(-1))
 195             return;
 196         ICONV_INPUT_TYPE in = const_cast<char *>(text.c_str());
 197         size_t in_len = text.size();
 198         while (in_len) {
 199             char * out = buf;
 200             size_t out_len = sizeof(buf);
 201             if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) &&
 202                 errno != E2BIG) {
 203                 // FIXME: how to handle this?
 204                 break;
 205             }
 206             tmp.append(buf, out - buf);
 207         }
 208
 209         (void)iconv_close(conv);
 210 #else
 211         return;
 212 #endif
 213     }
 214
 215     swap(text, tmp);
 216 }