Make PostList subclasses return PostList* not Internal*
[xapian.git] / xapian-applications / omega / utf8convert.cc
blob8a554a4db5020f20fad06d667a2c9996cc9f8295
1 /* utf8convert.cc: convert a string to UTF-8 encoding.
3 * Copyright (C) 2006,2007,2008,2010,2013,2017 Olly Betts
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include <config.h>
22 #include "utf8convert.h"
24 #include <algorithm>
25 #include <string>
27 #include "safeerrno.h"
28 #ifdef USE_ICONV
29 # include <iconv.h>
30 #endif
31 #include <xapian.h>
32 #include "strcasecmp.h"
33 #include "stringutils.h"
35 using namespace std;
37 void
38 convert_to_utf8(string & text, const string & charset)
40 // Shortcut if it's already in utf8!
41 if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0)
42 return;
43 if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0)
44 return;
46 // Nobody has told us what charset it's in, so do as little work as
47 // possible!
48 if (charset.empty())
49 return;
51 char buf[1024];
52 string tmp;
54 /* Handle iso-8859-1/windows-1252/cp-1252, utf-16/ucs-2,
55 * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
56 const char * p = charset.c_str();
58 bool utf16 = false;
59 if (strncasecmp(p, "utf", 3) == 0) {
60 p += 3;
61 if (*p == '-' || *p == '_' || *p == ' ') ++p;
62 if (*p != '1' || p[1] != '6') goto try_iconv;
63 p += 2;
64 utf16 = true;
65 } else if (strncasecmp(p, "ucs", 3) == 0) {
66 p += 3;
67 if (*p == '-' || *p == '_' || *p == ' ') ++p;
68 if (*p != '2') goto try_iconv;
69 ++p;
70 utf16 = true;
73 if (utf16) {
74 if (text.size() < 2) return;
76 bool big_endian = true;
77 string::const_iterator i = text.begin();
78 if (*p == '\0') {
79 // GNU iconv doesn't seem to handle BOMs.
80 if (startswith(text, "\xfe\xff")) {
81 i += 2;
82 } else if (startswith(text, "\xff\xfe")) {
83 big_endian = false;
84 i += 2;
86 // UTF-16 with no BOM is meant to be assumed to be BE. Strictly
87 // speaking, we're not meant to assume anything for UCS-2 with
88 // no BOM, but we've got to do something, so we might as well
89 // assume it's UTF-16 mislabelled, which is easy and sane.
90 } else if (strcasecmp(p, "LE") == 0) {
91 big_endian = false;
92 } else if (!(strcasecmp(p, "BE") == 0)) {
93 goto try_iconv;
96 tmp.reserve(text.size() / 2);
98 size_t start = 0;
99 if (text.size() & 1) {
100 // If there's a half-character at the end, nuke it now to make the
101 // conversion loop below simpler.
102 text.resize(text.size() - 1);
105 while (i != text.end()) {
106 unsigned ch = static_cast<unsigned char>(*i++);
107 unsigned ch2 = static_cast<unsigned char>(*i++);
108 if (big_endian) {
109 ch = (ch << 8) | ch2;
110 } else {
111 ch = (ch2 << 8) | ch;
113 if (ch >> 10 == 0xd800 >> 10) {
114 // Surrogate pair.
115 if (i == text.end()) break;
116 unsigned hi = (ch & 0x3ff);
117 ch = static_cast<unsigned char>(*i++);
118 ch2 = static_cast<unsigned char>(*i++);
119 if (big_endian) {
120 ch = (ch << 8) | ch2;
121 } else {
122 ch = (ch2 << 8) | ch;
124 if (ch >> 10 == 0xdc00 >> 10) {
125 ch &= 0x3ff;
126 ch |= (hi << 10);
127 ch += 0x10000;
130 start += Xapian::Unicode::to_utf8(ch, buf + start);
131 if (start >= sizeof(buf) - 4) {
132 tmp.append(buf, start);
133 start = 0;
136 if (start) tmp.append(buf, start);
137 } else {
138 // Assume windows-1252 if iso-8859-1 is specified. The only
139 // differences are in the range 128-159 which are control characters in
140 // iso-8859-1, and a lot of content is mislabelled. We use our own
141 // conversion code for this case, as GNU iconv fails if it sees one of
142 // the unassigned code points in windows-1252, whereas it would accept
143 // the same input as iso-8859-1, and it seems undesirable to be
144 // rejecting input due to this behind-the-scenes character set
145 // shenanigans.
146 const char * q = NULL;
147 if (strncasecmp(p, "windows", 7) == 0) {
148 q = p + 7;
149 } else if (strncasecmp(p, "cp", 2) == 0) {
150 q = p + 2;
152 if (q) {
153 if (*q == '-' || *q == '_' || *q == ' ') ++q;
154 if (strcmp(q, "1252") != 0)
155 goto try_iconv;
156 } else {
157 if (strncasecmp(p, "iso", 3) == 0) {
158 p += 3;
159 if (*p == '-' || *p == '_' || *p == ' ') ++p;
161 if (strncmp(p, "8859", 4) != 0) goto try_iconv;
162 p += 4;
163 if (*p == '-' || *p == '_' || *p == ' ') ++p;
164 if (strcmp(p, "1") != 0) goto try_iconv;
167 // FIXME: pull this out as a standard "normalise utf-8" function?
168 tmp.reserve(text.size());
170 size_t start = 0;
171 for (string::const_iterator i = text.begin(); i != text.end(); ++i) {
172 static const unsigned cp1252_to_unicode[32] = {
173 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
174 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
175 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
176 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
178 unsigned ch = static_cast<unsigned char>(*i);
179 if (ch - 128 < sizeof(cp1252_to_unicode) / sizeof(*cp1252_to_unicode))
180 ch = cp1252_to_unicode[ch - 128];
181 start += Xapian::Unicode::to_utf8(ch, buf + start);
182 if (start >= sizeof(buf) - 4) {
183 tmp.append(buf, start);
184 start = 0;
187 if (start) tmp.append(buf, start);
190 if (false) {
191 try_iconv:
192 #ifdef USE_ICONV
193 iconv_t conv = iconv_open("UTF-8", charset.c_str());
194 if (conv == reinterpret_cast<iconv_t>(-1))
195 return;
196 ICONV_INPUT_TYPE in = const_cast<char *>(text.c_str());
197 size_t in_len = text.size();
198 while (in_len) {
199 char * out = buf;
200 size_t out_len = sizeof(buf);
201 if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) &&
202 errno != E2BIG) {
203 // FIXME: how to handle this?
204 break;
206 tmp.append(buf, out - buf);
209 (void)iconv_close(conv);
210 #else
211 return;
212 #endif
215 swap(text, tmp);