1 /* utf8convert.cc: convert a string to UTF-8 encoding.
3 * Copyright (C) 2006,2007,2008,2010,2013,2017 Olly Betts
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "utf8convert.h"
27 #include "safeerrno.h"
32 #include "strcasecmp.h"
33 #include "stringutils.h"
38 convert_to_utf8(string
& text
, const string
& charset
)
40 // Shortcut if it's already in utf8!
41 if (charset
.size() == 5 && strcasecmp(charset
.c_str(), "utf-8") == 0)
43 if (charset
.size() == 4 && strcasecmp(charset
.c_str(), "utf8") == 0)
46 // Nobody has told us what charset it's in, so do as little work as
54 /* Handle iso-8859-1/windows-1252/cp-1252, utf-16/ucs-2,
55 * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
56 const char * p
= charset
.c_str();
59 if (strncasecmp(p
, "utf", 3) == 0) {
61 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
62 if (*p
!= '1' || p
[1] != '6') goto try_iconv
;
65 } else if (strncasecmp(p
, "ucs", 3) == 0) {
67 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
68 if (*p
!= '2') goto try_iconv
;
74 if (text
.size() < 2) return;
76 bool big_endian
= true;
77 string::const_iterator i
= text
.begin();
79 // GNU iconv doesn't seem to handle BOMs.
80 if (startswith(text
, "\xfe\xff")) {
82 } else if (startswith(text
, "\xff\xfe")) {
86 // UTF-16 with no BOM is meant to be assumed to be BE. Strictly
87 // speaking, we're not meant to assume anything for UCS-2 with
88 // no BOM, but we've got to do something, so we might as well
89 // assume it's UTF-16 mislabelled, which is easy and sane.
90 } else if (strcasecmp(p
, "LE") == 0) {
92 } else if (!(strcasecmp(p
, "BE") == 0)) {
96 tmp
.reserve(text
.size() / 2);
99 if (text
.size() & 1) {
100 // If there's a half-character at the end, nuke it now to make the
101 // conversion loop below simpler.
102 text
.resize(text
.size() - 1);
105 while (i
!= text
.end()) {
106 unsigned ch
= static_cast<unsigned char>(*i
++);
107 unsigned ch2
= static_cast<unsigned char>(*i
++);
109 ch
= (ch
<< 8) | ch2
;
111 ch
= (ch2
<< 8) | ch
;
113 if (ch
>> 10 == 0xd800 >> 10) {
115 if (i
== text
.end()) break;
116 unsigned hi
= (ch
& 0x3ff);
117 ch
= static_cast<unsigned char>(*i
++);
118 ch2
= static_cast<unsigned char>(*i
++);
120 ch
= (ch
<< 8) | ch2
;
122 ch
= (ch2
<< 8) | ch
;
124 if (ch
>> 10 == 0xdc00 >> 10) {
130 start
+= Xapian::Unicode::to_utf8(ch
, buf
+ start
);
131 if (start
>= sizeof(buf
) - 4) {
132 tmp
.append(buf
, start
);
136 if (start
) tmp
.append(buf
, start
);
138 // Assume windows-1252 if iso-8859-1 is specified. The only
139 // differences are in the range 128-159 which are control characters in
140 // iso-8859-1, and a lot of content is mislabelled. We use our own
141 // conversion code for this case, as GNU iconv fails if it sees one of
142 // the unassigned code points in windows-1252, whereas it would accept
143 // the same input as iso-8859-1, and it seems undesirable to be
144 // rejecting input due to this behind-the-scenes character set
146 const char * q
= NULL
;
147 if (strncasecmp(p
, "windows", 7) == 0) {
149 } else if (strncasecmp(p
, "cp", 2) == 0) {
153 if (*q
== '-' || *q
== '_' || *q
== ' ') ++q
;
154 if (strcmp(q
, "1252") != 0)
157 if (strncasecmp(p
, "iso", 3) == 0) {
159 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
161 if (strncmp(p
, "8859", 4) != 0) goto try_iconv
;
163 if (*p
== '-' || *p
== '_' || *p
== ' ') ++p
;
164 if (strcmp(p
, "1") != 0) goto try_iconv
;
167 // FIXME: pull this out as a standard "normalise utf-8" function?
168 tmp
.reserve(text
.size());
171 for (string::const_iterator i
= text
.begin(); i
!= text
.end(); ++i
) {
172 static const unsigned cp1252_to_unicode
[32] = {
173 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
174 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
175 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
176 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
178 unsigned ch
= static_cast<unsigned char>(*i
);
179 if (ch
- 128 < sizeof(cp1252_to_unicode
) / sizeof(*cp1252_to_unicode
))
180 ch
= cp1252_to_unicode
[ch
- 128];
181 start
+= Xapian::Unicode::to_utf8(ch
, buf
+ start
);
182 if (start
>= sizeof(buf
) - 4) {
183 tmp
.append(buf
, start
);
187 if (start
) tmp
.append(buf
, start
);
193 iconv_t conv
= iconv_open("UTF-8", charset
.c_str());
194 if (conv
== reinterpret_cast<iconv_t
>(-1))
196 ICONV_INPUT_TYPE in
= const_cast<char *>(text
.c_str());
197 size_t in_len
= text
.size();
200 size_t out_len
= sizeof(buf
);
201 if (iconv(conv
, &in
, &in_len
, &out
, &out_len
) == size_t(-1) &&
203 // FIXME: how to handle this?
206 tmp
.append(buf
, out
- buf
);
209 (void)iconv_close(conv
);