ext/scintilla/src/UniConversion.cxx

   1 // Scintilla source code edit control
   2 /** @file UniConversion.cxx
   3  ** Functions to handle UTF-8 and UTF-16 strings.
   4  **/
   5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
   6 // The License.txt file describes the conditions under which this software may be distributed.
   7
   8 #include <stdlib.h>
   9
  10 #include <stdexcept>
  11 #include <string>
  12
  13 #include "UniConversion.h"
  14
  15 #ifdef SCI_NAMESPACE
  16 using namespace Scintilla;
  17 #endif
  18
  19 #ifdef SCI_NAMESPACE
  20 namespace Scintilla {
  21 #endif
  22
  23 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
  24         unsigned int len = 0;
  25         for (unsigned int i = 0; i < tlen && uptr[i];) {
  26                 unsigned int uch = uptr[i];
  27                 if (uch < 0x80) {
  28                         len++;
  29                 } else if (uch < 0x800) {
  30                         len += 2;
  31                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  32                         (uch <= SURROGATE_TRAIL_LAST)) {
  33                         len += 4;
  34                         i++;
  35                 } else {
  36                         len += 3;
  37                 }
  38                 i++;
  39         }
  40         return len;
  41 }
  42
  43 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
  44         unsigned int k = 0;
  45         for (unsigned int i = 0; i < tlen && uptr[i];) {
  46                 unsigned int uch = uptr[i];
  47                 if (uch < 0x80) {
  48                         putf[k++] = static_cast<char>(uch);
  49                 } else if (uch < 0x800) {
  50                         putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
  51                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  52                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  53                         (uch <= SURROGATE_TRAIL_LAST)) {
  54                         // Half a surrogate pair
  55                         i++;
  56                         unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
  57                         putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
  58                         putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
  59                         putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
  60                         putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
  61                 } else {
  62                         putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
  63                         putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  64                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  65                 }
  66                 i++;
  67         }
  68         if (k < len)
  69                 putf[k] = '\0';
  70 }
  71
  72 unsigned int UTF8CharLength(unsigned char ch) {
  73         if (ch < 0x80) {
  74                 return 1;
  75         } else if (ch < 0x80 + 0x40 + 0x20) {
  76                 return 2;
  77         } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
  78                 return 3;
  79         } else {
  80                 return 4;
  81         }
  82 }
  83
  84 size_t UTF16Length(const char *s, size_t len) {
  85         size_t ulen = 0;
  86         size_t charLen;
  87         for (size_t i = 0; i<len;) {
  88                 unsigned char ch = static_cast<unsigned char>(s[i]);
  89                 if (ch < 0x80) {
  90                         charLen = 1;
  91                 } else if (ch < 0x80 + 0x40 + 0x20) {
  92                         charLen = 2;
  93                 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
  94                         charLen = 3;
  95                 } else {
  96                         charLen = 4;
  97                         ulen++;
  98                 }
  99                 i += charLen;
 100                 ulen++;
 101         }
 102         return ulen;
 103 }
 104
 105 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
 106         size_t ui = 0;
 107         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 108         size_t i = 0;
 109         while ((i<len) && (ui<tlen)) {
 110                 unsigned char ch = us[i++];
 111                 if (ch < 0x80) {
 112                         tbuf[ui] = ch;
 113                 } else if (ch < 0x80 + 0x40 + 0x20) {
 114                         tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
 115                         ch = us[i++];
 116                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
 117                 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
 118                         tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
 119                         ch = us[i++];
 120                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
 121                         ch = us[i++];
 122                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
 123                 } else {
 124                         // Outside the BMP so need two surrogates
 125                         int val = (ch & 0x7) << 18;
 126                         ch = us[i++];
 127                         val += (ch & 0x3F) << 12;
 128                         ch = us[i++];
 129                         val += (ch & 0x3F) << 6;
 130                         ch = us[i++];
 131                         val += (ch & 0x3F);
 132                         tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
 133                         ui++;
 134                         tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
 135                 }
 136                 ui++;
 137         }
 138         return ui;
 139 }
 140
 141 unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen) {
 142         unsigned int ui=0;
 143         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 144         unsigned int i=0;
 145         while ((i<len) && (ui<tlen)) {
 146                 unsigned char ch = us[i++];
 147                 unsigned int value = 0;
 148                 if (ch < 0x80) {
 149                         value = ch;
 150                 } else if (((len-i) >= 1) && (ch < 0x80 + 0x40 + 0x20)) {
 151                         value = (ch & 0x1F) << 6;
 152                         ch = us[i++];
 153                         value += ch & 0x7F;
 154                 } else if (((len-i) >= 2) && (ch < 0x80 + 0x40 + 0x20 + 0x10)) {
 155                         value = (ch & 0xF) << 12;
 156                         ch = us[i++];
 157                         value += (ch & 0x7F) << 6;
 158                         ch = us[i++];
 159                         value += ch & 0x7F;
 160                 } else if ((len-i) >= 3) {
 161                         value = (ch & 0x7) << 18;
 162                         ch = us[i++];
 163                         value += (ch & 0x3F) << 12;
 164                         ch = us[i++];
 165                         value += (ch & 0x3F) << 6;
 166                         ch = us[i++];
 167                         value += ch & 0x3F;
 168                 }
 169                 tbuf[ui] = value;
 170                 ui++;
 171         }
 172         return ui;
 173 }
 174
 175 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
 176         if (val < SUPPLEMENTAL_PLANE_FIRST) {
 177                 tbuf[0] = static_cast<wchar_t>(val);
 178                 return 1;
 179         } else {
 180                 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
 181                 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
 182                 return 2;
 183         }
 184 }
 185
 186 int UTF8BytesOfLead[256];
 187 static bool initialisedBytesOfLead = false;
 188
 189 static int BytesFromLead(int leadByte) {
 190         if (leadByte < 0xC2) {
 191                 // Single byte or invalid
 192                 return 1;
 193         } else if (leadByte < 0xE0) {
 194                 return 2;
 195         } else if (leadByte < 0xF0) {
 196                 return 3;
 197         } else if (leadByte < 0xF5) {
 198                 return 4;
 199         } else {
 200                 // Characters longer than 4 bytes not possible in current UTF-8
 201                 return 1;
 202         }
 203 }
 204
 205 void UTF8BytesOfLeadInitialise() {
 206         if (!initialisedBytesOfLead) {
 207                 for (int i=0; i<256; i++) {
 208                         UTF8BytesOfLead[i] = BytesFromLead(i);
 209                 }
 210                 initialisedBytesOfLead = true;
 211         }
 212 }
 213
 214 // Return both the width of the first character in the string and a status
 215 // saying whether it is valid or invalid.
 216 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
 217 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
 218 // reasonably treated as code points in some circumstances. They will, however,
 219 // not have associated glyphs.
 220 int UTF8Classify(const unsigned char *us, int len) {
 221         // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 222         if (*us < 0x80) {
 223                 // Single bytes easy
 224                 return 1;
 225         } else if (*us > 0xf4) {
 226                 // Characters longer than 4 bytes not possible in current UTF-8
 227                 return UTF8MaskInvalid | 1;
 228         } else if (*us >= 0xf0) {
 229                 // 4 bytes
 230                 if (len < 4)
 231                         return UTF8MaskInvalid | 1;
 232                 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
 233                         if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
 234                                 // *FFFE or *FFFF non-character
 235                                 return UTF8MaskInvalid | 4;
 236                         }
 237                         if (*us == 0xf4) {
 238                                 // Check if encoding a value beyond the last Unicode character 10FFFF
 239                                 if (us[1] > 0x8f) {
 240                                         return UTF8MaskInvalid | 1;
 241                                 } else if (us[1] == 0x8f) {
 242                                         if (us[2] > 0xbf) {
 243                                                 return UTF8MaskInvalid | 1;
 244                                         } else if (us[2] == 0xbf) {
 245                                                 if (us[3] > 0xbf) {
 246                                                         return UTF8MaskInvalid | 1;
 247                                                 }
 248                                         }
 249                                 }
 250                         } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
 251                                 // Overlong
 252                                 return UTF8MaskInvalid | 1;
 253                         }
 254                         return 4;
 255                 } else {
 256                         return UTF8MaskInvalid | 1;
 257                 }
 258         } else if (*us >= 0xe0) {
 259                 // 3 bytes
 260                 if (len < 3)
 261                         return UTF8MaskInvalid | 1;
 262                 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
 263                         if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
 264                                 // Overlong
 265                                 return UTF8MaskInvalid | 1;
 266                         }
 267                         if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
 268                                 // Surrogate
 269                                 return UTF8MaskInvalid | 1;
 270                         }
 271                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
 272                                 // U+FFFE non-character - 3 bytes long
 273                                 return UTF8MaskInvalid | 3;
 274                         }
 275                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
 276                                 // U+FFFF non-character - 3 bytes long
 277                                 return UTF8MaskInvalid | 3;
 278                         }
 279                         if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
 280                                 // U+FDD0 .. U+FDEF
 281                                 return UTF8MaskInvalid | 3;
 282                         }
 283                         return 3;
 284                 } else {
 285                         return UTF8MaskInvalid | 1;
 286                 }
 287         } else if (*us >= 0xc2) {
 288                 // 2 bytes
 289                 if (len < 2)
 290                         return UTF8MaskInvalid | 1;
 291                 if (UTF8IsTrailByte(us[1])) {
 292                         return 2;
 293                 } else {
 294                         return UTF8MaskInvalid | 1;
 295                 }
 296         } else {
 297                 // 0xc0 .. 0xc1 is overlong encoding
 298                 // 0x80 .. 0xbf is trail byte
 299                 return UTF8MaskInvalid | 1;
 300         }
 301 }
 302
 303 int UTF8DrawBytes(const unsigned char *us, int len) {
 304         int utf8StatusNext = UTF8Classify(us, len);
 305         return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
 306 }
 307
 308 // Replace invalid bytes in UTF-8 with the replacement character
 309 std::string FixInvalidUTF8(const std::string &text) {
 310         std::string result;
 311         const unsigned char *us = reinterpret_cast<const unsigned char *>(text.c_str());
 312         size_t remaining = text.size();
 313         while (remaining > 0) {
 314                 const int utf8Status = UTF8Classify(us, static_cast<int>(remaining));
 315                 if (utf8Status & UTF8MaskInvalid) {
 316                         // Replacement character 0xFFFD = UTF8:"efbfbd".
 317                         result.append("\xef\xbf\xbd");
 318                         us++;
 319                         remaining--;
 320                 } else {
 321                         const int len = utf8Status&UTF8MaskWidth;
 322                         result.append(reinterpret_cast<const char *>(us), len);
 323                         us += len;
 324                         remaining -= len;
 325                 }
 326         }
 327         return result;
 328 }
 329
 330 #ifdef SCI_NAMESPACE
 331 }
 332 #endif