ext/scintilla/src/UniConversion.cxx

   1 // Scintilla source code edit control
   2 /** @file UniConversion.cxx
   3  ** Functions to handle UTF-8 and UTF-16 strings.
   4  **/
   5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
   6 // The License.txt file describes the conditions under which this software may be distributed.
   7
   8 #include <cstdlib>
   9
  10 #include <stdexcept>
  11 #include <string>
  12
  13 #include "UniConversion.h"
  14
  15 using namespace Scintilla;
  16
  17 namespace Scintilla {
  18
  19 size_t UTF8Length(const wchar_t *uptr, size_t tlen) {
  20         size_t len = 0;
  21         for (size_t i = 0; i < tlen && uptr[i];) {
  22                 const unsigned int uch = uptr[i];
  23                 if (uch < 0x80) {
  24                         len++;
  25                 } else if (uch < 0x800) {
  26                         len += 2;
  27                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  28                         (uch <= SURROGATE_TRAIL_LAST)) {
  29                         len += 4;
  30                         i++;
  31                 } else {
  32                         len += 3;
  33                 }
  34                 i++;
  35         }
  36         return len;
  37 }
  38
  39 void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {
  40         size_t k = 0;
  41         for (size_t i = 0; i < tlen && uptr[i];) {
  42                 const unsigned int uch = uptr[i];
  43                 if (uch < 0x80) {
  44                         putf[k++] = static_cast<char>(uch);
  45                 } else if (uch < 0x800) {
  46                         putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
  47                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  48                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  49                         (uch <= SURROGATE_TRAIL_LAST)) {
  50                         // Half a surrogate pair
  51                         i++;
  52                         const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
  53                         putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
  54                         putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
  55                         putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
  56                         putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
  57                 } else {
  58                         putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
  59                         putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  60                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  61                 }
  62                 i++;
  63         }
  64         if (k < len)
  65                 putf[k] = '\0';
  66 }
  67
  68 void UTF8FromUTF32Character(int uch, char *putf) {
  69         size_t k = 0;
  70         if (uch < 0x80) {
  71                 putf[k++] = static_cast<char>(uch);
  72         } else if (uch < 0x800) {
  73                 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
  74                 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  75         } else if (uch < 0x10000) {
  76                 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
  77                 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  78                 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  79         } else {
  80                 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
  81                 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
  82                 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  83                 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  84         }
  85         putf[k] = '\0';
  86 }
  87
  88 size_t UTF16Length(const char *s, size_t len) {
  89         size_t ulen = 0;
  90         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
  91         for (size_t i = 0; i < len;) {
  92                 const unsigned char ch = us[i];
  93                 const unsigned int byteCount = UTF8BytesOfLead[ch];
  94                 const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
  95                 i += byteCount;
  96                 ulen += (i > len) ? 1 : utf16Len;
  97         }
  98         return ulen;
  99 }
 100
 101 constexpr unsigned char TrailByteValue(unsigned char c) {
 102         // The top 2 bits are 0b10 to indicate a trail byte.
 103         // The lower 6 bits contain the value.
 104         return c & 0b0011'1111;
 105 }
 106
 107 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
 108         size_t ui = 0;
 109         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 110         for (size_t i = 0; i < len;) {
 111                 unsigned char ch = us[i];
 112                 const unsigned int byteCount = UTF8BytesOfLead[ch];
 113                 unsigned int value;
 114
 115                 if (i + byteCount > len) {
 116                         // Trying to read past end but still have space to write
 117                         if (ui < tlen) {
 118                                 tbuf[ui] = ch;
 119                                 ui++;
 120                         }
 121                         break;
 122                 }
 123
 124                 const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
 125                 if (ui + outLen > tlen) {
 126                         throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
 127                 }
 128
 129                 i++;
 130                 switch (byteCount) {
 131                 case 1:
 132                         tbuf[ui] = ch;
 133                         break;
 134                 case 2:
 135                         value = (ch & 0x1F) << 6;
 136                         ch = us[i++];
 137                         value += TrailByteValue(ch);
 138                         tbuf[ui] = static_cast<wchar_t>(value);
 139                         break;
 140                 case 3:
 141                         value = (ch & 0xF) << 12;
 142                         ch = us[i++];
 143                         value += (TrailByteValue(ch) << 6);
 144                         ch = us[i++];
 145                         value += TrailByteValue(ch);
 146                         tbuf[ui] = static_cast<wchar_t>(value);
 147                         break;
 148                 default:
 149                         // Outside the BMP so need two surrogates
 150                         value = (ch & 0x7) << 18;
 151                         ch = us[i++];
 152                         value += TrailByteValue(ch) << 12;
 153                         ch = us[i++];
 154                         value += TrailByteValue(ch) << 6;
 155                         ch = us[i++];
 156                         value += TrailByteValue(ch);
 157                         tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
 158                         ui++;
 159                         tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
 160                         break;
 161                 }
 162                 ui++;
 163         }
 164         return ui;
 165 }
 166
 167 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
 168         size_t ui = 0;
 169         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 170         for (size_t i = 0; i < len;) {
 171                 unsigned char ch = us[i];
 172                 const unsigned int byteCount = UTF8BytesOfLead[ch];
 173                 unsigned int value;
 174
 175                 if (i + byteCount > len) {
 176                         // Trying to read past end but still have space to write
 177                         if (ui < tlen) {
 178                                 tbuf[ui] = ch;
 179                                 ui++;
 180                         }
 181                         break;
 182                 }
 183
 184                 if (ui == tlen) {
 185                         throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
 186                 }
 187
 188                 i++;
 189                 switch (byteCount) {
 190                 case 1:
 191                         value = ch;
 192                         break;
 193                 case 2:
 194                         value = (ch & 0x1F) << 6;
 195                         ch = us[i++];
 196                         value += TrailByteValue(ch);
 197                         break;
 198                 case 3:
 199                         value = (ch & 0xF) << 12;
 200                         ch = us[i++];
 201                         value += TrailByteValue(ch) << 6;
 202                         ch = us[i++];
 203                         value += TrailByteValue(ch);
 204                         break;
 205                 default:
 206                         value = (ch & 0x7) << 18;
 207                         ch = us[i++];
 208                         value += TrailByteValue(ch) << 12;
 209                         ch = us[i++];
 210                         value += TrailByteValue(ch) << 6;
 211                         ch = us[i++];
 212                         value += TrailByteValue(ch);
 213                         break;
 214                 }
 215                 tbuf[ui] = value;
 216                 ui++;
 217         }
 218         return ui;
 219 }
 220
 221 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
 222         if (val < SUPPLEMENTAL_PLANE_FIRST) {
 223                 tbuf[0] = static_cast<wchar_t>(val);
 224                 return 1;
 225         } else {
 226                 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
 227                 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
 228                 return 2;
 229         }
 230 }
 231
 232 const unsigned char UTF8BytesOfLead[256] = {
 233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
 234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
 235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
 236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
 237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
 238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
 239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
 240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
 241 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
 242 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
 243 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
 244 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
 245 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
 246 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
 247 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
 248 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
 249 };
 250
 251 // Return both the width of the first character in the string and a status
 252 // saying whether it is valid or invalid.
 253 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
 254 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
 255 // reasonably treated as code points in some circumstances. They will, however,
 256 // not have associated glyphs.
 257 int UTF8Classify(const unsigned char *us, int len) {
 258         // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 259         if (us[0] < 0x80) {
 260                 // ASCII
 261                 return 1;
 262         }
 263
 264         const int byteCount = UTF8BytesOfLead[us[0]];
 265         if (byteCount == 1 || byteCount > len) {
 266                 // Invalid lead byte
 267                 return UTF8MaskInvalid | 1;
 268         }
 269
 270         if (!UTF8IsTrailByte(us[1])) {
 271                 // Invalid trail byte
 272                 return UTF8MaskInvalid | 1;
 273         }
 274
 275         switch (byteCount) {
 276         case 2:
 277                 return 2;
 278
 279         case 3:
 280                 if (UTF8IsTrailByte(us[2])) {
 281                         if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
 282                                 // Overlong
 283                                 return UTF8MaskInvalid | 1;
 284                         }
 285                         if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
 286                                 // Surrogate
 287                                 return UTF8MaskInvalid | 1;
 288                         }
 289                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
 290                                 // U+FFFE non-character - 3 bytes long
 291                                 return UTF8MaskInvalid | 3;
 292                         }
 293                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
 294                                 // U+FFFF non-character - 3 bytes long
 295                                 return UTF8MaskInvalid | 3;
 296                         }
 297                         if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
 298                                 // U+FDD0 .. U+FDEF
 299                                 return UTF8MaskInvalid | 3;
 300                         }
 301                         return 3;
 302                 }
 303                 break;
 304
 305         default:
 306                 if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
 307                         if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
 308                                 // *FFFE or *FFFF non-character
 309                                 return UTF8MaskInvalid | 4;
 310                         }
 311                         if (*us == 0xf4) {
 312                                 // Check if encoding a value beyond the last Unicode character 10FFFF
 313                                 if (us[1] > 0x8f) {
 314                                         return UTF8MaskInvalid | 1;
 315                                 }
 316                         } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
 317                                 // Overlong
 318                                 return UTF8MaskInvalid | 1;
 319                         }
 320                         return 4;
 321                 }
 322                 break;
 323         }
 324
 325         return UTF8MaskInvalid | 1;
 326 }
 327
 328 int UTF8DrawBytes(const unsigned char *us, int len) {
 329         const int utf8StatusNext = UTF8Classify(us, len);
 330         return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
 331 }
 332
 333 // Replace invalid bytes in UTF-8 with the replacement character
 334 std::string FixInvalidUTF8(const std::string &text) {
 335         std::string result;
 336         const unsigned char *us = reinterpret_cast<const unsigned char *>(text.c_str());
 337         size_t remaining = text.size();
 338         while (remaining > 0) {
 339                 const int utf8Status = UTF8Classify(us, static_cast<int>(remaining));
 340                 if (utf8Status & UTF8MaskInvalid) {
 341                         // Replacement character 0xFFFD = UTF8:"efbfbd".
 342                         result.append("\xef\xbf\xbd");
 343                         us++;
 344                         remaining--;
 345                 } else {
 346                         const int len = utf8Status&UTF8MaskWidth;
 347                         result.append(reinterpret_cast<const char *>(us), len);
 348                         us += len;
 349                         remaining -= len;
 350                 }
 351         }
 352         return result;
 353 }
 354
 355 }