scintilla/src/UniConversion.cxx

   1 // Scintilla source code edit control
   2 /** @file UniConversion.cxx
   3  ** Functions to handle UTF-8 and UTF-16 strings.
   4  **/
   5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
   6 // The License.txt file describes the conditions under which this software may be distributed.
   7
   8 #include <cstdlib>
   9
  10 #include <stdexcept>
  11 #include <string>
  12 #include <string_view>
  13
  14 #include "UniConversion.h"
  15
  16 namespace Scintilla::Internal {
  17
  18 size_t UTF8Length(std::wstring_view wsv) noexcept {
  19         size_t len = 0;
  20         for (size_t i = 0; i < wsv.length() && wsv[i];) {
  21                 const unsigned int uch = wsv[i];
  22                 if (uch < 0x80) {
  23                         len++;
  24                 } else if (uch < 0x800) {
  25                         len += 2;
  26                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  27                         (uch <= SURROGATE_TRAIL_LAST)) {
  28                         len += 4;
  29                         i++;
  30                 } else {
  31                         len += 3;
  32                 }
  33                 i++;
  34         }
  35         return len;
  36 }
  37
  38 size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept {
  39         size_t positionUTF8 = 0;
  40         for (size_t lengthUTF16 = 0; (positionUTF8 < u8Text.length()) && (lengthUTF16 < positionUTF16);) {
  41                 const unsigned char uch = u8Text[positionUTF8];
  42                 const unsigned int byteCount = UTF8BytesOfLead[uch];
  43                 lengthUTF16 += UTF16LengthFromUTF8ByteCount(byteCount);
  44                 positionUTF8 += byteCount;
  45         }
  46
  47         return positionUTF8;
  48 }
  49
  50 void UTF8FromUTF16(std::wstring_view wsv, char *putf, size_t len) noexcept {
  51         size_t k = 0;
  52         for (size_t i = 0; i < wsv.length() && wsv[i];) {
  53                 const unsigned int uch = wsv[i];
  54                 if (uch < 0x80) {
  55                         putf[k++] = static_cast<char>(uch);
  56                 } else if (uch < 0x800) {
  57                         putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
  58                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  59                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  60                         (uch <= SURROGATE_TRAIL_LAST)) {
  61                         // Half a surrogate pair
  62                         i++;
  63                         const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (wsv[i] & 0x3ff);
  64                         putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
  65                         putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
  66                         putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
  67                         putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
  68                 } else {
  69                         putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
  70                         putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  71                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  72                 }
  73                 i++;
  74         }
  75         if (k < len)
  76                 putf[k] = '\0';
  77 }
  78
  79 void UTF8FromUTF32Character(int uch, char *putf) noexcept {
  80         size_t k = 0;
  81         if (uch < 0x80) {
  82                 putf[k++] = static_cast<char>(uch);
  83         } else if (uch < 0x800) {
  84                 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
  85                 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  86         } else if (uch < 0x10000) {
  87                 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
  88                 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  89                 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  90         } else {
  91                 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
  92                 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
  93                 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  94                 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  95         }
  96         putf[k] = '\0';
  97 }
  98
  99 size_t UTF16Length(std::string_view svu8) noexcept {
 100         size_t ulen = 0;
 101         for (size_t i = 0; i< svu8.length();) {
 102                 const unsigned char ch = svu8[i];
 103                 const unsigned int byteCount = UTF8BytesOfLead[ch];
 104                 const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
 105                 i += byteCount;
 106                 ulen += (i > svu8.length()) ? 1 : utf16Len;
 107         }
 108         return ulen;
 109 }
 110
 111 constexpr unsigned char TrailByteValue(unsigned char c) {
 112         // The top 2 bits are 0b10 to indicate a trail byte.
 113         // The lower 6 bits contain the value.
 114         return c & 0b0011'1111;
 115 }
 116
 117 size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen) {
 118         size_t ui = 0;
 119         for (size_t i = 0; i < svu8.length();) {
 120                 unsigned char ch = svu8[i];
 121                 const unsigned int byteCount = UTF8BytesOfLead[ch];
 122                 unsigned int value;
 123
 124                 if (i + byteCount > svu8.length()) {
 125                         // Trying to read past end but still have space to write
 126                         if (ui < tlen) {
 127                                 tbuf[ui] = ch;
 128                                 ui++;
 129                         }
 130                         break;
 131                 }
 132
 133                 const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
 134                 if (ui + outLen > tlen) {
 135                         throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
 136                 }
 137
 138                 i++;
 139                 switch (byteCount) {
 140                 case 1:
 141                         tbuf[ui] = ch;
 142                         break;
 143                 case 2:
 144                         value = (ch & 0x1F) << 6;
 145                         ch = svu8[i++];
 146                         value += TrailByteValue(ch);
 147                         tbuf[ui] = static_cast<wchar_t>(value);
 148                         break;
 149                 case 3:
 150                         value = (ch & 0xF) << 12;
 151                         ch = svu8[i++];
 152                         value += (TrailByteValue(ch) << 6);
 153                         ch = svu8[i++];
 154                         value += TrailByteValue(ch);
 155                         tbuf[ui] = static_cast<wchar_t>(value);
 156                         break;
 157                 default:
 158                         // Outside the BMP so need two surrogates
 159                         value = (ch & 0x7) << 18;
 160                         ch = svu8[i++];
 161                         value += TrailByteValue(ch) << 12;
 162                         ch = svu8[i++];
 163                         value += TrailByteValue(ch) << 6;
 164                         ch = svu8[i++];
 165                         value += TrailByteValue(ch);
 166                         tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
 167                         ui++;
 168                         tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
 169                         break;
 170                 }
 171                 ui++;
 172         }
 173         return ui;
 174 }
 175
 176 size_t UTF32Length(std::string_view svu8) noexcept {
 177         size_t ulen = 0;
 178         for (size_t i = 0; i < svu8.length();) {
 179                 const unsigned char ch = svu8[i];
 180                 const unsigned int byteCount = UTF8BytesOfLead[ch];
 181                 i += byteCount;
 182                 ulen++;
 183         }
 184         return ulen;
 185 }
 186
 187 size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen) {
 188         size_t ui = 0;
 189         for (size_t i = 0; i < svu8.length();) {
 190                 unsigned char ch = svu8[i];
 191                 const unsigned int byteCount = UTF8BytesOfLead[ch];
 192                 unsigned int value;
 193
 194                 if (i + byteCount > svu8.length()) {
 195                         // Trying to read past end but still have space to write
 196                         if (ui < tlen) {
 197                                 tbuf[ui] = ch;
 198                                 ui++;
 199                         }
 200                         break;
 201                 }
 202
 203                 if (ui == tlen) {
 204                         throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
 205                 }
 206
 207                 i++;
 208                 switch (byteCount) {
 209                 case 1:
 210                         value = ch;
 211                         break;
 212                 case 2:
 213                         value = (ch & 0x1F) << 6;
 214                         ch = svu8[i++];
 215                         value += TrailByteValue(ch);
 216                         break;
 217                 case 3:
 218                         value = (ch & 0xF) << 12;
 219                         ch = svu8[i++];
 220                         value += TrailByteValue(ch) << 6;
 221                         ch = svu8[i++];
 222                         value += TrailByteValue(ch);
 223                         break;
 224                 default:
 225                         value = (ch & 0x7) << 18;
 226                         ch = svu8[i++];
 227                         value += TrailByteValue(ch) << 12;
 228                         ch = svu8[i++];
 229                         value += TrailByteValue(ch) << 6;
 230                         ch = svu8[i++];
 231                         value += TrailByteValue(ch);
 232                         break;
 233                 }
 234                 tbuf[ui] = value;
 235                 ui++;
 236         }
 237         return ui;
 238 }
 239
 240 std::wstring WStringFromUTF8(std::string_view svu8) {
 241         if constexpr (sizeof(wchar_t) == 2) {
 242                 const size_t len16 = UTF16Length(svu8);
 243                 std::wstring ws(len16, 0);
 244                 UTF16FromUTF8(svu8, &ws[0], len16);
 245                 return ws;
 246         } else {
 247                 const size_t len32 = UTF32Length(svu8);
 248                 std::wstring ws(len32, 0);
 249                 UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32);
 250                 return ws;
 251         }
 252 }
 253
 254 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
 255         if (val < SUPPLEMENTAL_PLANE_FIRST) {
 256                 tbuf[0] = static_cast<wchar_t>(val);
 257                 return 1;
 258         } else {
 259                 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
 260                 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
 261                 return 2;
 262         }
 263 }
 264
 265 const unsigned char UTF8BytesOfLead[256] = {
 266 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
 267 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
 268 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
 269 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
 270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
 271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
 272 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
 273 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
 274 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
 275 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
 276 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
 277 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
 278 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
 279 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
 280 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
 281 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
 282 };
 283
 284 // Return both the width of the first character in the string and a status
 285 // saying whether it is valid or invalid.
 286 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
 287 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
 288 // reasonably treated as code points in some circumstances. They will, however,
 289 // not have associated glyphs.
 290 int UTF8Classify(const unsigned char *us, size_t len) noexcept {
 291         // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 292         if (us[0] < 0x80) {
 293                 // ASCII
 294                 return 1;
 295         }
 296
 297         const size_t byteCount = UTF8BytesOfLead[us[0]];
 298         if (byteCount == 1 || byteCount > len) {
 299                 // Invalid lead byte
 300                 return UTF8MaskInvalid | 1;
 301         }
 302
 303         if (!UTF8IsTrailByte(us[1])) {
 304                 // Invalid trail byte
 305                 return UTF8MaskInvalid | 1;
 306         }
 307
 308         switch (byteCount) {
 309         case 2:
 310                 return 2;
 311
 312         case 3:
 313                 if (UTF8IsTrailByte(us[2])) {
 314                         if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
 315                                 // Overlong
 316                                 return UTF8MaskInvalid | 1;
 317                         }
 318                         if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
 319                                 // Surrogate
 320                                 return UTF8MaskInvalid | 1;
 321                         }
 322                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
 323                                 // U+FFFE non-character - 3 bytes long
 324                                 return UTF8MaskInvalid | 3;
 325                         }
 326                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
 327                                 // U+FFFF non-character - 3 bytes long
 328                                 return UTF8MaskInvalid | 3;
 329                         }
 330                         if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
 331                                 // U+FDD0 .. U+FDEF
 332                                 return UTF8MaskInvalid | 3;
 333                         }
 334                         return 3;
 335                 }
 336                 break;
 337
 338         default:
 339                 if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
 340                         if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
 341                                 // *FFFE or *FFFF non-character
 342                                 return UTF8MaskInvalid | 4;
 343                         }
 344                         if (*us == 0xf4) {
 345                                 // Check if encoding a value beyond the last Unicode character 10FFFF
 346                                 if (us[1] > 0x8f) {
 347                                         return UTF8MaskInvalid | 1;
 348                                 }
 349                         } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
 350                                 // Overlong
 351                                 return UTF8MaskInvalid | 1;
 352                         }
 353                         return 4;
 354                 }
 355                 break;
 356         }
 357
 358         return UTF8MaskInvalid | 1;
 359 }
 360
 361 int UTF8DrawBytes(const char *s, size_t len) noexcept {
 362         const int utf8StatusNext = UTF8Classify(reinterpret_cast<const unsigned char *>(s), len);
 363         return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
 364 }
 365
 366 bool UTF8IsValid(std::string_view svu8) noexcept {
 367         const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data());
 368         size_t remaining = svu8.length();
 369         while (remaining > 0) {
 370                 const int utf8Status = UTF8Classify(us, remaining);
 371                 if (utf8Status & UTF8MaskInvalid) {
 372                         return false;
 373                 } else {
 374                         const int lenChar = utf8Status & UTF8MaskWidth;
 375                         us += lenChar;
 376                         remaining -= lenChar;
 377                 }
 378         }
 379         return remaining == 0;
 380 }
 381
 382 // Replace invalid bytes in UTF-8 with the replacement character
 383 std::string FixInvalidUTF8(const std::string &text) {
 384         std::string result;
 385         const char *s = text.c_str();
 386         size_t remaining = text.size();
 387         while (remaining > 0) {
 388                 const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
 389                 if (utf8Status & UTF8MaskInvalid) {
 390                         // Replacement character 0xFFFD = UTF8:"efbfbd".
 391                         result.append("\xef\xbf\xbd");
 392                         s++;
 393                         remaining--;
 394                 } else {
 395                         const size_t len = utf8Status & UTF8MaskWidth;
 396                         result.append(s, len);
 397                         s += len;
 398                         remaining -= len;
 399                 }
 400         }
 401         return result;
 402 }
 403
 404 }