scintilla/src/UniConversion.cxx

   1 // Scintilla source code edit control
   2 /** @file UniConversion.cxx
   3  ** Functions to handle UTF-8 and UTF-16 strings.
   4  **/
   5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
   6 // The License.txt file describes the conditions under which this software may be distributed.
   7
   8 #include <stdlib.h>
   9
  10 #include <stdexcept>
  11
  12 #include "UniConversion.h"
  13
  14 #ifdef SCI_NAMESPACE
  15 using namespace Scintilla;
  16 #endif
  17
  18 #ifdef SCI_NAMESPACE
  19 namespace Scintilla {
  20 #endif
  21
  22 enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
  23 enum { SURROGATE_TRAIL_LAST = 0xDFFF };
  24 enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 };
  25
  26 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
  27         unsigned int len = 0;
  28         for (unsigned int i = 0; i < tlen && uptr[i];) {
  29                 unsigned int uch = uptr[i];
  30                 if (uch < 0x80) {
  31                         len++;
  32                 } else if (uch < 0x800) {
  33                         len += 2;
  34                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  35                         (uch <= SURROGATE_TRAIL_LAST)) {
  36                         len += 4;
  37                         i++;
  38                 } else {
  39                         len += 3;
  40                 }
  41                 i++;
  42         }
  43         return len;
  44 }
  45
  46 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
  47         unsigned int k = 0;
  48         for (unsigned int i = 0; i < tlen && uptr[i];) {
  49                 unsigned int uch = uptr[i];
  50                 if (uch < 0x80) {
  51                         putf[k++] = static_cast<char>(uch);
  52                 } else if (uch < 0x800) {
  53                         putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
  54                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  55                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  56                         (uch <= SURROGATE_TRAIL_LAST)) {
  57                         // Half a surrogate pair
  58                         i++;
  59                         unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
  60                         putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
  61                         putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
  62                         putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
  63                         putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
  64                 } else {
  65                         putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
  66                         putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  67                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  68                 }
  69                 i++;
  70         }
  71         if (k < len)
  72                 putf[k] = '\0';
  73 }
  74
  75 unsigned int UTF8CharLength(unsigned char ch) {
  76         if (ch < 0x80) {
  77                 return 1;
  78         } else if (ch < 0x80 + 0x40 + 0x20) {
  79                 return 2;
  80         } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
  81                 return 3;
  82         } else {
  83                 return 4;
  84         }
  85 }
  86
  87 size_t UTF16Length(const char *s, size_t len) {
  88         size_t ulen = 0;
  89         size_t charLen;
  90         for (size_t i = 0; i<len;) {
  91                 unsigned char ch = static_cast<unsigned char>(s[i]);
  92                 if (ch < 0x80) {
  93                         charLen = 1;
  94                 } else if (ch < 0x80 + 0x40 + 0x20) {
  95                         charLen = 2;
  96                 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
  97                         charLen = 3;
  98                 } else {
  99                         charLen = 4;
 100                         ulen++;
 101                 }
 102                 i += charLen;
 103                 ulen++;
 104         }
 105         return ulen;
 106 }
 107
 108 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
 109         size_t ui = 0;
 110         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 111         size_t i = 0;
 112         while ((i<len) && (ui<tlen)) {
 113                 unsigned char ch = us[i++];
 114                 if (ch < 0x80) {
 115                         tbuf[ui] = ch;
 116                 } else if (ch < 0x80 + 0x40 + 0x20) {
 117                         tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
 118                         ch = us[i++];
 119                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
 120                 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
 121                         tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
 122                         ch = us[i++];
 123                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
 124                         ch = us[i++];
 125                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
 126                 } else {
 127                         // Outside the BMP so need two surrogates
 128                         int val = (ch & 0x7) << 18;
 129                         ch = us[i++];
 130                         val += (ch & 0x3F) << 12;
 131                         ch = us[i++];
 132                         val += (ch & 0x3F) << 6;
 133                         ch = us[i++];
 134                         val += (ch & 0x3F);
 135                         tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
 136                         ui++;
 137                         tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
 138                 }
 139                 ui++;
 140         }
 141         return ui;
 142 }
 143
 144 unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen) {
 145         unsigned int ui=0;
 146         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 147         unsigned int i=0;
 148         while ((i<len) && (ui<tlen)) {
 149                 unsigned char ch = us[i++];
 150                 unsigned int value = 0;
 151                 if (ch < 0x80) {
 152                         value = ch;
 153                 } else if (((len-i) >= 1) && (ch < 0x80 + 0x40 + 0x20)) {
 154                         value = (ch & 0x1F) << 6;
 155                         ch = us[i++];
 156                         value += ch & 0x7F;
 157                 } else if (((len-i) >= 2) && (ch < 0x80 + 0x40 + 0x20 + 0x10)) {
 158                         value = (ch & 0xF) << 12;
 159                         ch = us[i++];
 160                         value += (ch & 0x7F) << 6;
 161                         ch = us[i++];
 162                         value += ch & 0x7F;
 163                 } else if ((len-i) >= 3) {
 164                         value = (ch & 0x7) << 18;
 165                         ch = us[i++];
 166                         value += (ch & 0x3F) << 12;
 167                         ch = us[i++];
 168                         value += (ch & 0x3F) << 6;
 169                         ch = us[i++];
 170                         value += ch & 0x3F;
 171                 }
 172                 tbuf[ui] = value;
 173                 ui++;
 174         }
 175         return ui;
 176 }
 177
 178 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
 179         if (val < SUPPLEMENTAL_PLANE_FIRST) {
 180                 tbuf[0] = static_cast<wchar_t>(val);
 181                 return 1;
 182         } else {
 183                 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
 184                 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
 185                 return 2;
 186         }
 187 }
 188
 189 int UTF8BytesOfLead[256];
 190 static bool initialisedBytesOfLead = false;
 191
 192 static int BytesFromLead(int leadByte) {
 193         if (leadByte < 0xC2) {
 194                 // Single byte or invalid
 195                 return 1;
 196         } else if (leadByte < 0xE0) {
 197                 return 2;
 198         } else if (leadByte < 0xF0) {
 199                 return 3;
 200         } else if (leadByte < 0xF5) {
 201                 return 4;
 202         } else {
 203                 // Characters longer than 4 bytes not possible in current UTF-8
 204                 return 1;
 205         }
 206 }
 207
 208 void UTF8BytesOfLeadInitialise() {
 209         if (!initialisedBytesOfLead) {
 210                 for (int i=0; i<256; i++) {
 211                         UTF8BytesOfLead[i] = BytesFromLead(i);
 212                 }
 213                 initialisedBytesOfLead = true;
 214         }
 215 }
 216
 217 // Return both the width of the first character in the string and a status
 218 // saying whether it is valid or invalid.
 219 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
 220 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
 221 // reasonably treated as code points in some circumstances. They will, however,
 222 // not have associated glyphs.
 223 int UTF8Classify(const unsigned char *us, int len) {
 224         // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 225         if (*us < 0x80) {
 226                 // Single bytes easy
 227                 return 1;
 228         } else if (*us > 0xf4) {
 229                 // Characters longer than 4 bytes not possible in current UTF-8
 230                 return UTF8MaskInvalid | 1;
 231         } else if (*us >= 0xf0) {
 232                 // 4 bytes
 233                 if (len < 4)
 234                         return UTF8MaskInvalid | 1;
 235                 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
 236                         if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
 237                                 // *FFFE or *FFFF non-character
 238                                 return UTF8MaskInvalid | 4;
 239                         }
 240                         if (*us == 0xf4) {
 241                                 // Check if encoding a value beyond the last Unicode character 10FFFF
 242                                 if (us[1] > 0x8f) {
 243                                         return UTF8MaskInvalid | 1;
 244                                 } else if (us[1] == 0x8f) {
 245                                         if (us[2] > 0xbf) {
 246                                                 return UTF8MaskInvalid | 1;
 247                                         } else if (us[2] == 0xbf) {
 248                                                 if (us[3] > 0xbf) {
 249                                                         return UTF8MaskInvalid | 1;
 250                                                 }
 251                                         }
 252                                 }
 253                         } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
 254                                 // Overlong
 255                                 return UTF8MaskInvalid | 1;
 256                         }
 257                         return 4;
 258                 } else {
 259                         return UTF8MaskInvalid | 1;
 260                 }
 261         } else if (*us >= 0xe0) {
 262                 // 3 bytes
 263                 if (len < 3)
 264                         return UTF8MaskInvalid | 1;
 265                 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
 266                         if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
 267                                 // Overlong
 268                                 return UTF8MaskInvalid | 1;
 269                         }
 270                         if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
 271                                 // Surrogate
 272                                 return UTF8MaskInvalid | 1;
 273                         }
 274                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
 275                                 // U+FFFE non-character - 3 bytes long
 276                                 return UTF8MaskInvalid | 3;
 277                         }
 278                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
 279                                 // U+FFFF non-character - 3 bytes long
 280                                 return UTF8MaskInvalid | 3;
 281                         }
 282                         if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
 283                                 // U+FDD0 .. U+FDEF
 284                                 return UTF8MaskInvalid | 3;
 285                         }
 286                         return 3;
 287                 } else {
 288                         return UTF8MaskInvalid | 1;
 289                 }
 290         } else if (*us >= 0xc2) {
 291                 // 2 bytes
 292                 if (len < 2)
 293                         return UTF8MaskInvalid | 1;
 294                 if (UTF8IsTrailByte(us[1])) {
 295                         return 2;
 296                 } else {
 297                         return UTF8MaskInvalid | 1;
 298                 }
 299         } else {
 300                 // 0xc0 .. 0xc1 is overlong encoding
 301                 // 0x80 .. 0xbf is trail byte
 302                 return UTF8MaskInvalid | 1;
 303         }
 304 }
 305
 306 int UTF8DrawBytes(const unsigned char *us, int len) {
 307         int utf8StatusNext = UTF8Classify(us, len);
 308         return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
 309 }
 310
 311 #ifdef SCI_NAMESPACE
 312 }
 313 #endif