ext/scintilla/src/UniConversion.cxx

   1 // Scintilla source code edit control
   2 /** @file UniConversion.cxx
   3  ** Functions to handle UTF-8 and UTF-16 strings.
   4  **/
   5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
   6 // The License.txt file describes the conditions under which this software may be distributed.
   7
   8 #include <stdlib.h>
   9
  10 #include "UniConversion.h"
  11
  12 #ifdef SCI_NAMESPACE
  13 using namespace Scintilla;
  14 #endif
  15
  16 #ifdef SCI_NAMESPACE
  17 namespace Scintilla {
  18 #endif
  19
  20 enum { SURROGATE_LEAD_FIRST = 0xD800 };
  21 enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
  22 enum { SURROGATE_TRAIL_LAST = 0xDFFF };
  23 enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 };
  24
  25 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
  26         unsigned int len = 0;
  27         for (unsigned int i = 0; i < tlen && uptr[i];) {
  28                 unsigned int uch = uptr[i];
  29                 if (uch < 0x80) {
  30                         len++;
  31                 } else if (uch < 0x800) {
  32                         len += 2;
  33                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  34                         (uch <= SURROGATE_TRAIL_LAST)) {
  35                         len += 4;
  36                         i++;
  37                 } else {
  38                         len += 3;
  39                 }
  40                 i++;
  41         }
  42         return len;
  43 }
  44
  45 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
  46         int k = 0;
  47         for (unsigned int i = 0; i < tlen && uptr[i];) {
  48                 unsigned int uch = uptr[i];
  49                 if (uch < 0x80) {
  50                         putf[k++] = static_cast<char>(uch);
  51                 } else if (uch < 0x800) {
  52                         putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
  53                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  54                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  55                         (uch <= SURROGATE_TRAIL_LAST)) {
  56                         // Half a surrogate pair
  57                         i++;
  58                         unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
  59                         putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
  60                         putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
  61                         putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
  62                         putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
  63                 } else {
  64                         putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
  65                         putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  66                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  67                 }
  68                 i++;
  69         }
  70         putf[len] = '\0';
  71 }
  72
  73 unsigned int UTF8CharLength(unsigned char ch) {
  74         if (ch < 0x80) {
  75                 return 1;
  76         } else if (ch < 0x80 + 0x40 + 0x20) {
  77                 return 2;
  78         } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
  79                 return 3;
  80         } else {
  81                 return 4;
  82         }
  83 }
  84
  85 unsigned int UTF16Length(const char *s, unsigned int len) {
  86         unsigned int ulen = 0;
  87         unsigned int charLen;
  88         for (unsigned int i=0; i<len;) {
  89                 unsigned char ch = static_cast<unsigned char>(s[i]);
  90                 if (ch < 0x80) {
  91                         charLen = 1;
  92                 } else if (ch < 0x80 + 0x40 + 0x20) {
  93                         charLen = 2;
  94                 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
  95                         charLen = 3;
  96                 } else {
  97                         charLen = 4;
  98                         ulen++;
  99                 }
 100                 i += charLen;
 101                 ulen++;
 102         }
 103         return ulen;
 104 }
 105
 106 unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {
 107         unsigned int ui=0;
 108         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 109         unsigned int i=0;
 110         while ((i<len) && (ui<tlen)) {
 111                 unsigned char ch = us[i++];
 112                 if (ch < 0x80) {
 113                         tbuf[ui] = ch;
 114                 } else if (ch < 0x80 + 0x40 + 0x20) {
 115                         tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
 116                         ch = us[i++];
 117                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
 118                 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
 119                         tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
 120                         ch = us[i++];
 121                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
 122                         ch = us[i++];
 123                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
 124                 } else {
 125                         // Outside the BMP so need two surrogates
 126                         int val = (ch & 0x7) << 18;
 127                         ch = us[i++];
 128                         val += (ch & 0x3F) << 12;
 129                         ch = us[i++];
 130                         val += (ch & 0x3F) << 6;
 131                         ch = us[i++];
 132                         val += (ch & 0x3F);
 133                         tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
 134                         ui++;
 135                         tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
 136                 }
 137                 ui++;
 138         }
 139         return ui;
 140 }
 141
 142 unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen) {
 143         unsigned int ui=0;
 144         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 145         unsigned int i=0;
 146         while ((i<len) && (ui<tlen)) {
 147                 unsigned char ch = us[i++];
 148                 wchar_t value = 0;
 149                 if (ch < 0x80) {
 150                         value = ch;
 151                 } else if (((len-i) >= 1) && (ch < 0x80 + 0x40 + 0x20)) {
 152                         value = (ch & 0x1F) << 6;
 153                         ch = us[i++];
 154                         value += ch & 0x7F;
 155                 } else if (((len-i) >= 2) && (ch < 0x80 + 0x40 + 0x20 + 0x10)) {
 156                         value = (ch & 0xF) << 12;
 157                         ch = us[i++];
 158                         value += (ch & 0x7F) << 6;
 159                         ch = us[i++];
 160                         value += ch & 0x7F;
 161                 } else if ((len-i) >= 3) {
 162                         value = (ch & 0x7) << 18;
 163                         ch = us[i++];
 164                         value += (ch & 0x3F) << 12;
 165                         ch = us[i++];
 166                         value += (ch & 0x3F) << 6;
 167                         ch = us[i++];
 168                         value += ch & 0x3F;
 169                 }
 170                 tbuf[ui] = value;
 171                 ui++;
 172         }
 173         return ui;
 174 }
 175
 176 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
 177         if (val < SUPPLEMENTAL_PLANE_FIRST) {
 178                 tbuf[0] = static_cast<wchar_t>(val);
 179                 return 1;
 180         } else {
 181                 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
 182                 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
 183                 return 2;
 184         }
 185 }
 186
 187 int UTF8BytesOfLead[256];
 188 static bool initialisedBytesOfLead = false;
 189
 190 static int BytesFromLead(int leadByte) {
 191         if (leadByte < 0xC2) {
 192                 // Single byte or invalid
 193                 return 1;
 194         } else if (leadByte < 0xE0) {
 195                 return 2;
 196         } else if (leadByte < 0xF0) {
 197                 return 3;
 198         } else if (leadByte < 0xF5) {
 199                 return 4;
 200         } else {
 201                 // Characters longer than 4 bytes not possible in current UTF-8
 202                 return 1;
 203         }
 204 }
 205
 206 void UTF8BytesOfLeadInitialise() {
 207         if (!initialisedBytesOfLead) {
 208                 for (int i=0; i<256; i++) {
 209                         UTF8BytesOfLead[i] = BytesFromLead(i);
 210                 }
 211                 initialisedBytesOfLead = true;
 212         }
 213 }
 214
 215 // Return both the width of the first character in the string and a status
 216 // saying whether it is valid or invalid.
 217 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
 218 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
 219 // reasonably treated as code points in some circumstances. They will, however,
 220 // not have associated glyphs.
 221 int UTF8Classify(const unsigned char *us, int len) {
 222         // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 223         if (*us < 0x80) {
 224                 // Single bytes easy
 225                 return 1;
 226         } else if (*us > 0xf4) {
 227                 // Characters longer than 4 bytes not possible in current UTF-8
 228                 return UTF8MaskInvalid | 1;
 229         } else if (*us >= 0xf0) {
 230                 // 4 bytes
 231                 if (len < 4)
 232                         return UTF8MaskInvalid | 1;
 233                 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
 234                         if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
 235                                 // *FFFE or *FFFF non-character
 236                                 return UTF8MaskInvalid | 4;
 237                         }
 238                         if (*us == 0xf4) {
 239                                 // Check if encoding a value beyond the last Unicode character 10FFFF
 240                                 if (us[1] > 0x8f) {
 241                                         return UTF8MaskInvalid | 1;
 242                                 } else if (us[1] == 0x8f) {
 243                                         if (us[2] > 0xbf) {
 244                                                 return UTF8MaskInvalid | 1;
 245                                         } else if (us[2] == 0xbf) {
 246                                                 if (us[3] > 0xbf) {
 247                                                         return UTF8MaskInvalid | 1;
 248                                                 }
 249                                         }
 250                                 }
 251                         } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
 252                                 // Overlong
 253                                 return UTF8MaskInvalid | 1;
 254                         }
 255                         return 4;
 256                 } else {
 257                         return UTF8MaskInvalid | 1;
 258                 }
 259         } else if (*us >= 0xe0) {
 260                 // 3 bytes
 261                 if (len < 3)
 262                         return UTF8MaskInvalid | 1;
 263                 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
 264                         if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
 265                                 // Overlong
 266                                 return UTF8MaskInvalid | 1;
 267                         }
 268                         if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
 269                                 // Surrogate
 270                                 return UTF8MaskInvalid | 1;
 271                         }
 272                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
 273                                 // U+FFFE non-character - 3 bytes long
 274                                 return UTF8MaskInvalid | 3;
 275                         }
 276                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
 277                                 // U+FFFF non-character - 3 bytes long
 278                                 return UTF8MaskInvalid | 3;
 279                         }
 280                         if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
 281                                 // U+FDD0 .. U+FDEF
 282                                 return UTF8MaskInvalid | 3;
 283                         }
 284                         return 3;
 285                 } else {
 286                         return UTF8MaskInvalid | 1;
 287                 }
 288         } else if (*us >= 0xc2) {
 289                 // 2 bytes
 290                 if (len < 2)
 291                         return UTF8MaskInvalid | 1;
 292                 if (UTF8IsTrailByte(us[1])) {
 293                         return 2;
 294                 } else {
 295                         return UTF8MaskInvalid | 1;
 296                 }
 297         } else {
 298                 // 0xc0 .. 0xc1 is overlong encoding
 299                 // 0x80 .. 0xbf is trail byte
 300                 return UTF8MaskInvalid | 1;
 301         }
 302 }
 303
 304 int UTF8DrawBytes(const unsigned char *us, int len) {
 305         int utf8StatusNext = UTF8Classify(us, len);
 306         return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
 307 }
 308
 309 #ifdef SCI_NAMESPACE
 310 }
 311 #endif