updated Scintilla to 2.29
[TortoiseGit.git] / ext / scintilla / src / UniConversion.cxx
blobe965c58105ebe9a87ec5c7eb4cb5f92afc146c94
1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
8 #include <stdlib.h>
10 #include "UniConversion.h"
12 enum { SURROGATE_LEAD_FIRST = 0xD800 };
13 enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
14 enum { SURROGATE_TRAIL_LAST = 0xDFFF };
16 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
17 unsigned int len = 0;
18 for (unsigned int i = 0; i < tlen && uptr[i];) {
19 unsigned int uch = uptr[i];
20 if (uch < 0x80) {
21 len++;
22 } else if (uch < 0x800) {
23 len += 2;
24 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
25 (uch <= SURROGATE_TRAIL_LAST)) {
26 len += 4;
27 i++;
28 } else {
29 len += 3;
31 i++;
33 return len;
36 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
37 int k = 0;
38 for (unsigned int i = 0; i < tlen && uptr[i];) {
39 unsigned int uch = uptr[i];
40 if (uch < 0x80) {
41 putf[k++] = static_cast<char>(uch);
42 } else if (uch < 0x800) {
43 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
44 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
45 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
46 (uch <= SURROGATE_TRAIL_LAST)) {
47 // Half a surrogate pair
48 i++;
49 unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
50 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
51 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
52 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
53 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
54 } else {
55 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
56 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
57 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
59 i++;
61 putf[len] = '\0';
64 unsigned int UTF8CharLength(unsigned char ch) {
65 if (ch < 0x80) {
66 return 1;
67 } else if (ch < 0x80 + 0x40 + 0x20) {
68 return 2;
69 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
70 return 3;
71 } else {
72 return 4;
76 unsigned int UTF16Length(const char *s, unsigned int len) {
77 unsigned int ulen = 0;
78 unsigned int charLen;
79 for (unsigned int i=0; i<len;) {
80 unsigned char ch = static_cast<unsigned char>(s[i]);
81 if (ch < 0x80) {
82 charLen = 1;
83 } else if (ch < 0x80 + 0x40 + 0x20) {
84 charLen = 2;
85 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
86 charLen = 3;
87 } else {
88 charLen = 4;
89 ulen++;
91 i += charLen;
92 ulen++;
94 return ulen;
97 unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {
98 unsigned int ui=0;
99 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
100 unsigned int i=0;
101 while ((i<len) && (ui<tlen)) {
102 unsigned char ch = us[i++];
103 if (ch < 0x80) {
104 tbuf[ui] = ch;
105 } else if (ch < 0x80 + 0x40 + 0x20) {
106 tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
107 ch = us[i++];
108 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
109 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
110 tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
111 ch = us[i++];
112 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
113 ch = us[i++];
114 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
115 } else {
116 // Outside the BMP so need two surrogates
117 int val = (ch & 0x7) << 18;
118 ch = us[i++];
119 val += (ch & 0x3F) << 12;
120 ch = us[i++];
121 val += (ch & 0x3F) << 6;
122 ch = us[i++];
123 val += (ch & 0x3F);
124 tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
125 ui++;
126 tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
128 ui++;
130 return ui;