CPatch: New memory management
[TortoiseGit.git] / src / Utils / UnicodeUtils.cpp
blobbb7d38868bdc1b12123470820cae4589828072f8
1 // TortoiseGit - a Windows shell extension for easy version control
3 // Copyright (C) 2009-2014, 2016 - TortoiseGit
4 // Copyright (C) 2003-2006, 2008 - TortoiseSVN
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either version 2
9 // of the License, or (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software Foundation,
18 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "stdafx.h"
21 #include "UnicodeUtils.h"
23 #if defined(_MFC_VER) || defined(CSTRING_AVAILABLE)
25 struct CodeMap
27 int m_Code;
28 TCHAR * m_CodeName;
30 int CUnicodeUtils::GetCPCode(const CString &codename)
32 static CodeMap map[]=
34 { 37, L"IBM037"},// IBM EBCDIC US-Canada
35 {437, L"IBM437"},// OEM United States
36 {500, L"IBM500"},// IBM EBCDIC International
37 {708, L"ASMO-708"},// Arabic (ASMO 708)
38 {709, L"Arabic"},// (ASMO-449+, BCON V4)
39 {710, L"Arabic"},// - Transparent Arabic
40 {720, L"DOS-720"},// Arabic (Transparent ASMO); Arabic (DOS)
41 {737, L"ibm737"},// OEM Greek (formerly 437G); Greek (DOS)
42 {775, L"ibm775"},// OEM Baltic; Baltic (DOS)
43 {850, L"ibm850"},// OEM Multilingual Latin 1; Western European (DOS)
44 {852, L"ibm852"},// OEM Latin 2; Central European (DOS)
45 {855, L"IBM855"},// OEM Cyrillic (primarily Russian)
46 {857, L"ibm857"},// OEM Turkish; Turkish (DOS)
47 {858, L"IBM00858"},// OEM Multilingual Latin 1 + Euro symbol
48 {860, L"IBM860"},// OEM Portuguese; Portuguese (DOS)
49 {861, L"ibm861"},// OEM Icelandic; Icelandic (DOS)
50 {862, L"DOS-862"},// OEM Hebrew; Hebrew (DOS)
51 {863, L"IBM863"},// OEM French Canadian; French Canadian (DOS)
52 {864, L"IBM864"},// OEM Arabic; Arabic (864)
53 {865, L"IBM865"},// OEM Nordic; Nordic (DOS)
54 {866, L"cp866"},// OEM Russian; Cyrillic (DOS)
55 {869, L"ibm869"},// OEM Modern Greek; Greek, Modern (DOS)
56 {870, L"IBM870"},// IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
57 {874, L"windows-874"},// ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
58 {875, L"cp875"},// IBM EBCDIC Greek Modern
59 {932, L"shift_jis"},// ANSI/OEM Japanese; Japanese (Shift-JIS)
60 {936, L"gb2312"},// ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
61 {949, L"ks_c_5601-1987"},// ANSI/OEM Korean (Unified Hangul Code)
62 {949, L"cp949"},// ANSI/OEM Korean (Unified Hangul Code)
63 {950, L"big5"},// ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
64 {1026,L"IBM1026"},// IBM EBCDIC Turkish (Latin 5)
65 {1047,L"IBM01047"},// IBM EBCDIC Latin 1/Open System
66 {1140,L"IBM01140"},// IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
67 {1141, L"IBM01141"},// IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
68 {1142, L"IBM01142"},// IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
69 {1143, L"IBM01143"},// IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
70 {1144, L"IBM01144"},// IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
71 {1145, L"IBM01145"},// IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
72 {1146, L"IBM01146"},// IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
73 {1147, L"IBM01147"},// IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
74 {1148, L"IBM01148"},// IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
75 {1149, L"IBM01149"},// IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
76 {1200, L"utf-16"},// Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
77 {1201, L"unicodeFFFE"},// Unicode UTF-16, big endian byte order; available only to managed applications
78 {1250, L"windows-1250"},// ANSI Central European; Central European (Windows)
79 {1251, L"windows-1251"},// ANSI Cyrillic; Cyrillic (Windows)
80 {1251, L"cp1251"},
81 {1251, L"cp-1251"},
82 {1251, L"cp_1251"},
83 {1252, L"windows-1252"},// ANSI Latin 1; Western European (Windows)
84 {1253, L"windows-1253"},// ANSI Greek; Greek (Windows)
85 {1254, L"windows-1254"},// ANSI Turkish; Turkish (Windows)
86 {1255, L"windows-1255"},// ANSI Hebrew; Hebrew (Windows)
87 {1256, L"windows-1256"},// ANSI Arabic; Arabic (Windows)
88 {1257, L"windows-1257"},// ANSI Baltic; Baltic (Windows)
89 {1258, L"windows-1258"},// ANSI/OEM Vietnamese; Vietnamese (Windows)
90 {1361, L"Johab"},// Korean (Johab)
91 {10000,L"macintosh"},// MAC Roman; Western European (Mac)
92 {10001, L"x-mac-japanese"},// Japanese (Mac)
93 {10002, L"x-mac-chinesetrad"},// MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
94 {10003, L"x-mac-korean"},// Korean (Mac)
95 {10004, L"x-mac-arabic"},// Arabic (Mac)
96 {10005, L"x-mac-hebrew"},// Hebrew (Mac)
97 {10006, L"x-mac-greek"},// Greek (Mac)
98 {10007, L"x-mac-cyrillic"},// Cyrillic (Mac)
99 {10008, L"x-mac-chinesesimp"},// MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
100 {10010, L"x-mac-romanian"},// Romanian (Mac)
101 {10017, L"x-mac-ukrainian"},// Ukrainian (Mac)
102 {10021, L"x-mac-thai"},// Thai (Mac)
103 {10029, L"x-mac-ce"},// MAC Latin 2; Central European (Mac)
104 {10079, L"x-mac-icelandic"},// Icelandic (Mac)
105 {10081, L"x-mac-turkish"},// Turkish (Mac)
106 {10082, L"x-mac-croatian"},// Croatian (Mac)
107 {12000, L"utf-32"},// Unicode UTF-32, little endian byte order; available only to managed applications
108 {12001, L"utf-32BE"},// Unicode UTF-32, big endian byte order; available only to managed applications
109 {20000, L"x-Chinese_CNS"},// CNS Taiwan; Chinese Traditional (CNS)
110 {20001, L"x-cp20001"},// TCA Taiwan
111 {20002, L"x_Chinese-Eten"},// Eten Taiwan; Chinese Traditional (Eten)
112 {20003, L"x-cp20003"},// IBM5550 Taiwan
113 {20004, L"x-cp20004"},// TeleText Taiwan
114 {20005, L"x-cp20005"},// Wang Taiwan
115 {20105, L"x-IA5"},// IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
116 {20106, L"x-IA5-German"},// IA5 German (7-bit)
117 {20107, L"x-IA5-Swedish"},// IA5 Swedish (7-bit)
118 {20108, L"x-IA5-Norwegian"},// IA5 Norwegian (7-bit)
119 {20127, L"us-ascii"},// US-ASCII (7-bit)
120 {20261, L"x-cp20261"},// T.61
121 {20269, L"x-cp20269"},// ISO 6937 Non-Spacing Accent
122 {20273, L"IBM273"},// IBM EBCDIC Germany
123 {20277, L"IBM277"},//IBM EBCDIC Denmark-Norway
124 {20278, L"IBM278"},// IBM EBCDIC Finland-Sweden
125 {20280, L"IBM280"},// IBM EBCDIC Italy
126 {20284, L"IBM284"},// IBM EBCDIC Latin America-Spain
127 {20285, L"IBM285"},// IBM EBCDIC United Kingdom
128 {20290, L"IBM290"},// IBM EBCDIC Japanese Katakana Extended
129 {20297, L"IBM297"},// IBM EBCDIC France
130 {20420, L"IBM420"},// IBM EBCDIC Arabic
131 {20423, L"IBM423"},// IBM EBCDIC Greek
132 {20424, L"IBM424"},// IBM EBCDIC Hebrew
133 {20833, L"x-EBCDIC-KoreanExtended"},// IBM EBCDIC Korean Extended
134 {20838, L"IBM-Thai"},// IBM EBCDIC Thai
135 {20866, L"koi8-r"},// Russian (KOI8-R); Cyrillic (KOI8-R)
136 {20871, L"IBM871"},// IBM EBCDIC Icelandic
137 {20880, L"IBM880"},// IBM EBCDIC Cyrillic Russian
138 {20905, L"IBM905"},// IBM EBCDIC Turkish
139 {20924, L"IBM00924"},// IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
140 {20932, L"EUC-JP"},// Japanese (JIS 0208-1990 and 0121-1990)
141 {20936, L"x-cp20936"},// Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
142 {20949, L"x-cp20949"},// Korean Wansung
143 {21025, L"cp1025"},// IBM EBCDIC Cyrillic Serbian-Bulgarian
144 {21027, L"21027"},// (deprecated)
145 {21866, L"koi8-u"},// Ukrainian (KOI8-U); Cyrillic (KOI8-U)
146 {28591, L"iso-8859-1"},// ISO 8859-1 Latin 1; Western European (ISO)
147 {28592, L"iso-8859-2"},// ISO 8859-2 Central European; Central European (ISO)
148 {28593, L"iso-8859-3"},// ISO 8859-3 Latin 3
149 {28594, L"iso-8859-4"},// ISO 8859-4 Baltic
150 {28595, L"iso-8859-5"},// ISO 8859-5 Cyrillic
151 {28596, L"iso-8859-6"},// ISO 8859-6 Arabic
152 {28597, L"iso-8859-7"},// ISO 8859-7 Greek
153 {28598, L"iso-8859-8"},// ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
154 {28599, L"iso-8859-9"},// ISO 8859-9 Turkish
155 {28603, L"iso-8859-13"},// ISO 8859-13 Estonian
156 {28605, L"iso-8859-15"},// ISO 8859-15 Latin 9
157 {29001, L"x-Europa"},// Europa 3
158 {38598, L"iso-8859-8-i"},// ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
159 {50220, L"iso-2022-jp"},// ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
160 {50221, L"csISO2022JP"},// ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
161 {50222, L"iso-2022-jp"},// ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
162 {50225, L"iso-2022-kr"},// ISO 2022 Korean
163 {50227, L"x-cp50227"},// ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
164 {50229, L"ISO"},// 2022 Traditional Chinese
165 {50930, L"EBCDIC"},// Japanese (Katakana) Extended
166 {50931, L"EBCDIC"},// US-Canada and Japanese
167 {50933, L"EBCDIC"},// Korean Extended and Korean
168 {50935, L"EBCDIC"},// Simplified Chinese Extended and Simplified Chinese
169 {50936, L"EBCDIC"},// Simplified Chinese
170 {50937, L"EBCDIC"},// US-Canada and Traditional Chinese
171 {50939, L"EBCDIC"},// Japanese (Latin) Extended and Japanese
172 {51932, L"euc-jp"},// EUC Japanese
173 {51936, L"EUC-CN"},// EUC Simplified Chinese; Chinese Simplified (EUC)
174 {51949, L"euc-kr"},// EUC Korean
175 {51950, L"EUC"},// Traditional Chinese
176 {52936, L"hz-gb-2312"},// HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
177 {54936, L"GB18030"},// Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
178 {57002, L"x-iscii-de"},// ISCII Devanagari
179 {57003, L"x-iscii-be"},// ISCII Bengali
180 {57004, L"x-iscii-ta"},// ISCII Tamil
181 {57005, L"x-iscii-te"},// ISCII Telugu
182 {57006, L"x-iscii-as"},// ISCII Assamese
183 {57007, L"x-iscii-or"},// ISCII Oriya
184 {57008, L"x-iscii-ka"},// ISCII Kannada
185 {57009, L"x-iscii-ma"},// ISCII Malayalam
186 {57010, L"x-iscii-gu"},// ISCII Gujarati
187 {57011, L"x-iscii-pa"},// ISCII Punjabi
188 {65000, L"utf-7"},// Unicode (UTF-7)
189 {65001, L"utf-8"},// Unicode (UTF-8)
190 {0, nullptr}
193 static CodeMap *p=map;
194 if (codename.IsEmpty())
195 return CP_UTF8;
196 CString code(codename);
197 code.MakeLower();
198 while (p->m_CodeName)
200 CString str = p->m_CodeName;
201 str=str.MakeLower();
203 if (str == code)
204 return p->m_Code;
205 ++p;
208 return CP_UTF8;
211 CStringA CUnicodeUtils::GetUTF8(const CStringW& string)
213 return GetMulti(string,CP_UTF8);
216 CStringA CUnicodeUtils::GetMulti(const CStringW& string,int acp)
218 char * buf;
219 CStringA retVal;
220 int len = string.GetLength();
221 if (len==0)
222 return retVal;
223 buf = retVal.GetBuffer(len*4 + 1);
224 int lengthIncTerminator = WideCharToMultiByte(acp, 0, string, -1, buf, len * 4, nullptr, nullptr);
225 retVal.ReleaseBuffer(lengthIncTerminator-1);
226 return retVal;
230 CStringA CUnicodeUtils::GetUTF8(const CStringA& string)
232 WCHAR * buf;
233 int len = string.GetLength();
234 if (len==0)
235 return CStringA();
236 buf = new WCHAR[len*4 + 1];
237 int lengthIncTerminator = MultiByteToWideChar(CP_ACP, 0, string, -1, buf, len * 4);
238 CStringW temp = CStringW(buf, lengthIncTerminator - 1);
239 delete [] buf;
240 return (CUnicodeUtils::GetUTF8(temp));
243 CString CUnicodeUtils::GetUnicode(const CStringA& string, int acp)
245 WCHAR * buf;
246 CString retVal;
247 int len = string.GetLength();
248 if (len==0)
249 return retVal;
250 buf = retVal.GetBuffer(len * 4 + 1);
251 int lengthIncTerminator = MultiByteToWideChar(acp, 0, string, -1, buf, len * 4);
252 retVal.ReleaseBuffer(lengthIncTerminator - 1);
253 return retVal;
256 #endif //_MFC_VER
258 #ifdef UNICODE
259 std::string CUnicodeUtils::StdGetUTF8(const std::wstring& wide)
261 int len = (int)wide.size();
262 if (len==0)
263 return std::string();
264 int size = len*4;
265 char * narrow = new char[size];
266 int ret = WideCharToMultiByte(CP_UTF8, 0, wide.c_str(), len, narrow, size - 1, nullptr, nullptr);
267 narrow[ret] = '\0';
268 std::string sRet = std::string(narrow);
269 delete [] narrow;
270 return sRet;
273 std::wstring CUnicodeUtils::StdGetUnicode(const std::string& multibyte)
275 int len = (int)multibyte.size();
276 if (len==0)
277 return std::wstring();
278 int size = len*4;
279 wchar_t * wide = new wchar_t[size];
280 int ret = MultiByteToWideChar(CP_UTF8, 0, multibyte.c_str(), len, wide, size - 1);
281 wide[ret] = L'\0';
282 std::wstring sRet = std::wstring(wide);
283 delete [] wide;
284 return sRet;
286 #endif
288 std::string WideToMultibyte(const std::wstring& wide)
290 char * narrow = new char[wide.length()*3+2];
291 BOOL defaultCharUsed;
292 int ret = (int)WideCharToMultiByte(CP_ACP, 0, wide.c_str(), (int)wide.size(), narrow, (int)wide.length()*3 - 1, ".", &defaultCharUsed);
293 narrow[ret] = '\0';
294 std::string str = narrow;
295 delete[] narrow;
296 return str;
299 std::string WideToUTF8(const std::wstring& wide)
301 char * narrow = new char[wide.length()*3+2];
302 int ret = (int)WideCharToMultiByte(CP_UTF8, 0, wide.c_str(), (int)wide.size(), narrow, (int)wide.length() * 3 - 1, nullptr, nullptr);
303 narrow[ret] = '\0';
304 std::string str = narrow;
305 delete[] narrow;
306 return str;
309 std::wstring MultibyteToWide(const std::string& multibyte)
311 size_t length = multibyte.length();
312 if (length == 0)
313 return std::wstring();
315 wchar_t * wide = new wchar_t[multibyte.length()*2+2];
316 if (!wide)
317 return std::wstring();
318 int ret = (int)MultiByteToWideChar(CP_ACP, 0, multibyte.c_str(), (int)multibyte.size(), wide, (int)length*2 - 1);
319 wide[ret] = L'\0';
320 std::wstring str = wide;
321 delete[] wide;
322 return str;
325 std::wstring UTF8ToWide(const std::string& multibyte)
327 size_t length = multibyte.length();
328 if (length == 0)
329 return std::wstring();
331 wchar_t * wide = new wchar_t[length*2+2];
332 if (!wide)
333 return std::wstring();
334 int ret = (int)MultiByteToWideChar(CP_UTF8, 0, multibyte.c_str(), (int)multibyte.size(), wide, (int)length*2 - 1);
335 wide[ret] = L'\0';
336 std::wstring str = wide;
337 delete[] wide;
338 return str;
340 #ifdef UNICODE
341 std::wstring UTF8ToString(const std::string& string) {return UTF8ToWide(string);}
342 std::string StringToUTF8(const std::wstring& string) {return WideToUTF8(string);}
343 #else
344 std::wstring UTF8ToString(const std::string& string) {return WideToMultibyte(UTF8ToWide(string));}
345 std::string StringToUTF8(const std::wstring& string) {return WideToUTF8(MultibyteToWide(string));}
346 #endif
349 #pragma warning(push)
350 #pragma warning(disable: 4200)
351 struct STRINGRESOURCEIMAGE
353 WORD nLength;
354 WCHAR achString[];
356 #pragma warning(pop) // C4200
358 int LoadStringEx(HINSTANCE hInstance, UINT uID, LPTSTR lpBuffer, int nBufferMax, WORD wLanguage)
360 const STRINGRESOURCEIMAGE* pImage;
361 const STRINGRESOURCEIMAGE* pImageEnd;
362 ULONG nResourceSize;
363 HGLOBAL hGlobal;
364 UINT iIndex;
365 #ifndef UNICODE
366 BOOL defaultCharUsed;
367 #endif
368 int ret;
370 if (!lpBuffer)
371 return 0;
372 lpBuffer[0] = L'\0';
373 HRSRC hResource = FindResourceEx(hInstance, RT_STRING, MAKEINTRESOURCE(((uID>>4)+1)), wLanguage);
374 if (!hResource)
376 //try the default language before giving up!
377 hResource = FindResource(hInstance, MAKEINTRESOURCE(((uID>>4)+1)), RT_STRING);
378 if (!hResource)
379 return 0;
381 hGlobal = LoadResource(hInstance, hResource);
382 if (!hGlobal)
383 return 0;
384 pImage = (const STRINGRESOURCEIMAGE*)::LockResource(hGlobal);
385 if(!pImage)
386 return 0;
388 nResourceSize = ::SizeofResource(hInstance, hResource);
389 pImageEnd = reinterpret_cast<const STRINGRESOURCEIMAGE*>(LPBYTE(pImage) + nResourceSize);
390 iIndex = uID&0x000f;
392 while ((iIndex > 0) && (pImage < pImageEnd))
394 pImage = reinterpret_cast<const STRINGRESOURCEIMAGE*>(LPBYTE(pImage) + (sizeof(STRINGRESOURCEIMAGE) + (pImage->nLength * sizeof(WCHAR))));
395 iIndex--;
397 if (pImage >= pImageEnd)
398 return 0;
399 if (pImage->nLength == 0)
400 return 0;
401 #ifdef UNICODE
402 ret = pImage->nLength;
403 if (ret >= nBufferMax)
404 ret = nBufferMax - 1;
405 wcsncpy_s((wchar_t *)lpBuffer, nBufferMax, pImage->achString, ret);
406 lpBuffer[ret] = L'\0';
407 #else
408 ret = WideCharToMultiByte(CP_ACP, 0, pImage->achString, pImage->nLength, (LPSTR)lpBuffer, nBufferMax-1, ".", &defaultCharUsed);
409 lpBuffer[ret] = L'\0';
410 #endif
411 return ret;