1 // TortoiseGit - a Windows shell extension for easy version control
3 // Copyright (C) 2009-2014, 2016 - TortoiseGit
4 // Copyright (C) 2003-2006, 2008 - TortoiseSVN
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either version 2
9 // of the License, or (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software Foundation,
18 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #include "UnicodeUtils.h"
23 #if defined(_MFC_VER) || defined(CSTRING_AVAILABLE)
30 int CUnicodeUtils::GetCPCode(const CString
&codename
)
34 { 37, L
"IBM037"},// IBM EBCDIC US-Canada
35 {437, L
"IBM437"},// OEM United States
36 {500, L
"IBM500"},// IBM EBCDIC International
37 {708, L
"ASMO-708"},// Arabic (ASMO 708)
38 {709, L
"Arabic"},// (ASMO-449+, BCON V4)
39 {710, L
"Arabic"},// - Transparent Arabic
40 {720, L
"DOS-720"},// Arabic (Transparent ASMO); Arabic (DOS)
41 {737, L
"ibm737"},// OEM Greek (formerly 437G); Greek (DOS)
42 {775, L
"ibm775"},// OEM Baltic; Baltic (DOS)
43 {850, L
"ibm850"},// OEM Multilingual Latin 1; Western European (DOS)
44 {852, L
"ibm852"},// OEM Latin 2; Central European (DOS)
45 {855, L
"IBM855"},// OEM Cyrillic (primarily Russian)
46 {857, L
"ibm857"},// OEM Turkish; Turkish (DOS)
47 {858, L
"IBM00858"},// OEM Multilingual Latin 1 + Euro symbol
48 {860, L
"IBM860"},// OEM Portuguese; Portuguese (DOS)
49 {861, L
"ibm861"},// OEM Icelandic; Icelandic (DOS)
50 {862, L
"DOS-862"},// OEM Hebrew; Hebrew (DOS)
51 {863, L
"IBM863"},// OEM French Canadian; French Canadian (DOS)
52 {864, L
"IBM864"},// OEM Arabic; Arabic (864)
53 {865, L
"IBM865"},// OEM Nordic; Nordic (DOS)
54 {866, L
"cp866"},// OEM Russian; Cyrillic (DOS)
55 {869, L
"ibm869"},// OEM Modern Greek; Greek, Modern (DOS)
56 {870, L
"IBM870"},// IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
57 {874, L
"windows-874"},// ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
58 {875, L
"cp875"},// IBM EBCDIC Greek Modern
59 {932, L
"shift_jis"},// ANSI/OEM Japanese; Japanese (Shift-JIS)
60 {936, L
"gb2312"},// ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
61 {949, L
"ks_c_5601-1987"},// ANSI/OEM Korean (Unified Hangul Code)
62 {949, L
"cp949"},// ANSI/OEM Korean (Unified Hangul Code)
63 {950, L
"big5"},// ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
64 {1026,L
"IBM1026"},// IBM EBCDIC Turkish (Latin 5)
65 {1047,L
"IBM01047"},// IBM EBCDIC Latin 1/Open System
66 {1140,L
"IBM01140"},// IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
67 {1141, L
"IBM01141"},// IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
68 {1142, L
"IBM01142"},// IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
69 {1143, L
"IBM01143"},// IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
70 {1144, L
"IBM01144"},// IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
71 {1145, L
"IBM01145"},// IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
72 {1146, L
"IBM01146"},// IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
73 {1147, L
"IBM01147"},// IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
74 {1148, L
"IBM01148"},// IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
75 {1149, L
"IBM01149"},// IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
76 {1200, L
"utf-16"},// Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
77 {1201, L
"unicodeFFFE"},// Unicode UTF-16, big endian byte order; available only to managed applications
78 {1250, L
"windows-1250"},// ANSI Central European; Central European (Windows)
79 {1251, L
"windows-1251"},// ANSI Cyrillic; Cyrillic (Windows)
83 {1252, L
"windows-1252"},// ANSI Latin 1; Western European (Windows)
84 {1253, L
"windows-1253"},// ANSI Greek; Greek (Windows)
85 {1254, L
"windows-1254"},// ANSI Turkish; Turkish (Windows)
86 {1255, L
"windows-1255"},// ANSI Hebrew; Hebrew (Windows)
87 {1256, L
"windows-1256"},// ANSI Arabic; Arabic (Windows)
88 {1257, L
"windows-1257"},// ANSI Baltic; Baltic (Windows)
89 {1258, L
"windows-1258"},// ANSI/OEM Vietnamese; Vietnamese (Windows)
90 {1361, L
"Johab"},// Korean (Johab)
91 {10000,L
"macintosh"},// MAC Roman; Western European (Mac)
92 {10001, L
"x-mac-japanese"},// Japanese (Mac)
93 {10002, L
"x-mac-chinesetrad"},// MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
94 {10003, L
"x-mac-korean"},// Korean (Mac)
95 {10004, L
"x-mac-arabic"},// Arabic (Mac)
96 {10005, L
"x-mac-hebrew"},// Hebrew (Mac)
97 {10006, L
"x-mac-greek"},// Greek (Mac)
98 {10007, L
"x-mac-cyrillic"},// Cyrillic (Mac)
99 {10008, L
"x-mac-chinesesimp"},// MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
100 {10010, L
"x-mac-romanian"},// Romanian (Mac)
101 {10017, L
"x-mac-ukrainian"},// Ukrainian (Mac)
102 {10021, L
"x-mac-thai"},// Thai (Mac)
103 {10029, L
"x-mac-ce"},// MAC Latin 2; Central European (Mac)
104 {10079, L
"x-mac-icelandic"},// Icelandic (Mac)
105 {10081, L
"x-mac-turkish"},// Turkish (Mac)
106 {10082, L
"x-mac-croatian"},// Croatian (Mac)
107 {12000, L
"utf-32"},// Unicode UTF-32, little endian byte order; available only to managed applications
108 {12001, L
"utf-32BE"},// Unicode UTF-32, big endian byte order; available only to managed applications
109 {20000, L
"x-Chinese_CNS"},// CNS Taiwan; Chinese Traditional (CNS)
110 {20001, L
"x-cp20001"},// TCA Taiwan
111 {20002, L
"x_Chinese-Eten"},// Eten Taiwan; Chinese Traditional (Eten)
112 {20003, L
"x-cp20003"},// IBM5550 Taiwan
113 {20004, L
"x-cp20004"},// TeleText Taiwan
114 {20005, L
"x-cp20005"},// Wang Taiwan
115 {20105, L
"x-IA5"},// IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
116 {20106, L
"x-IA5-German"},// IA5 German (7-bit)
117 {20107, L
"x-IA5-Swedish"},// IA5 Swedish (7-bit)
118 {20108, L
"x-IA5-Norwegian"},// IA5 Norwegian (7-bit)
119 {20127, L
"us-ascii"},// US-ASCII (7-bit)
120 {20261, L
"x-cp20261"},// T.61
121 {20269, L
"x-cp20269"},// ISO 6937 Non-Spacing Accent
122 {20273, L
"IBM273"},// IBM EBCDIC Germany
123 {20277, L
"IBM277"},//IBM EBCDIC Denmark-Norway
124 {20278, L
"IBM278"},// IBM EBCDIC Finland-Sweden
125 {20280, L
"IBM280"},// IBM EBCDIC Italy
126 {20284, L
"IBM284"},// IBM EBCDIC Latin America-Spain
127 {20285, L
"IBM285"},// IBM EBCDIC United Kingdom
128 {20290, L
"IBM290"},// IBM EBCDIC Japanese Katakana Extended
129 {20297, L
"IBM297"},// IBM EBCDIC France
130 {20420, L
"IBM420"},// IBM EBCDIC Arabic
131 {20423, L
"IBM423"},// IBM EBCDIC Greek
132 {20424, L
"IBM424"},// IBM EBCDIC Hebrew
133 {20833, L
"x-EBCDIC-KoreanExtended"},// IBM EBCDIC Korean Extended
134 {20838, L
"IBM-Thai"},// IBM EBCDIC Thai
135 {20866, L
"koi8-r"},// Russian (KOI8-R); Cyrillic (KOI8-R)
136 {20871, L
"IBM871"},// IBM EBCDIC Icelandic
137 {20880, L
"IBM880"},// IBM EBCDIC Cyrillic Russian
138 {20905, L
"IBM905"},// IBM EBCDIC Turkish
139 {20924, L
"IBM00924"},// IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
140 {20932, L
"EUC-JP"},// Japanese (JIS 0208-1990 and 0121-1990)
141 {20936, L
"x-cp20936"},// Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
142 {20949, L
"x-cp20949"},// Korean Wansung
143 {21025, L
"cp1025"},// IBM EBCDIC Cyrillic Serbian-Bulgarian
144 {21027, L
"21027"},// (deprecated)
145 {21866, L
"koi8-u"},// Ukrainian (KOI8-U); Cyrillic (KOI8-U)
146 {28591, L
"iso-8859-1"},// ISO 8859-1 Latin 1; Western European (ISO)
147 {28592, L
"iso-8859-2"},// ISO 8859-2 Central European; Central European (ISO)
148 {28593, L
"iso-8859-3"},// ISO 8859-3 Latin 3
149 {28594, L
"iso-8859-4"},// ISO 8859-4 Baltic
150 {28595, L
"iso-8859-5"},// ISO 8859-5 Cyrillic
151 {28596, L
"iso-8859-6"},// ISO 8859-6 Arabic
152 {28597, L
"iso-8859-7"},// ISO 8859-7 Greek
153 {28598, L
"iso-8859-8"},// ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
154 {28599, L
"iso-8859-9"},// ISO 8859-9 Turkish
155 {28603, L
"iso-8859-13"},// ISO 8859-13 Estonian
156 {28605, L
"iso-8859-15"},// ISO 8859-15 Latin 9
157 {29001, L
"x-Europa"},// Europa 3
158 {38598, L
"iso-8859-8-i"},// ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
159 {50220, L
"iso-2022-jp"},// ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
160 {50221, L
"csISO2022JP"},// ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
161 {50222, L
"iso-2022-jp"},// ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
162 {50225, L
"iso-2022-kr"},// ISO 2022 Korean
163 {50227, L
"x-cp50227"},// ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
164 {50229, L
"ISO"},// 2022 Traditional Chinese
165 {50930, L
"EBCDIC"},// Japanese (Katakana) Extended
166 {50931, L
"EBCDIC"},// US-Canada and Japanese
167 {50933, L
"EBCDIC"},// Korean Extended and Korean
168 {50935, L
"EBCDIC"},// Simplified Chinese Extended and Simplified Chinese
169 {50936, L
"EBCDIC"},// Simplified Chinese
170 {50937, L
"EBCDIC"},// US-Canada and Traditional Chinese
171 {50939, L
"EBCDIC"},// Japanese (Latin) Extended and Japanese
172 {51932, L
"euc-jp"},// EUC Japanese
173 {51936, L
"EUC-CN"},// EUC Simplified Chinese; Chinese Simplified (EUC)
174 {51949, L
"euc-kr"},// EUC Korean
175 {51950, L
"EUC"},// Traditional Chinese
176 {52936, L
"hz-gb-2312"},// HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
177 {54936, L
"GB18030"},// Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
178 {57002, L
"x-iscii-de"},// ISCII Devanagari
179 {57003, L
"x-iscii-be"},// ISCII Bengali
180 {57004, L
"x-iscii-ta"},// ISCII Tamil
181 {57005, L
"x-iscii-te"},// ISCII Telugu
182 {57006, L
"x-iscii-as"},// ISCII Assamese
183 {57007, L
"x-iscii-or"},// ISCII Oriya
184 {57008, L
"x-iscii-ka"},// ISCII Kannada
185 {57009, L
"x-iscii-ma"},// ISCII Malayalam
186 {57010, L
"x-iscii-gu"},// ISCII Gujarati
187 {57011, L
"x-iscii-pa"},// ISCII Punjabi
188 {65000, L
"utf-7"},// Unicode (UTF-7)
189 {65001, L
"utf-8"},// Unicode (UTF-8)
193 static CodeMap
*p
=map
;
194 if (codename
.IsEmpty())
196 CString
code(codename
);
198 while (p
->m_CodeName
)
200 CString str
= p
->m_CodeName
;
211 CStringA
CUnicodeUtils::GetUTF8(const CStringW
& string
)
213 return GetMulti(string
,CP_UTF8
);
216 CStringA
CUnicodeUtils::GetMulti(const CStringW
& string
,int acp
)
220 int len
= string
.GetLength();
223 buf
= retVal
.GetBuffer(len
*4 + 1);
224 int lengthIncTerminator
= WideCharToMultiByte(acp
, 0, string
, -1, buf
, len
* 4, nullptr, nullptr);
225 retVal
.ReleaseBuffer(lengthIncTerminator
-1);
230 CStringA
CUnicodeUtils::GetUTF8(const CStringA
& string
)
233 int len
= string
.GetLength();
236 buf
= new WCHAR
[len
*4 + 1];
237 int lengthIncTerminator
= MultiByteToWideChar(CP_ACP
, 0, string
, -1, buf
, len
* 4);
238 CStringW temp
= CStringW(buf
, lengthIncTerminator
- 1);
240 return (CUnicodeUtils::GetUTF8(temp
));
243 CString
CUnicodeUtils::GetUnicode(const CStringA
& string
, int acp
)
247 int len
= string
.GetLength();
250 buf
= retVal
.GetBuffer(len
* 4 + 1);
251 int lengthIncTerminator
= MultiByteToWideChar(acp
, 0, string
, -1, buf
, len
* 4);
252 retVal
.ReleaseBuffer(lengthIncTerminator
- 1);
259 std::string
CUnicodeUtils::StdGetUTF8(const std::wstring
& wide
)
261 int len
= (int)wide
.size();
263 return std::string();
265 char * narrow
= new char[size
];
266 int ret
= WideCharToMultiByte(CP_UTF8
, 0, wide
.c_str(), len
, narrow
, size
- 1, nullptr, nullptr);
268 std::string sRet
= std::string(narrow
);
273 std::wstring
CUnicodeUtils::StdGetUnicode(const std::string
& multibyte
)
275 int len
= (int)multibyte
.size();
277 return std::wstring();
279 wchar_t * wide
= new wchar_t[size
];
280 int ret
= MultiByteToWideChar(CP_UTF8
, 0, multibyte
.c_str(), len
, wide
, size
- 1);
282 std::wstring sRet
= std::wstring(wide
);
288 std::string
WideToMultibyte(const std::wstring
& wide
)
290 char * narrow
= new char[wide
.length()*3+2];
291 BOOL defaultCharUsed
;
292 int ret
= (int)WideCharToMultiByte(CP_ACP
, 0, wide
.c_str(), (int)wide
.size(), narrow
, (int)wide
.length()*3 - 1, ".", &defaultCharUsed
);
294 std::string str
= narrow
;
299 std::string
WideToUTF8(const std::wstring
& wide
)
301 char * narrow
= new char[wide
.length()*3+2];
302 int ret
= (int)WideCharToMultiByte(CP_UTF8
, 0, wide
.c_str(), (int)wide
.size(), narrow
, (int)wide
.length() * 3 - 1, nullptr, nullptr);
304 std::string str
= narrow
;
309 std::wstring
MultibyteToWide(const std::string
& multibyte
)
311 size_t length
= multibyte
.length();
313 return std::wstring();
315 wchar_t * wide
= new wchar_t[multibyte
.length()*2+2];
317 return std::wstring();
318 int ret
= (int)MultiByteToWideChar(CP_ACP
, 0, multibyte
.c_str(), (int)multibyte
.size(), wide
, (int)length
*2 - 1);
320 std::wstring str
= wide
;
325 std::wstring
UTF8ToWide(const std::string
& multibyte
)
327 size_t length
= multibyte
.length();
329 return std::wstring();
331 wchar_t * wide
= new wchar_t[length
*2+2];
333 return std::wstring();
334 int ret
= (int)MultiByteToWideChar(CP_UTF8
, 0, multibyte
.c_str(), (int)multibyte
.size(), wide
, (int)length
*2 - 1);
336 std::wstring str
= wide
;
341 std::wstring
UTF8ToString(const std::string
& string
) {return UTF8ToWide(string
);}
342 std::string
StringToUTF8(const std::wstring
& string
) {return WideToUTF8(string
);}
344 std::wstring
UTF8ToString(const std::string
& string
) {return WideToMultibyte(UTF8ToWide(string
));}
345 std::string
StringToUTF8(const std::wstring
& string
) {return WideToUTF8(MultibyteToWide(string
));}
349 #pragma warning(push)
350 #pragma warning(disable: 4200)
351 struct STRINGRESOURCEIMAGE
356 #pragma warning(pop) // C4200
358 int LoadStringEx(HINSTANCE hInstance
, UINT uID
, LPTSTR lpBuffer
, int nBufferMax
, WORD wLanguage
)
360 const STRINGRESOURCEIMAGE
* pImage
;
361 const STRINGRESOURCEIMAGE
* pImageEnd
;
366 BOOL defaultCharUsed
;
373 HRSRC hResource
= FindResourceEx(hInstance
, RT_STRING
, MAKEINTRESOURCE(((uID
>>4)+1)), wLanguage
);
376 //try the default language before giving up!
377 hResource
= FindResource(hInstance
, MAKEINTRESOURCE(((uID
>>4)+1)), RT_STRING
);
381 hGlobal
= LoadResource(hInstance
, hResource
);
384 pImage
= (const STRINGRESOURCEIMAGE
*)::LockResource(hGlobal
);
388 nResourceSize
= ::SizeofResource(hInstance
, hResource
);
389 pImageEnd
= reinterpret_cast<const STRINGRESOURCEIMAGE
*>(LPBYTE(pImage
) + nResourceSize
);
392 while ((iIndex
> 0) && (pImage
< pImageEnd
))
394 pImage
= reinterpret_cast<const STRINGRESOURCEIMAGE
*>(LPBYTE(pImage
) + (sizeof(STRINGRESOURCEIMAGE
) + (pImage
->nLength
* sizeof(WCHAR
))));
397 if (pImage
>= pImageEnd
)
399 if (pImage
->nLength
== 0)
402 ret
= pImage
->nLength
;
403 if (ret
>= nBufferMax
)
404 ret
= nBufferMax
- 1;
405 wcsncpy_s((wchar_t *)lpBuffer
, nBufferMax
, pImage
->achString
, ret
);
406 lpBuffer
[ret
] = L
'\0';
408 ret
= WideCharToMultiByte(CP_ACP
, 0, pImage
->achString
, pImage
->nLength
, (LPSTR
)lpBuffer
, nBufferMax
-1, ".", &defaultCharUsed
);
409 lpBuffer
[ret
] = L
'\0';