1 // TortoiseGit - a Windows shell extension for easy version control
3 // Copyright (C) 2009-2014 - TortoiseGit
4 // Copyright (C) 2003-2006, 2008 - TortoiseSVN
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either version 2
9 // of the License, or (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software Foundation,
18 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #include "UnicodeUtils.h"
23 CUnicodeUtils::CUnicodeUtils(void)
27 CUnicodeUtils::~CUnicodeUtils(void)
31 #if defined(_MFC_VER) || defined(CSTRING_AVAILABLE)
38 int CUnicodeUtils::GetCPCode(const CString
&codename
)
42 { 37, _T("IBM037")},// IBM EBCDIC US-Canada
43 {437, _T("IBM437")},// OEM United States
44 {500, _T("IBM500")},// IBM EBCDIC International
45 {708, _T("ASMO-708")},// Arabic (ASMO 708)
46 {709, _T("Arabic")},// (ASMO-449+, BCON V4)
47 {710, _T("Arabic")},// - Transparent Arabic
48 {720, _T("DOS-720")},// Arabic (Transparent ASMO); Arabic (DOS)
49 {737, _T("ibm737")},// OEM Greek (formerly 437G); Greek (DOS)
50 {775, _T("ibm775")},// OEM Baltic; Baltic (DOS)
51 {850, _T("ibm850")},// OEM Multilingual Latin 1; Western European (DOS)
52 {852, _T("ibm852")},// OEM Latin 2; Central European (DOS)
53 {855, _T("IBM855")},// OEM Cyrillic (primarily Russian)
54 {857, _T("ibm857")},// OEM Turkish; Turkish (DOS)
55 {858, _T("IBM00858")},// OEM Multilingual Latin 1 + Euro symbol
56 {860, _T("IBM860")},// OEM Portuguese; Portuguese (DOS)
57 {861, _T("ibm861")},// OEM Icelandic; Icelandic (DOS)
58 {862, _T("DOS-862")},// OEM Hebrew; Hebrew (DOS)
59 {863, _T("IBM863")},// OEM French Canadian; French Canadian (DOS)
60 {864, _T("IBM864")},// OEM Arabic; Arabic (864)
61 {865, _T("IBM865")},// OEM Nordic; Nordic (DOS)
62 {866, _T("cp866")},// OEM Russian; Cyrillic (DOS)
63 {869, _T("ibm869")},// OEM Modern Greek; Greek, Modern (DOS)
64 {870, _T("IBM870")},// IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
65 {874, _T("windows-874")},// ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
66 {875, _T("cp875")},// IBM EBCDIC Greek Modern
67 {932, _T("shift_jis")},// ANSI/OEM Japanese; Japanese (Shift-JIS)
68 {936, _T("gb2312")},// ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
69 {949, _T("ks_c_5601-1987")},// ANSI/OEM Korean (Unified Hangul Code)
70 {949, _T("cp949")},// ANSI/OEM Korean (Unified Hangul Code)
71 {950, _T("big5")},// ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
72 {1026,_T("IBM1026")},// IBM EBCDIC Turkish (Latin 5)
73 {1047,_T("IBM01047")},// IBM EBCDIC Latin 1/Open System
74 {1140,_T("IBM01140")},// IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
75 {1141, _T("IBM01141")},// IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
76 {1142, _T("IBM01142")},// IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
77 {1143, _T("IBM01143")},// IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
78 {1144, _T("IBM01144")},// IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
79 {1145, _T("IBM01145")},// IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
80 {1146, _T("IBM01146")},// IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
81 {1147, _T("IBM01147")},// IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
82 {1148, _T("IBM01148")},// IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
83 {1149, _T("IBM01149")},// IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
84 {1200, _T("utf-16")},// Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
85 {1201, _T("unicodeFFFE")},// Unicode UTF-16, big endian byte order; available only to managed applications
86 {1250, _T("windows-1250")},// ANSI Central European; Central European (Windows)
87 {1251, _T("windows-1251")},// ANSI Cyrillic; Cyrillic (Windows)
89 {1251, _T("cp-1251")},
90 {1251, _T("cp_1251")},
91 {1252, _T("windows-1252")},// ANSI Latin 1; Western European (Windows)
92 {1253, _T("windows-1253")},// ANSI Greek; Greek (Windows)
93 {1254, _T("windows-1254")},// ANSI Turkish; Turkish (Windows)
94 {1255, _T("windows-1255")},// ANSI Hebrew; Hebrew (Windows)
95 {1256, _T("windows-1256")},// ANSI Arabic; Arabic (Windows)
96 {1257, _T("windows-1257")},// ANSI Baltic; Baltic (Windows)
97 {1258, _T("windows-1258")},// ANSI/OEM Vietnamese; Vietnamese (Windows)
98 {1361, _T("Johab")},// Korean (Johab)
99 {10000,_T("macintosh")},// MAC Roman; Western European (Mac)
100 {10001, _T("x-mac-japanese")},// Japanese (Mac)
101 {10002, _T("x-mac-chinesetrad")},// MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
102 {10003, _T("x-mac-korean")},// Korean (Mac)
103 {10004, _T("x-mac-arabic")},// Arabic (Mac)
104 {10005, _T("x-mac-hebrew")},// Hebrew (Mac)
105 {10006, _T("x-mac-greek")},// Greek (Mac)
106 {10007, _T("x-mac-cyrillic")},// Cyrillic (Mac)
107 {10008, _T("x-mac-chinesesimp")},// MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
108 {10010, _T("x-mac-romanian")},// Romanian (Mac)
109 {10017, _T("x-mac-ukrainian")},// Ukrainian (Mac)
110 {10021, _T("x-mac-thai")},// Thai (Mac)
111 {10029, _T("x-mac-ce")},// MAC Latin 2; Central European (Mac)
112 {10079, _T("x-mac-icelandic")},// Icelandic (Mac)
113 {10081, _T("x-mac-turkish")},// Turkish (Mac)
114 {10082, _T("x-mac-croatian")},// Croatian (Mac)
115 {12000, _T("utf-32")},// Unicode UTF-32, little endian byte order; available only to managed applications
116 {12001, _T("utf-32BE")},// Unicode UTF-32, big endian byte order; available only to managed applications
117 {20000, _T("x-Chinese_CNS")},// CNS Taiwan; Chinese Traditional (CNS)
118 {20001, _T("x-cp20001")},// TCA Taiwan
119 {20002, _T("x_Chinese-Eten")},// Eten Taiwan; Chinese Traditional (Eten)
120 {20003, _T("x-cp20003")},// IBM5550 Taiwan
121 {20004, _T("x-cp20004")},// TeleText Taiwan
122 {20005, _T("x-cp20005")},// Wang Taiwan
123 {20105, _T("x-IA5")},// IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
124 {20106, _T("x-IA5-German")},// IA5 German (7-bit)
125 {20107, _T("x-IA5-Swedish")},// IA5 Swedish (7-bit)
126 {20108, _T("x-IA5-Norwegian")},// IA5 Norwegian (7-bit)
127 {20127, _T("us-ascii")},// US-ASCII (7-bit)
128 {20261, _T("x-cp20261")},// T.61
129 {20269, _T("x-cp20269")},// ISO 6937 Non-Spacing Accent
130 {20273, _T("IBM273")},// IBM EBCDIC Germany
131 {20277, _T("IBM277")},//IBM EBCDIC Denmark-Norway
132 {20278, _T("IBM278")},// IBM EBCDIC Finland-Sweden
133 {20280, _T("IBM280")},// IBM EBCDIC Italy
134 {20284, _T("IBM284")},// IBM EBCDIC Latin America-Spain
135 {20285, _T("IBM285")},// IBM EBCDIC United Kingdom
136 {20290, _T("IBM290")},// IBM EBCDIC Japanese Katakana Extended
137 {20297, _T("IBM297")},// IBM EBCDIC France
138 {20420, _T("IBM420")},// IBM EBCDIC Arabic
139 {20423, _T("IBM423")},// IBM EBCDIC Greek
140 {20424, _T("IBM424")},// IBM EBCDIC Hebrew
141 {20833, _T("x-EBCDIC-KoreanExtended")},// IBM EBCDIC Korean Extended
142 {20838, _T("IBM-Thai")},// IBM EBCDIC Thai
143 {20866, _T("koi8-r")},// Russian (KOI8-R); Cyrillic (KOI8-R)
144 {20871, _T("IBM871")},// IBM EBCDIC Icelandic
145 {20880, _T("IBM880")},// IBM EBCDIC Cyrillic Russian
146 {20905, _T("IBM905")},// IBM EBCDIC Turkish
147 {20924, _T("IBM00924")},// IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
148 {20932, _T("EUC-JP")},// Japanese (JIS 0208-1990 and 0121-1990)
149 {20936, _T("x-cp20936")},// Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
150 {20949, _T("x-cp20949")},// Korean Wansung
151 {21025, _T("cp1025")},// IBM EBCDIC Cyrillic Serbian-Bulgarian
152 {21027, _T("21027")},// (deprecated)
153 {21866, _T("koi8-u")},// Ukrainian (KOI8-U); Cyrillic (KOI8-U)
154 {28591, _T("iso-8859-1")},// ISO 8859-1 Latin 1; Western European (ISO)
155 {28592, _T("iso-8859-2")},// ISO 8859-2 Central European; Central European (ISO)
156 {28593, _T("iso-8859-3")},// ISO 8859-3 Latin 3
157 {28594, _T("iso-8859-4")},// ISO 8859-4 Baltic
158 {28595, _T("iso-8859-5")},// ISO 8859-5 Cyrillic
159 {28596, _T("iso-8859-6")},// ISO 8859-6 Arabic
160 {28597, _T("iso-8859-7")},// ISO 8859-7 Greek
161 {28598, _T("iso-8859-8")},// ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
162 {28599, _T("iso-8859-9")},// ISO 8859-9 Turkish
163 {28603, _T("iso-8859-13")},// ISO 8859-13 Estonian
164 {28605, _T("iso-8859-15")},// ISO 8859-15 Latin 9
165 {29001, _T("x-Europa")},// Europa 3
166 {38598, _T("iso-8859-8-i")},// ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
167 {50220, _T("iso-2022-jp")},// ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
168 {50221, _T("csISO2022JP")},// ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
169 {50222, _T("iso-2022-jp")},// ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
170 {50225, _T("iso-2022-kr")},// ISO 2022 Korean
171 {50227, _T("x-cp50227")},// ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
172 {50229, _T("ISO")},// 2022 Traditional Chinese
173 {50930, _T("EBCDIC")},// Japanese (Katakana) Extended
174 {50931, _T("EBCDIC")},// US-Canada and Japanese
175 {50933, _T("EBCDIC")},// Korean Extended and Korean
176 {50935, _T("EBCDIC")},// Simplified Chinese Extended and Simplified Chinese
177 {50936, _T("EBCDIC")},// Simplified Chinese
178 {50937, _T("EBCDIC")},// US-Canada and Traditional Chinese
179 {50939, _T("EBCDIC")},// Japanese (Latin) Extended and Japanese
180 {51932, _T("euc-jp")},// EUC Japanese
181 {51936, _T("EUC-CN")},// EUC Simplified Chinese; Chinese Simplified (EUC)
182 {51949, _T("euc-kr")},// EUC Korean
183 {51950, _T("EUC")},// Traditional Chinese
184 {52936, _T("hz-gb-2312")},// HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
185 {54936, _T("GB18030")},// Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
186 {57002, _T("x-iscii-de")},// ISCII Devanagari
187 {57003, _T("x-iscii-be")},// ISCII Bengali
188 {57004, _T("x-iscii-ta")},// ISCII Tamil
189 {57005, _T("x-iscii-te")},// ISCII Telugu
190 {57006, _T("x-iscii-as")},// ISCII Assamese
191 {57007, _T("x-iscii-or")},// ISCII Oriya
192 {57008, _T("x-iscii-ka")},// ISCII Kannada
193 {57009, _T("x-iscii-ma")},// ISCII Malayalam
194 {57010, _T("x-iscii-gu")},// ISCII Gujarati
195 {57011, _T("x-iscii-pa")},// ISCII Punjabi
196 {65000, _T("utf-7")},// Unicode (UTF-7)
197 {65001, _T("utf-8")},// Unicode (UTF-8)
201 static CodeMap
*p
=map
;
202 if (codename
.IsEmpty())
204 CString
code(codename
);
206 while(p
->m_CodeName
!= NULL
)
208 CString str
= p
->m_CodeName
;
219 CStringA
CUnicodeUtils::GetUTF8(const CStringW
& string
)
221 return GetMulti(string
,CP_UTF8
);
224 CStringA
CUnicodeUtils::GetMulti(const CStringW
& string
,int acp
)
228 int len
= string
.GetLength();
231 buf
= retVal
.GetBuffer(len
*4 + 1);
232 int lengthIncTerminator
= WideCharToMultiByte(acp
, 0, string
, -1, buf
, len
*4, NULL
, NULL
);
233 retVal
.ReleaseBuffer(lengthIncTerminator
-1);
238 CStringA
CUnicodeUtils::GetUTF8(const CStringA
& string
)
241 int len
= string
.GetLength();
244 buf
= new WCHAR
[len
*4 + 1];
245 int lengthIncTerminator
= MultiByteToWideChar(CP_ACP
, 0, string
, -1, buf
, len
* 4);
246 CStringW temp
= CStringW(buf
, lengthIncTerminator
- 1);
248 return (CUnicodeUtils::GetUTF8(temp
));
251 CString
CUnicodeUtils::GetUnicode(const CStringA
& string
, int acp
)
255 int len
= string
.GetLength();
258 buf
= retVal
.GetBuffer(len
* 4 + 1);
259 int lengthIncTerminator
= MultiByteToWideChar(acp
, 0, string
, -1, buf
, len
* 4);
260 retVal
.ReleaseBuffer(lengthIncTerminator
- 1);
264 CStringA
CUnicodeUtils::ConvertWCHARStringToUTF8(const CString
& string
)
268 buf
= new char[string
.GetLength()+1];
272 for ( ; i
<string
.GetLength(); ++i
)
274 buf
[i
] = (char)string
.GetAt(i
);
277 sRet
= CStringA(buf
);
286 std::string
CUnicodeUtils::StdGetUTF8(const wide_string
& wide
)
288 int len
= (int)wide
.size();
290 return std::string();
292 char * narrow
= new char[size
];
293 int ret
= WideCharToMultiByte(CP_UTF8
, 0, wide
.c_str(), len
, narrow
, size
-1, NULL
, NULL
);
295 std::string sRet
= std::string(narrow
);
300 wide_string
CUnicodeUtils::StdGetUnicode(const std::string
& multibyte
)
302 int len
= (int)multibyte
.size();
304 return wide_string();
306 wchar_t * wide
= new wchar_t[size
];
307 int ret
= MultiByteToWideChar(CP_UTF8
, 0, multibyte
.c_str(), len
, wide
, size
- 1);
309 wide_string sRet
= wide_string(wide
);
315 std::string
WideToMultibyte(const wide_string
& wide
)
317 char * narrow
= new char[wide
.length()*3+2];
318 BOOL defaultCharUsed
;
319 int ret
= (int)WideCharToMultiByte(CP_ACP
, 0, wide
.c_str(), (int)wide
.size(), narrow
, (int)wide
.length()*3 - 1, ".", &defaultCharUsed
);
321 std::string str
= narrow
;
326 std::string
WideToUTF8(const wide_string
& wide
)
328 char * narrow
= new char[wide
.length()*3+2];
329 int ret
= (int)WideCharToMultiByte(CP_UTF8
, 0, wide
.c_str(), (int)wide
.size(), narrow
, (int)wide
.length()*3 - 1, NULL
, NULL
);
331 std::string str
= narrow
;
336 wide_string
MultibyteToWide(const std::string
& multibyte
)
338 size_t length
= multibyte
.length();
340 return wide_string();
342 wchar_t * wide
= new wchar_t[multibyte
.length()*2+2];
344 return wide_string();
345 int ret
= (int)MultiByteToWideChar(CP_ACP
, 0, multibyte
.c_str(), (int)multibyte
.size(), wide
, (int)length
*2 - 1);
347 wide_string str
= wide
;
352 wide_string
UTF8ToWide(const std::string
& multibyte
)
354 size_t length
= multibyte
.length();
356 return wide_string();
358 wchar_t * wide
= new wchar_t[length
*2+2];
360 return wide_string();
361 int ret
= (int)MultiByteToWideChar(CP_UTF8
, 0, multibyte
.c_str(), (int)multibyte
.size(), wide
, (int)length
*2 - 1);
363 wide_string str
= wide
;
368 stdstring
UTF8ToString(const std::string
& string
) {return UTF8ToWide(string
);}
369 std::string
StringToUTF8(const stdstring
& string
) {return WideToUTF8(string
);}
371 stdstring
UTF8ToString(const std::string
& string
) {return WideToMultibyte(UTF8ToWide(string
));}
372 std::string
StringToUTF8(const stdstring
& string
) {return WideToUTF8(MultibyteToWide(string
));}
376 #pragma warning(push)
377 #pragma warning(disable: 4200)
378 struct STRINGRESOURCEIMAGE
383 #pragma warning(pop) // C4200
385 int LoadStringEx(HINSTANCE hInstance
, UINT uID
, LPTSTR lpBuffer
, int nBufferMax
, WORD wLanguage
)
387 const STRINGRESOURCEIMAGE
* pImage
;
388 const STRINGRESOURCEIMAGE
* pImageEnd
;
393 BOOL defaultCharUsed
;
397 if (lpBuffer
== NULL
)
400 HRSRC hResource
= FindResourceEx(hInstance
, RT_STRING
, MAKEINTRESOURCE(((uID
>>4)+1)), wLanguage
);
403 //try the default language before giving up!
404 hResource
= FindResource(hInstance
, MAKEINTRESOURCE(((uID
>>4)+1)), RT_STRING
);
408 hGlobal
= LoadResource(hInstance
, hResource
);
411 pImage
= (const STRINGRESOURCEIMAGE
*)::LockResource(hGlobal
);
415 nResourceSize
= ::SizeofResource(hInstance
, hResource
);
416 pImageEnd
= (const STRINGRESOURCEIMAGE
*)(LPBYTE(pImage
)+nResourceSize
);
419 while ((iIndex
> 0) && (pImage
< pImageEnd
))
421 pImage
= (const STRINGRESOURCEIMAGE
*)(LPBYTE(pImage
)+(sizeof(STRINGRESOURCEIMAGE
)+(pImage
->nLength
*sizeof(WCHAR
))));
424 if (pImage
>= pImageEnd
)
426 if (pImage
->nLength
== 0)
429 ret
= pImage
->nLength
;
430 if (ret
> nBufferMax
)
432 wcsncpy_s((wchar_t *)lpBuffer
, nBufferMax
, pImage
->achString
, ret
);
435 ret
= WideCharToMultiByte(CP_ACP
, 0, pImage
->achString
, pImage
->nLength
, (LPSTR
)lpBuffer
, nBufferMax
-1, ".", &defaultCharUsed
);