1 // TortoiseSVN - a Windows shell extension for easy version control
\r
3 // Copyright (C) 2003-2006, 2008 - TortoiseSVN
\r
5 // This program is free software; you can redistribute it and/or
\r
6 // modify it under the terms of the GNU General Public License
\r
7 // as published by the Free Software Foundation; either version 2
\r
8 // of the License, or (at your option) any later version.
\r
10 // This program is distributed in the hope that it will be useful,
\r
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
13 // GNU General Public License for more details.
\r
15 // You should have received a copy of the GNU General Public License
\r
16 // along with this program; if not, write to the Free Software Foundation,
\r
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\r
20 #include "unicodeutils.h"
\r
22 CUnicodeUtils::CUnicodeUtils(void)
\r
26 CUnicodeUtils::~CUnicodeUtils(void)
\r
30 #if defined(_MFC_VER) || defined(CSTRING_AVAILABLE)
\r
37 int CUnicodeUtils::GetCPCode(CString &codename)
\r
39 static CodeMap map[]=
\r
41 {037, _T("IBM037")},// IBM EBCDIC US-Canada
\r
42 {437, _T("IBM437")},// OEM United States
\r
43 {500, _T("IBM500")},// IBM EBCDIC International
\r
44 {708, _T("ASMO-708")},// Arabic (ASMO 708)
\r
45 {709, _T("Arabic")},// (ASMO-449+, BCON V4)
\r
46 {710, _T("Arabic")},// - Transparent Arabic
\r
47 {720, _T("DOS-720")},// Arabic (Transparent ASMO); Arabic (DOS)
\r
48 {737, _T("ibm737")},// OEM Greek (formerly 437G); Greek (DOS)
\r
49 {775, _T("ibm775")},// OEM Baltic; Baltic (DOS)
\r
50 {850, _T("ibm850")},// OEM Multilingual Latin 1; Western European (DOS)
\r
51 {852, _T("ibm852")},// OEM Latin 2; Central European (DOS)
\r
52 {855, _T("IBM855")},// OEM Cyrillic (primarily Russian)
\r
53 {857, _T("ibm857")},// OEM Turkish; Turkish (DOS)
\r
54 {858, _T("IBM00858")},// OEM Multilingual Latin 1 + Euro symbol
\r
55 {860, _T("IBM860")},// OEM Portuguese; Portuguese (DOS)
\r
56 {861, _T("ibm861")},// OEM Icelandic; Icelandic (DOS)
\r
57 {862, _T("DOS-862")},// OEM Hebrew; Hebrew (DOS)
\r
58 {863, _T("IBM863")},// OEM French Canadian; French Canadian (DOS)
\r
59 {864, _T("IBM864")},// OEM Arabic; Arabic (864)
\r
60 {865, _T("IBM865")},// OEM Nordic; Nordic (DOS)
\r
61 {866, _T("cp866")},// OEM Russian; Cyrillic (DOS)
\r
62 {869, _T("ibm869")},// OEM Modern Greek; Greek, Modern (DOS)
\r
63 {870, _T("IBM870")},// IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
\r
64 {874, _T("windows-874")},// ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
\r
65 {875, _T("cp875")},// IBM EBCDIC Greek Modern
\r
66 {932, _T("shift_jis")},// ANSI/OEM Japanese; Japanese (Shift-JIS)
\r
67 {936, _T("gb2312")},// ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
\r
68 {949, _T("ks_c_5601-1987")},// ANSI/OEM Korean (Unified Hangul Code)
\r
69 {950, _T("big5")},// ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
\r
70 {1026,_T("IBM1026")},// IBM EBCDIC Turkish (Latin 5)
\r
71 {1047,_T("IBM01047")},// IBM EBCDIC Latin 1/Open System
\r
72 {1140,_T("IBM01140")},// IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
\r
73 {1141, _T("IBM01141")},// IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
\r
74 {1142, _T("IBM01142")},// IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
\r
75 {1143, _T("IBM01143")},// IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
\r
76 {1144, _T("IBM01144")},// IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
\r
77 {1145, _T("IBM01145")},// IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
\r
78 {1146, _T("IBM01146")},// IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
\r
79 {1147, _T("IBM01147")},// IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
\r
80 {1148, _T("IBM01148")},// IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
\r
81 {1149, _T("IBM01149")},// IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
\r
82 {1200, _T("utf-16")},// Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
\r
83 {1201, _T("unicodeFFFE")},// Unicode UTF-16, big endian byte order; available only to managed applications
\r
84 {1250, _T("windows-1250")},// ANSI Central European; Central European (Windows)
\r
85 {1251, _T("windows-1251")},// ANSI Cyrillic; Cyrillic (Windows)
\r
86 {1252, _T("windows-1252")},// ANSI Latin 1; Western European (Windows)
\r
87 {1253, _T("windows-1253")},// ANSI Greek; Greek (Windows)
\r
88 {1254, _T("windows-1254")},// ANSI Turkish; Turkish (Windows)
\r
89 {1255, _T("windows-1255")},// ANSI Hebrew; Hebrew (Windows)
\r
90 {1256, _T("windows-1256")},// ANSI Arabic; Arabic (Windows)
\r
91 {1257, _T("windows-1257")},// ANSI Baltic; Baltic (Windows)
\r
92 {1258, _T("windows-1258")},// ANSI/OEM Vietnamese; Vietnamese (Windows)
\r
93 {1361, _T("Johab")},// Korean (Johab)
\r
94 {10000,_T("macintosh")},// MAC Roman; Western European (Mac)
\r
95 {10001, _T("x-mac-japanese")},// Japanese (Mac)
\r
96 {10002, _T("x-mac-chinesetrad")},// MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
\r
97 {10003, _T("x-mac-korean")},// Korean (Mac)
\r
98 {10004, _T("x-mac-arabic")},// Arabic (Mac)
\r
99 {10005, _T("x-mac-hebrew")},// Hebrew (Mac)
\r
100 {10006, _T("x-mac-greek")},// Greek (Mac)
\r
101 {10007, _T("x-mac-cyrillic")},// Cyrillic (Mac)
\r
102 {10008, _T("x-mac-chinesesimp")},// MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
\r
103 {10010, _T("x-mac-romanian")},// Romanian (Mac)
\r
104 {10017, _T("x-mac-ukrainian")},// Ukrainian (Mac)
\r
105 {10021, _T("x-mac-thai")},// Thai (Mac)
\r
106 {10029, _T("x-mac-ce")},// MAC Latin 2; Central European (Mac)
\r
107 {10079, _T("x-mac-icelandic")},// Icelandic (Mac)
\r
108 {10081, _T("x-mac-turkish")},// Turkish (Mac)
\r
109 {10082, _T("x-mac-croatian")},// Croatian (Mac)
\r
110 {12000, _T("utf-32")},// Unicode UTF-32, little endian byte order; available only to managed applications
\r
111 {12001, _T("utf-32BE")},// Unicode UTF-32, big endian byte order; available only to managed applications
\r
112 {20000, _T("x-Chinese_CNS")},// CNS Taiwan; Chinese Traditional (CNS)
\r
113 {20001, _T("x-cp20001")},// TCA Taiwan
\r
114 {20002, _T("x_Chinese-Eten")},// Eten Taiwan; Chinese Traditional (Eten)
\r
115 {20003, _T("x-cp20003")},// IBM5550 Taiwan
\r
116 {20004, _T("x-cp20004")},// TeleText Taiwan
\r
117 {20005, _T("x-cp20005")},// Wang Taiwan
\r
118 {20105, _T("x-IA5")},// IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
\r
119 {20106, _T("x-IA5-German")},// IA5 German (7-bit)
\r
120 {20107, _T("x-IA5-Swedish")},// IA5 Swedish (7-bit)
\r
121 {20108, _T("x-IA5-Norwegian")},// IA5 Norwegian (7-bit)
\r
122 {20127, _T("us-ascii")},// US-ASCII (7-bit)
\r
123 {20261, _T("x-cp20261")},// T.61
\r
124 {20269, _T("x-cp20269")},// ISO 6937 Non-Spacing Accent
\r
125 {20273, _T("IBM273")},// IBM EBCDIC Germany
\r
126 {20277, _T("IBM277")},//IBM EBCDIC Denmark-Norway
\r
127 {20278, _T("IBM278")},// IBM EBCDIC Finland-Sweden
\r
128 {20280, _T("IBM280")},// IBM EBCDIC Italy
\r
129 {20284, _T("IBM284")},// IBM EBCDIC Latin America-Spain
\r
130 {20285, _T("IBM285")},// IBM EBCDIC United Kingdom
\r
131 {20290, _T("IBM290")},// IBM EBCDIC Japanese Katakana Extended
\r
132 {20297, _T("IBM297")},// IBM EBCDIC France
\r
133 {20420, _T("IBM420")},// IBM EBCDIC Arabic
\r
134 {20423, _T("IBM423")},// IBM EBCDIC Greek
\r
135 {20424, _T("IBM424")},// IBM EBCDIC Hebrew
\r
136 {20833, _T("x-EBCDIC-KoreanExtended")},// IBM EBCDIC Korean Extended
\r
137 {20838, _T("IBM-Thai")},// IBM EBCDIC Thai
\r
138 {20866, _T("koi8-r")},// Russian (KOI8-R); Cyrillic (KOI8-R)
\r
139 {20871, _T("IBM871")},// IBM EBCDIC Icelandic
\r
140 {20880, _T("IBM880")},// IBM EBCDIC Cyrillic Russian
\r
141 {20905, _T("IBM905")},// IBM EBCDIC Turkish
\r
142 {20924, _T("IBM00924")},// IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
\r
143 {20932, _T("EUC-JP")},// Japanese (JIS 0208-1990 and 0121-1990)
\r
144 {20936, _T("x-cp20936")},// Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
\r
145 {20949, _T("x-cp20949")},// Korean Wansung
\r
146 {21025, _T("cp1025")},// IBM EBCDIC Cyrillic Serbian-Bulgarian
\r
147 {21027, _T("21027")},// (deprecated)
\r
148 {21866, _T("koi8-u")},// Ukrainian (KOI8-U); Cyrillic (KOI8-U)
\r
149 {28591, _T("iso-8859-1")},// ISO 8859-1 Latin 1; Western European (ISO)
\r
150 {28592, _T("iso-8859-2")},// ISO 8859-2 Central European; Central European (ISO)
\r
151 {28593, _T("iso-8859-3")},// ISO 8859-3 Latin 3
\r
152 {28594, _T("iso-8859-4")},// ISO 8859-4 Baltic
\r
153 {28595, _T("iso-8859-5")},// ISO 8859-5 Cyrillic
\r
154 {28596, _T("iso-8859-6")},// ISO 8859-6 Arabic
\r
155 {28597, _T("iso-8859-7")},// ISO 8859-7 Greek
\r
156 {28598, _T("iso-8859-8")},// ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
\r
157 {28599, _T("iso-8859-9")},// ISO 8859-9 Turkish
\r
158 {28603, _T("iso-8859-13")},// ISO 8859-13 Estonian
\r
159 {28605, _T("iso-8859-15")},// ISO 8859-15 Latin 9
\r
160 {29001, _T("x-Europa")},// Europa 3
\r
161 {38598, _T("iso-8859-8-i")},// ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
\r
162 {50220, _T("iso-2022-jp")},// ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
\r
163 {50221, _T("csISO2022JP")},// ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
\r
164 {50222, _T("iso-2022-jp")},// ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
\r
165 {50225, _T("iso-2022-kr")},// ISO 2022 Korean
\r
166 {50227, _T("x-cp50227")},// ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
\r
167 {50229, _T("ISO")},// 2022 Traditional Chinese
\r
168 {50930, _T("EBCDIC")},// Japanese (Katakana) Extended
\r
169 {50931, _T("EBCDIC")},// US-Canada and Japanese
\r
170 {50933, _T("EBCDIC")},// Korean Extended and Korean
\r
171 {50935, _T("EBCDIC")},// Simplified Chinese Extended and Simplified Chinese
\r
172 {50936, _T("EBCDIC")},// Simplified Chinese
\r
173 {50937, _T("EBCDIC")},// US-Canada and Traditional Chinese
\r
174 {50939, _T("EBCDIC")},// Japanese (Latin) Extended and Japanese
\r
175 {51932, _T("euc-jp")},// EUC Japanese
\r
176 {51936, _T("EUC-CN")},// EUC Simplified Chinese; Chinese Simplified (EUC)
\r
177 {51949, _T("euc-kr")},// EUC Korean
\r
178 {51950, _T("EUC")},// Traditional Chinese
\r
179 {52936, _T("hz-gb-2312")},// HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
\r
180 {54936, _T("GB18030")},// Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
\r
181 {57002, _T("x-iscii-de")},// ISCII Devanagari
\r
182 {57003, _T("x-iscii-be")},// ISCII Bengali
\r
183 {57004, _T("x-iscii-ta")},// ISCII Tamil
\r
184 {57005, _T("x-iscii-te")},// ISCII Telugu
\r
185 {57006, _T("x-iscii-as")},// ISCII Assamese
\r
186 {57007, _T("x-iscii-or")},// ISCII Oriya
\r
187 {57008, _T("x-iscii-ka")},// ISCII Kannada
\r
188 {57009, _T("x-iscii-ma")},// ISCII Malayalam
\r
189 {57010, _T("x-iscii-gu")},// ISCII Gujarati
\r
190 {57011, _T("x-iscii-pa")},// ISCII Punjabi
\r
191 {65000, _T("utf-7")},// Unicode (UTF-7)
\r
192 {65001, _T("utf-8")},// Unicode (UTF-8)
\r
196 static CodeMap *p=map;
\r
197 codename=codename.MakeLower();
\r
198 while(p->m_CodeName != NULL)
\r
200 CString str = p->m_CodeName;
\r
201 str=str.MakeLower();
\r
203 if( str == codename)
\r
210 CStringA CUnicodeUtils::GetUTF8(const CStringW& string)
\r
214 int len = string.GetLength();
\r
217 buf = retVal.GetBuffer(len*4 + 1);
\r
218 // SecureZeroMemory(buf, (string.GetLength()*4 + 1)*sizeof(char));
\r
219 int lengthIncTerminator = WideCharToMultiByte(CP_UTF8, 0, string, -1, buf, len*4, NULL, NULL);
\r
220 retVal.ReleaseBuffer(lengthIncTerminator-1);
\r
224 CStringA CUnicodeUtils::GetUTF8(const CStringA& string)
\r
227 int len = string.GetLength();
\r
230 buf = new WCHAR[len*4 + 1];
\r
231 SecureZeroMemory(buf, (len*4 + 1)*sizeof(WCHAR));
\r
232 MultiByteToWideChar(CP_ACP, 0, string, -1, buf, len*4);
\r
233 CStringW temp = CStringW(buf);
\r
235 return (CUnicodeUtils::GetUTF8(temp));
\r
238 CString CUnicodeUtils::GetUnicode(const CStringA& string)
\r
241 int len = string.GetLength();
\r
244 buf = new WCHAR[len*4 + 1];
\r
245 SecureZeroMemory(buf, (len*4 + 1)*sizeof(WCHAR));
\r
246 MultiByteToWideChar(CP_UTF8, 0, string, -1, buf, len*4);
\r
247 CString ret = CString(buf);
\r
252 CStringA CUnicodeUtils::ConvertWCHARStringToUTF8(const CString& string)
\r
256 buf = new char[string.GetLength()+1];
\r
260 for ( ; i<string.GetLength(); ++i)
\r
262 buf[i] = (char)string.GetAt(i);
\r
265 sRet = CStringA(buf);
\r
274 std::string CUnicodeUtils::StdGetUTF8(const wide_string& wide)
\r
276 int len = (int)wide.size();
\r
278 return std::string();
\r
280 char * narrow = new char[size];
\r
281 int ret = WideCharToMultiByte(CP_UTF8, 0, wide.c_str(), len, narrow, size-1, NULL, NULL);
\r
283 std::string sRet = std::string(narrow);
\r
288 wide_string CUnicodeUtils::StdGetUnicode(const std::string& multibyte)
\r
290 int len = (int)multibyte.size();
\r
292 return wide_string();
\r
294 wchar_t * wide = new wchar_t[size];
\r
295 int ret = MultiByteToWideChar(CP_UTF8, 0, multibyte.c_str(), len, wide, size - 1);
\r
297 wide_string sRet = wide_string(wide);
\r
303 std::string WideToMultibyte(const wide_string& wide)
\r
305 char * narrow = new char[wide.length()*3+2];
\r
306 BOOL defaultCharUsed;
\r
307 int ret = (int)WideCharToMultiByte(CP_ACP, 0, wide.c_str(), (int)wide.size(), narrow, (int)wide.length()*3 - 1, ".", &defaultCharUsed);
\r
309 std::string str = narrow;
\r
314 std::string WideToUTF8(const wide_string& wide)
\r
316 char * narrow = new char[wide.length()*3+2];
\r
317 int ret = (int)WideCharToMultiByte(CP_UTF8, 0, wide.c_str(), (int)wide.size(), narrow, (int)wide.length()*3 - 1, NULL, NULL);
\r
319 std::string str = narrow;
\r
324 wide_string MultibyteToWide(const std::string& multibyte)
\r
326 size_t length = multibyte.length();
\r
328 return wide_string();
\r
330 wchar_t * wide = new wchar_t[multibyte.length()*2+2];
\r
332 return wide_string();
\r
333 int ret = (int)MultiByteToWideChar(CP_ACP, 0, multibyte.c_str(), (int)multibyte.size(), wide, (int)length*2 - 1);
\r
335 wide_string str = wide;
\r
340 wide_string UTF8ToWide(const std::string& multibyte)
\r
342 size_t length = multibyte.length();
\r
344 return wide_string();
\r
346 wchar_t * wide = new wchar_t[length*2+2];
\r
348 return wide_string();
\r
349 int ret = (int)MultiByteToWideChar(CP_UTF8, 0, multibyte.c_str(), (int)multibyte.size(), wide, (int)length*2 - 1);
\r
351 wide_string str = wide;
\r
356 stdstring UTF8ToString(const std::string& string) {return UTF8ToWide(string);}
\r
357 std::string StringToUTF8(const stdstring& string) {return WideToUTF8(string);}
\r
359 stdstring UTF8ToString(const std::string& string) {return WideToMultibyte(UTF8ToWide(string));}
\r
360 std::string StringToUTF8(const stdstring& string) {return WideToUTF8(MultibyteToWide(string));}
\r
364 #pragma warning(push)
\r
365 #pragma warning(disable: 4200)
\r
366 struct STRINGRESOURCEIMAGE
\r
371 #pragma warning(pop) // C4200
\r
373 int LoadStringEx(HINSTANCE hInstance, UINT uID, LPTSTR lpBuffer, int nBufferMax, WORD wLanguage)
\r
375 const STRINGRESOURCEIMAGE* pImage;
\r
376 const STRINGRESOURCEIMAGE* pImageEnd;
\r
377 ULONG nResourceSize;
\r
381 BOOL defaultCharUsed;
\r
385 if (lpBuffer == NULL)
\r
388 HRSRC hResource = FindResourceEx(hInstance, RT_STRING, MAKEINTRESOURCE(((uID>>4)+1)), wLanguage);
\r
391 //try the default language before giving up!
\r
392 hResource = FindResource(hInstance, MAKEINTRESOURCE(((uID>>4)+1)), RT_STRING);
\r
396 hGlobal = LoadResource(hInstance, hResource);
\r
399 pImage = (const STRINGRESOURCEIMAGE*)::LockResource(hGlobal);
\r
403 nResourceSize = ::SizeofResource(hInstance, hResource);
\r
404 pImageEnd = (const STRINGRESOURCEIMAGE*)(LPBYTE(pImage)+nResourceSize);
\r
405 iIndex = uID&0x000f;
\r
407 while ((iIndex > 0) && (pImage < pImageEnd))
\r
409 pImage = (const STRINGRESOURCEIMAGE*)(LPBYTE(pImage)+(sizeof(STRINGRESOURCEIMAGE)+(pImage->nLength*sizeof(WCHAR))));
\r
412 if (pImage >= pImageEnd)
\r
414 if (pImage->nLength == 0)
\r
417 ret = pImage->nLength;
\r
418 if (ret > nBufferMax)
\r
420 wcsncpy_s((wchar_t *)lpBuffer, nBufferMax, pImage->achString, ret);
\r
423 ret = WideCharToMultiByte(CP_ACP, 0, pImage->achString, pImage->nLength, (LPSTR)lpBuffer, nBufferMax-1, ".", &defaultCharUsed);
\r