6 #include "../WinPort/WinPort.h"
8 #include "DetectCodepage.h"
11 # include <uchardet.h>
13 static bool IsDecimalNumber(const char *s
)
16 if (*s
< '0' || *s
> '9') {
23 static int CheckForEncodedInName(const char *cs
)
25 if (strncasecmp(cs
, "windows-", 8) == 0) {
26 if (IsDecimalNumber(cs
+ 8)) {
29 if (strcasecmp(cs
+ 8, "31j") == 0) {
34 if (strncasecmp(cs
, "CP", 2) == 0 && IsDecimalNumber(cs
+ 2)) {
36 } else if (strncasecmp(cs
, "IBM", 3) == 0 && IsDecimalNumber(cs
+ 3)) {
43 if (r
== 878) { // IBM KOI8-R
44 return 20866; // MS KOI8-R
50 static int CheckForHardcodedByName(const char *cs
)
54 bool operator()(char const *a
, char const *b
) const
56 return std::strcmp(a
, b
) < 0;
60 std::map
<const char*, int, cmp_str
> encodings
62 {"UTF-16",CP_UTF16LE
},
63 {"UTF-32",CP_UTF32LE
},
65 {"ISO-8859-1",28591}, // Latin 1; Western European
66 {"ISO-8859-2",28592}, // Latin 2; Central European
67 {"ISO-8859-3",28593}, // Latin 3; South European
68 {"ISO-8859-4",28594}, // Latin 4; Baltic
69 {"ISO-8859-5",28595}, // Cyrillic
70 {"ISO-8859-6",28596}, // Arabic
71 {"ISO-8859-7",28597}, // Greek
72 {"ISO-8859-8",28598}, // Hebrew
73 {"ISO-8859-9",28599}, // Latin-5; Turkish
74 {"ISO-8859-10",28600}, // Latin-6; Nordic
75 {"ISO-8859-11",28601}, // Thai
76 {"ISO-8859-13",28603}, // Latin-7; Baltic Rim (Estonian)
77 {"ISO-8859-15",28605}, // Latin-9; Western European
78 {"ISO-8859-16",28606}, // Latin-10; South-Eastern European
79 {"TIS-620",28601}, // Thai
80 {"MAC-CYRILLIC",10007}, // Cyrillic (Mac)
81 {"MAC-CENTRALEUROPE",10029}, // Mac OS Central European
82 {"KOI8-R",20866}, // Cyrillic
83 {"EUC-JP",20932}, // Japanese
84 {"ISO-2022-JP",50220}, // Japanese
85 {"Johab",1361}, // Korean
86 {"SHIFT_JIS",932}, // Japanese
87 {"EUC-KR",51949}, // Korean
88 {"UHC",949}, // Korean
89 {"ISO-2022-KR",50225}, // Korean
90 {"BIG5",950}, // Traditional Chinese
91 {"GB18030",54936} // Chinese Simplified
95 // ASCII, EUC-TW, GEORGIAN-ACADEMY, GEORGIAN-PS, HZ-GB-2312, ISO-2022-CN, VISCII
97 auto r
= encodings
.find(cs
);
98 return r
==encodings
.end() ? -1 : r
->second
;
101 static int TranslateUDCharset(const char *cs
)
103 int r
= CheckForEncodedInName(cs
);
105 r
= CheckForHardcodedByName(cs
);
107 fprintf(stderr
, "TranslateUDCharset: unknown charset '%s'\n", cs
);
112 int DetectCodePage(const char *data
, size_t len
)
114 uchardet_t ud
= uchardet_new();
115 uchardet_handle_data(ud
, data
, len
);
116 uchardet_data_end(ud
);
117 const char *cs
= uchardet_get_charset(ud
);
118 int out
= cs
? TranslateUDCharset(cs
) : -1;
119 // fprintf(stderr, "DetectCodePage: '%s' -> %d\n", cs, out);
125 int DetectCodePage(const char *data
, size_t len
)