bugfixes and improvings in automatic encoding detection
[far2l.git] / far2l / src / locale / DetectCodepage.cpp
blob35a69190c62c20fd964c940acc4b27745a23e7c4
1 #include <strings.h>
2 #include <cstring>
3 #include <map>
5 #include <WinCompat.h>
6 #include "../WinPort/WinPort.h"
8 #include "DetectCodepage.h"
10 #ifdef USEUCD
11 # include <uchardet.h>
13 static bool IsDecimalNumber(const char *s)
15 for (;*s;++s) {
16 if (*s < '0' || *s > '9') {
17 return false;
20 return true;
23 static int CheckForEncodedInName(const char *cs)
25 if (strncasecmp(cs, "windows-", 8) == 0) {
26 if (IsDecimalNumber(cs + 8)) {
27 return atoi(cs + 8);
29 if (strcasecmp(cs + 8, "31j") == 0) {
30 return 932;
34 if (strncasecmp(cs, "CP", 2) == 0 && IsDecimalNumber(cs + 2)) {
35 cs+= 2;
36 } else if (strncasecmp(cs, "IBM", 3) == 0 && IsDecimalNumber(cs + 3)) {
37 cs+= 3;
38 } else {
39 return -1;
42 int r = atoi(cs);
43 if (r == 878) { // IBM KOI8-R
44 return 20866; // MS KOI8-R
47 return r;
50 static int CheckForHardcodedByName(const char *cs)
52 struct cmp_str
54 bool operator()(char const *a, char const *b) const
56 return std::strcmp(a, b) < 0;
60 std::map<const char*, int, cmp_str> encodings
62 {"UTF-16",CP_UTF16LE},
63 {"UTF-32",CP_UTF32LE},
64 {"UTF-8",CP_UTF8},
65 {"ISO-8859-1",28591}, // Latin 1; Western European
66 {"ISO-8859-2",28592}, // Latin 2; Central European
67 {"ISO-8859-3",28593}, // Latin 3; South European
68 {"ISO-8859-4",28594}, // Latin 4; Baltic
69 {"ISO-8859-5",28595}, // Cyrillic
70 {"ISO-8859-6",28596}, // Arabic
71 {"ISO-8859-7",28597}, // Greek
72 {"ISO-8859-8",28598}, // Hebrew
73 {"ISO-8859-9",28599}, // Latin-5; Turkish
74 {"ISO-8859-10",28600}, // Latin-6; Nordic
75 {"ISO-8859-11",28601}, // Thai
76 {"ISO-8859-13",28603}, // Latin-7; Baltic Rim (Estonian)
77 {"ISO-8859-15",28605}, // Latin-9; Western European
78 {"ISO-8859-16",28606}, // Latin-10; South-Eastern European
79 {"TIS-620",28601}, // Thai
80 {"MAC-CYRILLIC",10007}, // Cyrillic (Mac)
81 {"MAC-CENTRALEUROPE",10029}, // Mac OS Central European
82 {"KOI8-R",20866}, // Cyrillic
83 {"EUC-JP",20932}, // Japanese
84 {"ISO-2022-JP",50220}, // Japanese
85 {"Johab",1361}, // Korean
86 {"SHIFT_JIS",932}, // Japanese
87 {"EUC-KR",51949}, // Korean
88 {"UHC",949}, // Korean
89 {"ISO-2022-KR",50225}, // Korean
90 {"BIG5",950}, // Traditional Chinese
91 {"GB18030",54936} // Chinese Simplified
94 // the rest:
95 // ASCII, EUC-TW, GEORGIAN-ACADEMY, GEORGIAN-PS, HZ-GB-2312, ISO-2022-CN, VISCII
97 auto r= encodings.find(cs);
98 return r==encodings.end() ? -1 : r->second;
101 static int TranslateUDCharset(const char *cs)
103 int r = CheckForEncodedInName(cs);
104 if (r == -1)
105 r = CheckForHardcodedByName(cs);
106 if (r == -1)
107 fprintf(stderr, "TranslateUDCharset: unknown charset '%s'\n", cs);
109 return r;
112 int DetectCodePage(const char *data, size_t len)
114 uchardet_t ud = uchardet_new();
115 uchardet_handle_data(ud, data, len);
116 uchardet_data_end(ud);
117 const char *cs = uchardet_get_charset(ud);
118 int out = cs ? TranslateUDCharset(cs) : -1;
119 // fprintf(stderr, "DetectCodePage: '%s' -> %d\n", cs, out);
120 uchardet_delete(ud);
121 return out;
124 #else
125 int DetectCodePage(const char *data, size_t len)
127 return -1;
129 #endif