far2l/src/locale/DetectCodepage.cpp

   1 #include <strings.h>
   2 #include <cstring>
   3 #include <map>
   4
   5 #include <WinCompat.h>
   6 #include "../WinPort/WinPort.h"
   7
   8 #include "DetectCodepage.h"
   9
  10 #ifdef USEUCD
  11 # include <uchardet.h>
  12
  13 static bool IsDecimalNumber(const char *s)
  14 {
  15         for (;*s;++s) {
  16                 if (*s < '0' || *s > '9') {
  17                         return false;
  18                 }
  19         }
  20         return true;
  21 }
  22
  23 static int CheckForEncodedInName(const char *cs)
  24 {
  25         if (strncasecmp(cs, "windows-", 8) == 0) {
  26                 if (IsDecimalNumber(cs + 8)) {
  27                         return atoi(cs + 8);
  28                 }
  29                 if (strcasecmp(cs + 8, "31j") == 0) {
  30                         return 932;
  31                 }
  32         }
  33
  34         if (strncasecmp(cs, "CP", 2) == 0 && IsDecimalNumber(cs + 2)) {
  35                 cs+= 2;
  36         } else if (strncasecmp(cs, "IBM", 3) == 0 && IsDecimalNumber(cs + 3)) {
  37                 cs+= 3;
  38         } else {
  39                 return -1;
  40         }
  41
  42         int r = atoi(cs);
  43         if (r == 878) {   // IBM KOI8-R
  44                 return 20866; // MS KOI8-R
  45         }
  46
  47         return r;
  48 }
  49
  50 static int CheckForHardcodedByName(const char *cs)
  51 {
  52     struct cmp_str
  53     {
  54         bool operator()(char const *a, char const *b) const
  55         {
  56             return std::strcmp(a, b) < 0;
  57         }
  58     };
  59
  60     std::map<const char*, int, cmp_str> encodings
  61         {
  62             {"UTF-16",CP_UTF16LE},
  63             {"UTF-32",CP_UTF32LE},
  64             {"UTF-8",CP_UTF8},
  65             {"ISO-8859-1",28591},          // Latin 1; Western European
  66             {"ISO-8859-2",28592},          // Latin 2; Central European
  67             {"ISO-8859-3",28593},          // Latin 3; South European
  68             {"ISO-8859-4",28594},          // Latin 4; Baltic
  69             {"ISO-8859-5",28595},          // Cyrillic
  70             {"ISO-8859-6",28596},          // Arabic
  71             {"ISO-8859-7",28597},          // Greek
  72             {"ISO-8859-8",28598},          // Hebrew
  73             {"ISO-8859-9",28599},          // Latin-5; Turkish
  74             {"ISO-8859-10",28600},         // Latin-6; Nordic
  75             {"ISO-8859-11",28601},         // Thai
  76             {"ISO-8859-13",28603},         // Latin-7; Baltic Rim (Estonian)
  77             {"ISO-8859-15",28605},         // Latin-9; Western European
  78             {"ISO-8859-16",28606},         // Latin-10; South-Eastern European
  79             {"TIS-620",28601},             // Thai
  80             {"MAC-CYRILLIC",10007},        // Cyrillic (Mac)
  81             {"MAC-CENTRALEUROPE",10029},   // Mac OS Central European
  82             {"KOI8-R",20866},              // Cyrillic
  83             {"EUC-JP",20932},              // Japanese
  84             {"ISO-2022-JP",50220},         // Japanese
  85             {"Johab",1361},                // Korean
  86             {"SHIFT_JIS",932},             // Japanese
  87             {"EUC-KR",51949},              // Korean
  88             {"UHC",949},                   // Korean
  89             {"ISO-2022-KR",50225},         // Korean
  90             {"BIG5",950},                  // Traditional Chinese
  91             {"GB18030",54936}              // Chinese Simplified
  92         };
  93
  94     // the rest:
  95     // ASCII, EUC-TW, GEORGIAN-ACADEMY, GEORGIAN-PS, HZ-GB-2312, ISO-2022-CN, VISCII
  96
  97     auto r= encodings.find(cs);
  98     return r==encodings.end() ? -1 : r->second;
  99 }
 100
 101 static int TranslateUDCharset(const char *cs)
 102 {
 103         int r = CheckForEncodedInName(cs);
 104         if (r == -1)
 105                 r = CheckForHardcodedByName(cs);
 106         if (r == -1)
 107                 fprintf(stderr, "TranslateUDCharset: unknown charset '%s'\n", cs);
 108
 109     return r;
 110 }
 111
 112 int DetectCodePage(const char *data, size_t len)
 113 {
 114         uchardet_t ud = uchardet_new();
 115         uchardet_handle_data(ud, data, len);
 116         uchardet_data_end(ud);
 117         const char *cs = uchardet_get_charset(ud);
 118         int out = cs ? TranslateUDCharset(cs) : -1;
 119 //      fprintf(stderr, "DetectCodePage: '%s' -> %d\n", cs, out);
 120         uchardet_delete(ud);
 121         return out;
 122 }
 123
 124 #else
 125 int DetectCodePage(const char *data, size_t len)
 126 {
 127         return -1;
 128 }
 129 #endif