src/encodings.c

   1 /*
   2  *      encodings.c - this file is part of Geany, a fast and lightweight IDE
   3  *
   4  *      Copyright 2005-2011 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
   5  *      Copyright 2006-2011 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
   6  *
   7  *      This program is free software; you can redistribute it and/or modify
   8  *      it under the terms of the GNU General Public License as published by
   9  *      the Free Software Foundation; either version 2 of the License, or
  10  *      (at your option) any later version.
  11  *
  12  *      This program is distributed in the hope that it will be useful,
  13  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *      GNU General Public License for more details.
  16  *
  17  *      You should have received a copy of the GNU General Public License
  18  *      along with this program; if not, write to the Free Software
  19  *      Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20  */
  21
  22 /*
  23  * Encoding conversion and Byte Order Mark (BOM) handling.
  24  */
  25
  26 /*
  27  * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
  28  * list of people on the gedit Team.
  29  * See the gedit ChangeLog files for a list of changes.
  30  */
  31  /* Stolen from anjuta */
  32
  33 #include <string.h>
  34
  35 #include "geany.h"
  36 #include "utils.h"
  37 #include "support.h"
  38 #include "document.h"
  39 #include "documentprivate.h"
  40 #include "msgwindow.h"
  41 #include "encodings.h"
  42 #include "callbacks.h"
  43 #include "ui_utils.h"
  44
  45 #ifdef HAVE_REGEX_H
  46 # include <regex.h>
  47 #else
  48 # include "gnuregex.h"
  49 #endif
  50
  51 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
  52 #define PATTERN_HTMLMETA "<meta[ \t\n\r\f]+http-equiv[ \t\n\r\f]*=[ \t\n\r\f]*\"?content-type\"?[ \t\n\r\f]+content[ \t\n\r\f]*=[ \t\n\r\f]*\"text/x?html;[ \t\n\r\f]*charset=([a-z0-9_-]+)\"[ \t\n\r\f]*/?>"
  53 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
  54 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
  55
  56 /* precompiled regexps */
  57 static regex_t pregs[2];
  58 static gboolean pregs_loaded = FALSE;
  59
  60
  61 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
  62
  63
  64 #define fill(Order, Group, Idx, Charset, Name) \
  65                 encodings[Idx].idx = Idx; \
  66                 encodings[Idx].order = Order; \
  67                 encodings[Idx].group = Group; \
  68                 encodings[Idx].charset = Charset; \
  69                 encodings[Idx].name = Name;
  70
  71 static void init_encodings(void)
  72 {
  73         fill(0,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_14,             "ISO-8859-14",          _("Celtic"));
  74         fill(1,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_7,              "ISO-8859-7",           _("Greek"));
  75         fill(2,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1253,    "WINDOWS-1253",         _("Greek"));
  76         fill(3,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_10,             "ISO-8859-10",          _("Nordic"));
  77         fill(4,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_3,              "ISO-8859-3",           _("South European"));
  78         fill(5,         WESTEUROPEAN,   GEANY_ENCODING_IBM_850,                 "IBM850",                       _("Western"));
  79         fill(6,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_1,              "ISO-8859-1",           _("Western"));
  80         fill(7,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_15,             "ISO-8859-15",          _("Western"));
  81         fill(8,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1252,    "WINDOWS-1252",         _("Western"));
  82
  83         fill(0,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_4,              "ISO-8859-4",           _("Baltic"));
  84         fill(1,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_13,             "ISO-8859-13",          _("Baltic"));
  85         fill(2,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1257,    "WINDOWS-1257",         _("Baltic"));
  86         fill(3,         EASTEUROPEAN,   GEANY_ENCODING_IBM_852,                 "IBM852",                       _("Central European"));
  87         fill(4,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_2,              "ISO-8859-2",           _("Central European"));
  88         fill(5,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1250,    "WINDOWS-1250",         _("Central European"));
  89         fill(6,         EASTEUROPEAN,   GEANY_ENCODING_IBM_855,                 "IBM855",                       _("Cyrillic"));
  90         fill(7,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_5,              "ISO-8859-5",           _("Cyrillic"));
  91         /* ISO-IR-111 not available on Windows */
  92         fill(8,         EASTEUROPEAN,   GEANY_ENCODING_ISO_IR_111,              "ISO-IR-111",           _("Cyrillic"));
  93         fill(9,         EASTEUROPEAN,   GEANY_ENCODING_KOI8_R,                  "KOI8-R",                       _("Cyrillic"));
  94         fill(10,        EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1251,    "WINDOWS-1251",         _("Cyrillic"));
  95         fill(11,        EASTEUROPEAN,   GEANY_ENCODING_CP_866,                  "CP866",                        _("Cyrillic/Russian"));
  96         fill(12,        EASTEUROPEAN,   GEANY_ENCODING_KOI8_U,                  "KOI8-U",                       _("Cyrillic/Ukrainian"));
  97         fill(13,        EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_16,             "ISO-8859-16",          _("Romanian"));
  98
  99         fill(0,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_864,                 "IBM864",                       _("Arabic"));
 100         fill(1,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_6,              "ISO-8859-6",           _("Arabic"));
 101         fill(2,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1256,    "WINDOWS-1256",         _("Arabic"));
 102         fill(3,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_862,                 "IBM862",                       _("Hebrew"));
 103         /* not available at all, ? */
 104         fill(4,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8_I,    "ISO-8859-8-I",         _("Hebrew"));
 105         fill(5,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1255,    "WINDOWS-1255",         _("Hebrew"));
 106         fill(6,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8,              "ISO-8859-8",           _("Hebrew Visual"));
 107
 108         fill(0,         ASIAN,                  GEANY_ENCODING_ARMSCII_8,               "ARMSCII-8",            _("Armenian"));
 109         fill(1,         ASIAN,                  GEANY_ENCODING_GEOSTD8,                 "GEORGIAN-ACADEMY",     _("Georgian"));
 110         fill(2,         ASIAN,                  GEANY_ENCODING_TIS_620,                 "TIS-620",                      _("Thai"));
 111         fill(3,         ASIAN,                  GEANY_ENCODING_IBM_857,                 "IBM857",                       _("Turkish"));
 112         fill(4,         ASIAN,                  GEANY_ENCODING_WINDOWS_1254,    "WINDOWS-1254",         _("Turkish"));
 113         fill(5,         ASIAN,                  GEANY_ENCODING_ISO_8859_9,              "ISO-8859-9",           _("Turkish"));
 114         fill(6,         ASIAN,                  GEANY_ENCODING_TCVN,                    "TCVN",                         _("Vietnamese"));
 115         fill(7,         ASIAN,                  GEANY_ENCODING_VISCII,                  "VISCII",                       _("Vietnamese"));
 116         fill(8,         ASIAN,                  GEANY_ENCODING_WINDOWS_1258,    "WINDOWS-1258",         _("Vietnamese"));
 117
 118         fill(0,         UNICODE,                GEANY_ENCODING_UTF_7,                   "UTF-7",                        _("Unicode"));
 119         fill(1,         UNICODE,                GEANY_ENCODING_UTF_8,                   "UTF-8",                        _("Unicode"));
 120         fill(2,         UNICODE,                GEANY_ENCODING_UTF_16LE,                "UTF-16LE",                     _("Unicode"));
 121         fill(3,         UNICODE,                GEANY_ENCODING_UTF_16BE,                "UTF-16BE",                     _("Unicode"));
 122         fill(4,         UNICODE,                GEANY_ENCODING_UCS_2LE,                 "UCS-2LE",                      _("Unicode"));
 123         fill(5,         UNICODE,                GEANY_ENCODING_UCS_2BE,                 "UCS-2BE",                      _("Unicode"));
 124         fill(6,         UNICODE,                GEANY_ENCODING_UTF_32LE,                "UTF-32LE",                     _("Unicode"));
 125         fill(7,         UNICODE,                GEANY_ENCODING_UTF_32BE,                "UTF-32BE",                     _("Unicode"));
 126
 127         fill(0,         EASTASIAN,              GEANY_ENCODING_GB18030,                 "GB18030",                      _("Chinese Simplified"));
 128         fill(1,         EASTASIAN,              GEANY_ENCODING_GB2312,                  "GB2312",                       _("Chinese Simplified"));
 129         fill(2,         EASTASIAN,              GEANY_ENCODING_GBK,                             "GBK",                          _("Chinese Simplified"));
 130         /* maybe not available on Linux */
 131         fill(3,         EASTASIAN,              GEANY_ENCODING_HZ,                              "HZ",                           _("Chinese Simplified"));
 132         fill(4,         EASTASIAN,              GEANY_ENCODING_BIG5,                    "BIG5",                         _("Chinese Traditional"));
 133         fill(5,         EASTASIAN,              GEANY_ENCODING_BIG5_HKSCS,              "BIG5-HKSCS",           _("Chinese Traditional"));
 134         fill(6,         EASTASIAN,              GEANY_ENCODING_EUC_TW,                  "EUC-TW",                       _("Chinese Traditional"));
 135         fill(7,         EASTASIAN,              GEANY_ENCODING_EUC_JP,                  "EUC-JP",                       _("Japanese"));
 136         fill(8,         EASTASIAN,              GEANY_ENCODING_ISO_2022_JP,             "ISO-2022-JP",          _("Japanese"));
 137         fill(9,         EASTASIAN,              GEANY_ENCODING_SHIFT_JIS,               "SHIFT_JIS",            _("Japanese"));
 138         fill(10,        EASTASIAN,              GEANY_ENCODING_CP_932,                  "CP932",                        _("Japanese"));
 139         fill(11,        EASTASIAN,              GEANY_ENCODING_EUC_KR,                  "EUC-KR",                       _("Korean"));
 140         fill(12,        EASTASIAN,              GEANY_ENCODING_ISO_2022_KR,             "ISO-2022-KR",          _("Korean"));
 141         fill(13,        EASTASIAN,              GEANY_ENCODING_JOHAB,                   "JOHAB",                        _("Korean"));
 142         fill(14,        EASTASIAN,              GEANY_ENCODING_UHC,                             "UHC",                          _("Korean"));
 143
 144         fill(0,         NONE,                   GEANY_ENCODING_NONE,                    "None",                         _("Without encoding"));
 145 }
 146
 147
 148 /* compares two encoding names in a permissive fashion.
 149  * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
 150 static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
 151 {
 152         gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
 153         gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
 154
 155         while (*a && *b)
 156         {
 157                 gboolean is_alpha;
 158
 159                 if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
 160                         ((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
 161                 {
 162                         /* either there was a real separator, or we need a implicit one (a chage from alpha to
 163                          * numeric or so) */
 164                         if (! need_sep || (was_alpha != is_alpha))
 165                         {
 166                                 a++;
 167                                 b++;
 168                                 was_alpha = is_alpha;
 169                                 need_sep = FALSE;
 170                         }
 171                         else
 172                                 return FALSE;
 173                 }
 174                 else
 175                 {
 176                         guint n_sep = 0;
 177
 178                         if (! g_ascii_isalnum(*a))
 179                         {
 180                                 a++;
 181                                 n_sep++;
 182                         }
 183                         if (! g_ascii_isalnum(*b))
 184                         {
 185                                 b++;
 186                                 n_sep++;
 187                         }
 188                         if (n_sep < 1)
 189                                 return FALSE;
 190                         else if (n_sep < 2)
 191                                 need_sep = TRUE;
 192                 }
 193         }
 194         return *a == *b;
 195 }
 196
 197
 198 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
 199 {
 200         gint i;
 201
 202         if (charset == NULL)
 203                 return GEANY_ENCODING_UTF_8;
 204
 205         i = 0;
 206         while (i < GEANY_ENCODINGS_MAX)
 207         {
 208                 if (encodings_charset_equals(charset, encodings[i].charset))
 209                         return i;
 210
 211                 ++i;
 212         }
 213         return GEANY_ENCODING_UTF_8;
 214 }
 215
 216
 217 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
 218 {
 219         gint i;
 220
 221         if (charset == NULL)
 222                 return &encodings[GEANY_ENCODING_UTF_8];
 223
 224         i = 0;
 225         while (i < GEANY_ENCODINGS_MAX)
 226         {
 227                 if (encodings_charset_equals(charset, encodings[i].charset))
 228                         return &encodings[i];
 229
 230                 ++i;
 231         }
 232
 233         return NULL;
 234 }
 235
 236
 237 static const gchar *encodings_normalize_charset(const gchar *charset)
 238 {
 239         const GeanyEncoding *encoding;
 240
 241         encoding = encodings_get_from_charset(charset);
 242         if (encoding != NULL)
 243                 return encoding->charset;
 244
 245         return NULL;
 246 }
 247
 248
 249 const GeanyEncoding *encodings_get_from_index(gint idx)
 250 {
 251         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 252
 253         return &encodings[idx];
 254 }
 255
 256
 257 /**
 258  *  Gets the character set name of the specified index e.g. for use with
 259  *  @ref document_set_encoding().
 260  *
 261  *  @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
 262  *
 263  *
 264  *  @return The charset according to idx, or @c NULL if the index is invalid.
 265  *
 266  *  @since 0.13
 267  **/
 268 const gchar* encodings_get_charset_from_index(gint idx)
 269 {
 270         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 271
 272         return encodings[idx].charset;
 273 }
 274
 275
 276 gchar *encodings_to_string(const GeanyEncoding* enc)
 277 {
 278         g_return_val_if_fail(enc != NULL, NULL);
 279         g_return_val_if_fail(enc->name != NULL, NULL);
 280         g_return_val_if_fail(enc->charset != NULL, NULL);
 281
 282         return g_strdup_printf("%s (%s)", enc->name, enc->charset);
 283 }
 284
 285
 286 const gchar *encodings_get_charset(const GeanyEncoding* enc)
 287 {
 288         g_return_val_if_fail(enc != NULL, NULL);
 289         g_return_val_if_fail(enc->charset != NULL, NULL);
 290
 291         return enc->charset;
 292 }
 293
 294
 295 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
 296
 297
 298 void encodings_select_radio_item(const gchar *charset)
 299 {
 300         gint i;
 301
 302         g_return_if_fail(charset != NULL);
 303
 304         i = 0;
 305         while (i < GEANY_ENCODINGS_MAX)
 306         {
 307                 if (utils_str_equal(charset, encodings[i].charset))
 308                         break;
 309                 i++;
 310         }
 311         if (i == GEANY_ENCODINGS_MAX)
 312                 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
 313
 314         /* ignore_callback has to be set by the caller */
 315         gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
 316 }
 317
 318
 319 /* Regexp detection of file encoding declared in the file itself.
 320  * Idea and parts of code taken from Bluefish, thanks.
 321  * regex_compile() is used to compile regular expressions on program init and keep it in memory
 322  * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
 323  */
 324 static void regex_compile(regex_t *preg, const gchar *pattern)
 325 {
 326         gint retval = regcomp(preg, pattern, REG_EXTENDED | REG_ICASE);
 327         if (retval != 0)
 328         {
 329                 gchar errmsg[512];
 330                 regerror(retval, preg, errmsg, 512);
 331                 geany_debug("regcomp() failed (%s)", errmsg);
 332                 regfree(preg);
 333                 return;
 334         }
 335 }
 336
 337
 338 static gchar *regex_match(regex_t *preg, const gchar *buffer, gsize size)
 339 {
 340         gint retval;
 341         gchar *tmp_buf = NULL;
 342         gchar *encoding = NULL;
 343         regmatch_t pmatch[10];
 344
 345         if (G_UNLIKELY(! pregs_loaded || buffer == NULL))
 346                 return NULL;
 347
 348         if (size > 512)
 349                 tmp_buf = g_strndup(buffer, 512); /* scan only the first 512 characters in the buffer */
 350
 351         retval = regexec(preg, (tmp_buf != NULL) ? tmp_buf : buffer, 10, pmatch, 0);
 352         if (retval == 0 && pmatch[0].rm_so != -1 && pmatch[1].rm_so != -1)
 353         {
 354                 encoding = g_strndup(&buffer[pmatch[1].rm_so], pmatch[1].rm_eo - pmatch[1].rm_so);
 355                 geany_debug("Detected encoding by regex search: %s", encoding);
 356
 357                 setptr(encoding, g_utf8_strup(encoding, -1));
 358         }
 359         g_free(tmp_buf);
 360         return encoding;
 361 }
 362
 363
 364 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
 365 {
 366         GeanyDocument *doc = document_get_current();
 367         guint i = GPOINTER_TO_INT(user_data);
 368
 369         if (ignore_callback || doc == NULL || encodings[i].charset == NULL ||
 370                 ! gtk_check_menu_item_get_active(menuitem) ||
 371                 utils_str_equal(encodings[i].charset, doc->encoding))
 372                 return;
 373
 374         if (doc->readonly)
 375         {
 376                 utils_beep();
 377                 return;
 378         }
 379         document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
 380
 381         document_set_encoding(doc, encodings[i].charset);
 382 }
 383
 384
 385 void encodings_finalize(void)
 386 {
 387         if (pregs_loaded)
 388         {
 389                 guint i, len;
 390                 len = G_N_ELEMENTS(pregs);
 391                 for (i = 0; i < len; i++)
 392                 {
 393                         regfree(&pregs[i]);
 394                 }
 395         }
 396 }
 397
 398
 399 void encodings_init(void)
 400 {
 401         GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian,
 402                           *menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian,
 403                           *item_asian, *item_utf8, *item_middleeast;
 404         GCallback cb_func[2];
 405         GSList *group = NULL;
 406         gchar *label;
 407         gint order, group_size;
 408         guint i, j, k;
 409
 410         init_encodings();
 411
 412         if (! pregs_loaded)
 413         {
 414                 regex_compile(&pregs[0], PATTERN_HTMLMETA);
 415                 regex_compile(&pregs[1], PATTERN_CODING);
 416                 pregs_loaded = TRUE;
 417         }
 418
 419         /* create encodings submenu in document menu */
 420         menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
 421         menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
 422         cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
 423         cb_func[1] = G_CALLBACK(on_reload_as_activate);
 424
 425         for (k = 0; k < 2; k++)
 426         {
 427                 menu_westeuro = gtk_menu_new();
 428                 item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European"));
 429                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro);
 430                 gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro);
 431                 gtk_widget_show_all(item_westeuro);
 432
 433                 menu_easteuro = gtk_menu_new();
 434                 item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European"));
 435                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro);
 436                 gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro);
 437                 gtk_widget_show_all(item_easteuro);
 438
 439                 menu_eastasian = gtk_menu_new();
 440                 item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian"));
 441                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian);
 442                 gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian);
 443                 gtk_widget_show_all(item_eastasian);
 444
 445                 menu_asian = gtk_menu_new();
 446                 item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
 447                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian);
 448                 gtk_container_add(GTK_CONTAINER(menu[k]), item_asian);
 449                 gtk_widget_show_all(item_asian);
 450
 451                 menu_middleeast = gtk_menu_new();
 452                 item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
 453                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast);
 454                 gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast);
 455                 gtk_widget_show_all(item_middleeast);
 456
 457                 menu_utf8 = gtk_menu_new();
 458                 item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode"));
 459                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8);
 460                 gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8);
 461                 gtk_widget_show_all(item_utf8);
 462
 463                 /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
 464                 for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
 465                 {
 466                         order = 0;
 467                         switch (i)
 468                         {
 469                                 case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break;
 470                                 case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break;
 471                                 case EASTASIAN: submenu = menu_eastasian; group_size = 14; break;
 472                                 case ASIAN: submenu = menu_asian; group_size = 9; break;
 473                                 case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break;
 474                                 case UNICODE: submenu = menu_utf8; group_size = 8; break;
 475                                 default: submenu = menu[k]; group_size = 1;
 476                         }
 477
 478                         while (order < group_size)      /* the biggest group has 13 elements */
 479                         {
 480                                 for (j = 0; j < GEANY_ENCODINGS_MAX; j++)
 481                                 {
 482                                         if (encodings[j].group == i && encodings[j].order == order)
 483                                         {
 484                                                 label = encodings_to_string(&encodings[j]);
 485                                                 if (k == 0)
 486                                                 {
 487                                                         item = gtk_radio_menu_item_new_with_label(group, label);
 488                                                         group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
 489                                                         radio_items[j] = item;
 490                                                 }
 491                                                 else
 492                                                         item = gtk_menu_item_new_with_label(label);
 493                                                 gtk_widget_show(item);
 494                                                 gtk_container_add(GTK_CONTAINER(submenu), item);
 495                                                 g_signal_connect(item, "activate",
 496                                                                                 cb_func[k], GINT_TO_POINTER(encodings[j].idx));
 497                                                 g_free(label);
 498                                                 break;
 499                                         }
 500                                 }
 501                                 order++;
 502                         }
 503                 }
 504         }
 505 }
 506
 507
 508 /**
 509  *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
 510  *  If @a fast is not set, additional checks to validate the converted string are performed.
 511  *
 512  *  @param buffer The input string to convert.
 513  *  @param size The length of the string, or -1 if the string is nul-terminated.
 514  *  @param charset The charset to be used for conversion.
 515  *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
 516  *
 517  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 518  *    which must be freed with @c g_free(). Otherwise @c NULL.
 519  **/
 520 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gsize size,
 521                                                                                           const gchar *charset, gboolean fast)
 522 {
 523         gchar *utf8_content = NULL;
 524         GError *conv_error = NULL;
 525         gchar* converted_contents = NULL;
 526         gsize bytes_written;
 527
 528         g_return_val_if_fail(buffer != NULL, NULL);
 529         g_return_val_if_fail(charset != NULL, NULL);
 530
 531         converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
 532                                                                    &bytes_written, &conv_error);
 533
 534         if (fast)
 535         {
 536                 utf8_content = converted_contents;
 537                 if (conv_error != NULL) g_error_free(conv_error);
 538         }
 539         else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
 540         {
 541                 if (conv_error != NULL)
 542                 {
 543                         geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
 544                         g_error_free(conv_error);
 545                         conv_error = NULL;
 546                 }
 547                 else
 548                         geany_debug("Couldn't convert from %s to UTF-8.", charset);
 549
 550                 utf8_content = NULL;
 551                 g_free(converted_contents);
 552         }
 553         else
 554         {
 555                 geany_debug("Converted from %s to UTF-8.", charset);
 556                 utf8_content = converted_contents;
 557         }
 558
 559         return utf8_content;
 560 }
 561
 562
 563 static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
 564 {
 565         guint i;
 566
 567         for (i = 0; i < G_N_ELEMENTS(pregs); i++)
 568         {
 569                 gchar *charset;
 570
 571                 if ((charset = regex_match(&pregs[i], buffer, size)) != NULL)
 572                         return charset;
 573         }
 574         return NULL;
 575 }
 576
 577
 578 static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gsize size,
 579                 const gchar *suggested_charset, gchar **used_encoding)
 580 {
 581         const gchar *locale_charset = NULL;
 582         const gchar *charset;
 583         gchar *utf8_content;
 584         gboolean check_suggestion = suggested_charset != NULL;
 585         gboolean check_locale = FALSE;
 586         gint i, preferred_charset;
 587
 588         if ((gint)size == -1)
 589         {
 590                 size = strlen(buffer);
 591         }
 592
 593         /* current locale is not UTF-8, we have to check this charset */
 594         check_locale = ! g_get_charset(&locale_charset);
 595
 596         /* First check for preferred charset, if specified */
 597         preferred_charset = file_prefs.default_open_encoding;
 598
 599         if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
 600                 preferred_charset < 0 ||
 601                 preferred_charset >= GEANY_ENCODINGS_MAX)
 602         {
 603                 preferred_charset = -1;
 604         }
 605
 606         /* -1 means "Preferred charset" */
 607         for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
 608         {
 609                 if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
 610                         continue;
 611
 612                 if (check_suggestion)
 613                 {
 614                         check_suggestion = FALSE;
 615                         charset = encodings_normalize_charset(suggested_charset);
 616                         if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
 617                                 charset = suggested_charset;
 618                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 619                 }
 620                 else if (check_locale)
 621                 {
 622                         check_locale = FALSE;
 623                         charset = locale_charset;
 624                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 625                 }
 626                 else if (i == -1)
 627                 {
 628                         if (preferred_charset >= 0)
 629                         {
 630                                 charset = encodings[preferred_charset].charset;
 631                                 geany_debug("Using preferred charset: %s", charset);
 632                         }
 633                         else
 634                                 continue;
 635                 }
 636                 else if (i >= 0)
 637                         charset = encodings[i].charset;
 638                 else /* in this case we have i == -2, continue to increase i and go ahead */
 639                         continue;
 640
 641                 if (G_UNLIKELY(charset == NULL))
 642                         continue;
 643
 644                 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
 645                         size, charset);
 646                 utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
 647
 648                 if (G_LIKELY(utf8_content != NULL))
 649                 {
 650                         if (used_encoding != NULL)
 651                         {
 652                                 if (G_UNLIKELY(*used_encoding != NULL))
 653                                 {
 654                                         geany_debug("%s:%d", __FILE__, __LINE__);
 655                                         g_free(*used_encoding);
 656                                 }
 657                                 *used_encoding = g_strdup(charset);
 658                         }
 659                         return utf8_content;
 660                 }
 661         }
 662
 663         return NULL;
 664 }
 665
 666
 667 /**
 668  *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
 669  *  @a used_encoding.
 670  *
 671  *  @param buffer the input string to convert.
 672  *  @param size the length of the string, or -1 if the string is nul-terminated.
 673  *  @param used_encoding return location of the detected encoding of the input string, or @c NULL.
 674  *
 675  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 676  *    which must be freed with @c g_free(). Otherwise @c NULL.
 677  **/
 678 gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
 679 {
 680         gchar *regex_charset;
 681         gchar *utf8;
 682
 683         /* first try to read the encoding from the file content */
 684         regex_charset = encodings_check_regexes(buffer, size);
 685         utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
 686         g_free(regex_charset);
 687
 688         return utf8;
 689 }
 690
 691
 692 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
 693  * otherwise GEANY_ENCODING_NONE.
 694  * */
 695 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
 696 {
 697         if (len >= 3)
 698         {
 699                 if (bom_len)
 700                         *bom_len = 3;
 701
 702                 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
 703                         (guchar)string[2] == 0xbf)
 704                 {
 705                         return GEANY_ENCODING_UTF_8;
 706                 }
 707         }
 708         if (len >= 4)
 709         {
 710                 if (bom_len)
 711                         *bom_len = 4;
 712
 713                 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
 714                                  (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
 715                 {
 716                         return GEANY_ENCODING_UTF_32BE; /* Big endian */
 717                 }
 718                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
 719                                  (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
 720                 {
 721                         return GEANY_ENCODING_UTF_32LE; /* Little endian */
 722                 }
 723                 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
 724                                  (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
 725                 {
 726                          return GEANY_ENCODING_UTF_7;
 727                 }
 728         }
 729         if (len >= 2)
 730         {
 731                 if (bom_len)
 732                         *bom_len = 2;
 733
 734                 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
 735                 {
 736                         return GEANY_ENCODING_UTF_16BE; /* Big endian */
 737                 }
 738                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
 739                 {
 740                         return GEANY_ENCODING_UTF_16LE; /* Little endian */
 741                 }
 742         }
 743         if (bom_len)
 744                 *bom_len = 0;
 745         return GEANY_ENCODING_NONE;
 746 }
 747
 748
 749 gboolean encodings_is_unicode_charset(const gchar *string)
 750 {
 751         if (string != NULL &&
 752                 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
 753         {
 754                 return TRUE;
 755         }
 756         return FALSE;
 757 }
 758
 759
 760 typedef struct
 761 {
 762         gchar           *data;  /* null-terminated data */
 763         gsize            size;  /* actual data size */
 764         gsize            len;   /* string length of data */
 765         gchar           *enc;
 766         gboolean         bom;
 767         gboolean         partial;
 768 } BufferData;
 769
 770
 771 /* convert data with the specified encoding */
 772 static gboolean
 773 handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
 774 {
 775         GeanyEncodingIndex enc_idx;
 776
 777         if (utils_str_equal(forced_enc, "UTF-8"))
 778         {
 779                 if (! g_utf8_validate(buffer->data, buffer->len, NULL))
 780                 {
 781                         return FALSE;
 782                 }
 783         }
 784         else
 785         {
 786                 gchar *converted_text = encodings_convert_to_utf8_from_charset(
 787                                                                                 buffer->data, buffer->size, forced_enc, FALSE);
 788                 if (converted_text == NULL)
 789                 {
 790                         return FALSE;
 791                 }
 792                 else
 793                 {
 794                         setptr(buffer->data, converted_text);
 795                         buffer->len = strlen(converted_text);
 796                 }
 797         }
 798         enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
 799         buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
 800         buffer->enc = g_strdup(forced_enc);
 801         return TRUE;
 802 }
 803
 804
 805 /* detect encoding and convert to UTF-8 if necessary */
 806 static gboolean
 807 handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
 808 {
 809         g_return_val_if_fail(buffer->enc == NULL, FALSE);
 810         g_return_val_if_fail(buffer->bom == FALSE, FALSE);
 811
 812         if (buffer->size == 0)
 813         {
 814                 /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
 815                  * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
 816                 buffer->enc = g_strdup("UTF-8");
 817         }
 818         else
 819         {
 820                 /* first check for a BOM */
 821                 if (enc_idx != GEANY_ENCODING_NONE)
 822                 {
 823                         buffer->enc = g_strdup(encodings[enc_idx].charset);
 824                         buffer->bom = TRUE;
 825
 826                         if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
 827                         {
 828                                 gchar *converted_text = encodings_convert_to_utf8_from_charset(
 829                                                                                 buffer->data, buffer->size, buffer->enc, FALSE);
 830                                 if (converted_text != NULL)
 831                                 {
 832                                         setptr(buffer->data, converted_text);
 833                                         buffer->len = strlen(converted_text);
 834                                 }
 835                                 else
 836                                 {
 837                                         /* there was a problem converting data from BOM encoding type */
 838                                         setptr(buffer->enc, NULL);
 839                                         buffer->bom = FALSE;
 840                                 }
 841                         }
 842                 }
 843
 844                 if (buffer->enc == NULL)        /* either there was no BOM or the BOM encoding failed */
 845                 {
 846                         /* first try to read the encoding from the file content */
 847                         gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
 848
 849                         /* try UTF-8 first */
 850                         if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
 851                                 (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
 852                         {
 853                                 buffer->enc = g_strdup("UTF-8");
 854                         }
 855                         else
 856                         {
 857                                 /* detect the encoding */
 858                                 gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
 859                                         buffer->size, regex_charset, &buffer->enc);
 860
 861                                 if (converted_text == NULL)
 862                                 {
 863                                         g_free(regex_charset);
 864                                         return FALSE;
 865                                 }
 866                                 setptr(buffer->data, converted_text);
 867                                 buffer->len = strlen(converted_text);
 868                         }
 869                         g_free(regex_charset);
 870                 }
 871         }
 872         return TRUE;
 873 }
 874
 875
 876 static void
 877 handle_bom(BufferData *buffer)
 878 {
 879         guint bom_len;
 880
 881         encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
 882         g_return_if_fail(bom_len != 0);
 883
 884         /* use filedata->len here because the contents are already converted into UTF-8 */
 885         buffer->len -= bom_len;
 886         /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
 887         g_memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
 888         buffer->data = g_realloc(buffer->data, buffer->len + 1);
 889 }
 890
 891
 892 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
 893 static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
 894 {
 895         GeanyEncodingIndex tmp_enc_idx;
 896
 897         /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
 898          * if we have a BOM */
 899         tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
 900
 901         /* check whether the size of the loaded data is equal to the size of the file in the
 902          * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
 903          * file size of 0 bytes */
 904         if (buffer->len != buffer->size && buffer->size != 0 && (
 905                 tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
 906                 tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
 907         {
 908                 buffer->partial = TRUE;
 909         }
 910
 911         /* Determine character encoding and convert to UTF-8 */
 912         if (forced_enc != NULL)
 913         {
 914                 /* the encoding should be ignored(requested by user), so open the file "as it is" */
 915                 if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
 916                 {
 917                         buffer->bom = FALSE;
 918                         buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
 919                 }
 920                 else if (! handle_forced_encoding(buffer, forced_enc))
 921                 {
 922                         return FALSE;
 923                 }
 924         }
 925         else if (! handle_encoding(buffer, tmp_enc_idx))
 926         {
 927                 return FALSE;
 928         }
 929
 930         if (buffer->bom)
 931                 handle_bom(buffer);
 932         return TRUE;
 933 }
 934
 935
 936 /*
 937  * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
 938  * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
 939  *
 940  * @param buf a pointer to modifiable null-terminated buffer to convert.
 941  *   It may or may not be modified, and should be freed whatever happens.
 942  * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
 943  *   file size). It will be updated to the new size.
 944  * @param forced_enc forced encoding to use, or @c NULL
 945  * @param used_encoding return location for the actually used encoding, or @c NULL
 946  * @param has_bom return location to store whether the data had a BOM, or @c NULL
 947  * @param partial return location to store whether the conversion may be partial, or @c NULL
 948  *
 949  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
 950  */
 951 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
 952                 gchar **used_encoding, gboolean *has_bom, gboolean *partial)
 953 {
 954         BufferData buffer;
 955
 956         buffer.data = *buf;
 957         buffer.size = *size;
 958         /* use strlen to check for null chars */
 959         buffer.len = strlen(buffer.data);
 960         buffer.enc = NULL;
 961         buffer.bom = FALSE;
 962         buffer.partial = FALSE;
 963
 964         if (! handle_buffer(&buffer, forced_enc))
 965                 return FALSE;
 966
 967         *size = buffer.len;
 968         if (used_encoding)
 969                 *used_encoding = buffer.enc;
 970         else
 971                 g_free(buffer.enc);
 972         if (has_bom)
 973                 *has_bom = buffer.bom;
 974         if (partial)
 975                 *partial = buffer.partial;
 976
 977         *buf = buffer.data;
 978         return TRUE;
 979 }