src/encodings.c

   1 /*
   2  *      encodings.c - this file is part of Geany, a fast and lightweight IDE
   3  *
   4  *      Copyright 2005-2012 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
   5  *      Copyright 2006-2012 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
   6  *
   7  *      This program is free software; you can redistribute it and/or modify
   8  *      it under the terms of the GNU General Public License as published by
   9  *      the Free Software Foundation; either version 2 of the License, or
  10  *      (at your option) any later version.
  11  *
  12  *      This program is distributed in the hope that it will be useful,
  13  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *      GNU General Public License for more details.
  16  *
  17  *      You should have received a copy of the GNU General Public License along
  18  *      with this program; if not, write to the Free Software Foundation, Inc.,
  19  *      51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20  */
  21
  22 /*
  23  * Encoding conversion and Byte Order Mark (BOM) handling.
  24  */
  25
  26 /*
  27  * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
  28  * list of people on the gedit Team.
  29  * See the gedit ChangeLog files for a list of changes.
  30  */
  31  /* Stolen from anjuta */
  32
  33 #include <string.h>
  34
  35 #include "geany.h"
  36 #include "utils.h"
  37 #include "support.h"
  38 #include "document.h"
  39 #include "documentprivate.h"
  40 #include "msgwindow.h"
  41 #include "encodings.h"
  42 #include "callbacks.h"
  43 #include "ui_utils.h"
  44
  45 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
  46 #define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
  47 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
  48 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
  49
  50 /* precompiled regexps */
  51 static GRegex *pregs[2];
  52 static gboolean pregs_loaded = FALSE;
  53
  54
  55 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
  56
  57
  58 #define fill(Order, Group, Idx, Charset, Name) \
  59                 encodings[Idx].idx = Idx; \
  60                 encodings[Idx].order = Order; \
  61                 encodings[Idx].group = Group; \
  62                 encodings[Idx].charset = Charset; \
  63                 encodings[Idx].name = Name;
  64
  65 static void init_encodings(void)
  66 {
  67         fill(0,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_14,             "ISO-8859-14",          _("Celtic"));
  68         fill(1,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_7,              "ISO-8859-7",           _("Greek"));
  69         fill(2,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1253,    "WINDOWS-1253",         _("Greek"));
  70         fill(3,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_10,             "ISO-8859-10",          _("Nordic"));
  71         fill(4,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_3,              "ISO-8859-3",           _("South European"));
  72         fill(5,         WESTEUROPEAN,   GEANY_ENCODING_IBM_850,                 "IBM850",                       _("Western"));
  73         fill(6,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_1,              "ISO-8859-1",           _("Western"));
  74         fill(7,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_15,             "ISO-8859-15",          _("Western"));
  75         fill(8,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1252,    "WINDOWS-1252",         _("Western"));
  76
  77         fill(0,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_4,              "ISO-8859-4",           _("Baltic"));
  78         fill(1,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_13,             "ISO-8859-13",          _("Baltic"));
  79         fill(2,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1257,    "WINDOWS-1257",         _("Baltic"));
  80         fill(3,         EASTEUROPEAN,   GEANY_ENCODING_IBM_852,                 "IBM852",                       _("Central European"));
  81         fill(4,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_2,              "ISO-8859-2",           _("Central European"));
  82         fill(5,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1250,    "WINDOWS-1250",         _("Central European"));
  83         fill(6,         EASTEUROPEAN,   GEANY_ENCODING_IBM_855,                 "IBM855",                       _("Cyrillic"));
  84         fill(7,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_5,              "ISO-8859-5",           _("Cyrillic"));
  85         /* ISO-IR-111 not available on Windows */
  86         fill(8,         EASTEUROPEAN,   GEANY_ENCODING_ISO_IR_111,              "ISO-IR-111",           _("Cyrillic"));
  87         fill(9,         EASTEUROPEAN,   GEANY_ENCODING_KOI8_R,                  "KOI8-R",                       _("Cyrillic"));
  88         fill(10,        EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1251,    "WINDOWS-1251",         _("Cyrillic"));
  89         fill(11,        EASTEUROPEAN,   GEANY_ENCODING_CP_866,                  "CP866",                        _("Cyrillic/Russian"));
  90         fill(12,        EASTEUROPEAN,   GEANY_ENCODING_KOI8_U,                  "KOI8-U",                       _("Cyrillic/Ukrainian"));
  91         fill(13,        EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_16,             "ISO-8859-16",          _("Romanian"));
  92
  93         fill(0,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_864,                 "IBM864",                       _("Arabic"));
  94         fill(1,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_6,              "ISO-8859-6",           _("Arabic"));
  95         fill(2,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1256,    "WINDOWS-1256",         _("Arabic"));
  96         fill(3,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_862,                 "IBM862",                       _("Hebrew"));
  97         /* not available at all, ? */
  98         fill(4,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8_I,    "ISO-8859-8-I",         _("Hebrew"));
  99         fill(5,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1255,    "WINDOWS-1255",         _("Hebrew"));
 100         fill(6,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8,              "ISO-8859-8",           _("Hebrew Visual"));
 101
 102         fill(0,         ASIAN,                  GEANY_ENCODING_ARMSCII_8,               "ARMSCII-8",            _("Armenian"));
 103         fill(1,         ASIAN,                  GEANY_ENCODING_GEOSTD8,                 "GEORGIAN-ACADEMY",     _("Georgian"));
 104         fill(2,         ASIAN,                  GEANY_ENCODING_TIS_620,                 "TIS-620",                      _("Thai"));
 105         fill(3,         ASIAN,                  GEANY_ENCODING_IBM_857,                 "IBM857",                       _("Turkish"));
 106         fill(4,         ASIAN,                  GEANY_ENCODING_WINDOWS_1254,    "WINDOWS-1254",         _("Turkish"));
 107         fill(5,         ASIAN,                  GEANY_ENCODING_ISO_8859_9,              "ISO-8859-9",           _("Turkish"));
 108         fill(6,         ASIAN,                  GEANY_ENCODING_TCVN,                    "TCVN",                         _("Vietnamese"));
 109         fill(7,         ASIAN,                  GEANY_ENCODING_VISCII,                  "VISCII",                       _("Vietnamese"));
 110         fill(8,         ASIAN,                  GEANY_ENCODING_WINDOWS_1258,    "WINDOWS-1258",         _("Vietnamese"));
 111
 112         fill(0,         UNICODE,                GEANY_ENCODING_UTF_7,                   "UTF-7",                        _("Unicode"));
 113         fill(1,         UNICODE,                GEANY_ENCODING_UTF_8,                   "UTF-8",                        _("Unicode"));
 114         fill(2,         UNICODE,                GEANY_ENCODING_UTF_16LE,                "UTF-16LE",                     _("Unicode"));
 115         fill(3,         UNICODE,                GEANY_ENCODING_UTF_16BE,                "UTF-16BE",                     _("Unicode"));
 116         fill(4,         UNICODE,                GEANY_ENCODING_UCS_2LE,                 "UCS-2LE",                      _("Unicode"));
 117         fill(5,         UNICODE,                GEANY_ENCODING_UCS_2BE,                 "UCS-2BE",                      _("Unicode"));
 118         fill(6,         UNICODE,                GEANY_ENCODING_UTF_32LE,                "UTF-32LE",                     _("Unicode"));
 119         fill(7,         UNICODE,                GEANY_ENCODING_UTF_32BE,                "UTF-32BE",                     _("Unicode"));
 120
 121         fill(0,         EASTASIAN,              GEANY_ENCODING_GB18030,                 "GB18030",                      _("Chinese Simplified"));
 122         fill(1,         EASTASIAN,              GEANY_ENCODING_GB2312,                  "GB2312",                       _("Chinese Simplified"));
 123         fill(2,         EASTASIAN,              GEANY_ENCODING_GBK,                             "GBK",                          _("Chinese Simplified"));
 124         /* maybe not available on Linux */
 125         fill(3,         EASTASIAN,              GEANY_ENCODING_HZ,                              "HZ",                           _("Chinese Simplified"));
 126         fill(4,         EASTASIAN,              GEANY_ENCODING_BIG5,                    "BIG5",                         _("Chinese Traditional"));
 127         fill(5,         EASTASIAN,              GEANY_ENCODING_BIG5_HKSCS,              "BIG5-HKSCS",           _("Chinese Traditional"));
 128         fill(6,         EASTASIAN,              GEANY_ENCODING_EUC_TW,                  "EUC-TW",                       _("Chinese Traditional"));
 129         fill(7,         EASTASIAN,              GEANY_ENCODING_EUC_JP,                  "EUC-JP",                       _("Japanese"));
 130         fill(8,         EASTASIAN,              GEANY_ENCODING_ISO_2022_JP,             "ISO-2022-JP",          _("Japanese"));
 131         fill(9,         EASTASIAN,              GEANY_ENCODING_SHIFT_JIS,               "SHIFT_JIS",            _("Japanese"));
 132         fill(10,        EASTASIAN,              GEANY_ENCODING_CP_932,                  "CP932",                        _("Japanese"));
 133         fill(11,        EASTASIAN,              GEANY_ENCODING_EUC_KR,                  "EUC-KR",                       _("Korean"));
 134         fill(12,        EASTASIAN,              GEANY_ENCODING_ISO_2022_KR,             "ISO-2022-KR",          _("Korean"));
 135         fill(13,        EASTASIAN,              GEANY_ENCODING_JOHAB,                   "JOHAB",                        _("Korean"));
 136         fill(14,        EASTASIAN,              GEANY_ENCODING_UHC,                             "UHC",                          _("Korean"));
 137
 138         fill(0,         NONE,                   GEANY_ENCODING_NONE,                    "None",                         _("Without encoding"));
 139 }
 140
 141
 142 /* compares two encoding names in a permissive fashion.
 143  * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
 144 static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
 145 {
 146         gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
 147         gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
 148
 149         while (*a && *b)
 150         {
 151                 gboolean is_alpha;
 152
 153                 if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
 154                         ((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
 155                 {
 156                         /* either there was a real separator, or we need a implicit one (a chage from alpha to
 157                          * numeric or so) */
 158                         if (! need_sep || (was_alpha != is_alpha))
 159                         {
 160                                 a++;
 161                                 b++;
 162                                 was_alpha = is_alpha;
 163                                 need_sep = FALSE;
 164                         }
 165                         else
 166                                 return FALSE;
 167                 }
 168                 else
 169                 {
 170                         guint n_sep = 0;
 171
 172                         if (! g_ascii_isalnum(*a))
 173                         {
 174                                 a++;
 175                                 n_sep++;
 176                         }
 177                         if (! g_ascii_isalnum(*b))
 178                         {
 179                                 b++;
 180                                 n_sep++;
 181                         }
 182                         if (n_sep < 1)
 183                                 return FALSE;
 184                         else if (n_sep < 2)
 185                                 need_sep = TRUE;
 186                 }
 187         }
 188         return *a == *b;
 189 }
 190
 191
 192 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
 193 {
 194         gint i;
 195
 196         if (charset == NULL)
 197                 return GEANY_ENCODING_UTF_8;
 198
 199         i = 0;
 200         while (i < GEANY_ENCODINGS_MAX)
 201         {
 202                 if (encodings_charset_equals(charset, encodings[i].charset))
 203                         return i;
 204
 205                 ++i;
 206         }
 207         return GEANY_ENCODING_UTF_8;
 208 }
 209
 210
 211 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
 212 {
 213         gint i;
 214
 215         if (charset == NULL)
 216                 return &encodings[GEANY_ENCODING_UTF_8];
 217
 218         i = 0;
 219         while (i < GEANY_ENCODINGS_MAX)
 220         {
 221                 if (encodings_charset_equals(charset, encodings[i].charset))
 222                         return &encodings[i];
 223
 224                 ++i;
 225         }
 226
 227         return NULL;
 228 }
 229
 230
 231 static const gchar *encodings_normalize_charset(const gchar *charset)
 232 {
 233         const GeanyEncoding *encoding;
 234
 235         encoding = encodings_get_from_charset(charset);
 236         if (encoding != NULL)
 237                 return encoding->charset;
 238
 239         return NULL;
 240 }
 241
 242
 243 const GeanyEncoding *encodings_get_from_index(gint idx)
 244 {
 245         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 246
 247         return &encodings[idx];
 248 }
 249
 250
 251 /**
 252  *  Gets the character set name of the specified index e.g. for use with
 253  *  @ref document_set_encoding().
 254  *
 255  *  @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
 256  *
 257  *
 258  *  @return The charset according to idx, or @c NULL if the index is invalid.
 259  *
 260  *  @since 0.13
 261  **/
 262 const gchar* encodings_get_charset_from_index(gint idx)
 263 {
 264         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 265
 266         return encodings[idx].charset;
 267 }
 268
 269
 270 gchar *encodings_to_string(const GeanyEncoding* enc)
 271 {
 272         g_return_val_if_fail(enc != NULL, NULL);
 273         g_return_val_if_fail(enc->name != NULL, NULL);
 274         g_return_val_if_fail(enc->charset != NULL, NULL);
 275
 276         return g_strdup_printf("%s (%s)", enc->name, enc->charset);
 277 }
 278
 279
 280 const gchar *encodings_get_charset(const GeanyEncoding* enc)
 281 {
 282         g_return_val_if_fail(enc != NULL, NULL);
 283         g_return_val_if_fail(enc->charset != NULL, NULL);
 284
 285         return enc->charset;
 286 }
 287
 288
 289 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
 290
 291
 292 void encodings_select_radio_item(const gchar *charset)
 293 {
 294         gint i;
 295
 296         g_return_if_fail(charset != NULL);
 297
 298         i = 0;
 299         while (i < GEANY_ENCODINGS_MAX)
 300         {
 301                 if (utils_str_equal(charset, encodings[i].charset))
 302                         break;
 303                 i++;
 304         }
 305         if (i == GEANY_ENCODINGS_MAX)
 306                 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
 307
 308         /* ignore_callback has to be set by the caller */
 309         gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
 310 }
 311
 312
 313 /* Regexp detection of file encoding declared in the file itself.
 314  * Idea and parts of code taken from Bluefish, thanks.
 315  * regex_compile() is used to compile regular expressions on program init and keep it in memory
 316  * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
 317  */
 318 static GRegex *regex_compile(const gchar *pattern)
 319 {
 320         GError *error = NULL;
 321         GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error);
 322
 323         if (!regex)
 324         {
 325                 geany_debug("Failed to compile encoding regex (%s)", error->message);
 326                 g_error_free(error);
 327         }
 328         return regex;
 329 }
 330
 331
 332 static gchar *regex_match(GRegex *preg, const gchar *buffer, gsize size)
 333 {
 334         gchar *encoding = NULL;
 335         GMatchInfo *minfo;
 336
 337         if (G_UNLIKELY(! pregs_loaded || buffer == NULL))
 338                 return NULL;
 339
 340         /* scan only the first 512 characters in the buffer */
 341         size = MIN(size, 512);
 342
 343         if (g_regex_match_full(preg, buffer, size, 0, 0, &minfo, NULL) &&
 344                 g_match_info_get_match_count(minfo) >= 2)
 345         {
 346                 encoding = g_match_info_fetch(minfo, 1);
 347                 geany_debug("Detected encoding by regex search: %s", encoding);
 348
 349                 SETPTR(encoding, g_utf8_strup(encoding, -1));
 350         }
 351         g_match_info_free(minfo);
 352         return encoding;
 353 }
 354
 355
 356 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
 357 {
 358         GeanyDocument *doc = document_get_current();
 359         guint i = GPOINTER_TO_INT(user_data);
 360
 361         if (ignore_callback || doc == NULL || encodings[i].charset == NULL ||
 362                 ! gtk_check_menu_item_get_active(menuitem) ||
 363                 utils_str_equal(encodings[i].charset, doc->encoding))
 364                 return;
 365
 366         if (doc->readonly)
 367         {
 368                 utils_beep();
 369                 return;
 370         }
 371         document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
 372
 373         document_set_encoding(doc, encodings[i].charset);
 374 }
 375
 376
 377 void encodings_finalize(void)
 378 {
 379         if (pregs_loaded)
 380         {
 381                 guint i, len;
 382                 len = G_N_ELEMENTS(pregs);
 383                 for (i = 0; i < len; i++)
 384                 {
 385                         g_regex_unref(pregs[i]);
 386                 }
 387         }
 388 }
 389
 390
 391 void encodings_init(void)
 392 {
 393         GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian,
 394                           *menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian,
 395                           *item_asian, *item_utf8, *item_middleeast;
 396         GCallback cb_func[2];
 397         GSList *group = NULL;
 398         gchar *label;
 399         gint order, group_size;
 400         guint i, j, k;
 401
 402         init_encodings();
 403
 404         if (! pregs_loaded)
 405         {
 406                 pregs[0] = regex_compile(PATTERN_HTMLMETA);
 407                 pregs[1] = regex_compile(PATTERN_CODING);
 408                 pregs_loaded = TRUE;
 409         }
 410
 411         /* create encodings submenu in document menu */
 412         menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
 413         menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
 414         cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
 415         cb_func[1] = G_CALLBACK(on_reload_as_activate);
 416
 417         for (k = 0; k < 2; k++)
 418         {
 419                 menu_westeuro = gtk_menu_new();
 420                 item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European"));
 421                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro);
 422                 gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro);
 423                 gtk_widget_show_all(item_westeuro);
 424
 425                 menu_easteuro = gtk_menu_new();
 426                 item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European"));
 427                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro);
 428                 gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro);
 429                 gtk_widget_show_all(item_easteuro);
 430
 431                 menu_eastasian = gtk_menu_new();
 432                 item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian"));
 433                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian);
 434                 gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian);
 435                 gtk_widget_show_all(item_eastasian);
 436
 437                 menu_asian = gtk_menu_new();
 438                 item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
 439                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian);
 440                 gtk_container_add(GTK_CONTAINER(menu[k]), item_asian);
 441                 gtk_widget_show_all(item_asian);
 442
 443                 menu_middleeast = gtk_menu_new();
 444                 item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
 445                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast);
 446                 gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast);
 447                 gtk_widget_show_all(item_middleeast);
 448
 449                 menu_utf8 = gtk_menu_new();
 450                 item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode"));
 451                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8);
 452                 gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8);
 453                 gtk_widget_show_all(item_utf8);
 454
 455                 /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
 456                 for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
 457                 {
 458                         order = 0;
 459                         switch (i)
 460                         {
 461                                 case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break;
 462                                 case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break;
 463                                 case EASTASIAN: submenu = menu_eastasian; group_size = 14; break;
 464                                 case ASIAN: submenu = menu_asian; group_size = 9; break;
 465                                 case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break;
 466                                 case UNICODE: submenu = menu_utf8; group_size = 8; break;
 467                                 default: submenu = menu[k]; group_size = 1;
 468                         }
 469
 470                         while (order < group_size)      /* the biggest group has 13 elements */
 471                         {
 472                                 for (j = 0; j < GEANY_ENCODINGS_MAX; j++)
 473                                 {
 474                                         if (encodings[j].group == i && encodings[j].order == order)
 475                                         {
 476                                                 label = encodings_to_string(&encodings[j]);
 477                                                 if (k == 0)
 478                                                 {
 479                                                         item = gtk_radio_menu_item_new_with_label(group, label);
 480                                                         group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
 481                                                         radio_items[j] = item;
 482                                                 }
 483                                                 else
 484                                                         item = gtk_menu_item_new_with_label(label);
 485                                                 gtk_widget_show(item);
 486                                                 gtk_container_add(GTK_CONTAINER(submenu), item);
 487                                                 g_signal_connect(item, "activate",
 488                                                                                 cb_func[k], GINT_TO_POINTER(encodings[j].idx));
 489                                                 g_free(label);
 490                                                 break;
 491                                         }
 492                                 }
 493                                 order++;
 494                         }
 495                 }
 496         }
 497 }
 498
 499
 500 /**
 501  *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
 502  *  If @a fast is not set, additional checks to validate the converted string are performed.
 503  *
 504  *  @param buffer The input string to convert.
 505  *  @param size The length of the string, or -1 if the string is nul-terminated.
 506  *  @param charset The charset to be used for conversion.
 507  *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
 508  *
 509  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 510  *    which must be freed with @c g_free(). Otherwise @c NULL.
 511  **/
 512 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
 513                                                                                           const gchar *charset, gboolean fast)
 514 {
 515         gchar *utf8_content = NULL;
 516         GError *conv_error = NULL;
 517         gchar* converted_contents = NULL;
 518         gsize bytes_written;
 519
 520         g_return_val_if_fail(buffer != NULL, NULL);
 521         g_return_val_if_fail(charset != NULL, NULL);
 522
 523         converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
 524                                                                    &bytes_written, &conv_error);
 525
 526         if (fast)
 527         {
 528                 utf8_content = converted_contents;
 529                 if (conv_error != NULL) g_error_free(conv_error);
 530         }
 531         else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
 532         {
 533                 if (conv_error != NULL)
 534                 {
 535                         geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
 536                         g_error_free(conv_error);
 537                         conv_error = NULL;
 538                 }
 539                 else
 540                         geany_debug("Couldn't convert from %s to UTF-8.", charset);
 541
 542                 utf8_content = NULL;
 543                 g_free(converted_contents);
 544         }
 545         else
 546         {
 547                 geany_debug("Converted from %s to UTF-8.", charset);
 548                 utf8_content = converted_contents;
 549         }
 550
 551         return utf8_content;
 552 }
 553
 554
 555 static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
 556 {
 557         guint i;
 558
 559         for (i = 0; i < G_N_ELEMENTS(pregs); i++)
 560         {
 561                 gchar *charset;
 562
 563                 if ((charset = regex_match(pregs[i], buffer, size)) != NULL)
 564                         return charset;
 565         }
 566         return NULL;
 567 }
 568
 569
 570 static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size,
 571                 const gchar *suggested_charset, gchar **used_encoding)
 572 {
 573         const gchar *locale_charset = NULL;
 574         const gchar *charset;
 575         gchar *utf8_content;
 576         gboolean check_suggestion = suggested_charset != NULL;
 577         gboolean check_locale = FALSE;
 578         gint i, preferred_charset;
 579
 580         if (size == -1)
 581         {
 582                 size = strlen(buffer);
 583         }
 584
 585         /* current locale is not UTF-8, we have to check this charset */
 586         check_locale = ! g_get_charset(&locale_charset);
 587
 588         /* First check for preferred charset, if specified */
 589         preferred_charset = file_prefs.default_open_encoding;
 590
 591         if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
 592                 preferred_charset < 0 ||
 593                 preferred_charset >= GEANY_ENCODINGS_MAX)
 594         {
 595                 preferred_charset = -1;
 596         }
 597
 598         /* -1 means "Preferred charset" */
 599         for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
 600         {
 601                 if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
 602                         continue;
 603
 604                 if (check_suggestion)
 605                 {
 606                         check_suggestion = FALSE;
 607                         charset = encodings_normalize_charset(suggested_charset);
 608                         if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
 609                                 charset = suggested_charset;
 610                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 611                 }
 612                 else if (check_locale)
 613                 {
 614                         check_locale = FALSE;
 615                         charset = locale_charset;
 616                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 617                 }
 618                 else if (i == -1)
 619                 {
 620                         if (preferred_charset >= 0)
 621                         {
 622                                 charset = encodings[preferred_charset].charset;
 623                                 geany_debug("Using preferred charset: %s", charset);
 624                         }
 625                         else
 626                                 continue;
 627                 }
 628                 else if (i >= 0)
 629                         charset = encodings[i].charset;
 630                 else /* in this case we have i == -2, continue to increase i and go ahead */
 631                         continue;
 632
 633                 if (G_UNLIKELY(charset == NULL))
 634                         continue;
 635
 636                 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
 637                         size, charset);
 638                 utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
 639
 640                 if (G_LIKELY(utf8_content != NULL))
 641                 {
 642                         if (used_encoding != NULL)
 643                         {
 644                                 if (G_UNLIKELY(*used_encoding != NULL))
 645                                 {
 646                                         geany_debug("%s:%d", __FILE__, __LINE__);
 647                                         g_free(*used_encoding);
 648                                 }
 649                                 *used_encoding = g_strdup(charset);
 650                         }
 651                         return utf8_content;
 652                 }
 653         }
 654
 655         return NULL;
 656 }
 657
 658
 659 /**
 660  *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
 661  *  @a used_encoding.
 662  *
 663  *  @param buffer the input string to convert.
 664  *  @param size the length of the string, or -1 if the string is nul-terminated.
 665  *  @param used_encoding return location of the detected encoding of the input string, or @c NULL.
 666  *
 667  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 668  *    which must be freed with @c g_free(). Otherwise @c NULL.
 669  **/
 670 gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_encoding)
 671 {
 672         gchar *regex_charset;
 673         gchar *utf8;
 674
 675         /* first try to read the encoding from the file content */
 676         regex_charset = encodings_check_regexes(buffer, size);
 677         utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
 678         g_free(regex_charset);
 679
 680         return utf8;
 681 }
 682
 683
 684 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
 685  * otherwise GEANY_ENCODING_NONE.
 686  * */
 687 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
 688 {
 689         if (len >= 3)
 690         {
 691                 if (bom_len)
 692                         *bom_len = 3;
 693
 694                 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
 695                         (guchar)string[2] == 0xbf)
 696                 {
 697                         return GEANY_ENCODING_UTF_8;
 698                 }
 699         }
 700         if (len >= 4)
 701         {
 702                 if (bom_len)
 703                         *bom_len = 4;
 704
 705                 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
 706                                  (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
 707                 {
 708                         return GEANY_ENCODING_UTF_32BE; /* Big endian */
 709                 }
 710                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
 711                                  (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
 712                 {
 713                         return GEANY_ENCODING_UTF_32LE; /* Little endian */
 714                 }
 715                 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
 716                                  (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
 717                 {
 718                          return GEANY_ENCODING_UTF_7;
 719                 }
 720         }
 721         if (len >= 2)
 722         {
 723                 if (bom_len)
 724                         *bom_len = 2;
 725
 726                 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
 727                 {
 728                         return GEANY_ENCODING_UTF_16BE; /* Big endian */
 729                 }
 730                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
 731                 {
 732                         return GEANY_ENCODING_UTF_16LE; /* Little endian */
 733                 }
 734         }
 735         if (bom_len)
 736                 *bom_len = 0;
 737         return GEANY_ENCODING_NONE;
 738 }
 739
 740
 741 gboolean encodings_is_unicode_charset(const gchar *string)
 742 {
 743         if (string != NULL &&
 744                 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
 745         {
 746                 return TRUE;
 747         }
 748         return FALSE;
 749 }
 750
 751
 752 typedef struct
 753 {
 754         gchar           *data;  /* null-terminated data */
 755         gsize            size;  /* actual data size */
 756         gsize            len;   /* string length of data */
 757         gchar           *enc;
 758         gboolean         bom;
 759         gboolean         partial;
 760 } BufferData;
 761
 762
 763 /* convert data with the specified encoding */
 764 static gboolean
 765 handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
 766 {
 767         GeanyEncodingIndex enc_idx;
 768
 769         if (utils_str_equal(forced_enc, "UTF-8"))
 770         {
 771                 if (! g_utf8_validate(buffer->data, buffer->len, NULL))
 772                 {
 773                         return FALSE;
 774                 }
 775         }
 776         else
 777         {
 778                 gchar *converted_text = encodings_convert_to_utf8_from_charset(
 779                                                                                 buffer->data, buffer->size, forced_enc, FALSE);
 780                 if (converted_text == NULL)
 781                 {
 782                         return FALSE;
 783                 }
 784                 else
 785                 {
 786                         SETPTR(buffer->data, converted_text);
 787                         buffer->len = strlen(converted_text);
 788                 }
 789         }
 790         enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
 791         buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
 792         buffer->enc = g_strdup(forced_enc);
 793         return TRUE;
 794 }
 795
 796
 797 /* detect encoding and convert to UTF-8 if necessary */
 798 static gboolean
 799 handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
 800 {
 801         g_return_val_if_fail(buffer->enc == NULL, FALSE);
 802         g_return_val_if_fail(buffer->bom == FALSE, FALSE);
 803
 804         if (buffer->size == 0)
 805         {
 806                 /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
 807                  * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
 808                 buffer->enc = g_strdup("UTF-8");
 809         }
 810         else
 811         {
 812                 /* first check for a BOM */
 813                 if (enc_idx != GEANY_ENCODING_NONE)
 814                 {
 815                         buffer->enc = g_strdup(encodings[enc_idx].charset);
 816                         buffer->bom = TRUE;
 817
 818                         if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
 819                         {
 820                                 gchar *converted_text = encodings_convert_to_utf8_from_charset(
 821                                                                                 buffer->data, buffer->size, buffer->enc, FALSE);
 822                                 if (converted_text != NULL)
 823                                 {
 824                                         SETPTR(buffer->data, converted_text);
 825                                         buffer->len = strlen(converted_text);
 826                                 }
 827                                 else
 828                                 {
 829                                         /* there was a problem converting data from BOM encoding type */
 830                                         SETPTR(buffer->enc, NULL);
 831                                         buffer->bom = FALSE;
 832                                 }
 833                         }
 834                 }
 835
 836                 if (buffer->enc == NULL)        /* either there was no BOM or the BOM encoding failed */
 837                 {
 838                         /* first try to read the encoding from the file content */
 839                         gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
 840
 841                         /* try UTF-8 first */
 842                         if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
 843                                 (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
 844                         {
 845                                 buffer->enc = g_strdup("UTF-8");
 846                         }
 847                         else
 848                         {
 849                                 /* detect the encoding */
 850                                 gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
 851                                         buffer->size, regex_charset, &buffer->enc);
 852
 853                                 if (converted_text == NULL)
 854                                 {
 855                                         g_free(regex_charset);
 856                                         return FALSE;
 857                                 }
 858                                 SETPTR(buffer->data, converted_text);
 859                                 buffer->len = strlen(converted_text);
 860                         }
 861                         g_free(regex_charset);
 862                 }
 863         }
 864         return TRUE;
 865 }
 866
 867
 868 static void
 869 handle_bom(BufferData *buffer)
 870 {
 871         guint bom_len;
 872
 873         encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
 874         g_return_if_fail(bom_len != 0);
 875
 876         /* use filedata->len here because the contents are already converted into UTF-8 */
 877         buffer->len -= bom_len;
 878         /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
 879         g_memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
 880         buffer->data = g_realloc(buffer->data, buffer->len + 1);
 881 }
 882
 883
 884 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
 885 static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
 886 {
 887         GeanyEncodingIndex tmp_enc_idx;
 888
 889         /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
 890          * if we have a BOM */
 891         tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
 892
 893         /* check whether the size of the loaded data is equal to the size of the file in the
 894          * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
 895          * file size of 0 bytes */
 896         if (buffer->len != buffer->size && buffer->size != 0 && (
 897                 tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
 898                 tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
 899         {
 900                 buffer->partial = TRUE;
 901         }
 902
 903         /* Determine character encoding and convert to UTF-8 */
 904         if (forced_enc != NULL)
 905         {
 906                 /* the encoding should be ignored(requested by user), so open the file "as it is" */
 907                 if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
 908                 {
 909                         buffer->bom = FALSE;
 910                         buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
 911                 }
 912                 else if (! handle_forced_encoding(buffer, forced_enc))
 913                 {
 914                         return FALSE;
 915                 }
 916         }
 917         else if (! handle_encoding(buffer, tmp_enc_idx))
 918         {
 919                 return FALSE;
 920         }
 921
 922         if (buffer->bom)
 923                 handle_bom(buffer);
 924         return TRUE;
 925 }
 926
 927
 928 /*
 929  * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
 930  * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
 931  *
 932  * @param buf a pointer to modifiable null-terminated buffer to convert.
 933  *   It may or may not be modified, and should be freed whatever happens.
 934  * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
 935  *   file size). It will be updated to the new size.
 936  * @param forced_enc forced encoding to use, or @c NULL
 937  * @param used_encoding return location for the actually used encoding, or @c NULL
 938  * @param has_bom return location to store whether the data had a BOM, or @c NULL
 939  * @param partial return location to store whether the conversion may be partial, or @c NULL
 940  *
 941  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
 942  */
 943 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
 944                 gchar **used_encoding, gboolean *has_bom, gboolean *partial)
 945 {
 946         BufferData buffer;
 947
 948         buffer.data = *buf;
 949         buffer.size = *size;
 950         /* use strlen to check for null chars */
 951         buffer.len = strlen(buffer.data);
 952         buffer.enc = NULL;
 953         buffer.bom = FALSE;
 954         buffer.partial = FALSE;
 955
 956         if (! handle_buffer(&buffer, forced_enc))
 957                 return FALSE;
 958
 959         *size = buffer.len;
 960         if (used_encoding)
 961                 *used_encoding = buffer.enc;
 962         else
 963                 g_free(buffer.enc);
 964         if (has_bom)
 965                 *has_bom = buffer.bom;
 966         if (partial)
 967                 *partial = buffer.partial;
 968
 969         *buf = buffer.data;
 970         return TRUE;
 971 }