src/encodings.c

   1 /*
   2  *      encodings.c - this file is part of Geany, a fast and lightweight IDE
   3  *
   4  *      Copyright 2005-2012 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
   5  *      Copyright 2006-2012 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
   6  *
   7  *      This program is free software; you can redistribute it and/or modify
   8  *      it under the terms of the GNU General Public License as published by
   9  *      the Free Software Foundation; either version 2 of the License, or
  10  *      (at your option) any later version.
  11  *
  12  *      This program is distributed in the hope that it will be useful,
  13  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *      GNU General Public License for more details.
  16  *
  17  *      You should have received a copy of the GNU General Public License along
  18  *      with this program; if not, write to the Free Software Foundation, Inc.,
  19  *      51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20  */
  21
  22 /*
  23  * Encoding conversion and Byte Order Mark (BOM) handling.
  24  */
  25
  26 /*
  27  * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
  28  * list of people on the gedit Team.
  29  * See the gedit ChangeLog files for a list of changes.
  30  */
  31  /* Stolen from anjuta */
  32
  33 #ifdef HAVE_CONFIG_H
  34 # include "config.h"
  35 #endif
  36
  37 #include "encodings.h"
  38 #include "encodingsprivate.h"
  39
  40 #include "app.h"
  41 #include "callbacks.h"
  42 #include "documentprivate.h"
  43 #include "support.h"
  44 #include "ui_utils.h"
  45 #include "utils.h"
  46
  47 #include <string.h>
  48
  49
  50 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
  51 #define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
  52 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
  53 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
  54
  55 /* precompiled regexps */
  56 static GRegex *pregs[2];
  57 static gboolean pregs_loaded = FALSE;
  58
  59
  60 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
  61
  62
  63 #define fill(Order, Group, Idx, Charset, Name) \
  64                 encodings[Idx].idx = Idx; \
  65                 encodings[Idx].order = Order; \
  66                 encodings[Idx].group = Group; \
  67                 encodings[Idx].charset = Charset; \
  68                 encodings[Idx].name = Name;
  69
  70 static void init_encodings(void)
  71 {
  72         fill(0,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_14,             "ISO-8859-14",          _("Celtic"));
  73         fill(1,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_7,              "ISO-8859-7",           _("Greek"));
  74         fill(2,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1253,    "WINDOWS-1253",         _("Greek"));
  75         fill(3,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_10,             "ISO-8859-10",          _("Nordic"));
  76         fill(4,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_3,              "ISO-8859-3",           _("South European"));
  77         fill(5,         WESTEUROPEAN,   GEANY_ENCODING_IBM_850,                 "IBM850",                       _("Western"));
  78         fill(6,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_1,              "ISO-8859-1",           _("Western"));
  79         fill(7,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_15,             "ISO-8859-15",          _("Western"));
  80         fill(8,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1252,    "WINDOWS-1252",         _("Western"));
  81
  82         fill(0,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_4,              "ISO-8859-4",           _("Baltic"));
  83         fill(1,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_13,             "ISO-8859-13",          _("Baltic"));
  84         fill(2,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1257,    "WINDOWS-1257",         _("Baltic"));
  85         fill(3,         EASTEUROPEAN,   GEANY_ENCODING_IBM_852,                 "IBM852",                       _("Central European"));
  86         fill(4,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_2,              "ISO-8859-2",           _("Central European"));
  87         fill(5,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1250,    "WINDOWS-1250",         _("Central European"));
  88         fill(6,         EASTEUROPEAN,   GEANY_ENCODING_IBM_855,                 "IBM855",                       _("Cyrillic"));
  89         fill(7,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_5,              "ISO-8859-5",           _("Cyrillic"));
  90         /* ISO-IR-111 not available on Windows */
  91         fill(8,         EASTEUROPEAN,   GEANY_ENCODING_ISO_IR_111,              "ISO-IR-111",           _("Cyrillic"));
  92         fill(9,         EASTEUROPEAN,   GEANY_ENCODING_KOI8_R,                  "KOI8-R",                       _("Cyrillic"));
  93         fill(10,        EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1251,    "WINDOWS-1251",         _("Cyrillic"));
  94         fill(11,        EASTEUROPEAN,   GEANY_ENCODING_CP_866,                  "CP866",                        _("Cyrillic/Russian"));
  95         fill(12,        EASTEUROPEAN,   GEANY_ENCODING_KOI8_U,                  "KOI8-U",                       _("Cyrillic/Ukrainian"));
  96         fill(13,        EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_16,             "ISO-8859-16",          _("Romanian"));
  97
  98         fill(0,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_864,                 "IBM864",                       _("Arabic"));
  99         fill(1,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_6,              "ISO-8859-6",           _("Arabic"));
 100         fill(2,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1256,    "WINDOWS-1256",         _("Arabic"));
 101         fill(3,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_862,                 "IBM862",                       _("Hebrew"));
 102         /* not available at all, ? */
 103         fill(4,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8_I,    "ISO-8859-8-I",         _("Hebrew"));
 104         fill(5,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1255,    "WINDOWS-1255",         _("Hebrew"));
 105         fill(6,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8,              "ISO-8859-8",           _("Hebrew Visual"));
 106
 107         fill(0,         ASIAN,                  GEANY_ENCODING_ARMSCII_8,               "ARMSCII-8",            _("Armenian"));
 108         fill(1,         ASIAN,                  GEANY_ENCODING_GEOSTD8,                 "GEORGIAN-ACADEMY",     _("Georgian"));
 109         fill(2,         ASIAN,                  GEANY_ENCODING_TIS_620,                 "TIS-620",                      _("Thai"));
 110         fill(3,         ASIAN,                  GEANY_ENCODING_IBM_857,                 "IBM857",                       _("Turkish"));
 111         fill(4,         ASIAN,                  GEANY_ENCODING_WINDOWS_1254,    "WINDOWS-1254",         _("Turkish"));
 112         fill(5,         ASIAN,                  GEANY_ENCODING_ISO_8859_9,              "ISO-8859-9",           _("Turkish"));
 113         fill(6,         ASIAN,                  GEANY_ENCODING_TCVN,                    "TCVN",                         _("Vietnamese"));
 114         fill(7,         ASIAN,                  GEANY_ENCODING_VISCII,                  "VISCII",                       _("Vietnamese"));
 115         fill(8,         ASIAN,                  GEANY_ENCODING_WINDOWS_1258,    "WINDOWS-1258",         _("Vietnamese"));
 116
 117         fill(0,         UNICODE,                GEANY_ENCODING_UTF_7,                   "UTF-7",                        _("Unicode"));
 118         fill(1,         UNICODE,                GEANY_ENCODING_UTF_8,                   "UTF-8",                        _("Unicode"));
 119         fill(2,         UNICODE,                GEANY_ENCODING_UTF_16LE,                "UTF-16LE",                     _("Unicode"));
 120         fill(3,         UNICODE,                GEANY_ENCODING_UTF_16BE,                "UTF-16BE",                     _("Unicode"));
 121         fill(4,         UNICODE,                GEANY_ENCODING_UCS_2LE,                 "UCS-2LE",                      _("Unicode"));
 122         fill(5,         UNICODE,                GEANY_ENCODING_UCS_2BE,                 "UCS-2BE",                      _("Unicode"));
 123         fill(6,         UNICODE,                GEANY_ENCODING_UTF_32LE,                "UTF-32LE",                     _("Unicode"));
 124         fill(7,         UNICODE,                GEANY_ENCODING_UTF_32BE,                "UTF-32BE",                     _("Unicode"));
 125
 126         fill(0,         EASTASIAN,              GEANY_ENCODING_GB18030,                 "GB18030",                      _("Chinese Simplified"));
 127         fill(1,         EASTASIAN,              GEANY_ENCODING_GB2312,                  "GB2312",                       _("Chinese Simplified"));
 128         fill(2,         EASTASIAN,              GEANY_ENCODING_GBK,                             "GBK",                          _("Chinese Simplified"));
 129         /* maybe not available on Linux */
 130         fill(3,         EASTASIAN,              GEANY_ENCODING_HZ,                              "HZ",                           _("Chinese Simplified"));
 131         fill(4,         EASTASIAN,              GEANY_ENCODING_BIG5,                    "BIG5",                         _("Chinese Traditional"));
 132         fill(5,         EASTASIAN,              GEANY_ENCODING_BIG5_HKSCS,              "BIG5-HKSCS",           _("Chinese Traditional"));
 133         fill(6,         EASTASIAN,              GEANY_ENCODING_EUC_TW,                  "EUC-TW",                       _("Chinese Traditional"));
 134         fill(7,         EASTASIAN,              GEANY_ENCODING_EUC_JP,                  "EUC-JP",                       _("Japanese"));
 135         fill(8,         EASTASIAN,              GEANY_ENCODING_ISO_2022_JP,             "ISO-2022-JP",          _("Japanese"));
 136         fill(9,         EASTASIAN,              GEANY_ENCODING_SHIFT_JIS,               "SHIFT_JIS",            _("Japanese"));
 137         fill(10,        EASTASIAN,              GEANY_ENCODING_CP_932,                  "CP932",                        _("Japanese"));
 138         fill(11,        EASTASIAN,              GEANY_ENCODING_EUC_KR,                  "EUC-KR",                       _("Korean"));
 139         fill(12,        EASTASIAN,              GEANY_ENCODING_ISO_2022_KR,             "ISO-2022-KR",          _("Korean"));
 140         fill(13,        EASTASIAN,              GEANY_ENCODING_JOHAB,                   "JOHAB",                        _("Korean"));
 141         fill(14,        EASTASIAN,              GEANY_ENCODING_UHC,                             "UHC",                          _("Korean"));
 142
 143         fill(0,         NONE,                   GEANY_ENCODING_NONE,                    "None",                         _("Without encoding"));
 144 }
 145
 146
 147 /* compares two encoding names in a permissive fashion.
 148  * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
 149 static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
 150 {
 151         gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
 152         gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
 153
 154         while (*a && *b)
 155         {
 156                 gboolean is_alpha;
 157
 158                 if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
 159                         ((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
 160                 {
 161                         /* either there was a real separator, or we need a implicit one (a chage from alpha to
 162                          * numeric or so) */
 163                         if (! need_sep || (was_alpha != is_alpha))
 164                         {
 165                                 a++;
 166                                 b++;
 167                                 was_alpha = is_alpha;
 168                                 need_sep = FALSE;
 169                         }
 170                         else
 171                                 return FALSE;
 172                 }
 173                 else
 174                 {
 175                         guint n_sep = 0;
 176
 177                         if (! g_ascii_isalnum(*a))
 178                         {
 179                                 a++;
 180                                 n_sep++;
 181                         }
 182                         if (! g_ascii_isalnum(*b))
 183                         {
 184                                 b++;
 185                                 n_sep++;
 186                         }
 187                         if (n_sep < 1)
 188                                 return FALSE;
 189                         else if (n_sep < 2)
 190                                 need_sep = TRUE;
 191                 }
 192         }
 193         return *a == *b;
 194 }
 195
 196
 197 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
 198 {
 199         gint i;
 200
 201         if (charset == NULL)
 202                 return GEANY_ENCODING_UTF_8;
 203
 204         i = 0;
 205         while (i < GEANY_ENCODINGS_MAX)
 206         {
 207                 if (encodings_charset_equals(charset, encodings[i].charset))
 208                         return i;
 209
 210                 ++i;
 211         }
 212         return GEANY_ENCODING_UTF_8;
 213 }
 214
 215
 216 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
 217 {
 218         gint i;
 219
 220         if (charset == NULL)
 221                 return &encodings[GEANY_ENCODING_UTF_8];
 222
 223         i = 0;
 224         while (i < GEANY_ENCODINGS_MAX)
 225         {
 226                 if (encodings_charset_equals(charset, encodings[i].charset))
 227                         return &encodings[i];
 228
 229                 ++i;
 230         }
 231
 232         return NULL;
 233 }
 234
 235
 236 static const gchar *encodings_normalize_charset(const gchar *charset)
 237 {
 238         const GeanyEncoding *encoding;
 239
 240         encoding = encodings_get_from_charset(charset);
 241         if (encoding != NULL)
 242                 return encoding->charset;
 243
 244         return NULL;
 245 }
 246
 247
 248 const GeanyEncoding *encodings_get_from_index(gint idx)
 249 {
 250         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 251
 252         return &encodings[idx];
 253 }
 254
 255
 256 /**
 257  *  Gets the character set name of the specified index e.g. for use with
 258  *  @ref document_set_encoding().
 259  *
 260  *  @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
 261  *
 262  *
 263  *  @return @nullable The charset according to idx, or @c NULL if the index is invalid.
 264  *
 265  *  @since 0.13
 266  **/
 267 GEANY_API_SYMBOL
 268 const gchar* encodings_get_charset_from_index(gint idx)
 269 {
 270         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 271
 272         return encodings[idx].charset;
 273 }
 274
 275
 276 gchar *encodings_to_string(const GeanyEncoding* enc)
 277 {
 278         g_return_val_if_fail(enc != NULL, NULL);
 279         g_return_val_if_fail(enc->name != NULL, NULL);
 280         g_return_val_if_fail(enc->charset != NULL, NULL);
 281
 282         return g_strdup_printf("%s (%s)", enc->name, enc->charset);
 283 }
 284
 285
 286 const gchar *encodings_get_charset(const GeanyEncoding* enc)
 287 {
 288         g_return_val_if_fail(enc != NULL, NULL);
 289         g_return_val_if_fail(enc->charset != NULL, NULL);
 290
 291         return enc->charset;
 292 }
 293
 294
 295 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
 296
 297
 298 void encodings_select_radio_item(const gchar *charset)
 299 {
 300         gint i;
 301
 302         g_return_if_fail(charset != NULL);
 303
 304         i = 0;
 305         while (i < GEANY_ENCODINGS_MAX)
 306         {
 307                 if (utils_str_equal(charset, encodings[i].charset))
 308                         break;
 309                 i++;
 310         }
 311         if (i == GEANY_ENCODINGS_MAX)
 312                 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
 313
 314         /* ignore_callback has to be set by the caller */
 315         gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
 316 }
 317
 318
 319 /* Regexp detection of file encoding declared in the file itself.
 320  * Idea and parts of code taken from Bluefish, thanks.
 321  * regex_compile() is used to compile regular expressions on program init and keep it in memory
 322  * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
 323  */
 324 static GRegex *regex_compile(const gchar *pattern)
 325 {
 326         GError *error = NULL;
 327         GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error);
 328
 329         if (!regex)
 330         {
 331                 geany_debug("Failed to compile encoding regex (%s)", error->message);
 332                 g_error_free(error);
 333         }
 334         return regex;
 335 }
 336
 337
 338 static gchar *regex_match(GRegex *preg, const gchar *buffer, gsize size)
 339 {
 340         gchar *encoding = NULL;
 341         GMatchInfo *minfo;
 342
 343         if (G_UNLIKELY(! pregs_loaded || buffer == NULL))
 344                 return NULL;
 345
 346         /* scan only the first 512 characters in the buffer */
 347         size = MIN(size, 512);
 348
 349         if (g_regex_match_full(preg, buffer, size, 0, 0, &minfo, NULL) &&
 350                 g_match_info_get_match_count(minfo) >= 2)
 351         {
 352                 encoding = g_match_info_fetch(minfo, 1);
 353                 geany_debug("Detected encoding by regex search: %s", encoding);
 354
 355                 SETPTR(encoding, g_utf8_strup(encoding, -1));
 356         }
 357         g_match_info_free(minfo);
 358         return encoding;
 359 }
 360
 361
 362 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
 363 {
 364         GeanyDocument *doc = document_get_current();
 365         const gchar *charset = user_data;
 366
 367         if (ignore_callback || doc == NULL || charset == NULL ||
 368                 ! gtk_check_menu_item_get_active(menuitem) ||
 369                 utils_str_equal(charset, doc->encoding))
 370                 return;
 371
 372         if (doc->readonly)
 373         {
 374                 utils_beep();
 375                 return;
 376         }
 377         document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
 378
 379         document_set_encoding(doc, charset);
 380 }
 381
 382 static void encodings_reload_radio_item_change_cb(GtkMenuItem *menuitem, gpointer user_data)
 383 {
 384         GeanyDocument *doc = document_get_current();
 385
 386         g_return_if_fail(doc != NULL);
 387
 388         document_reload_prompt(doc, user_data);
 389 }
 390
 391
 392 void encodings_finalize(void)
 393 {
 394         if (pregs_loaded)
 395         {
 396                 guint i, len;
 397                 len = G_N_ELEMENTS(pregs);
 398                 for (i = 0; i < len; i++)
 399                 {
 400                         g_regex_unref(pregs[i]);
 401                 }
 402         }
 403 }
 404
 405
 406 void encodings_init(void)
 407 {
 408         GtkWidget *menu[2];
 409         GCallback cb_func[2];
 410         gint group_sizes[GEANY_ENCODING_GROUPS_MAX] = { 0 };
 411         const gchar *const groups[GEANY_ENCODING_GROUPS_MAX] =
 412         {
 413                 [NONE]                  = NULL,
 414                 [WESTEUROPEAN]  = N_("_West European"),
 415                 [EASTEUROPEAN]  = N_("_East European"),
 416                 [EASTASIAN]             = N_("East _Asian"),
 417                 [ASIAN]                 = N_("_SE & SW Asian"),
 418                 [MIDDLEEASTERN] = N_("_Middle Eastern"),
 419                 [UNICODE]               = N_("_Unicode"),
 420         };
 421
 422         init_encodings();
 423
 424         if (! pregs_loaded)
 425         {
 426                 pregs[0] = regex_compile(PATTERN_HTMLMETA);
 427                 pregs[1] = regex_compile(PATTERN_CODING);
 428                 pregs_loaded = TRUE;
 429         }
 430
 431         /* create encodings submenu in document menu */
 432         menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
 433         menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
 434         cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
 435         cb_func[1] = G_CALLBACK(encodings_reload_radio_item_change_cb);
 436
 437         for (guint i = 0; i < G_N_ELEMENTS(encodings); i++)
 438                 group_sizes[encodings[i].group]++;
 439
 440         for (guint k = 0; k < 2; k++)
 441         {
 442                 GSList *group = NULL;
 443                 GtkWidget *submenus[GEANY_ENCODING_GROUPS_MAX];
 444                 gint orders[GEANY_ENCODING_GROUPS_MAX] = { 0 };
 445                 guint n_added = 0;
 446
 447                 for (guint i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
 448                 {
 449                         if (! groups[i]) /* NONE */
 450                                 submenus[i] = menu[k];
 451                         else
 452                         {
 453                                 GtkWidget *item = gtk_menu_item_new_with_mnemonic(_(groups[i]));
 454                                 submenus[i] = gtk_menu_new();
 455                                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item), submenus[i]);
 456                                 gtk_container_add(GTK_CONTAINER(menu[k]), item);
 457                                 gtk_widget_show_all(item);
 458                         }
 459                 }
 460
 461                 /** TODO can it be optimized? ATM 882 runs at line "if (encodings[i].order ...)" */
 462                 do
 463                 {
 464                         for (guint i = 0; i < G_N_ELEMENTS(encodings); i++)
 465                         {
 466                                 if (encodings[i].order == orders[encodings[i].group])
 467                                 {
 468                                         GtkWidget *item;
 469                                         gchar *label = encodings_to_string(&encodings[i]);
 470
 471                                         if (k == 0) /* Set Encoding menu */
 472                                         {
 473                                                 item = gtk_radio_menu_item_new_with_label(group, label);
 474                                                 group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
 475                                                 radio_items[i] = item;
 476                                         }
 477                                         else
 478                                                 item = gtk_menu_item_new_with_label(label);
 479                                         gtk_widget_show(item);
 480                                         gtk_container_add(GTK_CONTAINER(submenus[encodings[i].group]), item);
 481                                         g_signal_connect(item, "activate", cb_func[k],
 482                                                         (gpointer) encodings[i].charset);
 483                                         g_free(label);
 484
 485                                         orders[encodings[i].group]++;
 486                                         n_added++;
 487                                 }
 488                         }
 489                 }
 490                 while (n_added < G_N_ELEMENTS(encodings));
 491         }
 492 }
 493
 494
 495 static gint encoding_combo_store_sort_func(GtkTreeModel *model,
 496                                                                                    GtkTreeIter *a,
 497                                                                                    GtkTreeIter *b,
 498                                                                                    gpointer data)
 499 {
 500         gboolean a_has_child = gtk_tree_model_iter_has_child(model, a);
 501         gboolean b_has_child = gtk_tree_model_iter_has_child(model, b);
 502         gchar *a_string;
 503         gchar *b_string;
 504         gint cmp_res;
 505
 506         if (a_has_child != b_has_child)
 507                 return a_has_child ? -1 : 1;
 508
 509         gtk_tree_model_get(model, a, 1, &a_string, -1);
 510         gtk_tree_model_get(model, b, 1, &b_string, -1);
 511         cmp_res = strcmp(a_string, b_string);
 512         g_free(a_string);
 513         g_free(b_string);
 514         return cmp_res;
 515 }
 516
 517
 518 GtkTreeStore *encodings_encoding_store_new(gboolean has_detect)
 519 {
 520         GtkTreeStore *store;
 521         GtkTreeIter iter_current, iter_westeuro, iter_easteuro, iter_eastasian,
 522                                 iter_asian, iter_utf8, iter_middleeast;
 523         GtkTreeIter *iter_parent;
 524         gint i;
 525
 526         store = gtk_tree_store_new(2, G_TYPE_INT, G_TYPE_STRING);
 527
 528         if (has_detect)
 529         {
 530                 gtk_tree_store_append(store, &iter_current, NULL);
 531                 gtk_tree_store_set(store, &iter_current, 0, GEANY_ENCODINGS_MAX, 1, _("Detect from file"), -1);
 532         }
 533
 534         gtk_tree_store_append(store, &iter_westeuro, NULL);
 535         gtk_tree_store_set(store, &iter_westeuro, 0, -1, 1, _("West European"), -1);
 536         gtk_tree_store_append(store, &iter_easteuro, NULL);
 537         gtk_tree_store_set(store, &iter_easteuro, 0, -1, 1, _("East European"), -1);
 538         gtk_tree_store_append(store, &iter_eastasian, NULL);
 539         gtk_tree_store_set(store, &iter_eastasian, 0, -1, 1, _("East Asian"), -1);
 540         gtk_tree_store_append(store, &iter_asian, NULL);
 541         gtk_tree_store_set(store, &iter_asian, 0, -1, 1, _("SE & SW Asian"), -1);
 542         gtk_tree_store_append(store, &iter_middleeast, NULL);
 543         gtk_tree_store_set(store, &iter_middleeast, 0, -1, 1, _("Middle Eastern"), -1);
 544         gtk_tree_store_append(store, &iter_utf8, NULL);
 545         gtk_tree_store_set(store, &iter_utf8, 0, -1, 1, _("Unicode"), -1);
 546
 547         for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
 548         {
 549                 gchar *encoding_string;
 550
 551                 switch (encodings[i].group)
 552                 {
 553                         case WESTEUROPEAN: iter_parent = &iter_westeuro; break;
 554                         case EASTEUROPEAN: iter_parent = &iter_easteuro; break;
 555                         case EASTASIAN: iter_parent = &iter_eastasian; break;
 556                         case ASIAN: iter_parent = &iter_asian; break;
 557                         case MIDDLEEASTERN: iter_parent = &iter_middleeast; break;
 558                         case UNICODE: iter_parent = &iter_utf8; break;
 559                         case NONE:
 560                         default: iter_parent = NULL;
 561                 }
 562                 gtk_tree_store_append(store, &iter_current, iter_parent);
 563                 encoding_string = encodings_to_string(&encodings[i]);
 564                 gtk_tree_store_set(store, &iter_current, 0, i, 1, encoding_string, -1);
 565                 g_free(encoding_string);
 566         }
 567
 568         gtk_tree_sortable_set_sort_column_id(GTK_TREE_SORTABLE(store), 1, GTK_SORT_ASCENDING);
 569         gtk_tree_sortable_set_sort_func(GTK_TREE_SORTABLE(store), 1, encoding_combo_store_sort_func, NULL, NULL);
 570
 571         return store;
 572 }
 573
 574
 575 gint encodings_encoding_store_get_encoding(GtkTreeStore *store, GtkTreeIter *iter)
 576 {
 577         gint enc;
 578         gtk_tree_model_get(GTK_TREE_MODEL(store), iter, 0, &enc, -1);
 579         return enc;
 580 }
 581
 582
 583 gboolean encodings_encoding_store_get_iter(GtkTreeStore *store, GtkTreeIter *iter, gint enc)
 584 {
 585         if (gtk_tree_model_get_iter_first(GTK_TREE_MODEL(store), iter))
 586         {
 587                 do
 588                 {
 589                         if (encodings_encoding_store_get_encoding(store, iter) == enc)
 590                                 return TRUE;
 591                 }
 592                 while (ui_tree_model_iter_any_next(GTK_TREE_MODEL(store), iter, TRUE));
 593         }
 594         return FALSE;
 595 }
 596
 597
 598 void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout,
 599                                                                                          GtkCellRenderer *cell,
 600                                                                                          GtkTreeModel *tree_model,
 601                                                                                          GtkTreeIter *iter,
 602                                                                                          gpointer data)
 603 {
 604         gboolean sensitive = !gtk_tree_model_iter_has_child(tree_model, iter);
 605         gchar *text;
 606
 607         gtk_tree_model_get(tree_model, iter, 1, &text, -1);
 608         g_object_set(cell, "sensitive", sensitive, "text", text, NULL);
 609         g_free(text);
 610 }
 611
 612
 613 /**
 614  *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
 615  *  If @a fast is not set, additional checks to validate the converted string are performed.
 616  *
 617  *  @param buffer The input string to convert.
 618  *  @param size The length of the string, or -1 if the string is nul-terminated.
 619  *  @param charset The charset to be used for conversion.
 620  *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
 621  *
 622  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 623  *    which must be freed with @c g_free(). Otherwise @c NULL.
 624  **/
 625 GEANY_API_SYMBOL
 626 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
 627                                                                                           const gchar *charset, gboolean fast)
 628 {
 629         gchar *utf8_content = NULL;
 630         GError *conv_error = NULL;
 631         gchar* converted_contents = NULL;
 632         gsize bytes_written;
 633
 634         g_return_val_if_fail(buffer != NULL, NULL);
 635         g_return_val_if_fail(charset != NULL, NULL);
 636
 637         converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
 638                                                                    &bytes_written, &conv_error);
 639
 640         if (fast)
 641         {
 642                 utf8_content = converted_contents;
 643                 if (conv_error != NULL) g_error_free(conv_error);
 644         }
 645         else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
 646         {
 647                 if (conv_error != NULL)
 648                 {
 649                         geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
 650                         g_error_free(conv_error);
 651                         conv_error = NULL;
 652                 }
 653                 else
 654                         geany_debug("Couldn't convert from %s to UTF-8.", charset);
 655
 656                 utf8_content = NULL;
 657                 g_free(converted_contents);
 658         }
 659         else
 660         {
 661                 geany_debug("Converted from %s to UTF-8.", charset);
 662                 utf8_content = converted_contents;
 663         }
 664
 665         return utf8_content;
 666 }
 667
 668
 669 static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
 670 {
 671         guint i;
 672
 673         for (i = 0; i < G_N_ELEMENTS(pregs); i++)
 674         {
 675                 gchar *charset;
 676
 677                 if ((charset = regex_match(pregs[i], buffer, size)) != NULL)
 678                         return charset;
 679         }
 680         return NULL;
 681 }
 682
 683
 684 static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size,
 685                 const gchar *suggested_charset, gchar **used_encoding)
 686 {
 687         const gchar *locale_charset = NULL;
 688         const gchar *charset;
 689         gchar *utf8_content;
 690         gboolean check_suggestion = suggested_charset != NULL;
 691         gboolean check_locale = FALSE;
 692         gint i, preferred_charset;
 693
 694         if (size == -1)
 695         {
 696                 size = strlen(buffer);
 697         }
 698
 699         /* current locale is not UTF-8, we have to check this charset */
 700         check_locale = ! g_get_charset(&locale_charset);
 701
 702         /* First check for preferred charset, if specified */
 703         preferred_charset = file_prefs.default_open_encoding;
 704
 705         if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
 706                 preferred_charset < 0 ||
 707                 preferred_charset >= GEANY_ENCODINGS_MAX)
 708         {
 709                 preferred_charset = -1;
 710         }
 711
 712         /* -1 means "Preferred charset" */
 713         for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
 714         {
 715                 if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
 716                         continue;
 717
 718                 if (check_suggestion)
 719                 {
 720                         check_suggestion = FALSE;
 721                         charset = encodings_normalize_charset(suggested_charset);
 722                         if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
 723                                 charset = suggested_charset;
 724                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 725                 }
 726                 else if (check_locale)
 727                 {
 728                         check_locale = FALSE;
 729                         charset = locale_charset;
 730                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 731                 }
 732                 else if (i == -1)
 733                 {
 734                         if (preferred_charset >= 0)
 735                         {
 736                                 charset = encodings[preferred_charset].charset;
 737                                 geany_debug("Using preferred charset: %s", charset);
 738                         }
 739                         else
 740                                 continue;
 741                 }
 742                 else if (i >= 0)
 743                         charset = encodings[i].charset;
 744                 else /* in this case we have i == -2, continue to increase i and go ahead */
 745                         continue;
 746
 747                 if (G_UNLIKELY(charset == NULL))
 748                         continue;
 749
 750                 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
 751                         size, charset);
 752                 utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
 753
 754                 if (G_LIKELY(utf8_content != NULL))
 755                 {
 756                         if (used_encoding != NULL)
 757                         {
 758                                 if (G_UNLIKELY(*used_encoding != NULL))
 759                                 {
 760                                         geany_debug("%s:%d", __FILE__, __LINE__);
 761                                         g_free(*used_encoding);
 762                                 }
 763                                 *used_encoding = g_strdup(charset);
 764                         }
 765                         return utf8_content;
 766                 }
 767         }
 768
 769         return NULL;
 770 }
 771
 772
 773 /**
 774  *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
 775  *  @a used_encoding.
 776  *
 777  *  @param buffer the input string to convert.
 778  *  @param size the length of the string, or -1 if the string is nul-terminated.
 779  *  @param used_encoding @out @optional return location of the detected encoding of the input string, or @c NULL.
 780  *
 781  *  @return @nullable If the conversion was successful, a newly allocated nul-terminated string,
 782  *    which must be freed with @c g_free(). Otherwise @c NULL.
 783  **/
 784 GEANY_API_SYMBOL
 785 gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_encoding)
 786 {
 787         gchar *regex_charset;
 788         gchar *utf8;
 789
 790         /* first try to read the encoding from the file content */
 791         regex_charset = encodings_check_regexes(buffer, size);
 792         utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
 793         g_free(regex_charset);
 794
 795         return utf8;
 796 }
 797
 798
 799 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
 800  * otherwise GEANY_ENCODING_NONE.
 801  * */
 802 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
 803 {
 804         if (len >= 3)
 805         {
 806                 if (bom_len)
 807                         *bom_len = 3;
 808
 809                 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
 810                         (guchar)string[2] == 0xbf)
 811                 {
 812                         return GEANY_ENCODING_UTF_8;
 813                 }
 814         }
 815         if (len >= 4)
 816         {
 817                 if (bom_len)
 818                         *bom_len = 4;
 819
 820                 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
 821                                  (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
 822                 {
 823                         return GEANY_ENCODING_UTF_32BE; /* Big endian */
 824                 }
 825                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
 826                                  (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
 827                 {
 828                         return GEANY_ENCODING_UTF_32LE; /* Little endian */
 829                 }
 830                 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
 831                                  (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
 832                 {
 833                          return GEANY_ENCODING_UTF_7;
 834                 }
 835         }
 836         if (len >= 2)
 837         {
 838                 if (bom_len)
 839                         *bom_len = 2;
 840
 841                 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
 842                 {
 843                         return GEANY_ENCODING_UTF_16BE; /* Big endian */
 844                 }
 845                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
 846                 {
 847                         return GEANY_ENCODING_UTF_16LE; /* Little endian */
 848                 }
 849         }
 850         if (bom_len)
 851                 *bom_len = 0;
 852         return GEANY_ENCODING_NONE;
 853 }
 854
 855
 856 gboolean encodings_is_unicode_charset(const gchar *string)
 857 {
 858         if (string != NULL &&
 859                 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
 860         {
 861                 return TRUE;
 862         }
 863         return FALSE;
 864 }
 865
 866
 867 typedef struct
 868 {
 869         gchar           *data;  /* null-terminated data */
 870         gsize            size;  /* actual data size */
 871         gsize            len;   /* string length of data */
 872         gchar           *enc;
 873         gboolean         bom;
 874         gboolean         partial;
 875 } BufferData;
 876
 877
 878 /* convert data with the specified encoding */
 879 static gboolean
 880 handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
 881 {
 882         GeanyEncodingIndex enc_idx;
 883
 884         if (utils_str_equal(forced_enc, "UTF-8"))
 885         {
 886                 if (! g_utf8_validate(buffer->data, buffer->len, NULL))
 887                 {
 888                         return FALSE;
 889                 }
 890         }
 891         else
 892         {
 893                 gchar *converted_text = encodings_convert_to_utf8_from_charset(
 894                                                                                 buffer->data, buffer->size, forced_enc, FALSE);
 895                 if (converted_text == NULL)
 896                 {
 897                         return FALSE;
 898                 }
 899                 else
 900                 {
 901                         SETPTR(buffer->data, converted_text);
 902                         buffer->len = strlen(converted_text);
 903                 }
 904         }
 905         enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
 906         buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
 907         buffer->enc = g_strdup(forced_enc);
 908         return TRUE;
 909 }
 910
 911
 912 /* detect encoding and convert to UTF-8 if necessary */
 913 static gboolean
 914 handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
 915 {
 916         g_return_val_if_fail(buffer->enc == NULL, FALSE);
 917         g_return_val_if_fail(buffer->bom == FALSE, FALSE);
 918
 919         if (buffer->size == 0)
 920         {
 921                 /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
 922                  * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
 923                 buffer->enc = g_strdup("UTF-8");
 924         }
 925         else
 926         {
 927                 /* first check for a BOM */
 928                 if (enc_idx != GEANY_ENCODING_NONE)
 929                 {
 930                         buffer->enc = g_strdup(encodings[enc_idx].charset);
 931                         buffer->bom = TRUE;
 932
 933                         if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
 934                         {
 935                                 gchar *converted_text = encodings_convert_to_utf8_from_charset(
 936                                                                                 buffer->data, buffer->size, buffer->enc, FALSE);
 937                                 if (converted_text != NULL)
 938                                 {
 939                                         SETPTR(buffer->data, converted_text);
 940                                         buffer->len = strlen(converted_text);
 941                                 }
 942                                 else
 943                                 {
 944                                         /* there was a problem converting data from BOM encoding type */
 945                                         SETPTR(buffer->enc, NULL);
 946                                         buffer->bom = FALSE;
 947                                 }
 948                         }
 949                 }
 950
 951                 if (buffer->enc == NULL)        /* either there was no BOM or the BOM encoding failed */
 952                 {
 953                         /* first try to read the encoding from the file content */
 954                         gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
 955
 956                         /* try UTF-8 first */
 957                         if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
 958                                 (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
 959                         {
 960                                 buffer->enc = g_strdup("UTF-8");
 961                         }
 962                         else
 963                         {
 964                                 /* detect the encoding */
 965                                 gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
 966                                         buffer->size, regex_charset, &buffer->enc);
 967
 968                                 if (converted_text == NULL)
 969                                 {
 970                                         g_free(regex_charset);
 971                                         return FALSE;
 972                                 }
 973                                 SETPTR(buffer->data, converted_text);
 974                                 buffer->len = strlen(converted_text);
 975                         }
 976                         g_free(regex_charset);
 977                 }
 978         }
 979         return TRUE;
 980 }
 981
 982
 983 static void
 984 handle_bom(BufferData *buffer)
 985 {
 986         guint bom_len;
 987
 988         encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
 989         g_return_if_fail(bom_len != 0);
 990
 991         /* use filedata->len here because the contents are already converted into UTF-8 */
 992         buffer->len -= bom_len;
 993         /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
 994         memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
 995         buffer->data = g_realloc(buffer->data, buffer->len + 1);
 996 }
 997
 998
 999 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
1000 static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
1001 {
1002         GeanyEncodingIndex tmp_enc_idx;
1003
1004         /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
1005          * if we have a BOM */
1006         tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
1007
1008         /* check whether the size of the loaded data is equal to the size of the file in the
1009          * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
1010          * file size of 0 bytes */
1011         if (buffer->len != buffer->size && buffer->size != 0 && (
1012                 tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
1013                 tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
1014         {
1015                 buffer->partial = TRUE;
1016         }
1017
1018         /* Determine character encoding and convert to UTF-8 */
1019         if (forced_enc != NULL)
1020         {
1021                 /* the encoding should be ignored(requested by user), so open the file "as it is" */
1022                 if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
1023                 {
1024                         buffer->bom = FALSE;
1025                         buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
1026                 }
1027                 else if (! handle_forced_encoding(buffer, forced_enc))
1028                 {
1029                         return FALSE;
1030                 }
1031         }
1032         else if (! handle_encoding(buffer, tmp_enc_idx))
1033         {
1034                 return FALSE;
1035         }
1036
1037         if (buffer->bom)
1038                 handle_bom(buffer);
1039         return TRUE;
1040 }
1041
1042
1043 /*
1044  * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
1045  * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
1046  *
1047  * @param buf a pointer to modifiable null-terminated buffer to convert.
1048  *   It may or may not be modified, and should be freed whatever happens.
1049  * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
1050  *   file size). It will be updated to the new size.
1051  * @param forced_enc forced encoding to use, or @c NULL
1052  * @param used_encoding return location for the actually used encoding, or @c NULL
1053  * @param has_bom return location to store whether the data had a BOM, or @c NULL
1054  * @param partial return location to store whether the conversion may be partial, or @c NULL
1055  *
1056  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
1057  */
1058 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
1059                 gchar **used_encoding, gboolean *has_bom, gboolean *partial)
1060 {
1061         BufferData buffer;
1062
1063         buffer.data = *buf;
1064         buffer.size = *size;
1065         /* use strlen to check for null chars */
1066         buffer.len = strlen(buffer.data);
1067         buffer.enc = NULL;
1068         buffer.bom = FALSE;
1069         buffer.partial = FALSE;
1070
1071         if (! handle_buffer(&buffer, forced_enc))
1072                 return FALSE;
1073
1074         *size = buffer.len;
1075         if (used_encoding)
1076                 *used_encoding = buffer.enc;
1077         else
1078                 g_free(buffer.enc);
1079         if (has_bom)
1080                 *has_bom = buffer.bom;
1081         if (partial)
1082                 *partial = buffer.partial;
1083
1084         *buf = buffer.data;
1085         return TRUE;
1086 }