src/encodings.c

   1 /*
   2  *      encodings.c - this file is part of Geany, a fast and lightweight IDE
   3  *
   4  *      Copyright 2005-2010 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
   5  *      Copyright 2006-2010 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
   6  *
   7  *      This program is free software; you can redistribute it and/or modify
   8  *      it under the terms of the GNU General Public License as published by
   9  *      the Free Software Foundation; either version 2 of the License, or
  10  *      (at your option) any later version.
  11  *
  12  *      This program is distributed in the hope that it will be useful,
  13  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *      GNU General Public License for more details.
  16  *
  17  *      You should have received a copy of the GNU General Public License
  18  *      along with this program; if not, write to the Free Software
  19  *      Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20  *
  21  *  $Id$
  22  */
  23
  24 /*
  25  * Encoding conversion and Byte Order Mark (BOM) handling.
  26  */
  27
  28 /*
  29  * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
  30  * list of people on the gedit Team.
  31  * See the gedit ChangeLog files for a list of changes.
  32  */
  33  /* Stolen from anjuta */
  34
  35 #include <string.h>
  36
  37 #include "geany.h"
  38 #include "utils.h"
  39 #include "support.h"
  40 #include "document.h"
  41 #include "documentprivate.h"
  42 #include "msgwindow.h"
  43 #include "encodings.h"
  44 #include "callbacks.h"
  45 #include "ui_utils.h"
  46
  47
  48 #ifdef HAVE_REGCOMP
  49 # ifdef HAVE_REGEX_H
  50 #  include <regex.h>
  51 # else
  52 #  include "gnuregex.h"
  53 # endif
  54 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
  55 # define PATTERN_HTMLMETA "<meta[ \t\n\r\f]http-equiv[ \t\n\r\f]*=[ \t\n\r\f]*\"content-type\"[ \t\n\r\f]+content[ \t\n\r\f]*=[ \t\n\r\f]*\"text/x?html;[ \t\n\r\f]*charset=([a-z0-9_-]+)\"[ \t\n\r\f]*/?>"
  56 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
  57 # define PATTERN_CODING "coding[\t ]*[:=][\t ]*([a-z0-9-]+)[\t ]*"
  58 /* precompiled regexps */
  59 static regex_t pregs[2];
  60 static gboolean pregs_loaded = FALSE;
  61 #endif
  62
  63
  64 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
  65
  66
  67 #define fill(Order, Group, Idx, Charset, Name) \
  68                 encodings[Idx].idx = Idx; \
  69                 encodings[Idx].order = Order; \
  70                 encodings[Idx].group = Group; \
  71                 encodings[Idx].charset = Charset; \
  72                 encodings[Idx].name = Name;
  73
  74 static void init_encodings(void)
  75 {
  76         fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic"));
  77         fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek"));
  78         fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek"));
  79         fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic"));
  80         fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European"));
  81         fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western"));
  82         fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western"));
  83         fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western"));
  84         fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western"));
  85
  86         fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic"));
  87         fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic"));
  88         fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic"));
  89         fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European"));
  90         fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European"));
  91         fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European"));
  92         fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic"));
  93         fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic"));
  94         /* ISO-IR-111 not available on Windows */
  95         fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic"));
  96         fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8-R", _("Cyrillic"));
  97         fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic"));
  98         fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian"));
  99         fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8-U", _("Cyrillic/Ukrainian"));
 100         fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian"));
 101
 102         fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic"));
 103         fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic"));
 104         fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic"));
 105         fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew"));
 106         /* not available at all, ? */
 107         fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew"));
 108         fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew"));
 109         fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual"));
 110
 111         fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian"));
 112         fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian"));
 113         fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai"));
 114         fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish"));
 115         fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish"));
 116         fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish"));
 117         fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese"));
 118         fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese"));
 119         fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese"));
 120
 121         fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode"));
 122         fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode"));
 123         fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode"));
 124         fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode"));
 125         fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode"));
 126         fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode"));
 127         fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode"));
 128         fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode"));
 129
 130         fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified"));
 131         fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified"));
 132         fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified"));
 133         /* maybe not available on Linux */
 134         fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified"));
 135         fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional"));
 136         fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional"));
 137         fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional"));
 138         fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese"));
 139         fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese"));
 140         fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese"));
 141         fill(10, EASTASIAN, GEANY_ENCODING_CP_932, "CP932", _("Japanese"));
 142         fill(11, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean"));
 143         fill(12, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean"));
 144         fill(13, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean"));
 145         fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean"));
 146
 147         fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding"));
 148 }
 149
 150
 151 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
 152 {
 153         gint i;
 154
 155         if (charset == NULL)
 156                 return GEANY_ENCODING_UTF_8;
 157
 158         i = 0;
 159         while (i < GEANY_ENCODINGS_MAX)
 160         {
 161                 if (strcmp(charset, encodings[i].charset) == 0)
 162                         return i;
 163
 164                 ++i;
 165         }
 166         return GEANY_ENCODING_UTF_8;
 167 }
 168
 169
 170 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
 171 {
 172         gint i;
 173
 174         if (charset == NULL)
 175                 return &encodings[GEANY_ENCODING_UTF_8];
 176
 177         i = 0;
 178         while (i < GEANY_ENCODINGS_MAX)
 179         {
 180                 if (strcmp(charset, encodings[i].charset) == 0)
 181                         return &encodings[i];
 182
 183                 ++i;
 184         }
 185
 186         return NULL;
 187 }
 188
 189
 190 const GeanyEncoding *encodings_get_from_index(gint idx)
 191 {
 192         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 193
 194         return &encodings[idx];
 195 }
 196
 197
 198 /**
 199  *  Gets the character set name of the specified index e.g. for use with
 200  *  @ref document_set_encoding().
 201  *
 202  *  @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
 203  *
 204  *
 205  *  @return The charset according to idx, or @c NULL if the index is invalid.
 206  *
 207  *  @since 0.13
 208  **/
 209 const gchar* encodings_get_charset_from_index(gint idx)
 210 {
 211         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 212
 213         return encodings[idx].charset;
 214 }
 215
 216
 217 gchar *encodings_to_string(const GeanyEncoding* enc)
 218 {
 219         g_return_val_if_fail(enc != NULL, NULL);
 220         g_return_val_if_fail(enc->name != NULL, NULL);
 221         g_return_val_if_fail(enc->charset != NULL, NULL);
 222
 223     return g_strdup_printf("%s (%s)", enc->name, enc->charset);
 224 }
 225
 226
 227 const gchar *encodings_get_charset(const GeanyEncoding* enc)
 228 {
 229         g_return_val_if_fail(enc != NULL, NULL);
 230         g_return_val_if_fail(enc->charset != NULL, NULL);
 231
 232         return enc->charset;
 233 }
 234
 235
 236 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
 237
 238
 239 void encodings_select_radio_item(const gchar *charset)
 240 {
 241         gint i;
 242
 243         g_return_if_fail(charset != NULL);
 244
 245         i = 0;
 246         while (i < GEANY_ENCODINGS_MAX)
 247         {
 248                 if (utils_str_equal(charset, encodings[i].charset))
 249                         break;
 250                 i++;
 251         }
 252         if (i == GEANY_ENCODINGS_MAX)
 253                 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
 254
 255         /* ignore_callback has to be set by the caller */
 256         gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
 257 }
 258
 259
 260 #ifdef HAVE_REGCOMP
 261 /* Regexp detection of file encoding declared in the file itself.
 262  * Idea and parts of code taken from Bluefish, thanks.
 263  * regex_compile() is used to compile regular expressions on program init and keep it in memory
 264  * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
 265  */
 266 static void regex_compile(regex_t *preg, const gchar *pattern)
 267 {
 268         gint retval = regcomp(preg, pattern, REG_EXTENDED | REG_ICASE);
 269         if (retval != 0)
 270         {
 271                 gchar errmsg[512];
 272                 regerror(retval, preg, errmsg, 512);
 273                 geany_debug("regcomp() failed (%s)", errmsg);
 274                 regfree(preg);
 275                 return;
 276         }
 277 }
 278
 279
 280 static gchar *regex_match(regex_t *preg, const gchar *buffer, gsize size)
 281 {
 282         gint retval;
 283         gchar *tmp_buf = NULL;
 284         gchar *encoding = NULL;
 285         regmatch_t pmatch[10];
 286
 287         if (G_UNLIKELY(! pregs_loaded) || G_UNLIKELY(buffer == NULL))
 288                 return NULL;
 289
 290         if (size > 512)
 291                 tmp_buf = g_strndup(buffer, 512); /* scan only the first 512 characters in the buffer */
 292
 293         retval = regexec(preg, (tmp_buf != NULL) ? tmp_buf : buffer, 10, pmatch, 0);
 294         if (retval == 0 && pmatch[0].rm_so != -1 && pmatch[1].rm_so != -1)
 295         {
 296                 encoding = g_strndup(&buffer[pmatch[1].rm_so], pmatch[1].rm_eo - pmatch[1].rm_so);
 297                 geany_debug("Detected encoding by regex search: %s", encoding);
 298
 299                 setptr(encoding, g_utf8_strup(encoding, -1));
 300         }
 301         g_free(tmp_buf);
 302         return encoding;
 303 }
 304 #endif
 305
 306
 307 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
 308 {
 309         GeanyDocument *doc = document_get_current();
 310         guint i = GPOINTER_TO_INT(user_data);
 311
 312         if (ignore_callback || doc == NULL || encodings[i].charset == NULL ||
 313                 ! gtk_check_menu_item_get_active(menuitem) ||
 314                 utils_str_equal(encodings[i].charset, doc->encoding))
 315                 return;
 316
 317         if (doc->readonly)
 318         {
 319                 utils_beep();
 320                 return;
 321         }
 322         document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
 323
 324         document_set_encoding(doc, encodings[i].charset);
 325 }
 326
 327
 328 void encodings_finalize(void)
 329 {
 330 #ifdef HAVE_REGCOMP
 331         if (pregs_loaded)
 332         {
 333                 guint i, len;
 334                 len = G_N_ELEMENTS(pregs);
 335                 for (i = 0; i < len; i++)
 336                 {
 337                         regfree(&pregs[i]);
 338                 }
 339         }
 340 #endif
 341 }
 342
 343
 344 void encodings_init(void)
 345 {
 346         GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian,
 347                           *menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian,
 348                           *item_asian, *item_utf8, *item_middleeast;
 349         GCallback cb_func[2];
 350         GSList *group = NULL;
 351         gchar *label;
 352         gint order, group_size;
 353         guint i, j, k;
 354
 355         init_encodings();
 356
 357 #ifdef HAVE_REGCOMP
 358         if (! pregs_loaded)
 359         {
 360                 regex_compile(&pregs[0], PATTERN_HTMLMETA);
 361                 regex_compile(&pregs[1], PATTERN_CODING);
 362                 pregs_loaded = TRUE;
 363         }
 364 #endif
 365
 366         /* create encodings submenu in document menu */
 367         menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
 368         menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
 369         cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
 370         cb_func[1] = G_CALLBACK(on_reload_as_activate);
 371
 372         for (k = 0; k < 2; k++)
 373         {
 374                 menu_westeuro = gtk_menu_new();
 375                 item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European"));
 376                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro);
 377                 gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro);
 378                 gtk_widget_show_all(item_westeuro);
 379
 380                 menu_easteuro = gtk_menu_new();
 381                 item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European"));
 382                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro);
 383                 gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro);
 384                 gtk_widget_show_all(item_easteuro);
 385
 386                 menu_eastasian = gtk_menu_new();
 387                 item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian"));
 388                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian);
 389                 gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian);
 390                 gtk_widget_show_all(item_eastasian);
 391
 392                 menu_asian = gtk_menu_new();
 393                 item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
 394                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian);
 395                 gtk_container_add(GTK_CONTAINER(menu[k]), item_asian);
 396                 gtk_widget_show_all(item_asian);
 397
 398                 menu_middleeast = gtk_menu_new();
 399                 item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
 400                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast);
 401                 gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast);
 402                 gtk_widget_show_all(item_middleeast);
 403
 404                 menu_utf8 = gtk_menu_new();
 405                 item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode"));
 406                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8);
 407                 gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8);
 408                 gtk_widget_show_all(item_utf8);
 409
 410                 /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
 411                 for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
 412                 {
 413                         order = 0;
 414                         switch (i)
 415                         {
 416                                 case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break;
 417                                 case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break;
 418                                 case EASTASIAN: submenu = menu_eastasian; group_size = 14; break;
 419                                 case ASIAN: submenu = menu_asian; group_size = 9; break;
 420                                 case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break;
 421                                 case UNICODE: submenu = menu_utf8; group_size = 8; break;
 422                                 default: submenu = menu[k]; group_size = 1;
 423                         }
 424
 425                         while (order < group_size)      /* the biggest group has 13 elements */
 426                         {
 427                                 for (j = 0; j < GEANY_ENCODINGS_MAX; j++)
 428                                 {
 429                                         if (encodings[j].group == i && encodings[j].order == order)
 430                                         {
 431                                                 label = encodings_to_string(&encodings[j]);
 432                                                 if (k == 0)
 433                                                 {
 434                                                         item = gtk_radio_menu_item_new_with_label(group, label);
 435                                                         group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
 436                                                         radio_items[j] = item;
 437                                                 }
 438                                                 else
 439                                                         item = gtk_menu_item_new_with_label(label);
 440                                                 gtk_widget_show(item);
 441                                                 gtk_container_add(GTK_CONTAINER(submenu), item);
 442                                                 g_signal_connect(item, "activate",
 443                                                                                 cb_func[k], GINT_TO_POINTER(encodings[j].idx));
 444                                                 g_free(label);
 445                                                 break;
 446                                         }
 447                                 }
 448                                 order++;
 449                         }
 450                 }
 451         }
 452 }
 453
 454
 455 /**
 456  *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
 457  *  If @a fast is not set, additional checks to validate the converted string are performed.
 458  *
 459  *  @param buffer The input string to convert.
 460  *  @param size The length of the string, or -1 if the string is nul-terminated.
 461  *  @param charset The charset to be used for conversion.
 462  *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
 463  *
 464  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 465  *    which must be freed with @c g_free(). Otherwise @c NULL.
 466  **/
 467 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gsize size,
 468                                                                                           const gchar *charset, gboolean fast)
 469 {
 470         gchar *utf8_content = NULL;
 471         GError *conv_error = NULL;
 472         gchar* converted_contents = NULL;
 473         gsize bytes_written;
 474
 475         g_return_val_if_fail(buffer != NULL, NULL);
 476         g_return_val_if_fail(charset != NULL, NULL);
 477
 478         converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
 479                                                                    &bytes_written, &conv_error);
 480
 481         if (fast)
 482         {
 483                 utf8_content = converted_contents;
 484                 if (conv_error != NULL) g_error_free(conv_error);
 485         }
 486         else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
 487         {
 488                 if (conv_error != NULL)
 489                 {
 490                         geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
 491                         g_error_free(conv_error);
 492                         conv_error = NULL;
 493                 }
 494                 else
 495                         geany_debug("Couldn't convert from %s to UTF-8.", charset);
 496
 497                 utf8_content = NULL;
 498                 if (converted_contents != NULL)
 499                         g_free(converted_contents);
 500         }
 501         else
 502         {
 503                 geany_debug("Converted from %s to UTF-8.", charset);
 504                 utf8_content = converted_contents;
 505         }
 506
 507         return utf8_content;
 508 }
 509
 510
 511 /**
 512  *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
 513  *  @a used_encoding.
 514  *
 515  *  @param buffer the input string to convert.
 516  *  @param size the length of the string, or -1 if the string is nul-terminated.
 517  *  @param used_encoding return location of the detected encoding of the input string, or @c NULL.
 518  *
 519  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 520  *    which must be freed with @c g_free(). Otherwise @c NULL.
 521  **/
 522 gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
 523 {
 524         gchar *locale_charset = NULL;
 525         gchar *regex_charset = NULL;
 526         const gchar *charset;
 527         gchar *utf8_content;
 528         gboolean check_regex = FALSE;
 529         gboolean check_locale = FALSE;
 530         gint i, len, preferred_charset;
 531
 532         if ((gint)size == -1)
 533         {
 534                 size = strlen(buffer);
 535         }
 536
 537 #ifdef HAVE_REGCOMP
 538         /* first try to read the encoding from the file content */
 539         len = (gint) G_N_ELEMENTS(pregs);
 540         for (i = 0; i < len && ! check_regex; i++)
 541         {
 542                 if ((regex_charset = regex_match(&pregs[i], buffer, size)) != NULL)
 543                         check_regex = TRUE;
 544         }
 545 #endif
 546
 547         /* current locale is not UTF-8, we have to check this charset */
 548         check_locale = ! g_get_charset((const gchar**) &charset);
 549
 550         /* First check for preferred charset, if specified */
 551         preferred_charset = file_prefs.default_open_encoding;
 552
 553         if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
 554                 preferred_charset < 0 ||
 555                 preferred_charset >= GEANY_ENCODINGS_MAX)
 556         {
 557                 preferred_charset = -1;
 558         }
 559
 560         /* -1 means "Preferred charset" */
 561         for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
 562         {
 563                 if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
 564                         continue;
 565
 566                 if (check_regex)
 567                 {
 568                         check_regex = FALSE;
 569                         charset = regex_charset;
 570                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 571                 }
 572                 else if (check_locale)
 573                 {
 574                         check_locale = FALSE;
 575                         charset = locale_charset;
 576                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 577                 }
 578                 else if (i == -1)
 579                 {
 580                         if (preferred_charset >= 0)
 581                         {
 582                                 charset = encodings[preferred_charset].charset;
 583                                 geany_debug("Using preferred charset: %s", charset);
 584                         }
 585                         else
 586                                 continue;
 587                 }
 588                 else if (i >= 0)
 589                         charset = encodings[i].charset;
 590                 else /* in this case we have i == -2, continue to increase i and go ahead */
 591                         continue;
 592
 593                 if (G_UNLIKELY(charset == NULL))
 594                         continue;
 595
 596                 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
 597                         size, charset);
 598                 utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
 599
 600                 if (G_LIKELY(utf8_content != NULL))
 601                 {
 602                         if (used_encoding != NULL)
 603                         {
 604                                 if (G_UNLIKELY(*used_encoding != NULL))
 605                                 {
 606                                         geany_debug("%s:%d", __FILE__, __LINE__);
 607                                         g_free(*used_encoding);
 608                                 }
 609                                 *used_encoding = g_strdup(charset);
 610                         }
 611                         g_free(regex_charset);
 612                         return utf8_content;
 613                 }
 614         }
 615         g_free(regex_charset);
 616
 617         return NULL;
 618 }
 619
 620
 621 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
 622  * otherwise GEANY_ENCODING_NONE.
 623  * */
 624 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
 625 {
 626         if (len >= 3)
 627         {
 628                 if (bom_len)
 629                         *bom_len = 3;
 630
 631                 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
 632                         (guchar)string[2] == 0xbf)
 633                 {
 634                         return GEANY_ENCODING_UTF_8;
 635                 }
 636         }
 637         if (len >= 4)
 638         {
 639                 if (bom_len)
 640                         *bom_len = 4;
 641
 642                 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
 643                                  (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
 644                 {
 645                         return GEANY_ENCODING_UTF_32BE; /* Big endian */
 646                 }
 647                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
 648                                  (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
 649                 {
 650                         return GEANY_ENCODING_UTF_32LE; /* Little endian */
 651                 }
 652                 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
 653                                  (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
 654                 {
 655                          return GEANY_ENCODING_UTF_7;
 656                 }
 657         }
 658         if (len >= 2)
 659         {
 660                 if (bom_len)
 661                         *bom_len = 2;
 662
 663                 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
 664                 {
 665                         return GEANY_ENCODING_UTF_16BE; /* Big endian */
 666                 }
 667                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
 668                 {
 669                         return GEANY_ENCODING_UTF_16LE; /* Little endian */
 670                 }
 671         }
 672         if (bom_len)
 673                 *bom_len = 0;
 674         return GEANY_ENCODING_NONE;
 675 }
 676
 677
 678 gboolean encodings_is_unicode_charset(const gchar *string)
 679 {
 680         if (string != NULL &&
 681                 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
 682         {
 683                 return TRUE;
 684         }
 685         return FALSE;
 686 }
 687
 688