src/encodings.c

   1 /*
   2  *      encodings.c - this file is part of Geany, a fast and lightweight IDE
   3  *
   4  *      Copyright 2005-2010 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
   5  *      Copyright 2006-2010 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
   6  *
   7  *      This program is free software; you can redistribute it and/or modify
   8  *      it under the terms of the GNU General Public License as published by
   9  *      the Free Software Foundation; either version 2 of the License, or
  10  *      (at your option) any later version.
  11  *
  12  *      This program is distributed in the hope that it will be useful,
  13  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *      GNU General Public License for more details.
  16  *
  17  *      You should have received a copy of the GNU General Public License
  18  *      along with this program; if not, write to the Free Software
  19  *      Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20  *
  21  *  $Id$
  22  */
  23
  24 /*
  25  * Encoding conversion and Byte Order Mark (BOM) handling.
  26  */
  27
  28 /*
  29  * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
  30  * list of people on the gedit Team.
  31  * See the gedit ChangeLog files for a list of changes.
  32  */
  33  /* Stolen from anjuta */
  34
  35 #include <string.h>
  36
  37 #include "geany.h"
  38 #include "utils.h"
  39 #include "support.h"
  40 #include "document.h"
  41 #include "documentprivate.h"
  42 #include "msgwindow.h"
  43 #include "encodings.h"
  44 #include "callbacks.h"
  45 #include "ui_utils.h"
  46
  47 #ifdef HAVE_REGEX_H
  48 # include <regex.h>
  49 #else
  50 # include "gnuregex.h"
  51 #endif
  52
  53 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
  54 #define PATTERN_HTMLMETA "<meta[ \t\n\r\f]http-equiv[ \t\n\r\f]*=[ \t\n\r\f]*\"content-type\"[ \t\n\r\f]+content[ \t\n\r\f]*=[ \t\n\r\f]*\"text/x?html;[ \t\n\r\f]*charset=([a-z0-9_-]+)\"[ \t\n\r\f]*/?>"
  55 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
  56 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*([a-z0-9-]+)[\t ]*"
  57
  58 /* precompiled regexps */
  59 static regex_t pregs[2];
  60 static gboolean pregs_loaded = FALSE;
  61
  62
  63 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
  64
  65
  66 #define fill(Order, Group, Idx, Charset, Name) \
  67                 encodings[Idx].idx = Idx; \
  68                 encodings[Idx].order = Order; \
  69                 encodings[Idx].group = Group; \
  70                 encodings[Idx].charset = Charset; \
  71                 encodings[Idx].name = Name;
  72
  73 static void init_encodings(void)
  74 {
  75         fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic"));
  76         fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek"));
  77         fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek"));
  78         fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic"));
  79         fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European"));
  80         fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western"));
  81         fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western"));
  82         fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western"));
  83         fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western"));
  84
  85         fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic"));
  86         fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic"));
  87         fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic"));
  88         fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European"));
  89         fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European"));
  90         fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European"));
  91         fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic"));
  92         fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic"));
  93         /* ISO-IR-111 not available on Windows */
  94         fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic"));
  95         fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8-R", _("Cyrillic"));
  96         fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic"));
  97         fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian"));
  98         fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8-U", _("Cyrillic/Ukrainian"));
  99         fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian"));
 100
 101         fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic"));
 102         fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic"));
 103         fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic"));
 104         fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew"));
 105         /* not available at all, ? */
 106         fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew"));
 107         fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew"));
 108         fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual"));
 109
 110         fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian"));
 111         fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian"));
 112         fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai"));
 113         fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish"));
 114         fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish"));
 115         fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish"));
 116         fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese"));
 117         fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese"));
 118         fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese"));
 119
 120         fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode"));
 121         fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode"));
 122         fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode"));
 123         fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode"));
 124         fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode"));
 125         fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode"));
 126         fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode"));
 127         fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode"));
 128
 129         fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified"));
 130         fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified"));
 131         fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified"));
 132         /* maybe not available on Linux */
 133         fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified"));
 134         fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional"));
 135         fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional"));
 136         fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional"));
 137         fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese"));
 138         fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese"));
 139         fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese"));
 140         fill(10, EASTASIAN, GEANY_ENCODING_CP_932, "CP932", _("Japanese"));
 141         fill(11, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean"));
 142         fill(12, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean"));
 143         fill(13, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean"));
 144         fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean"));
 145
 146         fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding"));
 147 }
 148
 149
 150 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
 151 {
 152         gint i;
 153
 154         if (charset == NULL)
 155                 return GEANY_ENCODING_UTF_8;
 156
 157         i = 0;
 158         while (i < GEANY_ENCODINGS_MAX)
 159         {
 160                 if (strcmp(charset, encodings[i].charset) == 0)
 161                         return i;
 162
 163                 ++i;
 164         }
 165         return GEANY_ENCODING_UTF_8;
 166 }
 167
 168
 169 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
 170 {
 171         gint i;
 172
 173         if (charset == NULL)
 174                 return &encodings[GEANY_ENCODING_UTF_8];
 175
 176         i = 0;
 177         while (i < GEANY_ENCODINGS_MAX)
 178         {
 179                 if (strcmp(charset, encodings[i].charset) == 0)
 180                         return &encodings[i];
 181
 182                 ++i;
 183         }
 184
 185         return NULL;
 186 }
 187
 188
 189 const GeanyEncoding *encodings_get_from_index(gint idx)
 190 {
 191         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 192
 193         return &encodings[idx];
 194 }
 195
 196
 197 /**
 198  *  Gets the character set name of the specified index e.g. for use with
 199  *  @ref document_set_encoding().
 200  *
 201  *  @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
 202  *
 203  *
 204  *  @return The charset according to idx, or @c NULL if the index is invalid.
 205  *
 206  *  @since 0.13
 207  **/
 208 const gchar* encodings_get_charset_from_index(gint idx)
 209 {
 210         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 211
 212         return encodings[idx].charset;
 213 }
 214
 215
 216 gchar *encodings_to_string(const GeanyEncoding* enc)
 217 {
 218         g_return_val_if_fail(enc != NULL, NULL);
 219         g_return_val_if_fail(enc->name != NULL, NULL);
 220         g_return_val_if_fail(enc->charset != NULL, NULL);
 221
 222     return g_strdup_printf("%s (%s)", enc->name, enc->charset);
 223 }
 224
 225
 226 const gchar *encodings_get_charset(const GeanyEncoding* enc)
 227 {
 228         g_return_val_if_fail(enc != NULL, NULL);
 229         g_return_val_if_fail(enc->charset != NULL, NULL);
 230
 231         return enc->charset;
 232 }
 233
 234
 235 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
 236
 237
 238 void encodings_select_radio_item(const gchar *charset)
 239 {
 240         gint i;
 241
 242         g_return_if_fail(charset != NULL);
 243
 244         i = 0;
 245         while (i < GEANY_ENCODINGS_MAX)
 246         {
 247                 if (utils_str_equal(charset, encodings[i].charset))
 248                         break;
 249                 i++;
 250         }
 251         if (i == GEANY_ENCODINGS_MAX)
 252                 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
 253
 254         /* ignore_callback has to be set by the caller */
 255         gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
 256 }
 257
 258
 259 /* Regexp detection of file encoding declared in the file itself.
 260  * Idea and parts of code taken from Bluefish, thanks.
 261  * regex_compile() is used to compile regular expressions on program init and keep it in memory
 262  * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
 263  */
 264 static void regex_compile(regex_t *preg, const gchar *pattern)
 265 {
 266         gint retval = regcomp(preg, pattern, REG_EXTENDED | REG_ICASE);
 267         if (retval != 0)
 268         {
 269                 gchar errmsg[512];
 270                 regerror(retval, preg, errmsg, 512);
 271                 geany_debug("regcomp() failed (%s)", errmsg);
 272                 regfree(preg);
 273                 return;
 274         }
 275 }
 276
 277
 278 static gchar *regex_match(regex_t *preg, const gchar *buffer, gsize size)
 279 {
 280         gint retval;
 281         gchar *tmp_buf = NULL;
 282         gchar *encoding = NULL;
 283         regmatch_t pmatch[10];
 284
 285         if (G_UNLIKELY(! pregs_loaded) || G_UNLIKELY(buffer == NULL))
 286                 return NULL;
 287
 288         if (size > 512)
 289                 tmp_buf = g_strndup(buffer, 512); /* scan only the first 512 characters in the buffer */
 290
 291         retval = regexec(preg, (tmp_buf != NULL) ? tmp_buf : buffer, 10, pmatch, 0);
 292         if (retval == 0 && pmatch[0].rm_so != -1 && pmatch[1].rm_so != -1)
 293         {
 294                 encoding = g_strndup(&buffer[pmatch[1].rm_so], pmatch[1].rm_eo - pmatch[1].rm_so);
 295                 geany_debug("Detected encoding by regex search: %s", encoding);
 296
 297                 setptr(encoding, g_utf8_strup(encoding, -1));
 298         }
 299         g_free(tmp_buf);
 300         return encoding;
 301 }
 302
 303
 304 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
 305 {
 306         GeanyDocument *doc = document_get_current();
 307         guint i = GPOINTER_TO_INT(user_data);
 308
 309         if (ignore_callback || doc == NULL || encodings[i].charset == NULL ||
 310                 ! gtk_check_menu_item_get_active(menuitem) ||
 311                 utils_str_equal(encodings[i].charset, doc->encoding))
 312                 return;
 313
 314         if (doc->readonly)
 315         {
 316                 utils_beep();
 317                 return;
 318         }
 319         document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
 320
 321         document_set_encoding(doc, encodings[i].charset);
 322 }
 323
 324
 325 void encodings_finalize(void)
 326 {
 327         if (pregs_loaded)
 328         {
 329                 guint i, len;
 330                 len = G_N_ELEMENTS(pregs);
 331                 for (i = 0; i < len; i++)
 332                 {
 333                         regfree(&pregs[i]);
 334                 }
 335         }
 336 }
 337
 338
 339 void encodings_init(void)
 340 {
 341         GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian,
 342                           *menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian,
 343                           *item_asian, *item_utf8, *item_middleeast;
 344         GCallback cb_func[2];
 345         GSList *group = NULL;
 346         gchar *label;
 347         gint order, group_size;
 348         guint i, j, k;
 349
 350         init_encodings();
 351
 352         if (! pregs_loaded)
 353         {
 354                 regex_compile(&pregs[0], PATTERN_HTMLMETA);
 355                 regex_compile(&pregs[1], PATTERN_CODING);
 356                 pregs_loaded = TRUE;
 357         }
 358
 359         /* create encodings submenu in document menu */
 360         menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
 361         menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
 362         cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
 363         cb_func[1] = G_CALLBACK(on_reload_as_activate);
 364
 365         for (k = 0; k < 2; k++)
 366         {
 367                 menu_westeuro = gtk_menu_new();
 368                 item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European"));
 369                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro);
 370                 gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro);
 371                 gtk_widget_show_all(item_westeuro);
 372
 373                 menu_easteuro = gtk_menu_new();
 374                 item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European"));
 375                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro);
 376                 gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro);
 377                 gtk_widget_show_all(item_easteuro);
 378
 379                 menu_eastasian = gtk_menu_new();
 380                 item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian"));
 381                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian);
 382                 gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian);
 383                 gtk_widget_show_all(item_eastasian);
 384
 385                 menu_asian = gtk_menu_new();
 386                 item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
 387                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian);
 388                 gtk_container_add(GTK_CONTAINER(menu[k]), item_asian);
 389                 gtk_widget_show_all(item_asian);
 390
 391                 menu_middleeast = gtk_menu_new();
 392                 item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
 393                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast);
 394                 gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast);
 395                 gtk_widget_show_all(item_middleeast);
 396
 397                 menu_utf8 = gtk_menu_new();
 398                 item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode"));
 399                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8);
 400                 gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8);
 401                 gtk_widget_show_all(item_utf8);
 402
 403                 /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
 404                 for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
 405                 {
 406                         order = 0;
 407                         switch (i)
 408                         {
 409                                 case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break;
 410                                 case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break;
 411                                 case EASTASIAN: submenu = menu_eastasian; group_size = 14; break;
 412                                 case ASIAN: submenu = menu_asian; group_size = 9; break;
 413                                 case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break;
 414                                 case UNICODE: submenu = menu_utf8; group_size = 8; break;
 415                                 default: submenu = menu[k]; group_size = 1;
 416                         }
 417
 418                         while (order < group_size)      /* the biggest group has 13 elements */
 419                         {
 420                                 for (j = 0; j < GEANY_ENCODINGS_MAX; j++)
 421                                 {
 422                                         if (encodings[j].group == i && encodings[j].order == order)
 423                                         {
 424                                                 label = encodings_to_string(&encodings[j]);
 425                                                 if (k == 0)
 426                                                 {
 427                                                         item = gtk_radio_menu_item_new_with_label(group, label);
 428                                                         group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
 429                                                         radio_items[j] = item;
 430                                                 }
 431                                                 else
 432                                                         item = gtk_menu_item_new_with_label(label);
 433                                                 gtk_widget_show(item);
 434                                                 gtk_container_add(GTK_CONTAINER(submenu), item);
 435                                                 g_signal_connect(item, "activate",
 436                                                                                 cb_func[k], GINT_TO_POINTER(encodings[j].idx));
 437                                                 g_free(label);
 438                                                 break;
 439                                         }
 440                                 }
 441                                 order++;
 442                         }
 443                 }
 444         }
 445 }
 446
 447
 448 /**
 449  *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
 450  *  If @a fast is not set, additional checks to validate the converted string are performed.
 451  *
 452  *  @param buffer The input string to convert.
 453  *  @param size The length of the string, or -1 if the string is nul-terminated.
 454  *  @param charset The charset to be used for conversion.
 455  *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
 456  *
 457  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 458  *    which must be freed with @c g_free(). Otherwise @c NULL.
 459  **/
 460 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gsize size,
 461                                                                                           const gchar *charset, gboolean fast)
 462 {
 463         gchar *utf8_content = NULL;
 464         GError *conv_error = NULL;
 465         gchar* converted_contents = NULL;
 466         gsize bytes_written;
 467
 468         g_return_val_if_fail(buffer != NULL, NULL);
 469         g_return_val_if_fail(charset != NULL, NULL);
 470
 471         converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
 472                                                                    &bytes_written, &conv_error);
 473
 474         if (fast)
 475         {
 476                 utf8_content = converted_contents;
 477                 if (conv_error != NULL) g_error_free(conv_error);
 478         }
 479         else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
 480         {
 481                 if (conv_error != NULL)
 482                 {
 483                         geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
 484                         g_error_free(conv_error);
 485                         conv_error = NULL;
 486                 }
 487                 else
 488                         geany_debug("Couldn't convert from %s to UTF-8.", charset);
 489
 490                 utf8_content = NULL;
 491                 g_free(converted_contents);
 492         }
 493         else
 494         {
 495                 geany_debug("Converted from %s to UTF-8.", charset);
 496                 utf8_content = converted_contents;
 497         }
 498
 499         return utf8_content;
 500 }
 501
 502
 503 /**
 504  *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
 505  *  @a used_encoding.
 506  *
 507  *  @param buffer the input string to convert.
 508  *  @param size the length of the string, or -1 if the string is nul-terminated.
 509  *  @param used_encoding return location of the detected encoding of the input string, or @c NULL.
 510  *
 511  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 512  *    which must be freed with @c g_free(). Otherwise @c NULL.
 513  **/
 514 gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
 515 {
 516         gchar *locale_charset = NULL;
 517         gchar *regex_charset = NULL;
 518         const gchar *charset;
 519         gchar *utf8_content;
 520         gboolean check_regex = FALSE;
 521         gboolean check_locale = FALSE;
 522         gint i, len, preferred_charset;
 523
 524         if ((gint)size == -1)
 525         {
 526                 size = strlen(buffer);
 527         }
 528
 529         /* first try to read the encoding from the file content */
 530         len = (gint) G_N_ELEMENTS(pregs);
 531         for (i = 0; i < len && ! check_regex; i++)
 532         {
 533                 if ((regex_charset = regex_match(&pregs[i], buffer, size)) != NULL)
 534                         check_regex = TRUE;
 535         }
 536
 537         /* current locale is not UTF-8, we have to check this charset */
 538         check_locale = ! g_get_charset((const gchar**) &charset);
 539
 540         /* First check for preferred charset, if specified */
 541         preferred_charset = file_prefs.default_open_encoding;
 542
 543         if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
 544                 preferred_charset < 0 ||
 545                 preferred_charset >= GEANY_ENCODINGS_MAX)
 546         {
 547                 preferred_charset = -1;
 548         }
 549
 550         /* -1 means "Preferred charset" */
 551         for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
 552         {
 553                 if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
 554                         continue;
 555
 556                 if (check_regex)
 557                 {
 558                         check_regex = FALSE;
 559                         charset = regex_charset;
 560                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 561                 }
 562                 else if (check_locale)
 563                 {
 564                         check_locale = FALSE;
 565                         charset = locale_charset;
 566                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 567                 }
 568                 else if (i == -1)
 569                 {
 570                         if (preferred_charset >= 0)
 571                         {
 572                                 charset = encodings[preferred_charset].charset;
 573                                 geany_debug("Using preferred charset: %s", charset);
 574                         }
 575                         else
 576                                 continue;
 577                 }
 578                 else if (i >= 0)
 579                         charset = encodings[i].charset;
 580                 else /* in this case we have i == -2, continue to increase i and go ahead */
 581                         continue;
 582
 583                 if (G_UNLIKELY(charset == NULL))
 584                         continue;
 585
 586                 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
 587                         size, charset);
 588                 utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
 589
 590                 if (G_LIKELY(utf8_content != NULL))
 591                 {
 592                         if (used_encoding != NULL)
 593                         {
 594                                 if (G_UNLIKELY(*used_encoding != NULL))
 595                                 {
 596                                         geany_debug("%s:%d", __FILE__, __LINE__);
 597                                         g_free(*used_encoding);
 598                                 }
 599                                 *used_encoding = g_strdup(charset);
 600                         }
 601                         g_free(regex_charset);
 602                         return utf8_content;
 603                 }
 604         }
 605         g_free(regex_charset);
 606
 607         return NULL;
 608 }
 609
 610
 611 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
 612  * otherwise GEANY_ENCODING_NONE.
 613  * */
 614 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
 615 {
 616         if (len >= 3)
 617         {
 618                 if (bom_len)
 619                         *bom_len = 3;
 620
 621                 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
 622                         (guchar)string[2] == 0xbf)
 623                 {
 624                         return GEANY_ENCODING_UTF_8;
 625                 }
 626         }
 627         if (len >= 4)
 628         {
 629                 if (bom_len)
 630                         *bom_len = 4;
 631
 632                 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
 633                                  (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
 634                 {
 635                         return GEANY_ENCODING_UTF_32BE; /* Big endian */
 636                 }
 637                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
 638                                  (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
 639                 {
 640                         return GEANY_ENCODING_UTF_32LE; /* Little endian */
 641                 }
 642                 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
 643                                  (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
 644                 {
 645                          return GEANY_ENCODING_UTF_7;
 646                 }
 647         }
 648         if (len >= 2)
 649         {
 650                 if (bom_len)
 651                         *bom_len = 2;
 652
 653                 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
 654                 {
 655                         return GEANY_ENCODING_UTF_16BE; /* Big endian */
 656                 }
 657                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
 658                 {
 659                         return GEANY_ENCODING_UTF_16LE; /* Little endian */
 660                 }
 661         }
 662         if (bom_len)
 663                 *bom_len = 0;
 664         return GEANY_ENCODING_NONE;
 665 }
 666
 667
 668 gboolean encodings_is_unicode_charset(const gchar *string)
 669 {
 670         if (string != NULL &&
 671                 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
 672         {
 673                 return TRUE;
 674         }
 675         return FALSE;
 676 }
 677
 678