src/encodings.c

   1 /*
   2  *      encodings.c - this file is part of Geany, a fast and lightweight IDE
   3  *
   4  *      Copyright 2005 The Geany contributors
   5  *
   6  *      This program is free software; you can redistribute it and/or modify
   7  *      it under the terms of the GNU General Public License as published by
   8  *      the Free Software Foundation; either version 2 of the License, or
   9  *      (at your option) any later version.
  10  *
  11  *      This program is distributed in the hope that it will be useful,
  12  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  *      GNU General Public License for more details.
  15  *
  16  *      You should have received a copy of the GNU General Public License along
  17  *      with this program; if not, write to the Free Software Foundation, Inc.,
  18  *      51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19  */
  20
  21 /*
  22  * Encoding conversion and Byte Order Mark (BOM) handling.
  23  */
  24
  25 /*
  26  * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
  27  * list of people on the gedit Team.
  28  * See the gedit ChangeLog files for a list of changes.
  29  */
  30  /* Stolen from anjuta */
  31
  32 #ifdef HAVE_CONFIG_H
  33 # include "config.h"
  34 #endif
  35
  36 #include "encodings.h"
  37 #include "encodingsprivate.h"
  38
  39 #include "app.h"
  40 #include "callbacks.h"
  41 #include "documentprivate.h"
  42 #include "support.h"
  43 #include "ui_utils.h"
  44 #include "utils.h"
  45
  46 #include <string.h>
  47
  48
  49 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
  50 #define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
  51 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
  52 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
  53
  54 /* precompiled regexps */
  55 static GRegex *pregs[2];
  56 static gboolean pregs_loaded = FALSE;
  57
  58
  59 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
  60
  61
  62 static gboolean conversion_supported(const gchar *to, const gchar *from)
  63 {
  64         GIConv conv = g_iconv_open(to, from);
  65         if (conv == (GIConv) -1)
  66                 return FALSE;
  67
  68         g_iconv_close(conv);
  69         return TRUE;
  70 }
  71
  72
  73 #define fill(Order, Group, Idx, Charset, Name) \
  74                 encodings[Idx].idx = Idx; \
  75                 encodings[Idx].order = Order; \
  76                 encodings[Idx].group = Group; \
  77                 encodings[Idx].charset = Charset; \
  78                 encodings[Idx].name = Name; \
  79                 encodings[Idx].supported = FALSE;
  80
  81 static void init_encodings(void)
  82 {
  83         fill(0,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_14,             "ISO-8859-14",          _("Celtic"));
  84         fill(1,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_7,              "ISO-8859-7",           _("Greek"));
  85         fill(2,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1253,    "WINDOWS-1253",         _("Greek"));
  86         fill(3,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_10,             "ISO-8859-10",          _("Nordic"));
  87         fill(4,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_3,              "ISO-8859-3",           _("South European"));
  88         fill(5,         WESTEUROPEAN,   GEANY_ENCODING_IBM_850,                 "IBM850",                       _("Western"));
  89         fill(6,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_1,              "ISO-8859-1",           _("Western"));
  90         fill(7,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_15,             "ISO-8859-15",          _("Western"));
  91         fill(8,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1252,    "WINDOWS-1252",         _("Western"));
  92
  93         fill(0,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_4,              "ISO-8859-4",           _("Baltic"));
  94         fill(1,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_13,             "ISO-8859-13",          _("Baltic"));
  95         fill(2,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1257,    "WINDOWS-1257",         _("Baltic"));
  96         fill(3,         EASTEUROPEAN,   GEANY_ENCODING_IBM_852,                 "IBM852",                       _("Central European"));
  97         fill(4,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_2,              "ISO-8859-2",           _("Central European"));
  98         fill(5,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1250,    "WINDOWS-1250",         _("Central European"));
  99         fill(6,         EASTEUROPEAN,   GEANY_ENCODING_IBM_855,                 "IBM855",                       _("Cyrillic"));
 100         fill(7,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_5,              "ISO-8859-5",           _("Cyrillic"));
 101         /* ISO-IR-111 not available on Windows */
 102         fill(8,         EASTEUROPEAN,   GEANY_ENCODING_ISO_IR_111,              "ISO-IR-111",           _("Cyrillic"));
 103         fill(9,         EASTEUROPEAN,   GEANY_ENCODING_KOI8_R,                  "KOI8-R",                       _("Cyrillic"));
 104         fill(10,        EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1251,    "WINDOWS-1251",         _("Cyrillic"));
 105         fill(11,        EASTEUROPEAN,   GEANY_ENCODING_CP_866,                  "CP866",                        _("Cyrillic/Russian"));
 106         fill(12,        EASTEUROPEAN,   GEANY_ENCODING_KOI8_U,                  "KOI8-U",                       _("Cyrillic/Ukrainian"));
 107         fill(13,        EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_16,             "ISO-8859-16",          _("Romanian"));
 108
 109         fill(0,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_864,                 "IBM864",                       _("Arabic"));
 110         fill(1,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_6,              "ISO-8859-6",           _("Arabic"));
 111         fill(2,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1256,    "WINDOWS-1256",         _("Arabic"));
 112         fill(3,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_862,                 "IBM862",                       _("Hebrew"));
 113         /* not available at all, ? */
 114         fill(4,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8_I,    "ISO-8859-8-I",         _("Hebrew"));
 115         fill(5,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1255,    "WINDOWS-1255",         _("Hebrew"));
 116         fill(6,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8,              "ISO-8859-8",           _("Hebrew Visual"));
 117
 118         fill(0,         ASIAN,                  GEANY_ENCODING_ARMSCII_8,               "ARMSCII-8",            _("Armenian"));
 119         fill(1,         ASIAN,                  GEANY_ENCODING_GEOSTD8,                 "GEORGIAN-ACADEMY",     _("Georgian"));
 120         fill(2,         ASIAN,                  GEANY_ENCODING_TIS_620,                 "TIS-620",                      _("Thai"));
 121         fill(3,         ASIAN,                  GEANY_ENCODING_IBM_857,                 "IBM857",                       _("Turkish"));
 122         fill(4,         ASIAN,                  GEANY_ENCODING_WINDOWS_1254,    "WINDOWS-1254",         _("Turkish"));
 123         fill(5,         ASIAN,                  GEANY_ENCODING_ISO_8859_9,              "ISO-8859-9",           _("Turkish"));
 124         fill(6,         ASIAN,                  GEANY_ENCODING_TCVN,                    "TCVN",                         _("Vietnamese"));
 125         fill(7,         ASIAN,                  GEANY_ENCODING_VISCII,                  "VISCII",                       _("Vietnamese"));
 126         fill(8,         ASIAN,                  GEANY_ENCODING_WINDOWS_1258,    "WINDOWS-1258",         _("Vietnamese"));
 127
 128         fill(0,         UNICODE,                GEANY_ENCODING_UTF_7,                   "UTF-7",                        _("Unicode"));
 129         fill(1,         UNICODE,                GEANY_ENCODING_UTF_8,                   "UTF-8",                        _("Unicode"));
 130         fill(2,         UNICODE,                GEANY_ENCODING_UTF_16LE,                "UTF-16LE",                     _("Unicode"));
 131         fill(3,         UNICODE,                GEANY_ENCODING_UTF_16BE,                "UTF-16BE",                     _("Unicode"));
 132         fill(4,         UNICODE,                GEANY_ENCODING_UCS_2LE,                 "UCS-2LE",                      _("Unicode"));
 133         fill(5,         UNICODE,                GEANY_ENCODING_UCS_2BE,                 "UCS-2BE",                      _("Unicode"));
 134         fill(6,         UNICODE,                GEANY_ENCODING_UTF_32LE,                "UTF-32LE",                     _("Unicode"));
 135         fill(7,         UNICODE,                GEANY_ENCODING_UTF_32BE,                "UTF-32BE",                     _("Unicode"));
 136
 137         fill(0,         EASTASIAN,              GEANY_ENCODING_GB18030,                 "GB18030",                      _("Chinese Simplified"));
 138         fill(1,         EASTASIAN,              GEANY_ENCODING_GB2312,                  "GB2312",                       _("Chinese Simplified"));
 139         fill(2,         EASTASIAN,              GEANY_ENCODING_GBK,                             "GBK",                          _("Chinese Simplified"));
 140         /* maybe not available on Linux */
 141         fill(3,         EASTASIAN,              GEANY_ENCODING_HZ,                              "HZ",                           _("Chinese Simplified"));
 142         fill(4,         EASTASIAN,              GEANY_ENCODING_BIG5,                    "BIG5",                         _("Chinese Traditional"));
 143         fill(5,         EASTASIAN,              GEANY_ENCODING_BIG5_HKSCS,              "BIG5-HKSCS",           _("Chinese Traditional"));
 144         fill(6,         EASTASIAN,              GEANY_ENCODING_EUC_TW,                  "EUC-TW",                       _("Chinese Traditional"));
 145         fill(7,         EASTASIAN,              GEANY_ENCODING_EUC_JP,                  "EUC-JP",                       _("Japanese"));
 146         fill(8,         EASTASIAN,              GEANY_ENCODING_ISO_2022_JP,             "ISO-2022-JP",          _("Japanese"));
 147         fill(9,         EASTASIAN,              GEANY_ENCODING_SHIFT_JIS,               "SHIFT_JIS",            _("Japanese"));
 148         fill(10,        EASTASIAN,              GEANY_ENCODING_CP_932,                  "CP932",                        _("Japanese"));
 149         fill(11,        EASTASIAN,              GEANY_ENCODING_EUC_KR,                  "EUC-KR",                       _("Korean"));
 150         fill(12,        EASTASIAN,              GEANY_ENCODING_ISO_2022_KR,             "ISO-2022-KR",          _("Korean"));
 151         fill(13,        EASTASIAN,              GEANY_ENCODING_JOHAB,                   "JOHAB",                        _("Korean"));
 152         fill(14,        EASTASIAN,              GEANY_ENCODING_UHC,                             "UHC",                          _("Korean"));
 153
 154         fill(0,         NONE,                   GEANY_ENCODING_NONE,                    "None",                         _("Without encoding"));
 155
 156         /* fill the flags member */
 157         for (guint i = 0; i < G_N_ELEMENTS(encodings); i++)
 158         {
 159                 if (i == GEANY_ENCODING_NONE || conversion_supported("UTF-8", encodings[i].charset))
 160                         encodings[i].supported = TRUE;
 161                 else
 162                 {
 163                         /* geany_debug() doesn't really work at this point, unless G_MESSAGES_DEBUG
 164                          * is set explicitly by the caller, but that's better than nothing */
 165                         geany_debug("Encoding %s is not supported by the system", encodings[i].charset);
 166                 }
 167         }
 168 }
 169
 170
 171 /* compares two encoding names in a permissive fashion.
 172  * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
 173 static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
 174 {
 175         gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
 176         gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
 177
 178         while (*a && *b)
 179         {
 180                 gboolean is_alpha;
 181
 182                 if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
 183                         ((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
 184                 {
 185                         /* either there was a real separator, or we need a implicit one (a chage from alpha to
 186                          * numeric or so) */
 187                         if (! need_sep || (was_alpha != is_alpha))
 188                         {
 189                                 a++;
 190                                 b++;
 191                                 was_alpha = is_alpha;
 192                                 need_sep = FALSE;
 193                         }
 194                         else
 195                                 return FALSE;
 196                 }
 197                 else
 198                 {
 199                         guint n_sep = 0;
 200
 201                         if (! g_ascii_isalnum(*a))
 202                         {
 203                                 a++;
 204                                 n_sep++;
 205                         }
 206                         if (! g_ascii_isalnum(*b))
 207                         {
 208                                 b++;
 209                                 n_sep++;
 210                         }
 211                         if (n_sep < 1)
 212                                 return FALSE;
 213                         else if (n_sep < 2)
 214                                 need_sep = TRUE;
 215                 }
 216         }
 217         return *a == *b;
 218 }
 219
 220
 221 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
 222 {
 223         if (charset == NULL)
 224                 return GEANY_ENCODING_UTF_8;
 225
 226         for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++)
 227         {
 228                 if (encodings_charset_equals(charset, encodings[i].charset))
 229                         return i;
 230         }
 231         return GEANY_ENCODING_UTF_8;
 232 }
 233
 234
 235 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
 236 {
 237         if (charset == NULL)
 238                 return &encodings[GEANY_ENCODING_UTF_8];
 239
 240         for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++)
 241         {
 242                 if (encodings_charset_equals(charset, encodings[i].charset))
 243                         return &encodings[i];
 244         }
 245
 246         return NULL;
 247 }
 248
 249
 250 static const gchar *encodings_normalize_charset(const gchar *charset)
 251 {
 252         const GeanyEncoding *encoding;
 253
 254         encoding = encodings_get_from_charset(charset);
 255         if (encoding != NULL)
 256                 return encoding->charset;
 257
 258         return NULL;
 259 }
 260
 261
 262 const GeanyEncoding *encodings_get_from_index(gint idx)
 263 {
 264         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 265
 266         return &encodings[idx];
 267 }
 268
 269
 270 /**
 271  *  Gets the character set name of the specified index e.g. for use with
 272  *  @ref document_set_encoding().
 273  *
 274  *  @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
 275  *
 276  *
 277  *  @return @nullable The charset according to idx, or @c NULL if the index is invalid.
 278  *
 279  *  @since 0.13
 280  **/
 281 GEANY_API_SYMBOL
 282 const gchar* encodings_get_charset_from_index(gint idx)
 283 {
 284         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 285
 286         return encodings[idx].charset;
 287 }
 288
 289
 290 gchar *encodings_to_string(const GeanyEncoding* enc)
 291 {
 292         g_return_val_if_fail(enc != NULL, NULL);
 293         g_return_val_if_fail(enc->name != NULL, NULL);
 294         g_return_val_if_fail(enc->charset != NULL, NULL);
 295
 296         if (enc->idx == GEANY_ENCODING_NONE)
 297                 return g_strdup(enc->name); // enc->charset is "None" and would be useless to display
 298         else
 299                 return g_strdup_printf("%s (%s)", enc->name, enc->charset);
 300 }
 301
 302
 303 const gchar *encodings_get_charset(const GeanyEncoding* enc)
 304 {
 305         g_return_val_if_fail(enc != NULL, NULL);
 306         g_return_val_if_fail(enc->charset != NULL, NULL);
 307
 308         return enc->charset;
 309 }
 310
 311
 312 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
 313
 314
 315 void encodings_select_radio_item(const gchar *charset)
 316 {
 317         gint i;
 318
 319         g_return_if_fail(charset != NULL);
 320
 321         for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
 322         {
 323                 if (utils_str_equal(charset, encodings[i].charset))
 324                         break;
 325         }
 326         if (i == GEANY_ENCODINGS_MAX)
 327                 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
 328
 329         /* ignore_callback has to be set by the caller */
 330         gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
 331 }
 332
 333
 334 /* Regexp detection of file encoding declared in the file itself.
 335  * Idea and parts of code taken from Bluefish, thanks.
 336  * regex_compile() is used to compile regular expressions on program init and keep it in memory
 337  * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
 338  */
 339 static GRegex *regex_compile(const gchar *pattern)
 340 {
 341         GError *error = NULL;
 342         GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS | G_REGEX_RAW, 0, &error);
 343
 344         if (!regex)
 345         {
 346                 geany_debug("Failed to compile encoding regex (%s)", error->message);
 347                 g_error_free(error);
 348         }
 349         return regex;
 350 }
 351
 352
 353 static gchar *regex_match(GRegex *preg, const gchar *buffer, gsize size)
 354 {
 355         gchar *encoding = NULL;
 356         GMatchInfo *minfo;
 357
 358         if (G_UNLIKELY(! pregs_loaded || buffer == NULL))
 359                 return NULL;
 360
 361         /* scan only the first 512 characters in the buffer */
 362         size = MIN(size, 512);
 363
 364         if (g_regex_match_full(preg, buffer, size, 0, 0, &minfo, NULL) &&
 365                 g_match_info_get_match_count(minfo) >= 2)
 366         {
 367                 encoding = g_match_info_fetch(minfo, 1);
 368                 geany_debug("Detected encoding by regex search: %s", encoding);
 369
 370                 SETPTR(encoding, g_utf8_strup(encoding, -1));
 371         }
 372         g_match_info_free(minfo);
 373         return encoding;
 374 }
 375
 376
 377 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
 378 {
 379         GeanyDocument *doc = document_get_current();
 380         const gchar *charset = user_data;
 381
 382         if (ignore_callback || doc == NULL || charset == NULL ||
 383                 ! gtk_check_menu_item_get_active(menuitem) ||
 384                 utils_str_equal(charset, doc->encoding))
 385                 return;
 386
 387         if (doc->readonly)
 388         {
 389                 utils_beep();
 390                 return;
 391         }
 392         document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
 393
 394         document_set_encoding(doc, charset);
 395 }
 396
 397 static void encodings_reload_radio_item_change_cb(GtkMenuItem *menuitem, gpointer user_data)
 398 {
 399         GeanyDocument *doc = document_get_current();
 400
 401         g_return_if_fail(doc != NULL);
 402
 403         document_reload_prompt(doc, user_data);
 404 }
 405
 406
 407 void encodings_finalize(void)
 408 {
 409         if (pregs_loaded)
 410         {
 411                 guint i, len;
 412                 len = G_N_ELEMENTS(pregs);
 413                 for (i = 0; i < len; i++)
 414                 {
 415                         g_regex_unref(pregs[i]);
 416                 }
 417         }
 418 }
 419
 420
 421 /* initialization of non-UI parts */
 422 void encodings_init_headless(void)
 423 {
 424         static gboolean initialized = FALSE;
 425
 426         if (initialized)
 427                 return;
 428
 429         init_encodings();
 430
 431         if (! pregs_loaded)
 432         {
 433                 pregs[0] = regex_compile(PATTERN_HTMLMETA);
 434                 pregs[1] = regex_compile(PATTERN_CODING);
 435                 pregs_loaded = TRUE;
 436         }
 437
 438         initialized = TRUE;
 439 }
 440
 441
 442 void encodings_init(void)
 443 {
 444         GtkWidget *menu[2];
 445         GCallback cb_func[2];
 446         const gchar *const groups[GEANY_ENCODING_GROUPS_MAX] =
 447         {
 448                 [NONE]                  = NULL,
 449                 [WESTEUROPEAN]  = N_("_West European"),
 450                 [EASTEUROPEAN]  = N_("_East European"),
 451                 [EASTASIAN]             = N_("East _Asian"),
 452                 [ASIAN]                 = N_("_SE & SW Asian"),
 453                 [MIDDLEEASTERN] = N_("_Middle Eastern"),
 454                 [UNICODE]               = N_("_Unicode"),
 455         };
 456
 457         encodings_init_headless();
 458
 459         /* create encodings submenu in document menu */
 460         menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
 461         menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
 462         cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
 463         cb_func[1] = G_CALLBACK(encodings_reload_radio_item_change_cb);
 464
 465         for (guint k = 0; k < 2; k++)
 466         {
 467                 GSList *group = NULL;
 468                 GtkWidget *submenus[GEANY_ENCODING_GROUPS_MAX];
 469                 gint orders[GEANY_ENCODING_GROUPS_MAX] = { 0 };
 470                 guint n_added = 0;
 471
 472                 for (guint i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
 473                 {
 474                         if (! groups[i]) /* NONE */
 475                                 submenus[i] = menu[k];
 476                         else
 477                         {
 478                                 GtkWidget *item = gtk_menu_item_new_with_mnemonic(_(groups[i]));
 479                                 submenus[i] = gtk_menu_new();
 480                                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item), submenus[i]);
 481                                 gtk_container_add(GTK_CONTAINER(menu[k]), item);
 482                                 gtk_widget_show_all(item);
 483                         }
 484                 }
 485
 486                 /** TODO can it be optimized? ATM 882 runs at line "if (encodings[i].order ...)" */
 487                 do
 488                 {
 489                         for (guint i = 0; i < G_N_ELEMENTS(encodings); i++)
 490                         {
 491                                 if (encodings[i].order == orders[encodings[i].group])
 492                                 {
 493                                         GtkWidget *item;
 494                                         gchar *label = encodings_to_string(&encodings[i]);
 495
 496                                         if (k == 0) /* Set Encoding menu */
 497                                         {
 498                                                 item = gtk_radio_menu_item_new_with_label(group, label);
 499                                                 group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
 500                                                 radio_items[i] = item;
 501                                         }
 502                                         else
 503                                                 item = gtk_menu_item_new_with_label(label);
 504                                         if (encodings[i].supported)
 505                                                 gtk_widget_show(item);
 506                                         gtk_container_add(GTK_CONTAINER(submenus[encodings[i].group]), item);
 507                                         g_signal_connect(item, "activate", cb_func[k],
 508                                                         (gpointer) encodings[i].charset);
 509                                         g_free(label);
 510
 511                                         orders[encodings[i].group]++;
 512                                         n_added++;
 513                                 }
 514                         }
 515                 }
 516                 while (n_added < G_N_ELEMENTS(encodings));
 517         }
 518 }
 519
 520
 521 static gint encoding_combo_store_sort_func(GtkTreeModel *model,
 522                                                                                    GtkTreeIter *a,
 523                                                                                    GtkTreeIter *b,
 524                                                                                    gpointer data)
 525 {
 526         gboolean a_has_child = gtk_tree_model_iter_has_child(model, a);
 527         gboolean b_has_child = gtk_tree_model_iter_has_child(model, b);
 528         gchar *a_string;
 529         gchar *b_string;
 530         gint cmp_res;
 531
 532         if (a_has_child != b_has_child)
 533                 return a_has_child ? -1 : 1;
 534
 535         gtk_tree_model_get(model, a, 1, &a_string, -1);
 536         gtk_tree_model_get(model, b, 1, &b_string, -1);
 537         cmp_res = strcmp(a_string, b_string);
 538         g_free(a_string);
 539         g_free(b_string);
 540         return cmp_res;
 541 }
 542
 543
 544 GtkTreeStore *encodings_encoding_store_new(gboolean has_detect)
 545 {
 546         GtkTreeStore *store;
 547         GtkTreeIter iter_current, iter_westeuro, iter_easteuro, iter_eastasian,
 548                                 iter_asian, iter_utf8, iter_middleeast;
 549         GtkTreeIter *iter_parent;
 550         gint i;
 551
 552         store = gtk_tree_store_new(2, G_TYPE_INT, G_TYPE_STRING);
 553
 554         if (has_detect)
 555         {
 556                 gtk_tree_store_append(store, &iter_current, NULL);
 557                 gtk_tree_store_set(store, &iter_current, 0, GEANY_ENCODINGS_MAX, 1, _("Detect from file"), -1);
 558         }
 559
 560         gtk_tree_store_append(store, &iter_westeuro, NULL);
 561         gtk_tree_store_set(store, &iter_westeuro, 0, -1, 1, _("West European"), -1);
 562         gtk_tree_store_append(store, &iter_easteuro, NULL);
 563         gtk_tree_store_set(store, &iter_easteuro, 0, -1, 1, _("East European"), -1);
 564         gtk_tree_store_append(store, &iter_eastasian, NULL);
 565         gtk_tree_store_set(store, &iter_eastasian, 0, -1, 1, _("East Asian"), -1);
 566         gtk_tree_store_append(store, &iter_asian, NULL);
 567         gtk_tree_store_set(store, &iter_asian, 0, -1, 1, _("SE & SW Asian"), -1);
 568         gtk_tree_store_append(store, &iter_middleeast, NULL);
 569         gtk_tree_store_set(store, &iter_middleeast, 0, -1, 1, _("Middle Eastern"), -1);
 570         gtk_tree_store_append(store, &iter_utf8, NULL);
 571         gtk_tree_store_set(store, &iter_utf8, 0, -1, 1, _("Unicode"), -1);
 572
 573         for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
 574         {
 575                 gchar *encoding_string;
 576
 577                 if (! encodings[i].supported)
 578                         continue;
 579
 580                 switch (encodings[i].group)
 581                 {
 582                         case WESTEUROPEAN: iter_parent = &iter_westeuro; break;
 583                         case EASTEUROPEAN: iter_parent = &iter_easteuro; break;
 584                         case EASTASIAN: iter_parent = &iter_eastasian; break;
 585                         case ASIAN: iter_parent = &iter_asian; break;
 586                         case MIDDLEEASTERN: iter_parent = &iter_middleeast; break;
 587                         case UNICODE: iter_parent = &iter_utf8; break;
 588                         case NONE:
 589                         default: iter_parent = NULL;
 590                 }
 591                 gtk_tree_store_append(store, &iter_current, iter_parent);
 592                 encoding_string = encodings_to_string(&encodings[i]);
 593                 gtk_tree_store_set(store, &iter_current, 0, i, 1, encoding_string, -1);
 594                 g_free(encoding_string);
 595         }
 596
 597         gtk_tree_sortable_set_sort_column_id(GTK_TREE_SORTABLE(store), 1, GTK_SORT_ASCENDING);
 598         gtk_tree_sortable_set_sort_func(GTK_TREE_SORTABLE(store), 1, encoding_combo_store_sort_func, NULL, NULL);
 599
 600         return store;
 601 }
 602
 603
 604 gint encodings_encoding_store_get_encoding(GtkTreeStore *store, GtkTreeIter *iter)
 605 {
 606         gint enc;
 607         gtk_tree_model_get(GTK_TREE_MODEL(store), iter, 0, &enc, -1);
 608         return enc;
 609 }
 610
 611
 612 gboolean encodings_encoding_store_get_iter(GtkTreeStore *store, GtkTreeIter *iter, gint enc)
 613 {
 614         if (gtk_tree_model_get_iter_first(GTK_TREE_MODEL(store), iter))
 615         {
 616                 do
 617                 {
 618                         if (encodings_encoding_store_get_encoding(store, iter) == enc)
 619                                 return TRUE;
 620                 }
 621                 while (ui_tree_model_iter_any_next(GTK_TREE_MODEL(store), iter, TRUE));
 622         }
 623         return FALSE;
 624 }
 625
 626
 627 void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout,
 628                                                                                          GtkCellRenderer *cell,
 629                                                                                          GtkTreeModel *tree_model,
 630                                                                                          GtkTreeIter *iter,
 631                                                                                          gpointer data)
 632 {
 633         gboolean sensitive = !gtk_tree_model_iter_has_child(tree_model, iter);
 634         gchar *text;
 635
 636         gtk_tree_model_get(tree_model, iter, 1, &text, -1);
 637         g_object_set(cell, "sensitive", sensitive, "text", text, NULL);
 638         g_free(text);
 639 }
 640
 641
 642 static gchar *convert_to_utf8_from_charset(const gchar *buffer, gssize size,
 643                                                                                    const gchar *charset, gboolean fast,
 644                                                                                    gsize *utf8_size, GError **error)
 645 {
 646         gchar *utf8_content = NULL;
 647         GError *conv_error = NULL;
 648         gchar* converted_contents = NULL;
 649         gsize bytes_written;
 650
 651         g_return_val_if_fail(buffer != NULL, NULL);
 652         g_return_val_if_fail(charset != NULL, NULL);
 653
 654         converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
 655                                                                    &bytes_written, &conv_error);
 656
 657         if (fast)
 658         {
 659                 utf8_content = converted_contents;
 660                 if (conv_error != NULL) g_propagate_error(error, conv_error);
 661         }
 662         else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
 663         {
 664                 if (conv_error != NULL)
 665                 {
 666                         geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
 667                         g_propagate_error(error, conv_error);
 668                         conv_error = NULL;
 669                 }
 670                 else
 671                 {
 672                         geany_debug("Couldn't convert from %s to UTF-8.", charset);
 673                         g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 674                                         _("Data contains NULs"));
 675                 }
 676
 677                 utf8_content = NULL;
 678                 g_free(converted_contents);
 679         }
 680         else
 681         {
 682                 geany_debug("Converted from %s to UTF-8.", charset);
 683                 utf8_content = converted_contents;
 684         }
 685
 686         if (utf8_content && utf8_size)
 687                 *utf8_size = bytes_written;
 688
 689         return utf8_content;
 690 }
 691
 692
 693 /**
 694  *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
 695  *  If @a fast is not set, additional checks to validate the converted string are performed.
 696  *
 697  *  @param buffer The input string to convert.
 698  *  @param size The length of the string, or -1 if the string is nul-terminated.
 699  *  @param charset The charset to be used for conversion.
 700  *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
 701  *
 702  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 703  *    which must be freed with @c g_free(). Otherwise @c NULL.
 704  **/
 705 GEANY_API_SYMBOL
 706 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
 707                                                                                           const gchar *charset, gboolean fast)
 708 {
 709         /* If fast=FALSE, we can safely ignore the size as the output cannot contain NULs.
 710          * Otherwise, the caller already agrees on partial data anyway. */
 711         return convert_to_utf8_from_charset(buffer, size, charset, fast, NULL, NULL);
 712 }
 713
 714
 715 static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
 716 {
 717         guint i;
 718
 719         for (i = 0; i < G_N_ELEMENTS(pregs); i++)
 720         {
 721                 gchar *charset;
 722
 723                 if ((charset = regex_match(pregs[i], buffer, size)) != NULL)
 724                         return charset;
 725         }
 726         return NULL;
 727 }
 728
 729
 730 static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size,
 731                 const gchar *suggested_charset, gchar **used_encoding, gsize *utf8_size, GError **error)
 732 {
 733         const gchar *locale_charset = NULL;
 734         const gchar *charset;
 735         gchar *utf8_content;
 736         gboolean check_suggestion = suggested_charset != NULL;
 737         gboolean check_locale = FALSE;
 738         gint i, preferred_charset;
 739
 740         if (size == -1)
 741         {
 742                 size = strlen(buffer);
 743         }
 744
 745         /* current locale is not UTF-8, we have to check this charset */
 746         check_locale = ! g_get_charset(&locale_charset);
 747
 748         /* First check for preferred charset, if specified */
 749         preferred_charset = file_prefs.default_open_encoding;
 750
 751         if (preferred_charset == (gint) encodings[GEANY_ENCODING_NONE].idx ||
 752                 preferred_charset < 0 ||
 753                 preferred_charset >= GEANY_ENCODINGS_MAX)
 754         {
 755                 preferred_charset = -1;
 756         }
 757
 758         /* -1 means "Preferred charset" */
 759         for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
 760         {
 761                 if (G_UNLIKELY(i == (gint) encodings[GEANY_ENCODING_NONE].idx))
 762                         continue;
 763
 764                 if (check_suggestion)
 765                 {
 766                         check_suggestion = FALSE;
 767                         charset = encodings_normalize_charset(suggested_charset);
 768                         if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
 769                                 charset = suggested_charset;
 770                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 771                 }
 772                 else if (check_locale)
 773                 {
 774                         check_locale = FALSE;
 775                         charset = locale_charset;
 776                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 777                 }
 778                 else if (i == -1)
 779                 {
 780                         if (preferred_charset >= 0)
 781                         {
 782                                 charset = encodings[preferred_charset].charset;
 783                                 geany_debug("Using preferred charset: %s", charset);
 784                         }
 785                         else
 786                                 continue;
 787                 }
 788                 else if (i >= 0 && encodings[i].supported)
 789                         charset = encodings[i].charset;
 790                 else /* in this case we have i == -2, continue to increase i and go ahead */
 791                         continue;
 792
 793                 if (G_UNLIKELY(charset == NULL))
 794                         continue;
 795
 796                 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
 797                         size, charset);
 798                 utf8_content = convert_to_utf8_from_charset(buffer, size, charset, FALSE, utf8_size, NULL);
 799
 800                 if (G_LIKELY(utf8_content != NULL))
 801                 {
 802                         if (used_encoding != NULL)
 803                         {
 804                                 if (G_UNLIKELY(*used_encoding != NULL))
 805                                 {
 806                                         geany_debug("%s:%d", __FILE__, __LINE__);
 807                                         g_free(*used_encoding);
 808                                 }
 809                                 *used_encoding = g_strdup(charset);
 810                         }
 811                         return utf8_content;
 812                 }
 813         }
 814
 815         g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 816                         _("Data contains NULs or the encoding is not supported"));
 817
 818         return NULL;
 819 }
 820
 821
 822 /**
 823  *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
 824  *  @a used_encoding.
 825  *
 826  *  @param buffer the input string to convert.
 827  *  @param size the length of the string, or -1 if the string is nul-terminated.
 828  *  @param used_encoding @out @optional return location of the detected encoding of the input string, or @c NULL.
 829  *
 830  *  @return @nullable If the conversion was successful, a newly allocated nul-terminated string,
 831  *    which must be freed with @c g_free(). Otherwise @c NULL.
 832  **/
 833 GEANY_API_SYMBOL
 834 gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_encoding)
 835 {
 836         gchar *regex_charset;
 837         gchar *utf8;
 838
 839         /* first try to read the encoding from the file content */
 840         regex_charset = encodings_check_regexes(buffer, size);
 841         /* we know this cannot succeed if there are NULs in the output, so ignoring the size is OK */
 842         utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding, NULL, NULL);
 843         g_free(regex_charset);
 844
 845         return utf8;
 846 }
 847
 848
 849 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
 850  * otherwise GEANY_ENCODING_NONE.
 851  * */
 852 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
 853 {
 854         if (len >= 3)
 855         {
 856                 if (bom_len)
 857                         *bom_len = 3;
 858
 859                 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
 860                         (guchar)string[2] == 0xbf)
 861                 {
 862                         return GEANY_ENCODING_UTF_8;
 863                 }
 864         }
 865         if (len >= 4)
 866         {
 867                 if (bom_len)
 868                         *bom_len = 4;
 869
 870                 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
 871                                  (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
 872                 {
 873                         return GEANY_ENCODING_UTF_32BE; /* Big endian */
 874                 }
 875                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
 876                                  (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
 877                 {
 878                         return GEANY_ENCODING_UTF_32LE; /* Little endian */
 879                 }
 880                 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
 881                                  (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
 882                 {
 883                          return GEANY_ENCODING_UTF_7;
 884                 }
 885         }
 886         if (len >= 2)
 887         {
 888                 if (bom_len)
 889                         *bom_len = 2;
 890
 891                 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
 892                 {
 893                         return GEANY_ENCODING_UTF_16BE; /* Big endian */
 894                 }
 895                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
 896                 {
 897                         return GEANY_ENCODING_UTF_16LE; /* Little endian */
 898                 }
 899         }
 900         if (bom_len)
 901                 *bom_len = 0;
 902         return GEANY_ENCODING_NONE;
 903 }
 904
 905
 906 gboolean encodings_is_unicode_charset(const gchar *string)
 907 {
 908         if (string != NULL &&
 909                 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
 910         {
 911                 return TRUE;
 912         }
 913         return FALSE;
 914 }
 915
 916
 917 typedef struct
 918 {
 919         gchar           *data;  /* null-terminated data */
 920         gsize            size;  /* actual data size */
 921         gchar           *enc;
 922         gboolean         bom;
 923 } BufferData;
 924
 925
 926 /* convert data with the specified encoding */
 927 static gboolean
 928 handle_forced_encoding(BufferData *buffer, const gchar *forced_enc, GError **error)
 929 {
 930         GeanyEncodingIndex enc_idx;
 931
 932         if (utils_str_equal(forced_enc, "UTF-8"))
 933         {
 934                 if (! g_utf8_validate(buffer->data, buffer->size, NULL))
 935                 {
 936                         g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 937                                         _("Data contains NULs or is not valid UTF-8"));
 938                         return FALSE;
 939                 }
 940         }
 941         else
 942         {
 943                 gchar *converted_text = convert_to_utf8_from_charset(
 944                                                                                 buffer->data, buffer->size, forced_enc, FALSE, &buffer->size, error);
 945                 if (converted_text == NULL)
 946                 {
 947                         return FALSE;
 948                 }
 949                 else
 950                 {
 951                         SETPTR(buffer->data, converted_text);
 952                 }
 953         }
 954         enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
 955         buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
 956         buffer->enc = g_strdup(forced_enc);
 957         return TRUE;
 958 }
 959
 960
 961 /* detect encoding and convert to UTF-8 if necessary */
 962 static gboolean
 963 handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx, GError **error)
 964 {
 965         g_return_val_if_fail(buffer->enc == NULL, FALSE);
 966         g_return_val_if_fail(buffer->bom == FALSE, FALSE);
 967
 968         if (buffer->size == 0)
 969         {
 970                 /* we have no data so assume UTF-8 */
 971                 buffer->enc = g_strdup("UTF-8");
 972         }
 973         else
 974         {
 975                 /* first check for a BOM */
 976                 if (enc_idx != GEANY_ENCODING_NONE)
 977                 {
 978                         buffer->enc = g_strdup(encodings[enc_idx].charset);
 979                         buffer->bom = TRUE;
 980
 981                         if (enc_idx == GEANY_ENCODING_UTF_8)
 982                         {
 983                                 if (! g_utf8_validate(buffer->data, buffer->size, NULL))
 984                                 {
 985                                         /* this is not actually valid UTF-8 */
 986                                         SETPTR(buffer->enc, NULL);
 987                                         buffer->bom = FALSE;
 988                                 }
 989                         }
 990                         else /* the BOM indicated something else than UTF-8 */
 991                         {
 992                                 gchar *converted_text = convert_to_utf8_from_charset(
 993                                                                                 buffer->data, buffer->size, buffer->enc, FALSE, &buffer->size, NULL);
 994                                 if (converted_text != NULL)
 995                                 {
 996                                         SETPTR(buffer->data, converted_text);
 997                                 }
 998                                 else
 999                                 {
1000                                         /* there was a problem converting data from BOM encoding type */
1001                                         SETPTR(buffer->enc, NULL);
1002                                         buffer->bom = FALSE;
1003                                 }
1004                         }
1005                 }
1006
1007                 if (buffer->enc == NULL)        /* either there was no BOM or the BOM encoding failed */
1008                 {
1009                         /* first try to read the encoding from the file content */
1010                         gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
1011
1012                         /* try UTF-8 first */
1013                         if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
1014                                 g_utf8_validate(buffer->data, buffer->size, NULL))
1015                         {
1016                                 buffer->enc = g_strdup("UTF-8");
1017                         }
1018                         else
1019                         {
1020                                 /* detect the encoding */
1021                                 gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
1022                                         buffer->size, regex_charset, &buffer->enc, &buffer->size, error);
1023
1024                                 if (converted_text == NULL)
1025                                 {
1026                                         g_free(regex_charset);
1027                                         return FALSE;
1028                                 }
1029                                 SETPTR(buffer->data, converted_text);
1030                         }
1031                         g_free(regex_charset);
1032                 }
1033         }
1034         return TRUE;
1035 }
1036
1037
1038 static void
1039 handle_bom(BufferData *buffer)
1040 {
1041         guint bom_len;
1042
1043         encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
1044         g_return_if_fail(bom_len != 0);
1045
1046         /* the contents are already converted into UTF-8 here */
1047         buffer->size -= bom_len;
1048         /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
1049         memmove(buffer->data, buffer->data + bom_len, buffer->size + 1);
1050         buffer->data = g_realloc(buffer->data, buffer->size + 1);
1051 }
1052
1053
1054 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
1055 static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc, GError **error)
1056 {
1057         GeanyEncodingIndex tmp_enc_idx;
1058
1059         /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
1060          * if we have a BOM */
1061         tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
1062
1063         /* Determine character encoding and convert to UTF-8 */
1064         if (forced_enc != NULL)
1065         {
1066                 /* the encoding should be ignored(requested by user), so open the file "as it is" */
1067                 if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
1068                 {
1069                         buffer->bom = FALSE;
1070                         buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
1071                 }
1072                 else if (! handle_forced_encoding(buffer, forced_enc, error))
1073                 {
1074                         return FALSE;
1075                 }
1076         }
1077         else if (! handle_encoding(buffer, tmp_enc_idx, error))
1078         {
1079                 return FALSE;
1080         }
1081
1082         if (buffer->bom)
1083                 handle_bom(buffer);
1084         return TRUE;
1085 }
1086
1087
1088 /*
1089  * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
1090  * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
1091  *
1092  * @param buf a pointer to modifiable null-terminated buffer to convert.
1093  *   It may or may not be modified, and should be freed whatever happens.
1094  * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
1095  *   file size). It will be updated to the new size.
1096  * @param forced_enc forced encoding to use, or @c NULL
1097  * @param used_encoding return location for the actually used encoding, or @c NULL
1098  * @param has_bom return location to store whether the data had a BOM, or @c NULL
1099  * @param has_nuls return location to store whether the converted data contains NULs, or @c NULL
1100  *
1101  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
1102  */
1103 GEANY_EXPORT_SYMBOL
1104 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
1105                 gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls, GError **error)
1106 {
1107         BufferData buffer;
1108
1109         buffer.data = *buf;
1110         buffer.size = *size;
1111         buffer.enc = NULL;
1112         buffer.bom = FALSE;
1113
1114         if (! handle_buffer(&buffer, forced_enc, error))
1115                 return FALSE;
1116
1117         *size = buffer.size;
1118         if (used_encoding)
1119                 *used_encoding = buffer.enc;
1120         else
1121                 g_free(buffer.enc);
1122         if (has_bom)
1123                 *has_bom = buffer.bom;
1124         if (has_nuls)
1125                 *has_nuls = strlen(buffer.data) != buffer.size;
1126
1127         *buf = buffer.data;
1128         return TRUE;
1129 }