src/encodings.c

   1 /*
   2  *      encodings.c - this file is part of Geany, a fast and lightweight IDE
   3  *
   4  *      Copyright 2005-2012 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
   5  *      Copyright 2006-2012 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
   6  *
   7  *      This program is free software; you can redistribute it and/or modify
   8  *      it under the terms of the GNU General Public License as published by
   9  *      the Free Software Foundation; either version 2 of the License, or
  10  *      (at your option) any later version.
  11  *
  12  *      This program is distributed in the hope that it will be useful,
  13  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *      GNU General Public License for more details.
  16  *
  17  *      You should have received a copy of the GNU General Public License along
  18  *      with this program; if not, write to the Free Software Foundation, Inc.,
  19  *      51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20  */
  21
  22 /*
  23  * Encoding conversion and Byte Order Mark (BOM) handling.
  24  */
  25
  26 /*
  27  * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
  28  * list of people on the gedit Team.
  29  * See the gedit ChangeLog files for a list of changes.
  30  */
  31  /* Stolen from anjuta */
  32
  33 #ifdef HAVE_CONFIG_H
  34 # include "config.h"
  35 #endif
  36
  37 #include "encodings.h"
  38
  39 #include "app.h"
  40 #include "callbacks.h"
  41 #include "documentprivate.h"
  42 #include "support.h"
  43 #include "ui_utils.h"
  44 #include "utils.h"
  45
  46 #include <string.h>
  47
  48
  49 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
  50 #define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
  51 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
  52 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
  53
  54 /* precompiled regexps */
  55 static GRegex *pregs[2];
  56 static gboolean pregs_loaded = FALSE;
  57
  58
  59 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
  60
  61
  62 #define fill(Order, Group, Idx, Charset, Name) \
  63                 encodings[Idx].idx = Idx; \
  64                 encodings[Idx].order = Order; \
  65                 encodings[Idx].group = Group; \
  66                 encodings[Idx].charset = Charset; \
  67                 encodings[Idx].name = Name;
  68
  69 static void init_encodings(void)
  70 {
  71         fill(0,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_14,             "ISO-8859-14",          _("Celtic"));
  72         fill(1,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_7,              "ISO-8859-7",           _("Greek"));
  73         fill(2,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1253,    "WINDOWS-1253",         _("Greek"));
  74         fill(3,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_10,             "ISO-8859-10",          _("Nordic"));
  75         fill(4,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_3,              "ISO-8859-3",           _("South European"));
  76         fill(5,         WESTEUROPEAN,   GEANY_ENCODING_IBM_850,                 "IBM850",                       _("Western"));
  77         fill(6,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_1,              "ISO-8859-1",           _("Western"));
  78         fill(7,         WESTEUROPEAN,   GEANY_ENCODING_ISO_8859_15,             "ISO-8859-15",          _("Western"));
  79         fill(8,         WESTEUROPEAN,   GEANY_ENCODING_WINDOWS_1252,    "WINDOWS-1252",         _("Western"));
  80
  81         fill(0,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_4,              "ISO-8859-4",           _("Baltic"));
  82         fill(1,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_13,             "ISO-8859-13",          _("Baltic"));
  83         fill(2,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1257,    "WINDOWS-1257",         _("Baltic"));
  84         fill(3,         EASTEUROPEAN,   GEANY_ENCODING_IBM_852,                 "IBM852",                       _("Central European"));
  85         fill(4,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_2,              "ISO-8859-2",           _("Central European"));
  86         fill(5,         EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1250,    "WINDOWS-1250",         _("Central European"));
  87         fill(6,         EASTEUROPEAN,   GEANY_ENCODING_IBM_855,                 "IBM855",                       _("Cyrillic"));
  88         fill(7,         EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_5,              "ISO-8859-5",           _("Cyrillic"));
  89         /* ISO-IR-111 not available on Windows */
  90         fill(8,         EASTEUROPEAN,   GEANY_ENCODING_ISO_IR_111,              "ISO-IR-111",           _("Cyrillic"));
  91         fill(9,         EASTEUROPEAN,   GEANY_ENCODING_KOI8_R,                  "KOI8-R",                       _("Cyrillic"));
  92         fill(10,        EASTEUROPEAN,   GEANY_ENCODING_WINDOWS_1251,    "WINDOWS-1251",         _("Cyrillic"));
  93         fill(11,        EASTEUROPEAN,   GEANY_ENCODING_CP_866,                  "CP866",                        _("Cyrillic/Russian"));
  94         fill(12,        EASTEUROPEAN,   GEANY_ENCODING_KOI8_U,                  "KOI8-U",                       _("Cyrillic/Ukrainian"));
  95         fill(13,        EASTEUROPEAN,   GEANY_ENCODING_ISO_8859_16,             "ISO-8859-16",          _("Romanian"));
  96
  97         fill(0,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_864,                 "IBM864",                       _("Arabic"));
  98         fill(1,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_6,              "ISO-8859-6",           _("Arabic"));
  99         fill(2,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1256,    "WINDOWS-1256",         _("Arabic"));
 100         fill(3,         MIDDLEEASTERN,  GEANY_ENCODING_IBM_862,                 "IBM862",                       _("Hebrew"));
 101         /* not available at all, ? */
 102         fill(4,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8_I,    "ISO-8859-8-I",         _("Hebrew"));
 103         fill(5,         MIDDLEEASTERN,  GEANY_ENCODING_WINDOWS_1255,    "WINDOWS-1255",         _("Hebrew"));
 104         fill(6,         MIDDLEEASTERN,  GEANY_ENCODING_ISO_8859_8,              "ISO-8859-8",           _("Hebrew Visual"));
 105
 106         fill(0,         ASIAN,                  GEANY_ENCODING_ARMSCII_8,               "ARMSCII-8",            _("Armenian"));
 107         fill(1,         ASIAN,                  GEANY_ENCODING_GEOSTD8,                 "GEORGIAN-ACADEMY",     _("Georgian"));
 108         fill(2,         ASIAN,                  GEANY_ENCODING_TIS_620,                 "TIS-620",                      _("Thai"));
 109         fill(3,         ASIAN,                  GEANY_ENCODING_IBM_857,                 "IBM857",                       _("Turkish"));
 110         fill(4,         ASIAN,                  GEANY_ENCODING_WINDOWS_1254,    "WINDOWS-1254",         _("Turkish"));
 111         fill(5,         ASIAN,                  GEANY_ENCODING_ISO_8859_9,              "ISO-8859-9",           _("Turkish"));
 112         fill(6,         ASIAN,                  GEANY_ENCODING_TCVN,                    "TCVN",                         _("Vietnamese"));
 113         fill(7,         ASIAN,                  GEANY_ENCODING_VISCII,                  "VISCII",                       _("Vietnamese"));
 114         fill(8,         ASIAN,                  GEANY_ENCODING_WINDOWS_1258,    "WINDOWS-1258",         _("Vietnamese"));
 115
 116         fill(0,         UNICODE,                GEANY_ENCODING_UTF_7,                   "UTF-7",                        _("Unicode"));
 117         fill(1,         UNICODE,                GEANY_ENCODING_UTF_8,                   "UTF-8",                        _("Unicode"));
 118         fill(2,         UNICODE,                GEANY_ENCODING_UTF_16LE,                "UTF-16LE",                     _("Unicode"));
 119         fill(3,         UNICODE,                GEANY_ENCODING_UTF_16BE,                "UTF-16BE",                     _("Unicode"));
 120         fill(4,         UNICODE,                GEANY_ENCODING_UCS_2LE,                 "UCS-2LE",                      _("Unicode"));
 121         fill(5,         UNICODE,                GEANY_ENCODING_UCS_2BE,                 "UCS-2BE",                      _("Unicode"));
 122         fill(6,         UNICODE,                GEANY_ENCODING_UTF_32LE,                "UTF-32LE",                     _("Unicode"));
 123         fill(7,         UNICODE,                GEANY_ENCODING_UTF_32BE,                "UTF-32BE",                     _("Unicode"));
 124
 125         fill(0,         EASTASIAN,              GEANY_ENCODING_GB18030,                 "GB18030",                      _("Chinese Simplified"));
 126         fill(1,         EASTASIAN,              GEANY_ENCODING_GB2312,                  "GB2312",                       _("Chinese Simplified"));
 127         fill(2,         EASTASIAN,              GEANY_ENCODING_GBK,                             "GBK",                          _("Chinese Simplified"));
 128         /* maybe not available on Linux */
 129         fill(3,         EASTASIAN,              GEANY_ENCODING_HZ,                              "HZ",                           _("Chinese Simplified"));
 130         fill(4,         EASTASIAN,              GEANY_ENCODING_BIG5,                    "BIG5",                         _("Chinese Traditional"));
 131         fill(5,         EASTASIAN,              GEANY_ENCODING_BIG5_HKSCS,              "BIG5-HKSCS",           _("Chinese Traditional"));
 132         fill(6,         EASTASIAN,              GEANY_ENCODING_EUC_TW,                  "EUC-TW",                       _("Chinese Traditional"));
 133         fill(7,         EASTASIAN,              GEANY_ENCODING_EUC_JP,                  "EUC-JP",                       _("Japanese"));
 134         fill(8,         EASTASIAN,              GEANY_ENCODING_ISO_2022_JP,             "ISO-2022-JP",          _("Japanese"));
 135         fill(9,         EASTASIAN,              GEANY_ENCODING_SHIFT_JIS,               "SHIFT_JIS",            _("Japanese"));
 136         fill(10,        EASTASIAN,              GEANY_ENCODING_CP_932,                  "CP932",                        _("Japanese"));
 137         fill(11,        EASTASIAN,              GEANY_ENCODING_EUC_KR,                  "EUC-KR",                       _("Korean"));
 138         fill(12,        EASTASIAN,              GEANY_ENCODING_ISO_2022_KR,             "ISO-2022-KR",          _("Korean"));
 139         fill(13,        EASTASIAN,              GEANY_ENCODING_JOHAB,                   "JOHAB",                        _("Korean"));
 140         fill(14,        EASTASIAN,              GEANY_ENCODING_UHC,                             "UHC",                          _("Korean"));
 141
 142         fill(0,         NONE,                   GEANY_ENCODING_NONE,                    "None",                         _("Without encoding"));
 143 }
 144
 145
 146 /* compares two encoding names in a permissive fashion.
 147  * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
 148 static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
 149 {
 150         gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
 151         gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
 152
 153         while (*a && *b)
 154         {
 155                 gboolean is_alpha;
 156
 157                 if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
 158                         ((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
 159                 {
 160                         /* either there was a real separator, or we need a implicit one (a chage from alpha to
 161                          * numeric or so) */
 162                         if (! need_sep || (was_alpha != is_alpha))
 163                         {
 164                                 a++;
 165                                 b++;
 166                                 was_alpha = is_alpha;
 167                                 need_sep = FALSE;
 168                         }
 169                         else
 170                                 return FALSE;
 171                 }
 172                 else
 173                 {
 174                         guint n_sep = 0;
 175
 176                         if (! g_ascii_isalnum(*a))
 177                         {
 178                                 a++;
 179                                 n_sep++;
 180                         }
 181                         if (! g_ascii_isalnum(*b))
 182                         {
 183                                 b++;
 184                                 n_sep++;
 185                         }
 186                         if (n_sep < 1)
 187                                 return FALSE;
 188                         else if (n_sep < 2)
 189                                 need_sep = TRUE;
 190                 }
 191         }
 192         return *a == *b;
 193 }
 194
 195
 196 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
 197 {
 198         gint i;
 199
 200         if (charset == NULL)
 201                 return GEANY_ENCODING_UTF_8;
 202
 203         i = 0;
 204         while (i < GEANY_ENCODINGS_MAX)
 205         {
 206                 if (encodings_charset_equals(charset, encodings[i].charset))
 207                         return i;
 208
 209                 ++i;
 210         }
 211         return GEANY_ENCODING_UTF_8;
 212 }
 213
 214
 215 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
 216 {
 217         gint i;
 218
 219         if (charset == NULL)
 220                 return &encodings[GEANY_ENCODING_UTF_8];
 221
 222         i = 0;
 223         while (i < GEANY_ENCODINGS_MAX)
 224         {
 225                 if (encodings_charset_equals(charset, encodings[i].charset))
 226                         return &encodings[i];
 227
 228                 ++i;
 229         }
 230
 231         return NULL;
 232 }
 233
 234
 235 static const gchar *encodings_normalize_charset(const gchar *charset)
 236 {
 237         const GeanyEncoding *encoding;
 238
 239         encoding = encodings_get_from_charset(charset);
 240         if (encoding != NULL)
 241                 return encoding->charset;
 242
 243         return NULL;
 244 }
 245
 246
 247 const GeanyEncoding *encodings_get_from_index(gint idx)
 248 {
 249         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 250
 251         return &encodings[idx];
 252 }
 253
 254
 255 /**
 256  *  Gets the character set name of the specified index e.g. for use with
 257  *  @ref document_set_encoding().
 258  *
 259  *  @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
 260  *
 261  *
 262  *  @return The charset according to idx, or @c NULL if the index is invalid.
 263  *
 264  *  @since 0.13
 265  **/
 266 GEANY_API_SYMBOL
 267 const gchar* encodings_get_charset_from_index(gint idx)
 268 {
 269         g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
 270
 271         return encodings[idx].charset;
 272 }
 273
 274
 275 gchar *encodings_to_string(const GeanyEncoding* enc)
 276 {
 277         g_return_val_if_fail(enc != NULL, NULL);
 278         g_return_val_if_fail(enc->name != NULL, NULL);
 279         g_return_val_if_fail(enc->charset != NULL, NULL);
 280
 281         return g_strdup_printf("%s (%s)", enc->name, enc->charset);
 282 }
 283
 284
 285 const gchar *encodings_get_charset(const GeanyEncoding* enc)
 286 {
 287         g_return_val_if_fail(enc != NULL, NULL);
 288         g_return_val_if_fail(enc->charset != NULL, NULL);
 289
 290         return enc->charset;
 291 }
 292
 293
 294 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
 295
 296
 297 void encodings_select_radio_item(const gchar *charset)
 298 {
 299         gint i;
 300
 301         g_return_if_fail(charset != NULL);
 302
 303         i = 0;
 304         while (i < GEANY_ENCODINGS_MAX)
 305         {
 306                 if (utils_str_equal(charset, encodings[i].charset))
 307                         break;
 308                 i++;
 309         }
 310         if (i == GEANY_ENCODINGS_MAX)
 311                 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
 312
 313         /* ignore_callback has to be set by the caller */
 314         gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
 315 }
 316
 317
 318 /* Regexp detection of file encoding declared in the file itself.
 319  * Idea and parts of code taken from Bluefish, thanks.
 320  * regex_compile() is used to compile regular expressions on program init and keep it in memory
 321  * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
 322  */
 323 static GRegex *regex_compile(const gchar *pattern)
 324 {
 325         GError *error = NULL;
 326         GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error);
 327
 328         if (!regex)
 329         {
 330                 geany_debug("Failed to compile encoding regex (%s)", error->message);
 331                 g_error_free(error);
 332         }
 333         return regex;
 334 }
 335
 336
 337 static gchar *regex_match(GRegex *preg, const gchar *buffer, gsize size)
 338 {
 339         gchar *encoding = NULL;
 340         GMatchInfo *minfo;
 341
 342         if (G_UNLIKELY(! pregs_loaded || buffer == NULL))
 343                 return NULL;
 344
 345         /* scan only the first 512 characters in the buffer */
 346         size = MIN(size, 512);
 347
 348         if (g_regex_match_full(preg, buffer, size, 0, 0, &minfo, NULL) &&
 349                 g_match_info_get_match_count(minfo) >= 2)
 350         {
 351                 encoding = g_match_info_fetch(minfo, 1);
 352                 geany_debug("Detected encoding by regex search: %s", encoding);
 353
 354                 SETPTR(encoding, g_utf8_strup(encoding, -1));
 355         }
 356         g_match_info_free(minfo);
 357         return encoding;
 358 }
 359
 360
 361 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
 362 {
 363         GeanyDocument *doc = document_get_current();
 364         const gchar *charset = user_data;
 365
 366         if (ignore_callback || doc == NULL || charset == NULL ||
 367                 ! gtk_check_menu_item_get_active(menuitem) ||
 368                 utils_str_equal(charset, doc->encoding))
 369                 return;
 370
 371         if (doc->readonly)
 372         {
 373                 utils_beep();
 374                 return;
 375         }
 376         document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
 377
 378         document_set_encoding(doc, charset);
 379 }
 380
 381 static void encodings_reload_radio_item_change_cb(GtkMenuItem *menuitem, gpointer user_data)
 382 {
 383         GeanyDocument *doc = document_get_current();
 384
 385         g_return_if_fail(doc != NULL);
 386
 387         document_reload_prompt(doc, user_data);
 388 }
 389
 390
 391 void encodings_finalize(void)
 392 {
 393         if (pregs_loaded)
 394         {
 395                 guint i, len;
 396                 len = G_N_ELEMENTS(pregs);
 397                 for (i = 0; i < len; i++)
 398                 {
 399                         g_regex_unref(pregs[i]);
 400                 }
 401         }
 402 }
 403
 404
 405 void encodings_init(void)
 406 {
 407         GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian,
 408                           *menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian,
 409                           *item_asian, *item_utf8, *item_middleeast;
 410         GCallback cb_func[2];
 411         GSList *group = NULL;
 412         gchar *label;
 413         gint order, group_size;
 414         guint i, j, k;
 415
 416         init_encodings();
 417
 418         if (! pregs_loaded)
 419         {
 420                 pregs[0] = regex_compile(PATTERN_HTMLMETA);
 421                 pregs[1] = regex_compile(PATTERN_CODING);
 422                 pregs_loaded = TRUE;
 423         }
 424
 425         /* create encodings submenu in document menu */
 426         menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
 427         menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
 428         cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
 429         cb_func[1] = G_CALLBACK(encodings_reload_radio_item_change_cb);
 430
 431         for (k = 0; k < 2; k++)
 432         {
 433                 menu_westeuro = gtk_menu_new();
 434                 item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European"));
 435                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro);
 436                 gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro);
 437                 gtk_widget_show_all(item_westeuro);
 438
 439                 menu_easteuro = gtk_menu_new();
 440                 item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European"));
 441                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro);
 442                 gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro);
 443                 gtk_widget_show_all(item_easteuro);
 444
 445                 menu_eastasian = gtk_menu_new();
 446                 item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian"));
 447                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian);
 448                 gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian);
 449                 gtk_widget_show_all(item_eastasian);
 450
 451                 menu_asian = gtk_menu_new();
 452                 item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
 453                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian);
 454                 gtk_container_add(GTK_CONTAINER(menu[k]), item_asian);
 455                 gtk_widget_show_all(item_asian);
 456
 457                 menu_middleeast = gtk_menu_new();
 458                 item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
 459                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast);
 460                 gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast);
 461                 gtk_widget_show_all(item_middleeast);
 462
 463                 menu_utf8 = gtk_menu_new();
 464                 item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode"));
 465                 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8);
 466                 gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8);
 467                 gtk_widget_show_all(item_utf8);
 468
 469                 /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
 470                 for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
 471                 {
 472                         order = 0;
 473                         switch (i)
 474                         {
 475                                 case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break;
 476                                 case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break;
 477                                 case EASTASIAN: submenu = menu_eastasian; group_size = 14; break;
 478                                 case ASIAN: submenu = menu_asian; group_size = 9; break;
 479                                 case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break;
 480                                 case UNICODE: submenu = menu_utf8; group_size = 8; break;
 481                                 default: submenu = menu[k]; group_size = 1;
 482                         }
 483
 484                         while (order < group_size)      /* the biggest group has 13 elements */
 485                         {
 486                                 for (j = 0; j < GEANY_ENCODINGS_MAX; j++)
 487                                 {
 488                                         if (encodings[j].group == i && encodings[j].order == order)
 489                                         {
 490                                                 label = encodings_to_string(&encodings[j]);
 491                                                 if (k == 0)
 492                                                 {
 493                                                         item = gtk_radio_menu_item_new_with_label(group, label);
 494                                                         group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
 495                                                         radio_items[j] = item;
 496                                                 }
 497                                                 else
 498                                                         item = gtk_menu_item_new_with_label(label);
 499                                                 gtk_widget_show(item);
 500                                                 gtk_container_add(GTK_CONTAINER(submenu), item);
 501                                                 g_signal_connect(item, "activate", cb_func[k],
 502                                                                 (gpointer) encodings[j].charset);
 503                                                 g_free(label);
 504                                                 break;
 505                                         }
 506                                 }
 507                                 order++;
 508                         }
 509                 }
 510         }
 511 }
 512
 513
 514 static gint encoding_combo_store_sort_func(GtkTreeModel *model,
 515                                                                                    GtkTreeIter *a,
 516                                                                                    GtkTreeIter *b,
 517                                                                                    gpointer data)
 518 {
 519         gboolean a_has_child = gtk_tree_model_iter_has_child(model, a);
 520         gboolean b_has_child = gtk_tree_model_iter_has_child(model, b);
 521         gchar *a_string;
 522         gchar *b_string;
 523         gint cmp_res;
 524
 525         if (a_has_child != b_has_child)
 526                 return a_has_child ? -1 : 1;
 527
 528         gtk_tree_model_get(model, a, 1, &a_string, -1);
 529         gtk_tree_model_get(model, b, 1, &b_string, -1);
 530         cmp_res = strcmp(a_string, b_string);
 531         g_free(a_string);
 532         g_free(b_string);
 533         return cmp_res;
 534 }
 535
 536
 537 GtkTreeStore *encodings_encoding_store_new(gboolean has_detect)
 538 {
 539         GtkTreeStore *store;
 540         GtkTreeIter iter_current, iter_westeuro, iter_easteuro, iter_eastasian,
 541                                 iter_asian, iter_utf8, iter_middleeast;
 542         GtkTreeIter *iter_parent;
 543         gchar *encoding_string;
 544         gint i;
 545
 546         store = gtk_tree_store_new(2, G_TYPE_INT, G_TYPE_STRING);
 547
 548         if (has_detect)
 549         {
 550                 gtk_tree_store_append(store, &iter_current, NULL);
 551                 gtk_tree_store_set(store, &iter_current, 0, GEANY_ENCODINGS_MAX, 1, _("Detect from file"), -1);
 552         }
 553
 554         gtk_tree_store_append(store, &iter_westeuro, NULL);
 555         gtk_tree_store_set(store, &iter_westeuro, 0, -1, 1, _("West European"), -1);
 556         gtk_tree_store_append(store, &iter_easteuro, NULL);
 557         gtk_tree_store_set(store, &iter_easteuro, 0, -1, 1, _("East European"), -1);
 558         gtk_tree_store_append(store, &iter_eastasian, NULL);
 559         gtk_tree_store_set(store, &iter_eastasian, 0, -1, 1, _("East Asian"), -1);
 560         gtk_tree_store_append(store, &iter_asian, NULL);
 561         gtk_tree_store_set(store, &iter_asian, 0, -1, 1, _("SE & SW Asian"), -1);
 562         gtk_tree_store_append(store, &iter_middleeast, NULL);
 563         gtk_tree_store_set(store, &iter_middleeast, 0, -1, 1, _("Middle Eastern"), -1);
 564         gtk_tree_store_append(store, &iter_utf8, NULL);
 565         gtk_tree_store_set(store, &iter_utf8, 0, -1, 1, _("Unicode"), -1);
 566
 567         for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
 568         {
 569                 switch (encodings[i].group)
 570                 {
 571                         case WESTEUROPEAN: iter_parent = &iter_westeuro; break;
 572                         case EASTEUROPEAN: iter_parent = &iter_easteuro; break;
 573                         case EASTASIAN: iter_parent = &iter_eastasian; break;
 574                         case ASIAN: iter_parent = &iter_asian; break;
 575                         case MIDDLEEASTERN: iter_parent = &iter_middleeast; break;
 576                         case UNICODE: iter_parent = &iter_utf8; break;
 577                         case NONE:
 578                         default: iter_parent = NULL;
 579                 }
 580                 gtk_tree_store_append(store, &iter_current, iter_parent);
 581                 encoding_string = encodings_to_string(&encodings[i]);
 582                 gtk_tree_store_set(store, &iter_current, 0, i, 1, encoding_string, -1);
 583                 g_free(encoding_string);
 584         }
 585
 586         gtk_tree_sortable_set_sort_column_id(GTK_TREE_SORTABLE(store), 1, GTK_SORT_ASCENDING);
 587         gtk_tree_sortable_set_sort_func(GTK_TREE_SORTABLE(store), 1, encoding_combo_store_sort_func, NULL, NULL);
 588
 589         return store;
 590 }
 591
 592
 593 gint encodings_encoding_store_get_encoding(GtkTreeStore *store, GtkTreeIter *iter)
 594 {
 595         gint enc;
 596         gtk_tree_model_get(GTK_TREE_MODEL(store), iter, 0, &enc, -1);
 597         return enc;
 598 }
 599
 600
 601 gboolean encodings_encoding_store_get_iter(GtkTreeStore *store, GtkTreeIter *iter, gint enc)
 602 {
 603         if (gtk_tree_model_get_iter_first(GTK_TREE_MODEL(store), iter))
 604         {
 605                 do
 606                 {
 607                         if (encodings_encoding_store_get_encoding(store, iter) == enc)
 608                                 return TRUE;
 609                 }
 610                 while (ui_tree_model_iter_any_next(GTK_TREE_MODEL(store), iter, TRUE));
 611         }
 612         return FALSE;
 613 }
 614
 615
 616 void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout,
 617                                                                                          GtkCellRenderer *cell,
 618                                                                                          GtkTreeModel *tree_model,
 619                                                                                          GtkTreeIter *iter,
 620                                                                                          gpointer data)
 621 {
 622         gboolean sensitive = !gtk_tree_model_iter_has_child(tree_model, iter);
 623         gchar *text;
 624
 625         gtk_tree_model_get(tree_model, iter, 1, &text, -1);
 626         g_object_set(cell, "sensitive", sensitive, "text", text, NULL);
 627         g_free(text);
 628 }
 629
 630
 631 /**
 632  *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
 633  *  If @a fast is not set, additional checks to validate the converted string are performed.
 634  *
 635  *  @param buffer The input string to convert.
 636  *  @param size The length of the string, or -1 if the string is nul-terminated.
 637  *  @param charset The charset to be used for conversion.
 638  *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
 639  *
 640  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 641  *    which must be freed with @c g_free(). Otherwise @c NULL.
 642  **/
 643 GEANY_API_SYMBOL
 644 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
 645                                                                                           const gchar *charset, gboolean fast)
 646 {
 647         gchar *utf8_content = NULL;
 648         GError *conv_error = NULL;
 649         gchar* converted_contents = NULL;
 650         gsize bytes_written;
 651
 652         g_return_val_if_fail(buffer != NULL, NULL);
 653         g_return_val_if_fail(charset != NULL, NULL);
 654
 655         converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
 656                                                                    &bytes_written, &conv_error);
 657
 658         if (fast)
 659         {
 660                 utf8_content = converted_contents;
 661                 if (conv_error != NULL) g_error_free(conv_error);
 662         }
 663         else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
 664         {
 665                 if (conv_error != NULL)
 666                 {
 667                         geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
 668                         g_error_free(conv_error);
 669                         conv_error = NULL;
 670                 }
 671                 else
 672                         geany_debug("Couldn't convert from %s to UTF-8.", charset);
 673
 674                 utf8_content = NULL;
 675                 g_free(converted_contents);
 676         }
 677         else
 678         {
 679                 geany_debug("Converted from %s to UTF-8.", charset);
 680                 utf8_content = converted_contents;
 681         }
 682
 683         return utf8_content;
 684 }
 685
 686
 687 static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
 688 {
 689         guint i;
 690
 691         for (i = 0; i < G_N_ELEMENTS(pregs); i++)
 692         {
 693                 gchar *charset;
 694
 695                 if ((charset = regex_match(pregs[i], buffer, size)) != NULL)
 696                         return charset;
 697         }
 698         return NULL;
 699 }
 700
 701
 702 static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size,
 703                 const gchar *suggested_charset, gchar **used_encoding)
 704 {
 705         const gchar *locale_charset = NULL;
 706         const gchar *charset;
 707         gchar *utf8_content;
 708         gboolean check_suggestion = suggested_charset != NULL;
 709         gboolean check_locale = FALSE;
 710         gint i, preferred_charset;
 711
 712         if (size == -1)
 713         {
 714                 size = strlen(buffer);
 715         }
 716
 717         /* current locale is not UTF-8, we have to check this charset */
 718         check_locale = ! g_get_charset(&locale_charset);
 719
 720         /* First check for preferred charset, if specified */
 721         preferred_charset = file_prefs.default_open_encoding;
 722
 723         if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
 724                 preferred_charset < 0 ||
 725                 preferred_charset >= GEANY_ENCODINGS_MAX)
 726         {
 727                 preferred_charset = -1;
 728         }
 729
 730         /* -1 means "Preferred charset" */
 731         for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
 732         {
 733                 if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
 734                         continue;
 735
 736                 if (check_suggestion)
 737                 {
 738                         check_suggestion = FALSE;
 739                         charset = encodings_normalize_charset(suggested_charset);
 740                         if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
 741                                 charset = suggested_charset;
 742                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 743                 }
 744                 else if (check_locale)
 745                 {
 746                         check_locale = FALSE;
 747                         charset = locale_charset;
 748                         i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 749                 }
 750                 else if (i == -1)
 751                 {
 752                         if (preferred_charset >= 0)
 753                         {
 754                                 charset = encodings[preferred_charset].charset;
 755                                 geany_debug("Using preferred charset: %s", charset);
 756                         }
 757                         else
 758                                 continue;
 759                 }
 760                 else if (i >= 0)
 761                         charset = encodings[i].charset;
 762                 else /* in this case we have i == -2, continue to increase i and go ahead */
 763                         continue;
 764
 765                 if (G_UNLIKELY(charset == NULL))
 766                         continue;
 767
 768                 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
 769                         size, charset);
 770                 utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
 771
 772                 if (G_LIKELY(utf8_content != NULL))
 773                 {
 774                         if (used_encoding != NULL)
 775                         {
 776                                 if (G_UNLIKELY(*used_encoding != NULL))
 777                                 {
 778                                         geany_debug("%s:%d", __FILE__, __LINE__);
 779                                         g_free(*used_encoding);
 780                                 }
 781                                 *used_encoding = g_strdup(charset);
 782                         }
 783                         return utf8_content;
 784                 }
 785         }
 786
 787         return NULL;
 788 }
 789
 790
 791 /**
 792  *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
 793  *  @a used_encoding.
 794  *
 795  *  @param buffer the input string to convert.
 796  *  @param size the length of the string, or -1 if the string is nul-terminated.
 797  *  @param used_encoding return location of the detected encoding of the input string, or @c NULL.
 798  *
 799  *  @return If the conversion was successful, a newly allocated nul-terminated string,
 800  *    which must be freed with @c g_free(). Otherwise @c NULL.
 801  **/
 802 GEANY_API_SYMBOL
 803 gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_encoding)
 804 {
 805         gchar *regex_charset;
 806         gchar *utf8;
 807
 808         /* first try to read the encoding from the file content */
 809         regex_charset = encodings_check_regexes(buffer, size);
 810         utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
 811         g_free(regex_charset);
 812
 813         return utf8;
 814 }
 815
 816
 817 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
 818  * otherwise GEANY_ENCODING_NONE.
 819  * */
 820 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
 821 {
 822         if (len >= 3)
 823         {
 824                 if (bom_len)
 825                         *bom_len = 3;
 826
 827                 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
 828                         (guchar)string[2] == 0xbf)
 829                 {
 830                         return GEANY_ENCODING_UTF_8;
 831                 }
 832         }
 833         if (len >= 4)
 834         {
 835                 if (bom_len)
 836                         *bom_len = 4;
 837
 838                 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
 839                                  (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
 840                 {
 841                         return GEANY_ENCODING_UTF_32BE; /* Big endian */
 842                 }
 843                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
 844                                  (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
 845                 {
 846                         return GEANY_ENCODING_UTF_32LE; /* Little endian */
 847                 }
 848                 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
 849                                  (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
 850                 {
 851                          return GEANY_ENCODING_UTF_7;
 852                 }
 853         }
 854         if (len >= 2)
 855         {
 856                 if (bom_len)
 857                         *bom_len = 2;
 858
 859                 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
 860                 {
 861                         return GEANY_ENCODING_UTF_16BE; /* Big endian */
 862                 }
 863                 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
 864                 {
 865                         return GEANY_ENCODING_UTF_16LE; /* Little endian */
 866                 }
 867         }
 868         if (bom_len)
 869                 *bom_len = 0;
 870         return GEANY_ENCODING_NONE;
 871 }
 872
 873
 874 gboolean encodings_is_unicode_charset(const gchar *string)
 875 {
 876         if (string != NULL &&
 877                 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
 878         {
 879                 return TRUE;
 880         }
 881         return FALSE;
 882 }
 883
 884
 885 typedef struct
 886 {
 887         gchar           *data;  /* null-terminated data */
 888         gsize            size;  /* actual data size */
 889         gsize            len;   /* string length of data */
 890         gchar           *enc;
 891         gboolean         bom;
 892         gboolean         partial;
 893 } BufferData;
 894
 895
 896 /* convert data with the specified encoding */
 897 static gboolean
 898 handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
 899 {
 900         GeanyEncodingIndex enc_idx;
 901
 902         if (utils_str_equal(forced_enc, "UTF-8"))
 903         {
 904                 if (! g_utf8_validate(buffer->data, buffer->len, NULL))
 905                 {
 906                         return FALSE;
 907                 }
 908         }
 909         else
 910         {
 911                 gchar *converted_text = encodings_convert_to_utf8_from_charset(
 912                                                                                 buffer->data, buffer->size, forced_enc, FALSE);
 913                 if (converted_text == NULL)
 914                 {
 915                         return FALSE;
 916                 }
 917                 else
 918                 {
 919                         SETPTR(buffer->data, converted_text);
 920                         buffer->len = strlen(converted_text);
 921                 }
 922         }
 923         enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
 924         buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
 925         buffer->enc = g_strdup(forced_enc);
 926         return TRUE;
 927 }
 928
 929
 930 /* detect encoding and convert to UTF-8 if necessary */
 931 static gboolean
 932 handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
 933 {
 934         g_return_val_if_fail(buffer->enc == NULL, FALSE);
 935         g_return_val_if_fail(buffer->bom == FALSE, FALSE);
 936
 937         if (buffer->size == 0)
 938         {
 939                 /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
 940                  * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
 941                 buffer->enc = g_strdup("UTF-8");
 942         }
 943         else
 944         {
 945                 /* first check for a BOM */
 946                 if (enc_idx != GEANY_ENCODING_NONE)
 947                 {
 948                         buffer->enc = g_strdup(encodings[enc_idx].charset);
 949                         buffer->bom = TRUE;
 950
 951                         if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
 952                         {
 953                                 gchar *converted_text = encodings_convert_to_utf8_from_charset(
 954                                                                                 buffer->data, buffer->size, buffer->enc, FALSE);
 955                                 if (converted_text != NULL)
 956                                 {
 957                                         SETPTR(buffer->data, converted_text);
 958                                         buffer->len = strlen(converted_text);
 959                                 }
 960                                 else
 961                                 {
 962                                         /* there was a problem converting data from BOM encoding type */
 963                                         SETPTR(buffer->enc, NULL);
 964                                         buffer->bom = FALSE;
 965                                 }
 966                         }
 967                 }
 968
 969                 if (buffer->enc == NULL)        /* either there was no BOM or the BOM encoding failed */
 970                 {
 971                         /* first try to read the encoding from the file content */
 972                         gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
 973
 974                         /* try UTF-8 first */
 975                         if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
 976                                 (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
 977                         {
 978                                 buffer->enc = g_strdup("UTF-8");
 979                         }
 980                         else
 981                         {
 982                                 /* detect the encoding */
 983                                 gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
 984                                         buffer->size, regex_charset, &buffer->enc);
 985
 986                                 if (converted_text == NULL)
 987                                 {
 988                                         g_free(regex_charset);
 989                                         return FALSE;
 990                                 }
 991                                 SETPTR(buffer->data, converted_text);
 992                                 buffer->len = strlen(converted_text);
 993                         }
 994                         g_free(regex_charset);
 995                 }
 996         }
 997         return TRUE;
 998 }
 999
1000
1001 static void
1002 handle_bom(BufferData *buffer)
1003 {
1004         guint bom_len;
1005
1006         encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
1007         g_return_if_fail(bom_len != 0);
1008
1009         /* use filedata->len here because the contents are already converted into UTF-8 */
1010         buffer->len -= bom_len;
1011         /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
1012         g_memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
1013         buffer->data = g_realloc(buffer->data, buffer->len + 1);
1014 }
1015
1016
1017 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
1018 static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
1019 {
1020         GeanyEncodingIndex tmp_enc_idx;
1021
1022         /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
1023          * if we have a BOM */
1024         tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
1025
1026         /* check whether the size of the loaded data is equal to the size of the file in the
1027          * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
1028          * file size of 0 bytes */
1029         if (buffer->len != buffer->size && buffer->size != 0 && (
1030                 tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
1031                 tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
1032         {
1033                 buffer->partial = TRUE;
1034         }
1035
1036         /* Determine character encoding and convert to UTF-8 */
1037         if (forced_enc != NULL)
1038         {
1039                 /* the encoding should be ignored(requested by user), so open the file "as it is" */
1040                 if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
1041                 {
1042                         buffer->bom = FALSE;
1043                         buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
1044                 }
1045                 else if (! handle_forced_encoding(buffer, forced_enc))
1046                 {
1047                         return FALSE;
1048                 }
1049         }
1050         else if (! handle_encoding(buffer, tmp_enc_idx))
1051         {
1052                 return FALSE;
1053         }
1054
1055         if (buffer->bom)
1056                 handle_bom(buffer);
1057         return TRUE;
1058 }
1059
1060
1061 /*
1062  * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
1063  * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
1064  *
1065  * @param buf a pointer to modifiable null-terminated buffer to convert.
1066  *   It may or may not be modified, and should be freed whatever happens.
1067  * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
1068  *   file size). It will be updated to the new size.
1069  * @param forced_enc forced encoding to use, or @c NULL
1070  * @param used_encoding return location for the actually used encoding, or @c NULL
1071  * @param has_bom return location to store whether the data had a BOM, or @c NULL
1072  * @param partial return location to store whether the conversion may be partial, or @c NULL
1073  *
1074  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
1075  */
1076 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
1077                 gchar **used_encoding, gboolean *has_bom, gboolean *partial)
1078 {
1079         BufferData buffer;
1080
1081         buffer.data = *buf;
1082         buffer.size = *size;
1083         /* use strlen to check for null chars */
1084         buffer.len = strlen(buffer.data);
1085         buffer.enc = NULL;
1086         buffer.bom = FALSE;
1087         buffer.partial = FALSE;
1088
1089         if (! handle_buffer(&buffer, forced_enc))
1090                 return FALSE;
1091
1092         *size = buffer.len;
1093         if (used_encoding)
1094                 *used_encoding = buffer.enc;
1095         else
1096                 g_free(buffer.enc);
1097         if (has_bom)
1098                 *has_bom = buffer.bom;
1099         if (partial)
1100                 *partial = buffer.partial;
1101
1102         *buf = buffer.data;
1103         return TRUE;
1104 }