Merge pull request #3913 from techee/include_fix
[geany-mirror.git] / src / encodings.c
blob5a5f59bbafa682b3162cce0c696a2767d3f81524
1 /*
2 * encodings.c - this file is part of Geany, a fast and lightweight IDE
4 * Copyright 2005 The Geany contributors
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 * Encoding conversion and Byte Order Mark (BOM) handling.
26 * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
27 * list of people on the gedit Team.
28 * See the gedit ChangeLog files for a list of changes.
30 /* Stolen from anjuta */
32 #ifdef HAVE_CONFIG_H
33 # include "config.h"
34 #endif
36 #include "encodings.h"
37 #include "encodingsprivate.h"
39 #include "app.h"
40 #include "callbacks.h"
41 #include "documentprivate.h"
42 #include "support.h"
43 #include "ui_utils.h"
44 #include "utils.h"
46 #include <string.h>
49 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
50 #define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
51 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
52 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
54 /* precompiled regexps */
55 static GRegex *pregs[2];
56 static gboolean pregs_loaded = FALSE;
59 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
62 static gboolean conversion_supported(const gchar *to, const gchar *from)
64 GIConv conv = g_iconv_open(to, from);
65 if (conv == (GIConv) -1)
66 return FALSE;
68 g_iconv_close(conv);
69 return TRUE;
73 #define fill(Order, Group, Idx, Charset, Name) \
74 encodings[Idx].idx = Idx; \
75 encodings[Idx].order = Order; \
76 encodings[Idx].group = Group; \
77 encodings[Idx].charset = Charset; \
78 encodings[Idx].name = Name; \
79 encodings[Idx].supported = FALSE;
81 static void init_encodings(void)
83 fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic"));
84 fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek"));
85 fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek"));
86 fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic"));
87 fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European"));
88 fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western"));
89 fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western"));
90 fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western"));
91 fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western"));
93 fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic"));
94 fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic"));
95 fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic"));
96 fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European"));
97 fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European"));
98 fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European"));
99 fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic"));
100 fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic"));
101 /* ISO-IR-111 not available on Windows */
102 fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic"));
103 fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8-R", _("Cyrillic"));
104 fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic"));
105 fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian"));
106 fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8-U", _("Cyrillic/Ukrainian"));
107 fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian"));
109 fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic"));
110 fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic"));
111 fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic"));
112 fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew"));
113 /* not available at all, ? */
114 fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew"));
115 fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew"));
116 fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual"));
118 fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian"));
119 fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian"));
120 fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai"));
121 fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish"));
122 fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish"));
123 fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish"));
124 fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese"));
125 fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese"));
126 fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese"));
128 fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode"));
129 fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode"));
130 fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode"));
131 fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode"));
132 fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode"));
133 fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode"));
134 fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode"));
135 fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode"));
137 fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified"));
138 fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified"));
139 fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified"));
140 /* maybe not available on Linux */
141 fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified"));
142 fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional"));
143 fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional"));
144 fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional"));
145 fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese"));
146 fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese"));
147 fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese"));
148 fill(10, EASTASIAN, GEANY_ENCODING_CP_932, "CP932", _("Japanese"));
149 fill(11, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean"));
150 fill(12, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean"));
151 fill(13, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean"));
152 fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean"));
154 fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding"));
156 /* fill the flags member */
157 for (guint i = 0; i < G_N_ELEMENTS(encodings); i++)
159 if (i == GEANY_ENCODING_NONE || conversion_supported("UTF-8", encodings[i].charset))
160 encodings[i].supported = TRUE;
161 else
163 /* geany_debug() doesn't really work at this point, unless G_MESSAGES_DEBUG
164 * is set explicitly by the caller, but that's better than nothing */
165 geany_debug("Encoding %s is not supported by the system", encodings[i].charset);
171 /* compares two encoding names in a permissive fashion.
172 * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
173 static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
175 gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
176 gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
178 while (*a && *b)
180 gboolean is_alpha;
182 if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
183 ((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
185 /* either there was a real separator, or we need a implicit one (a chage from alpha to
186 * numeric or so) */
187 if (! need_sep || (was_alpha != is_alpha))
189 a++;
190 b++;
191 was_alpha = is_alpha;
192 need_sep = FALSE;
194 else
195 return FALSE;
197 else
199 guint n_sep = 0;
201 if (! g_ascii_isalnum(*a))
203 a++;
204 n_sep++;
206 if (! g_ascii_isalnum(*b))
208 b++;
209 n_sep++;
211 if (n_sep < 1)
212 return FALSE;
213 else if (n_sep < 2)
214 need_sep = TRUE;
217 return *a == *b;
221 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
223 if (charset == NULL)
224 return GEANY_ENCODING_UTF_8;
226 for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++)
228 if (encodings_charset_equals(charset, encodings[i].charset))
229 return i;
231 return GEANY_ENCODING_UTF_8;
235 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
237 if (charset == NULL)
238 return &encodings[GEANY_ENCODING_UTF_8];
240 for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++)
242 if (encodings_charset_equals(charset, encodings[i].charset))
243 return &encodings[i];
246 return NULL;
250 static const gchar *encodings_normalize_charset(const gchar *charset)
252 const GeanyEncoding *encoding;
254 encoding = encodings_get_from_charset(charset);
255 if (encoding != NULL)
256 return encoding->charset;
258 return NULL;
262 const GeanyEncoding *encodings_get_from_index(gint idx)
264 g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
266 return &encodings[idx];
271 * Gets the character set name of the specified index e.g. for use with
272 * @ref document_set_encoding().
274 * @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
277 * @return @nullable The charset according to idx, or @c NULL if the index is invalid.
279 * @since 0.13
281 GEANY_API_SYMBOL
282 const gchar* encodings_get_charset_from_index(gint idx)
284 g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
286 return encodings[idx].charset;
290 gchar *encodings_to_string(const GeanyEncoding* enc)
292 g_return_val_if_fail(enc != NULL, NULL);
293 g_return_val_if_fail(enc->name != NULL, NULL);
294 g_return_val_if_fail(enc->charset != NULL, NULL);
296 if (enc->idx == GEANY_ENCODING_NONE)
297 return g_strdup(enc->name); // enc->charset is "None" and would be useless to display
298 else
299 return g_strdup_printf("%s (%s)", enc->name, enc->charset);
303 const gchar *encodings_get_charset(const GeanyEncoding* enc)
305 g_return_val_if_fail(enc != NULL, NULL);
306 g_return_val_if_fail(enc->charset != NULL, NULL);
308 return enc->charset;
312 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
315 void encodings_select_radio_item(const gchar *charset)
317 gint i;
319 g_return_if_fail(charset != NULL);
321 for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
323 if (utils_str_equal(charset, encodings[i].charset))
324 break;
326 if (i == GEANY_ENCODINGS_MAX)
327 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
329 /* ignore_callback has to be set by the caller */
330 gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
334 /* Regexp detection of file encoding declared in the file itself.
335 * Idea and parts of code taken from Bluefish, thanks.
336 * regex_compile() is used to compile regular expressions on program init and keep it in memory
337 * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
339 static GRegex *regex_compile(const gchar *pattern)
341 GError *error = NULL;
342 GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS | G_REGEX_RAW, 0, &error);
344 if (!regex)
346 geany_debug("Failed to compile encoding regex (%s)", error->message);
347 g_error_free(error);
349 return regex;
353 static gchar *regex_match(GRegex *preg, const gchar *buffer, gsize size)
355 gchar *encoding = NULL;
356 GMatchInfo *minfo;
358 if (G_UNLIKELY(! pregs_loaded || buffer == NULL))
359 return NULL;
361 /* scan only the first 512 characters in the buffer */
362 size = MIN(size, 512);
364 if (g_regex_match_full(preg, buffer, size, 0, 0, &minfo, NULL) &&
365 g_match_info_get_match_count(minfo) >= 2)
367 encoding = g_match_info_fetch(minfo, 1);
368 geany_debug("Detected encoding by regex search: %s", encoding);
370 SETPTR(encoding, g_utf8_strup(encoding, -1));
372 g_match_info_free(minfo);
373 return encoding;
377 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
379 GeanyDocument *doc = document_get_current();
380 const gchar *charset = user_data;
382 if (ignore_callback || doc == NULL || charset == NULL ||
383 ! gtk_check_menu_item_get_active(menuitem) ||
384 utils_str_equal(charset, doc->encoding))
385 return;
387 if (doc->readonly)
389 utils_beep();
390 return;
392 document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
394 document_set_encoding(doc, charset);
397 static void encodings_reload_radio_item_change_cb(GtkMenuItem *menuitem, gpointer user_data)
399 GeanyDocument *doc = document_get_current();
401 g_return_if_fail(doc != NULL);
403 document_reload_prompt(doc, user_data);
407 void encodings_finalize(void)
409 if (pregs_loaded)
411 guint i, len;
412 len = G_N_ELEMENTS(pregs);
413 for (i = 0; i < len; i++)
415 g_regex_unref(pregs[i]);
421 /* initialization of non-UI parts */
422 void encodings_init_headless(void)
424 static gboolean initialized = FALSE;
426 if (initialized)
427 return;
429 init_encodings();
431 if (! pregs_loaded)
433 pregs[0] = regex_compile(PATTERN_HTMLMETA);
434 pregs[1] = regex_compile(PATTERN_CODING);
435 pregs_loaded = TRUE;
438 initialized = TRUE;
442 void encodings_init(void)
444 GtkWidget *menu[2];
445 GCallback cb_func[2];
446 const gchar *const groups[GEANY_ENCODING_GROUPS_MAX] =
448 [NONE] = NULL,
449 [WESTEUROPEAN] = N_("_West European"),
450 [EASTEUROPEAN] = N_("_East European"),
451 [EASTASIAN] = N_("East _Asian"),
452 [ASIAN] = N_("_SE & SW Asian"),
453 [MIDDLEEASTERN] = N_("_Middle Eastern"),
454 [UNICODE] = N_("_Unicode"),
457 encodings_init_headless();
459 /* create encodings submenu in document menu */
460 menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
461 menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
462 cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
463 cb_func[1] = G_CALLBACK(encodings_reload_radio_item_change_cb);
465 for (guint k = 0; k < 2; k++)
467 GSList *group = NULL;
468 GtkWidget *submenus[GEANY_ENCODING_GROUPS_MAX];
469 gint orders[GEANY_ENCODING_GROUPS_MAX] = { 0 };
470 guint n_added = 0;
472 for (guint i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
474 if (! groups[i]) /* NONE */
475 submenus[i] = menu[k];
476 else
478 GtkWidget *item = gtk_menu_item_new_with_mnemonic(_(groups[i]));
479 submenus[i] = gtk_menu_new();
480 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item), submenus[i]);
481 gtk_container_add(GTK_CONTAINER(menu[k]), item);
482 gtk_widget_show_all(item);
486 /** TODO can it be optimized? ATM 882 runs at line "if (encodings[i].order ...)" */
489 for (guint i = 0; i < G_N_ELEMENTS(encodings); i++)
491 if (encodings[i].order == orders[encodings[i].group])
493 GtkWidget *item;
494 gchar *label = encodings_to_string(&encodings[i]);
496 if (k == 0) /* Set Encoding menu */
498 item = gtk_radio_menu_item_new_with_label(group, label);
499 group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
500 radio_items[i] = item;
502 else
503 item = gtk_menu_item_new_with_label(label);
504 if (encodings[i].supported)
505 gtk_widget_show(item);
506 gtk_container_add(GTK_CONTAINER(submenus[encodings[i].group]), item);
507 g_signal_connect(item, "activate", cb_func[k],
508 (gpointer) encodings[i].charset);
509 g_free(label);
511 orders[encodings[i].group]++;
512 n_added++;
516 while (n_added < G_N_ELEMENTS(encodings));
521 static gint encoding_combo_store_sort_func(GtkTreeModel *model,
522 GtkTreeIter *a,
523 GtkTreeIter *b,
524 gpointer data)
526 gboolean a_has_child = gtk_tree_model_iter_has_child(model, a);
527 gboolean b_has_child = gtk_tree_model_iter_has_child(model, b);
528 gchar *a_string;
529 gchar *b_string;
530 gint cmp_res;
532 if (a_has_child != b_has_child)
533 return a_has_child ? -1 : 1;
535 gtk_tree_model_get(model, a, 1, &a_string, -1);
536 gtk_tree_model_get(model, b, 1, &b_string, -1);
537 cmp_res = strcmp(a_string, b_string);
538 g_free(a_string);
539 g_free(b_string);
540 return cmp_res;
544 GtkTreeStore *encodings_encoding_store_new(gboolean has_detect)
546 GtkTreeStore *store;
547 GtkTreeIter iter_current, iter_westeuro, iter_easteuro, iter_eastasian,
548 iter_asian, iter_utf8, iter_middleeast;
549 GtkTreeIter *iter_parent;
550 gint i;
552 store = gtk_tree_store_new(2, G_TYPE_INT, G_TYPE_STRING);
554 if (has_detect)
556 gtk_tree_store_append(store, &iter_current, NULL);
557 gtk_tree_store_set(store, &iter_current, 0, GEANY_ENCODINGS_MAX, 1, _("Detect from file"), -1);
560 gtk_tree_store_append(store, &iter_westeuro, NULL);
561 gtk_tree_store_set(store, &iter_westeuro, 0, -1, 1, _("West European"), -1);
562 gtk_tree_store_append(store, &iter_easteuro, NULL);
563 gtk_tree_store_set(store, &iter_easteuro, 0, -1, 1, _("East European"), -1);
564 gtk_tree_store_append(store, &iter_eastasian, NULL);
565 gtk_tree_store_set(store, &iter_eastasian, 0, -1, 1, _("East Asian"), -1);
566 gtk_tree_store_append(store, &iter_asian, NULL);
567 gtk_tree_store_set(store, &iter_asian, 0, -1, 1, _("SE & SW Asian"), -1);
568 gtk_tree_store_append(store, &iter_middleeast, NULL);
569 gtk_tree_store_set(store, &iter_middleeast, 0, -1, 1, _("Middle Eastern"), -1);
570 gtk_tree_store_append(store, &iter_utf8, NULL);
571 gtk_tree_store_set(store, &iter_utf8, 0, -1, 1, _("Unicode"), -1);
573 for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
575 gchar *encoding_string;
577 if (! encodings[i].supported)
578 continue;
580 switch (encodings[i].group)
582 case WESTEUROPEAN: iter_parent = &iter_westeuro; break;
583 case EASTEUROPEAN: iter_parent = &iter_easteuro; break;
584 case EASTASIAN: iter_parent = &iter_eastasian; break;
585 case ASIAN: iter_parent = &iter_asian; break;
586 case MIDDLEEASTERN: iter_parent = &iter_middleeast; break;
587 case UNICODE: iter_parent = &iter_utf8; break;
588 case NONE:
589 default: iter_parent = NULL;
591 gtk_tree_store_append(store, &iter_current, iter_parent);
592 encoding_string = encodings_to_string(&encodings[i]);
593 gtk_tree_store_set(store, &iter_current, 0, i, 1, encoding_string, -1);
594 g_free(encoding_string);
597 gtk_tree_sortable_set_sort_column_id(GTK_TREE_SORTABLE(store), 1, GTK_SORT_ASCENDING);
598 gtk_tree_sortable_set_sort_func(GTK_TREE_SORTABLE(store), 1, encoding_combo_store_sort_func, NULL, NULL);
600 return store;
604 gint encodings_encoding_store_get_encoding(GtkTreeStore *store, GtkTreeIter *iter)
606 gint enc;
607 gtk_tree_model_get(GTK_TREE_MODEL(store), iter, 0, &enc, -1);
608 return enc;
612 gboolean encodings_encoding_store_get_iter(GtkTreeStore *store, GtkTreeIter *iter, gint enc)
614 if (gtk_tree_model_get_iter_first(GTK_TREE_MODEL(store), iter))
618 if (encodings_encoding_store_get_encoding(store, iter) == enc)
619 return TRUE;
621 while (ui_tree_model_iter_any_next(GTK_TREE_MODEL(store), iter, TRUE));
623 return FALSE;
627 void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout,
628 GtkCellRenderer *cell,
629 GtkTreeModel *tree_model,
630 GtkTreeIter *iter,
631 gpointer data)
633 gboolean sensitive = !gtk_tree_model_iter_has_child(tree_model, iter);
634 gchar *text;
636 gtk_tree_model_get(tree_model, iter, 1, &text, -1);
637 g_object_set(cell, "sensitive", sensitive, "text", text, NULL);
638 g_free(text);
642 static gchar *convert_to_utf8_from_charset(const gchar *buffer, gssize size,
643 const gchar *charset, gboolean fast,
644 gsize *utf8_size, GError **error)
646 gchar *utf8_content = NULL;
647 GError *conv_error = NULL;
648 gchar* converted_contents = NULL;
649 gsize bytes_written;
651 g_return_val_if_fail(buffer != NULL, NULL);
652 g_return_val_if_fail(charset != NULL, NULL);
654 converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
655 &bytes_written, &conv_error);
657 if (fast)
659 utf8_content = converted_contents;
660 if (conv_error != NULL) g_propagate_error(error, conv_error);
662 else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
664 if (conv_error != NULL)
666 geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
667 g_propagate_error(error, conv_error);
668 conv_error = NULL;
670 else
672 geany_debug("Couldn't convert from %s to UTF-8.", charset);
673 g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
674 _("Data contains NULs"));
677 utf8_content = NULL;
678 g_free(converted_contents);
680 else
682 geany_debug("Converted from %s to UTF-8.", charset);
683 utf8_content = converted_contents;
686 if (utf8_content && utf8_size)
687 *utf8_size = bytes_written;
689 return utf8_content;
694 * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
695 * If @a fast is not set, additional checks to validate the converted string are performed.
697 * @param buffer The input string to convert.
698 * @param size The length of the string, or -1 if the string is nul-terminated.
699 * @param charset The charset to be used for conversion.
700 * @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
702 * @return If the conversion was successful, a newly allocated nul-terminated string,
703 * which must be freed with @c g_free(). Otherwise @c NULL.
705 GEANY_API_SYMBOL
706 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
707 const gchar *charset, gboolean fast)
709 /* If fast=FALSE, we can safely ignore the size as the output cannot contain NULs.
710 * Otherwise, the caller already agrees on partial data anyway. */
711 return convert_to_utf8_from_charset(buffer, size, charset, fast, NULL, NULL);
715 static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
717 guint i;
719 for (i = 0; i < G_N_ELEMENTS(pregs); i++)
721 gchar *charset;
723 if ((charset = regex_match(pregs[i], buffer, size)) != NULL)
724 return charset;
726 return NULL;
730 static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size,
731 const gchar *suggested_charset, gchar **used_encoding, gsize *utf8_size, GError **error)
733 const gchar *locale_charset = NULL;
734 const gchar *charset;
735 gchar *utf8_content;
736 gboolean check_suggestion = suggested_charset != NULL;
737 gboolean check_locale = FALSE;
738 gint i, preferred_charset;
740 if (size == -1)
742 size = strlen(buffer);
745 /* current locale is not UTF-8, we have to check this charset */
746 check_locale = ! g_get_charset(&locale_charset);
748 /* First check for preferred charset, if specified */
749 preferred_charset = file_prefs.default_open_encoding;
751 if (preferred_charset == (gint) encodings[GEANY_ENCODING_NONE].idx ||
752 preferred_charset < 0 ||
753 preferred_charset >= GEANY_ENCODINGS_MAX)
755 preferred_charset = -1;
758 /* -1 means "Preferred charset" */
759 for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
761 if (G_UNLIKELY(i == (gint) encodings[GEANY_ENCODING_NONE].idx))
762 continue;
764 if (check_suggestion)
766 check_suggestion = FALSE;
767 charset = encodings_normalize_charset(suggested_charset);
768 if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
769 charset = suggested_charset;
770 i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
772 else if (check_locale)
774 check_locale = FALSE;
775 charset = locale_charset;
776 i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
778 else if (i == -1)
780 if (preferred_charset >= 0)
782 charset = encodings[preferred_charset].charset;
783 geany_debug("Using preferred charset: %s", charset);
785 else
786 continue;
788 else if (i >= 0 && encodings[i].supported)
789 charset = encodings[i].charset;
790 else /* in this case we have i == -2, continue to increase i and go ahead */
791 continue;
793 if (G_UNLIKELY(charset == NULL))
794 continue;
796 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
797 size, charset);
798 utf8_content = convert_to_utf8_from_charset(buffer, size, charset, FALSE, utf8_size, NULL);
800 if (G_LIKELY(utf8_content != NULL))
802 if (used_encoding != NULL)
804 if (G_UNLIKELY(*used_encoding != NULL))
806 geany_debug("%s:%d", __FILE__, __LINE__);
807 g_free(*used_encoding);
809 *used_encoding = g_strdup(charset);
811 return utf8_content;
815 g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
816 _("Data contains NULs or the encoding is not supported"));
818 return NULL;
823 * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
824 * @a used_encoding.
826 * @param buffer the input string to convert.
827 * @param size the length of the string, or -1 if the string is nul-terminated.
828 * @param used_encoding @out @optional return location of the detected encoding of the input string, or @c NULL.
830 * @return @nullable If the conversion was successful, a newly allocated nul-terminated string,
831 * which must be freed with @c g_free(). Otherwise @c NULL.
833 GEANY_API_SYMBOL
834 gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_encoding)
836 gchar *regex_charset;
837 gchar *utf8;
839 /* first try to read the encoding from the file content */
840 regex_charset = encodings_check_regexes(buffer, size);
841 /* we know this cannot succeed if there are NULs in the output, so ignoring the size is OK */
842 utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding, NULL, NULL);
843 g_free(regex_charset);
845 return utf8;
849 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
850 * otherwise GEANY_ENCODING_NONE.
851 * */
852 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
854 if (len >= 3)
856 if (bom_len)
857 *bom_len = 3;
859 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
860 (guchar)string[2] == 0xbf)
862 return GEANY_ENCODING_UTF_8;
865 if (len >= 4)
867 if (bom_len)
868 *bom_len = 4;
870 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
871 (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
873 return GEANY_ENCODING_UTF_32BE; /* Big endian */
875 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
876 (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
878 return GEANY_ENCODING_UTF_32LE; /* Little endian */
880 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
881 (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
883 return GEANY_ENCODING_UTF_7;
886 if (len >= 2)
888 if (bom_len)
889 *bom_len = 2;
891 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
893 return GEANY_ENCODING_UTF_16BE; /* Big endian */
895 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
897 return GEANY_ENCODING_UTF_16LE; /* Little endian */
900 if (bom_len)
901 *bom_len = 0;
902 return GEANY_ENCODING_NONE;
906 gboolean encodings_is_unicode_charset(const gchar *string)
908 if (string != NULL &&
909 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
911 return TRUE;
913 return FALSE;
917 typedef struct
919 gchar *data; /* null-terminated data */
920 gsize size; /* actual data size */
921 gchar *enc;
922 gboolean bom;
923 } BufferData;
926 /* convert data with the specified encoding */
927 static gboolean
928 handle_forced_encoding(BufferData *buffer, const gchar *forced_enc, GError **error)
930 GeanyEncodingIndex enc_idx;
932 if (utils_str_equal(forced_enc, "UTF-8"))
934 if (! g_utf8_validate(buffer->data, buffer->size, NULL))
936 g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
937 _("Data contains NULs or is not valid UTF-8"));
938 return FALSE;
941 else
943 gchar *converted_text = convert_to_utf8_from_charset(
944 buffer->data, buffer->size, forced_enc, FALSE, &buffer->size, error);
945 if (converted_text == NULL)
947 return FALSE;
949 else
951 SETPTR(buffer->data, converted_text);
954 enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
955 buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
956 buffer->enc = g_strdup(forced_enc);
957 return TRUE;
961 /* detect encoding and convert to UTF-8 if necessary */
962 static gboolean
963 handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx, GError **error)
965 g_return_val_if_fail(buffer->enc == NULL, FALSE);
966 g_return_val_if_fail(buffer->bom == FALSE, FALSE);
968 if (buffer->size == 0)
970 /* we have no data so assume UTF-8 */
971 buffer->enc = g_strdup("UTF-8");
973 else
975 /* first check for a BOM */
976 if (enc_idx != GEANY_ENCODING_NONE)
978 buffer->enc = g_strdup(encodings[enc_idx].charset);
979 buffer->bom = TRUE;
981 if (enc_idx == GEANY_ENCODING_UTF_8)
983 if (! g_utf8_validate(buffer->data, buffer->size, NULL))
985 /* this is not actually valid UTF-8 */
986 SETPTR(buffer->enc, NULL);
987 buffer->bom = FALSE;
990 else /* the BOM indicated something else than UTF-8 */
992 gchar *converted_text = convert_to_utf8_from_charset(
993 buffer->data, buffer->size, buffer->enc, FALSE, &buffer->size, NULL);
994 if (converted_text != NULL)
996 SETPTR(buffer->data, converted_text);
998 else
1000 /* there was a problem converting data from BOM encoding type */
1001 SETPTR(buffer->enc, NULL);
1002 buffer->bom = FALSE;
1007 if (buffer->enc == NULL) /* either there was no BOM or the BOM encoding failed */
1009 /* first try to read the encoding from the file content */
1010 gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
1012 /* try UTF-8 first */
1013 if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
1014 g_utf8_validate(buffer->data, buffer->size, NULL))
1016 buffer->enc = g_strdup("UTF-8");
1018 else
1020 /* detect the encoding */
1021 gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
1022 buffer->size, regex_charset, &buffer->enc, &buffer->size, error);
1024 if (converted_text == NULL)
1026 g_free(regex_charset);
1027 return FALSE;
1029 SETPTR(buffer->data, converted_text);
1031 g_free(regex_charset);
1034 return TRUE;
1038 static void
1039 handle_bom(BufferData *buffer)
1041 guint bom_len;
1043 encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
1044 g_return_if_fail(bom_len != 0);
1046 /* the contents are already converted into UTF-8 here */
1047 buffer->size -= bom_len;
1048 /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
1049 memmove(buffer->data, buffer->data + bom_len, buffer->size + 1);
1050 buffer->data = g_realloc(buffer->data, buffer->size + 1);
1054 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
1055 static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc, GError **error)
1057 GeanyEncodingIndex tmp_enc_idx;
1059 /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
1060 * if we have a BOM */
1061 tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
1063 /* Determine character encoding and convert to UTF-8 */
1064 if (forced_enc != NULL)
1066 /* the encoding should be ignored(requested by user), so open the file "as it is" */
1067 if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
1069 buffer->bom = FALSE;
1070 buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
1072 else if (! handle_forced_encoding(buffer, forced_enc, error))
1074 return FALSE;
1077 else if (! handle_encoding(buffer, tmp_enc_idx, error))
1079 return FALSE;
1082 if (buffer->bom)
1083 handle_bom(buffer);
1084 return TRUE;
1089 * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
1090 * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
1092 * @param buf a pointer to modifiable null-terminated buffer to convert.
1093 * It may or may not be modified, and should be freed whatever happens.
1094 * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
1095 * file size). It will be updated to the new size.
1096 * @param forced_enc forced encoding to use, or @c NULL
1097 * @param used_encoding return location for the actually used encoding, or @c NULL
1098 * @param has_bom return location to store whether the data had a BOM, or @c NULL
1099 * @param has_nuls return location to store whether the converted data contains NULs, or @c NULL
1101 * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
1103 GEANY_EXPORT_SYMBOL
1104 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
1105 gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls, GError **error)
1107 BufferData buffer;
1109 buffer.data = *buf;
1110 buffer.size = *size;
1111 buffer.enc = NULL;
1112 buffer.bom = FALSE;
1114 if (! handle_buffer(&buffer, forced_enc, error))
1115 return FALSE;
1117 *size = buffer.size;
1118 if (used_encoding)
1119 *used_encoding = buffer.enc;
1120 else
1121 g_free(buffer.enc);
1122 if (has_bom)
1123 *has_bom = buffer.bom;
1124 if (has_nuls)
1125 *has_nuls = strlen(buffer.data) != buffer.size;
1127 *buf = buffer.data;
1128 return TRUE;