Update of German translation
[geany-mirror.git] / src / encodings.c
blob8a15eb7fd30e31630c41e9ef059653abd61617cf
1 /*
2 * encodings.c - this file is part of Geany, a fast and lightweight IDE
4 * Copyright 2005-2012 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
5 * Copyright 2006-2012 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 * Encoding conversion and Byte Order Mark (BOM) handling.
27 * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
28 * list of people on the gedit Team.
29 * See the gedit ChangeLog files for a list of changes.
31 /* Stolen from anjuta */
33 #include <string.h>
35 #include "geany.h"
36 #include "utils.h"
37 #include "support.h"
38 #include "document.h"
39 #include "documentprivate.h"
40 #include "msgwindow.h"
41 #include "encodings.h"
42 #include "callbacks.h"
43 #include "ui_utils.h"
45 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
46 #define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
47 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
48 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
50 /* precompiled regexps */
51 static GRegex *pregs[2];
52 static gboolean pregs_loaded = FALSE;
55 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
58 #define fill(Order, Group, Idx, Charset, Name) \
59 encodings[Idx].idx = Idx; \
60 encodings[Idx].order = Order; \
61 encodings[Idx].group = Group; \
62 encodings[Idx].charset = Charset; \
63 encodings[Idx].name = Name;
65 static void init_encodings(void)
67 fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic"));
68 fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek"));
69 fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek"));
70 fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic"));
71 fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European"));
72 fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western"));
73 fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western"));
74 fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western"));
75 fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western"));
77 fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic"));
78 fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic"));
79 fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic"));
80 fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European"));
81 fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European"));
82 fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European"));
83 fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic"));
84 fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic"));
85 /* ISO-IR-111 not available on Windows */
86 fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic"));
87 fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8-R", _("Cyrillic"));
88 fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic"));
89 fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian"));
90 fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8-U", _("Cyrillic/Ukrainian"));
91 fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian"));
93 fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic"));
94 fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic"));
95 fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic"));
96 fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew"));
97 /* not available at all, ? */
98 fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew"));
99 fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew"));
100 fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual"));
102 fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian"));
103 fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian"));
104 fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai"));
105 fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish"));
106 fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish"));
107 fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish"));
108 fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese"));
109 fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese"));
110 fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese"));
112 fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode"));
113 fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode"));
114 fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode"));
115 fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode"));
116 fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode"));
117 fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode"));
118 fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode"));
119 fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode"));
121 fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified"));
122 fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified"));
123 fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified"));
124 /* maybe not available on Linux */
125 fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified"));
126 fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional"));
127 fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional"));
128 fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional"));
129 fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese"));
130 fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese"));
131 fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese"));
132 fill(10, EASTASIAN, GEANY_ENCODING_CP_932, "CP932", _("Japanese"));
133 fill(11, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean"));
134 fill(12, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean"));
135 fill(13, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean"));
136 fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean"));
138 fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding"));
142 /* compares two encoding names in a permissive fashion.
143 * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
144 static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
146 gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
147 gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
149 while (*a && *b)
151 gboolean is_alpha;
153 if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
154 ((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
156 /* either there was a real separator, or we need a implicit one (a chage from alpha to
157 * numeric or so) */
158 if (! need_sep || (was_alpha != is_alpha))
160 a++;
161 b++;
162 was_alpha = is_alpha;
163 need_sep = FALSE;
165 else
166 return FALSE;
168 else
170 guint n_sep = 0;
172 if (! g_ascii_isalnum(*a))
174 a++;
175 n_sep++;
177 if (! g_ascii_isalnum(*b))
179 b++;
180 n_sep++;
182 if (n_sep < 1)
183 return FALSE;
184 else if (n_sep < 2)
185 need_sep = TRUE;
188 return *a == *b;
192 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
194 gint i;
196 if (charset == NULL)
197 return GEANY_ENCODING_UTF_8;
199 i = 0;
200 while (i < GEANY_ENCODINGS_MAX)
202 if (encodings_charset_equals(charset, encodings[i].charset))
203 return i;
205 ++i;
207 return GEANY_ENCODING_UTF_8;
211 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
213 gint i;
215 if (charset == NULL)
216 return &encodings[GEANY_ENCODING_UTF_8];
218 i = 0;
219 while (i < GEANY_ENCODINGS_MAX)
221 if (encodings_charset_equals(charset, encodings[i].charset))
222 return &encodings[i];
224 ++i;
227 return NULL;
231 static const gchar *encodings_normalize_charset(const gchar *charset)
233 const GeanyEncoding *encoding;
235 encoding = encodings_get_from_charset(charset);
236 if (encoding != NULL)
237 return encoding->charset;
239 return NULL;
243 const GeanyEncoding *encodings_get_from_index(gint idx)
245 g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
247 return &encodings[idx];
252 * Gets the character set name of the specified index e.g. for use with
253 * @ref document_set_encoding().
255 * @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
258 * @return The charset according to idx, or @c NULL if the index is invalid.
260 * @since 0.13
262 const gchar* encodings_get_charset_from_index(gint idx)
264 g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
266 return encodings[idx].charset;
270 gchar *encodings_to_string(const GeanyEncoding* enc)
272 g_return_val_if_fail(enc != NULL, NULL);
273 g_return_val_if_fail(enc->name != NULL, NULL);
274 g_return_val_if_fail(enc->charset != NULL, NULL);
276 return g_strdup_printf("%s (%s)", enc->name, enc->charset);
280 const gchar *encodings_get_charset(const GeanyEncoding* enc)
282 g_return_val_if_fail(enc != NULL, NULL);
283 g_return_val_if_fail(enc->charset != NULL, NULL);
285 return enc->charset;
289 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
292 void encodings_select_radio_item(const gchar *charset)
294 gint i;
296 g_return_if_fail(charset != NULL);
298 i = 0;
299 while (i < GEANY_ENCODINGS_MAX)
301 if (utils_str_equal(charset, encodings[i].charset))
302 break;
303 i++;
305 if (i == GEANY_ENCODINGS_MAX)
306 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
308 /* ignore_callback has to be set by the caller */
309 gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
313 /* Regexp detection of file encoding declared in the file itself.
314 * Idea and parts of code taken from Bluefish, thanks.
315 * regex_compile() is used to compile regular expressions on program init and keep it in memory
316 * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
318 static GRegex *regex_compile(const gchar *pattern)
320 GError *error = NULL;
321 GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error);
323 if (!regex)
325 geany_debug("Failed to compile encoding regex (%s)", error->message);
326 g_error_free(error);
328 return regex;
332 static gchar *regex_match(GRegex *preg, const gchar *buffer, gsize size)
334 gchar *encoding = NULL;
335 GMatchInfo *minfo;
337 if (G_UNLIKELY(! pregs_loaded || buffer == NULL))
338 return NULL;
340 /* scan only the first 512 characters in the buffer */
341 size = MIN(size, 512);
343 if (g_regex_match_full(preg, buffer, size, 0, 0, &minfo, NULL) &&
344 g_match_info_get_match_count(minfo) >= 2)
346 encoding = g_match_info_fetch(minfo, 1);
347 geany_debug("Detected encoding by regex search: %s", encoding);
349 SETPTR(encoding, g_utf8_strup(encoding, -1));
351 g_match_info_free(minfo);
352 return encoding;
356 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
358 GeanyDocument *doc = document_get_current();
359 guint i = GPOINTER_TO_INT(user_data);
361 if (ignore_callback || doc == NULL || encodings[i].charset == NULL ||
362 ! gtk_check_menu_item_get_active(menuitem) ||
363 utils_str_equal(encodings[i].charset, doc->encoding))
364 return;
366 if (doc->readonly)
368 utils_beep();
369 return;
371 document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
373 document_set_encoding(doc, encodings[i].charset);
377 void encodings_finalize(void)
379 if (pregs_loaded)
381 guint i, len;
382 len = G_N_ELEMENTS(pregs);
383 for (i = 0; i < len; i++)
385 g_regex_unref(pregs[i]);
391 void encodings_init(void)
393 GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian,
394 *menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian,
395 *item_asian, *item_utf8, *item_middleeast;
396 GCallback cb_func[2];
397 GSList *group = NULL;
398 gchar *label;
399 gint order, group_size;
400 guint i, j, k;
402 init_encodings();
404 if (! pregs_loaded)
406 pregs[0] = regex_compile(PATTERN_HTMLMETA);
407 pregs[1] = regex_compile(PATTERN_CODING);
408 pregs_loaded = TRUE;
411 /* create encodings submenu in document menu */
412 menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
413 menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
414 cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
415 cb_func[1] = G_CALLBACK(on_reload_as_activate);
417 for (k = 0; k < 2; k++)
419 menu_westeuro = gtk_menu_new();
420 item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European"));
421 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro);
422 gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro);
423 gtk_widget_show_all(item_westeuro);
425 menu_easteuro = gtk_menu_new();
426 item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European"));
427 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro);
428 gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro);
429 gtk_widget_show_all(item_easteuro);
431 menu_eastasian = gtk_menu_new();
432 item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian"));
433 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian);
434 gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian);
435 gtk_widget_show_all(item_eastasian);
437 menu_asian = gtk_menu_new();
438 item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
439 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian);
440 gtk_container_add(GTK_CONTAINER(menu[k]), item_asian);
441 gtk_widget_show_all(item_asian);
443 menu_middleeast = gtk_menu_new();
444 item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
445 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast);
446 gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast);
447 gtk_widget_show_all(item_middleeast);
449 menu_utf8 = gtk_menu_new();
450 item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode"));
451 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8);
452 gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8);
453 gtk_widget_show_all(item_utf8);
455 /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
456 for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
458 order = 0;
459 switch (i)
461 case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break;
462 case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break;
463 case EASTASIAN: submenu = menu_eastasian; group_size = 14; break;
464 case ASIAN: submenu = menu_asian; group_size = 9; break;
465 case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break;
466 case UNICODE: submenu = menu_utf8; group_size = 8; break;
467 default: submenu = menu[k]; group_size = 1;
470 while (order < group_size) /* the biggest group has 13 elements */
472 for (j = 0; j < GEANY_ENCODINGS_MAX; j++)
474 if (encodings[j].group == i && encodings[j].order == order)
476 label = encodings_to_string(&encodings[j]);
477 if (k == 0)
479 item = gtk_radio_menu_item_new_with_label(group, label);
480 group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
481 radio_items[j] = item;
483 else
484 item = gtk_menu_item_new_with_label(label);
485 gtk_widget_show(item);
486 gtk_container_add(GTK_CONTAINER(submenu), item);
487 g_signal_connect(item, "activate",
488 cb_func[k], GINT_TO_POINTER(encodings[j].idx));
489 g_free(label);
490 break;
493 order++;
501 * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
502 * If @a fast is not set, additional checks to validate the converted string are performed.
504 * @param buffer The input string to convert.
505 * @param size The length of the string, or -1 if the string is nul-terminated.
506 * @param charset The charset to be used for conversion.
507 * @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
509 * @return If the conversion was successful, a newly allocated nul-terminated string,
510 * which must be freed with @c g_free(). Otherwise @c NULL.
512 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
513 const gchar *charset, gboolean fast)
515 gchar *utf8_content = NULL;
516 GError *conv_error = NULL;
517 gchar* converted_contents = NULL;
518 gsize bytes_written;
520 g_return_val_if_fail(buffer != NULL, NULL);
521 g_return_val_if_fail(charset != NULL, NULL);
523 converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
524 &bytes_written, &conv_error);
526 if (fast)
528 utf8_content = converted_contents;
529 if (conv_error != NULL) g_error_free(conv_error);
531 else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
533 if (conv_error != NULL)
535 geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
536 g_error_free(conv_error);
537 conv_error = NULL;
539 else
540 geany_debug("Couldn't convert from %s to UTF-8.", charset);
542 utf8_content = NULL;
543 g_free(converted_contents);
545 else
547 geany_debug("Converted from %s to UTF-8.", charset);
548 utf8_content = converted_contents;
551 return utf8_content;
555 static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
557 guint i;
559 for (i = 0; i < G_N_ELEMENTS(pregs); i++)
561 gchar *charset;
563 if ((charset = regex_match(pregs[i], buffer, size)) != NULL)
564 return charset;
566 return NULL;
570 static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size,
571 const gchar *suggested_charset, gchar **used_encoding)
573 const gchar *locale_charset = NULL;
574 const gchar *charset;
575 gchar *utf8_content;
576 gboolean check_suggestion = suggested_charset != NULL;
577 gboolean check_locale = FALSE;
578 gint i, preferred_charset;
580 if (size == -1)
582 size = strlen(buffer);
585 /* current locale is not UTF-8, we have to check this charset */
586 check_locale = ! g_get_charset(&locale_charset);
588 /* First check for preferred charset, if specified */
589 preferred_charset = file_prefs.default_open_encoding;
591 if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
592 preferred_charset < 0 ||
593 preferred_charset >= GEANY_ENCODINGS_MAX)
595 preferred_charset = -1;
598 /* -1 means "Preferred charset" */
599 for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
601 if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
602 continue;
604 if (check_suggestion)
606 check_suggestion = FALSE;
607 charset = encodings_normalize_charset(suggested_charset);
608 if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
609 charset = suggested_charset;
610 i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
612 else if (check_locale)
614 check_locale = FALSE;
615 charset = locale_charset;
616 i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
618 else if (i == -1)
620 if (preferred_charset >= 0)
622 charset = encodings[preferred_charset].charset;
623 geany_debug("Using preferred charset: %s", charset);
625 else
626 continue;
628 else if (i >= 0)
629 charset = encodings[i].charset;
630 else /* in this case we have i == -2, continue to increase i and go ahead */
631 continue;
633 if (G_UNLIKELY(charset == NULL))
634 continue;
636 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
637 size, charset);
638 utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
640 if (G_LIKELY(utf8_content != NULL))
642 if (used_encoding != NULL)
644 if (G_UNLIKELY(*used_encoding != NULL))
646 geany_debug("%s:%d", __FILE__, __LINE__);
647 g_free(*used_encoding);
649 *used_encoding = g_strdup(charset);
651 return utf8_content;
655 return NULL;
660 * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
661 * @a used_encoding.
663 * @param buffer the input string to convert.
664 * @param size the length of the string, or -1 if the string is nul-terminated.
665 * @param used_encoding return location of the detected encoding of the input string, or @c NULL.
667 * @return If the conversion was successful, a newly allocated nul-terminated string,
668 * which must be freed with @c g_free(). Otherwise @c NULL.
670 gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_encoding)
672 gchar *regex_charset;
673 gchar *utf8;
675 /* first try to read the encoding from the file content */
676 regex_charset = encodings_check_regexes(buffer, size);
677 utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
678 g_free(regex_charset);
680 return utf8;
684 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
685 * otherwise GEANY_ENCODING_NONE.
686 * */
687 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
689 if (len >= 3)
691 if (bom_len)
692 *bom_len = 3;
694 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
695 (guchar)string[2] == 0xbf)
697 return GEANY_ENCODING_UTF_8;
700 if (len >= 4)
702 if (bom_len)
703 *bom_len = 4;
705 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
706 (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
708 return GEANY_ENCODING_UTF_32BE; /* Big endian */
710 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
711 (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
713 return GEANY_ENCODING_UTF_32LE; /* Little endian */
715 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
716 (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
718 return GEANY_ENCODING_UTF_7;
721 if (len >= 2)
723 if (bom_len)
724 *bom_len = 2;
726 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
728 return GEANY_ENCODING_UTF_16BE; /* Big endian */
730 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
732 return GEANY_ENCODING_UTF_16LE; /* Little endian */
735 if (bom_len)
736 *bom_len = 0;
737 return GEANY_ENCODING_NONE;
741 gboolean encodings_is_unicode_charset(const gchar *string)
743 if (string != NULL &&
744 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
746 return TRUE;
748 return FALSE;
752 typedef struct
754 gchar *data; /* null-terminated data */
755 gsize size; /* actual data size */
756 gsize len; /* string length of data */
757 gchar *enc;
758 gboolean bom;
759 gboolean partial;
760 } BufferData;
763 /* convert data with the specified encoding */
764 static gboolean
765 handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
767 GeanyEncodingIndex enc_idx;
769 if (utils_str_equal(forced_enc, "UTF-8"))
771 if (! g_utf8_validate(buffer->data, buffer->len, NULL))
773 return FALSE;
776 else
778 gchar *converted_text = encodings_convert_to_utf8_from_charset(
779 buffer->data, buffer->size, forced_enc, FALSE);
780 if (converted_text == NULL)
782 return FALSE;
784 else
786 SETPTR(buffer->data, converted_text);
787 buffer->len = strlen(converted_text);
790 enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
791 buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
792 buffer->enc = g_strdup(forced_enc);
793 return TRUE;
797 /* detect encoding and convert to UTF-8 if necessary */
798 static gboolean
799 handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
801 g_return_val_if_fail(buffer->enc == NULL, FALSE);
802 g_return_val_if_fail(buffer->bom == FALSE, FALSE);
804 if (buffer->size == 0)
806 /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
807 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
808 buffer->enc = g_strdup("UTF-8");
810 else
812 /* first check for a BOM */
813 if (enc_idx != GEANY_ENCODING_NONE)
815 buffer->enc = g_strdup(encodings[enc_idx].charset);
816 buffer->bom = TRUE;
818 if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
820 gchar *converted_text = encodings_convert_to_utf8_from_charset(
821 buffer->data, buffer->size, buffer->enc, FALSE);
822 if (converted_text != NULL)
824 SETPTR(buffer->data, converted_text);
825 buffer->len = strlen(converted_text);
827 else
829 /* there was a problem converting data from BOM encoding type */
830 SETPTR(buffer->enc, NULL);
831 buffer->bom = FALSE;
836 if (buffer->enc == NULL) /* either there was no BOM or the BOM encoding failed */
838 /* first try to read the encoding from the file content */
839 gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
841 /* try UTF-8 first */
842 if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
843 (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
845 buffer->enc = g_strdup("UTF-8");
847 else
849 /* detect the encoding */
850 gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
851 buffer->size, regex_charset, &buffer->enc);
853 if (converted_text == NULL)
855 g_free(regex_charset);
856 return FALSE;
858 SETPTR(buffer->data, converted_text);
859 buffer->len = strlen(converted_text);
861 g_free(regex_charset);
864 return TRUE;
868 static void
869 handle_bom(BufferData *buffer)
871 guint bom_len;
873 encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
874 g_return_if_fail(bom_len != 0);
876 /* use filedata->len here because the contents are already converted into UTF-8 */
877 buffer->len -= bom_len;
878 /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
879 g_memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
880 buffer->data = g_realloc(buffer->data, buffer->len + 1);
884 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
885 static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
887 GeanyEncodingIndex tmp_enc_idx;
889 /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
890 * if we have a BOM */
891 tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
893 /* check whether the size of the loaded data is equal to the size of the file in the
894 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
895 * file size of 0 bytes */
896 if (buffer->len != buffer->size && buffer->size != 0 && (
897 tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
898 tmp_enc_idx == GEANY_ENCODING_UTF_7)) /* filter UTF-7/8 where no NULL bytes are allowed */
900 buffer->partial = TRUE;
903 /* Determine character encoding and convert to UTF-8 */
904 if (forced_enc != NULL)
906 /* the encoding should be ignored(requested by user), so open the file "as it is" */
907 if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
909 buffer->bom = FALSE;
910 buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
912 else if (! handle_forced_encoding(buffer, forced_enc))
914 return FALSE;
917 else if (! handle_encoding(buffer, tmp_enc_idx))
919 return FALSE;
922 if (buffer->bom)
923 handle_bom(buffer);
924 return TRUE;
929 * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
930 * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
932 * @param buf a pointer to modifiable null-terminated buffer to convert.
933 * It may or may not be modified, and should be freed whatever happens.
934 * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
935 * file size). It will be updated to the new size.
936 * @param forced_enc forced encoding to use, or @c NULL
937 * @param used_encoding return location for the actually used encoding, or @c NULL
938 * @param has_bom return location to store whether the data had a BOM, or @c NULL
939 * @param partial return location to store whether the conversion may be partial, or @c NULL
941 * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
943 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
944 gchar **used_encoding, gboolean *has_bom, gboolean *partial)
946 BufferData buffer;
948 buffer.data = *buf;
949 buffer.size = *size;
950 /* use strlen to check for null chars */
951 buffer.len = strlen(buffer.data);
952 buffer.enc = NULL;
953 buffer.bom = FALSE;
954 buffer.partial = FALSE;
956 if (! handle_buffer(&buffer, forced_enc))
957 return FALSE;
959 *size = buffer.len;
960 if (used_encoding)
961 *used_encoding = buffer.enc;
962 else
963 g_free(buffer.enc);
964 if (has_bom)
965 *has_bom = buffer.bom;
966 if (partial)
967 *partial = buffer.partial;
969 *buf = buffer.data;
970 return TRUE;