Version bump.
[geany-mirror.git] / src / encodings.c
blob29e800dd3e0505b26e4a67ed558bdbb691a300fb
1 /*
2 * encodings.c - this file is part of Geany, a fast and lightweight IDE
4 * Copyright 2005-2010 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
5 * Copyright 2006-2010 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * $Id$
25 * Encoding conversion and Byte Order Mark (BOM) handling.
29 * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
30 * list of people on the gedit Team.
31 * See the gedit ChangeLog files for a list of changes.
33 /* Stolen from anjuta */
35 #include <string.h>
37 #include "geany.h"
38 #include "utils.h"
39 #include "support.h"
40 #include "document.h"
41 #include "documentprivate.h"
42 #include "msgwindow.h"
43 #include "encodings.h"
44 #include "callbacks.h"
45 #include "ui_utils.h"
48 #ifdef HAVE_REGCOMP
49 # ifdef HAVE_REGEX_H
50 # include <regex.h>
51 # else
52 # include "gnuregex.h"
53 # endif
54 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
55 # define PATTERN_HTMLMETA "<meta[ \t\n\r\f]http-equiv[ \t\n\r\f]*=[ \t\n\r\f]*\"content-type\"[ \t\n\r\f]+content[ \t\n\r\f]*=[ \t\n\r\f]*\"text/x?html;[ \t\n\r\f]*charset=([a-z0-9_-]+)\"[ \t\n\r\f]*/?>"
56 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
57 # define PATTERN_CODING "coding[\t ]*[:=][\t ]*([a-z0-9-]+)[\t ]*"
58 /* precompiled regexps */
59 static regex_t pregs[2];
60 static gboolean pregs_loaded = FALSE;
61 #endif
64 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
67 #define fill(Order, Group, Idx, Charset, Name) \
68 encodings[Idx].idx = Idx; \
69 encodings[Idx].order = Order; \
70 encodings[Idx].group = Group; \
71 encodings[Idx].charset = Charset; \
72 encodings[Idx].name = Name;
74 static void init_encodings(void)
76 fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic"));
77 fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek"));
78 fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek"));
79 fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic"));
80 fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European"));
81 fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western"));
82 fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western"));
83 fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western"));
84 fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western"));
86 fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic"));
87 fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic"));
88 fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic"));
89 fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European"));
90 fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European"));
91 fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European"));
92 fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic"));
93 fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic"));
94 /* ISO-IR-111 not available on Windows */
95 fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic"));
96 fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8-R", _("Cyrillic"));
97 fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic"));
98 fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian"));
99 fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8-U", _("Cyrillic/Ukrainian"));
100 fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian"));
102 fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic"));
103 fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic"));
104 fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic"));
105 fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew"));
106 /* not available at all, ? */
107 fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew"));
108 fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew"));
109 fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual"));
111 fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian"));
112 fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian"));
113 fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai"));
114 fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish"));
115 fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish"));
116 fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish"));
117 fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese"));
118 fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese"));
119 fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese"));
121 fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode"));
122 fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode"));
123 fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode"));
124 fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode"));
125 fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode"));
126 fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode"));
127 fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode"));
128 fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode"));
130 fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified"));
131 fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified"));
132 fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified"));
133 /* maybe not available on Linux */
134 fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified"));
135 fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional"));
136 fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional"));
137 fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional"));
138 fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese"));
139 fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese"));
140 fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese"));
141 fill(10, EASTASIAN, GEANY_ENCODING_CP_932, "CP932", _("Japanese"));
142 fill(11, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean"));
143 fill(12, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean"));
144 fill(13, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean"));
145 fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean"));
147 fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding"));
151 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
153 gint i;
155 if (charset == NULL)
156 return GEANY_ENCODING_UTF_8;
158 i = 0;
159 while (i < GEANY_ENCODINGS_MAX)
161 if (strcmp(charset, encodings[i].charset) == 0)
162 return i;
164 ++i;
166 return GEANY_ENCODING_UTF_8;
170 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
172 gint i;
174 if (charset == NULL)
175 return &encodings[GEANY_ENCODING_UTF_8];
177 i = 0;
178 while (i < GEANY_ENCODINGS_MAX)
180 if (strcmp(charset, encodings[i].charset) == 0)
181 return &encodings[i];
183 ++i;
186 return NULL;
190 const GeanyEncoding *encodings_get_from_index(gint idx)
192 g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
194 return &encodings[idx];
199 * Gets the character set name of the specified index e.g. for use with
200 * @ref document_set_encoding().
202 * @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
205 * @return The charset according to idx, or @c NULL if the index is invalid.
207 * @since 0.13
209 const gchar* encodings_get_charset_from_index(gint idx)
211 g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
213 return encodings[idx].charset;
217 gchar *encodings_to_string(const GeanyEncoding* enc)
219 g_return_val_if_fail(enc != NULL, NULL);
220 g_return_val_if_fail(enc->name != NULL, NULL);
221 g_return_val_if_fail(enc->charset != NULL, NULL);
223 return g_strdup_printf("%s (%s)", enc->name, enc->charset);
227 const gchar *encodings_get_charset(const GeanyEncoding* enc)
229 g_return_val_if_fail(enc != NULL, NULL);
230 g_return_val_if_fail(enc->charset != NULL, NULL);
232 return enc->charset;
236 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
239 void encodings_select_radio_item(const gchar *charset)
241 gint i;
243 g_return_if_fail(charset != NULL);
245 i = 0;
246 while (i < GEANY_ENCODINGS_MAX)
248 if (utils_str_equal(charset, encodings[i].charset))
249 break;
250 i++;
252 if (i == GEANY_ENCODINGS_MAX)
253 i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
255 /* ignore_callback has to be set by the caller */
256 gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
260 #ifdef HAVE_REGCOMP
261 /* Regexp detection of file encoding declared in the file itself.
262 * Idea and parts of code taken from Bluefish, thanks.
263 * regex_compile() is used to compile regular expressions on program init and keep it in memory
264 * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
266 static void regex_compile(regex_t *preg, const gchar *pattern)
268 gint retval = regcomp(preg, pattern, REG_EXTENDED | REG_ICASE);
269 if (retval != 0)
271 gchar errmsg[512];
272 regerror(retval, preg, errmsg, 512);
273 geany_debug("regcomp() failed (%s)", errmsg);
274 regfree(preg);
275 return;
280 static gchar *regex_match(regex_t *preg, const gchar *buffer, gsize size)
282 gint retval;
283 gchar *tmp_buf = NULL;
284 gchar *encoding = NULL;
285 regmatch_t pmatch[10];
287 if (G_UNLIKELY(! pregs_loaded) || G_UNLIKELY(buffer == NULL))
288 return NULL;
290 if (size > 512)
291 tmp_buf = g_strndup(buffer, 512); /* scan only the first 512 characters in the buffer */
293 retval = regexec(preg, (tmp_buf != NULL) ? tmp_buf : buffer, 10, pmatch, 0);
294 if (retval == 0 && pmatch[0].rm_so != -1 && pmatch[1].rm_so != -1)
296 encoding = g_strndup(&buffer[pmatch[1].rm_so], pmatch[1].rm_eo - pmatch[1].rm_so);
297 geany_debug("Detected encoding by regex search: %s", encoding);
299 setptr(encoding, g_utf8_strup(encoding, -1));
301 g_free(tmp_buf);
302 return encoding;
304 #endif
307 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
309 GeanyDocument *doc = document_get_current();
310 guint i = GPOINTER_TO_INT(user_data);
312 if (ignore_callback || doc == NULL || encodings[i].charset == NULL ||
313 ! gtk_check_menu_item_get_active(menuitem) ||
314 utils_str_equal(encodings[i].charset, doc->encoding))
315 return;
317 if (doc->readonly)
319 utils_beep();
320 return;
322 document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
324 document_set_encoding(doc, encodings[i].charset);
328 void encodings_finalize(void)
330 #ifdef HAVE_REGCOMP
331 if (pregs_loaded)
333 guint i, len;
334 len = G_N_ELEMENTS(pregs);
335 for (i = 0; i < len; i++)
337 regfree(&pregs[i]);
340 #endif
344 void encodings_init(void)
346 GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian,
347 *menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian,
348 *item_asian, *item_utf8, *item_middleeast;
349 GCallback cb_func[2];
350 GSList *group = NULL;
351 gchar *label;
352 gint order, group_size;
353 guint i, j, k;
355 init_encodings();
357 #ifdef HAVE_REGCOMP
358 if (! pregs_loaded)
360 regex_compile(&pregs[0], PATTERN_HTMLMETA);
361 regex_compile(&pregs[1], PATTERN_CODING);
362 pregs_loaded = TRUE;
364 #endif
366 /* create encodings submenu in document menu */
367 menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
368 menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
369 cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
370 cb_func[1] = G_CALLBACK(on_reload_as_activate);
372 for (k = 0; k < 2; k++)
374 menu_westeuro = gtk_menu_new();
375 item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European"));
376 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro);
377 gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro);
378 gtk_widget_show_all(item_westeuro);
380 menu_easteuro = gtk_menu_new();
381 item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European"));
382 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro);
383 gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro);
384 gtk_widget_show_all(item_easteuro);
386 menu_eastasian = gtk_menu_new();
387 item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian"));
388 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian);
389 gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian);
390 gtk_widget_show_all(item_eastasian);
392 menu_asian = gtk_menu_new();
393 item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
394 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian);
395 gtk_container_add(GTK_CONTAINER(menu[k]), item_asian);
396 gtk_widget_show_all(item_asian);
398 menu_middleeast = gtk_menu_new();
399 item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
400 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast);
401 gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast);
402 gtk_widget_show_all(item_middleeast);
404 menu_utf8 = gtk_menu_new();
405 item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode"));
406 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8);
407 gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8);
408 gtk_widget_show_all(item_utf8);
410 /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
411 for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
413 order = 0;
414 switch (i)
416 case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break;
417 case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break;
418 case EASTASIAN: submenu = menu_eastasian; group_size = 14; break;
419 case ASIAN: submenu = menu_asian; group_size = 9; break;
420 case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break;
421 case UNICODE: submenu = menu_utf8; group_size = 8; break;
422 default: submenu = menu[k]; group_size = 1;
425 while (order < group_size) /* the biggest group has 13 elements */
427 for (j = 0; j < GEANY_ENCODINGS_MAX; j++)
429 if (encodings[j].group == i && encodings[j].order == order)
431 label = encodings_to_string(&encodings[j]);
432 if (k == 0)
434 item = gtk_radio_menu_item_new_with_label(group, label);
435 group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
436 radio_items[j] = item;
438 else
439 item = gtk_menu_item_new_with_label(label);
440 gtk_widget_show(item);
441 gtk_container_add(GTK_CONTAINER(submenu), item);
442 g_signal_connect(item, "activate",
443 cb_func[k], GINT_TO_POINTER(encodings[j].idx));
444 g_free(label);
445 break;
448 order++;
456 * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
457 * If @a fast is not set, additional checks to validate the converted string are performed.
459 * @param buffer The input string to convert.
460 * @param size The length of the string, or -1 if the string is nul-terminated.
461 * @param charset The charset to be used for conversion.
462 * @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
464 * @return If the conversion was successful, a newly allocated nul-terminated string,
465 * which must be freed with @c g_free(). Otherwise @c NULL.
467 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gsize size,
468 const gchar *charset, gboolean fast)
470 gchar *utf8_content = NULL;
471 GError *conv_error = NULL;
472 gchar* converted_contents = NULL;
473 gsize bytes_written;
475 g_return_val_if_fail(buffer != NULL, NULL);
476 g_return_val_if_fail(charset != NULL, NULL);
478 converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
479 &bytes_written, &conv_error);
481 if (fast)
483 utf8_content = converted_contents;
484 if (conv_error != NULL) g_error_free(conv_error);
486 else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
488 if (conv_error != NULL)
490 geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
491 g_error_free(conv_error);
492 conv_error = NULL;
494 else
495 geany_debug("Couldn't convert from %s to UTF-8.", charset);
497 utf8_content = NULL;
498 if (converted_contents != NULL)
499 g_free(converted_contents);
501 else
503 geany_debug("Converted from %s to UTF-8.", charset);
504 utf8_content = converted_contents;
507 return utf8_content;
512 * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
513 * @a used_encoding.
515 * @param buffer the input string to convert.
516 * @param size the length of the string, or -1 if the string is nul-terminated.
517 * @param used_encoding return location of the detected encoding of the input string, or @c NULL.
519 * @return If the conversion was successful, a newly allocated nul-terminated string,
520 * which must be freed with @c g_free(). Otherwise @c NULL.
522 gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
524 gchar *locale_charset = NULL;
525 gchar *regex_charset = NULL;
526 const gchar *charset;
527 gchar *utf8_content;
528 gboolean check_regex = FALSE;
529 gboolean check_locale = FALSE;
530 gint i, len, preferred_charset;
532 if ((gint)size == -1)
534 size = strlen(buffer);
537 #ifdef HAVE_REGCOMP
538 /* first try to read the encoding from the file content */
539 len = (gint) G_N_ELEMENTS(pregs);
540 for (i = 0; i < len && ! check_regex; i++)
542 if ((regex_charset = regex_match(&pregs[i], buffer, size)) != NULL)
543 check_regex = TRUE;
545 #endif
547 /* current locale is not UTF-8, we have to check this charset */
548 check_locale = ! g_get_charset((const gchar**) &charset);
550 /* First check for preferred charset, if specified */
551 preferred_charset = file_prefs.default_open_encoding;
553 if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
554 preferred_charset < 0 ||
555 preferred_charset >= GEANY_ENCODINGS_MAX)
557 preferred_charset = -1;
560 /* -1 means "Preferred charset" */
561 for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
563 if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
564 continue;
566 if (check_regex)
568 check_regex = FALSE;
569 charset = regex_charset;
570 i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
572 else if (check_locale)
574 check_locale = FALSE;
575 charset = locale_charset;
576 i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
578 else if (i == -1)
580 if (preferred_charset >= 0)
582 charset = encodings[preferred_charset].charset;
583 geany_debug("Using preferred charset: %s", charset);
585 else
586 continue;
588 else if (i >= 0)
589 charset = encodings[i].charset;
590 else /* in this case we have i == -2, continue to increase i and go ahead */
591 continue;
593 if (G_UNLIKELY(charset == NULL))
594 continue;
596 geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
597 size, charset);
598 utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
600 if (G_LIKELY(utf8_content != NULL))
602 if (used_encoding != NULL)
604 if (G_UNLIKELY(*used_encoding != NULL))
606 geany_debug("%s:%d", __FILE__, __LINE__);
607 g_free(*used_encoding);
609 *used_encoding = g_strdup(charset);
611 g_free(regex_charset);
612 return utf8_content;
615 g_free(regex_charset);
617 return NULL;
621 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
622 * otherwise GEANY_ENCODING_NONE.
623 * */
624 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
626 if (len >= 3)
628 if (bom_len)
629 *bom_len = 3;
631 if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
632 (guchar)string[2] == 0xbf)
634 return GEANY_ENCODING_UTF_8;
637 if (len >= 4)
639 if (bom_len)
640 *bom_len = 4;
642 if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
643 (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
645 return GEANY_ENCODING_UTF_32BE; /* Big endian */
647 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
648 (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
650 return GEANY_ENCODING_UTF_32LE; /* Little endian */
652 if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
653 (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
655 return GEANY_ENCODING_UTF_7;
658 if (len >= 2)
660 if (bom_len)
661 *bom_len = 2;
663 if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
665 return GEANY_ENCODING_UTF_16BE; /* Big endian */
667 if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
669 return GEANY_ENCODING_UTF_16LE; /* Little endian */
672 if (bom_len)
673 *bom_len = 0;
674 return GEANY_ENCODING_NONE;
678 gboolean encodings_is_unicode_charset(const gchar *string)
680 if (string != NULL &&
681 (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
683 return TRUE;
685 return FALSE;