2 * encodings.c - this file is part of Geany, a fast and lightweight IDE
4 * Copyright 2005-2010 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
5 * Copyright 2006-2010 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 * Encoding conversion and Byte Order Mark (BOM) handling.
29 * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
30 * list of people on the gedit Team.
31 * See the gedit ChangeLog files for a list of changes.
33 /* Stolen from anjuta */
41 #include "documentprivate.h"
42 #include "msgwindow.h"
43 #include "encodings.h"
44 #include "callbacks.h"
52 # include "gnuregex.h"
54 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
55 # define PATTERN_HTMLMETA "<meta[ \t\n\r\f]http-equiv[ \t\n\r\f]*=[ \t\n\r\f]*\"content-type\"[ \t\n\r\f]+content[ \t\n\r\f]*=[ \t\n\r\f]*\"text/x?html;[ \t\n\r\f]*charset=([a-z0-9_-]+)\"[ \t\n\r\f]*/?>"
56 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
57 # define PATTERN_CODING "coding[\t ]*[:=][\t ]*([a-z0-9-]+)[\t ]*"
58 /* precompiled regexps */
59 static regex_t pregs
[2];
60 static gboolean pregs_loaded
= FALSE
;
64 GeanyEncoding encodings
[GEANY_ENCODINGS_MAX
];
67 #define fill(Order, Group, Idx, Charset, Name) \
68 encodings[Idx].idx = Idx; \
69 encodings[Idx].order = Order; \
70 encodings[Idx].group = Group; \
71 encodings[Idx].charset = Charset; \
72 encodings[Idx].name = Name;
74 static void init_encodings(void)
76 fill(0, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_14
, "ISO-8859-14", _("Celtic"));
77 fill(1, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_7
, "ISO-8859-7", _("Greek"));
78 fill(2, WESTEUROPEAN
, GEANY_ENCODING_WINDOWS_1253
, "WINDOWS-1253", _("Greek"));
79 fill(3, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_10
, "ISO-8859-10", _("Nordic"));
80 fill(4, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_3
, "ISO-8859-3", _("South European"));
81 fill(5, WESTEUROPEAN
, GEANY_ENCODING_IBM_850
, "IBM850", _("Western"));
82 fill(6, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_1
, "ISO-8859-1", _("Western"));
83 fill(7, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_15
, "ISO-8859-15", _("Western"));
84 fill(8, WESTEUROPEAN
, GEANY_ENCODING_WINDOWS_1252
, "WINDOWS-1252", _("Western"));
86 fill(0, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_4
, "ISO-8859-4", _("Baltic"));
87 fill(1, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_13
, "ISO-8859-13", _("Baltic"));
88 fill(2, EASTEUROPEAN
, GEANY_ENCODING_WINDOWS_1257
, "WINDOWS-1257", _("Baltic"));
89 fill(3, EASTEUROPEAN
, GEANY_ENCODING_IBM_852
, "IBM852", _("Central European"));
90 fill(4, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_2
, "ISO-8859-2", _("Central European"));
91 fill(5, EASTEUROPEAN
, GEANY_ENCODING_WINDOWS_1250
, "WINDOWS-1250", _("Central European"));
92 fill(6, EASTEUROPEAN
, GEANY_ENCODING_IBM_855
, "IBM855", _("Cyrillic"));
93 fill(7, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_5
, "ISO-8859-5", _("Cyrillic"));
94 /* ISO-IR-111 not available on Windows */
95 fill(8, EASTEUROPEAN
, GEANY_ENCODING_ISO_IR_111
, "ISO-IR-111", _("Cyrillic"));
96 fill(9, EASTEUROPEAN
, GEANY_ENCODING_KOI8_R
, "KOI8-R", _("Cyrillic"));
97 fill(10, EASTEUROPEAN
, GEANY_ENCODING_WINDOWS_1251
, "WINDOWS-1251", _("Cyrillic"));
98 fill(11, EASTEUROPEAN
, GEANY_ENCODING_CP_866
, "CP866", _("Cyrillic/Russian"));
99 fill(12, EASTEUROPEAN
, GEANY_ENCODING_KOI8_U
, "KOI8-U", _("Cyrillic/Ukrainian"));
100 fill(13, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_16
, "ISO-8859-16", _("Romanian"));
102 fill(0, MIDDLEEASTERN
, GEANY_ENCODING_IBM_864
, "IBM864", _("Arabic"));
103 fill(1, MIDDLEEASTERN
, GEANY_ENCODING_ISO_8859_6
, "ISO-8859-6", _("Arabic"));
104 fill(2, MIDDLEEASTERN
, GEANY_ENCODING_WINDOWS_1256
, "WINDOWS-1256", _("Arabic"));
105 fill(3, MIDDLEEASTERN
, GEANY_ENCODING_IBM_862
, "IBM862", _("Hebrew"));
106 /* not available at all, ? */
107 fill(4, MIDDLEEASTERN
, GEANY_ENCODING_ISO_8859_8_I
, "ISO-8859-8-I", _("Hebrew"));
108 fill(5, MIDDLEEASTERN
, GEANY_ENCODING_WINDOWS_1255
, "WINDOWS-1255", _("Hebrew"));
109 fill(6, MIDDLEEASTERN
, GEANY_ENCODING_ISO_8859_8
, "ISO-8859-8", _("Hebrew Visual"));
111 fill(0, ASIAN
, GEANY_ENCODING_ARMSCII_8
, "ARMSCII-8", _("Armenian"));
112 fill(1, ASIAN
, GEANY_ENCODING_GEOSTD8
, "GEORGIAN-ACADEMY", _("Georgian"));
113 fill(2, ASIAN
, GEANY_ENCODING_TIS_620
, "TIS-620", _("Thai"));
114 fill(3, ASIAN
, GEANY_ENCODING_IBM_857
, "IBM857", _("Turkish"));
115 fill(4, ASIAN
, GEANY_ENCODING_WINDOWS_1254
, "WINDOWS-1254", _("Turkish"));
116 fill(5, ASIAN
, GEANY_ENCODING_ISO_8859_9
, "ISO-8859-9", _("Turkish"));
117 fill(6, ASIAN
, GEANY_ENCODING_TCVN
, "TCVN", _("Vietnamese"));
118 fill(7, ASIAN
, GEANY_ENCODING_VISCII
, "VISCII", _("Vietnamese"));
119 fill(8, ASIAN
, GEANY_ENCODING_WINDOWS_1258
, "WINDOWS-1258", _("Vietnamese"));
121 fill(0, UNICODE
, GEANY_ENCODING_UTF_7
, "UTF-7", _("Unicode"));
122 fill(1, UNICODE
, GEANY_ENCODING_UTF_8
, "UTF-8", _("Unicode"));
123 fill(2, UNICODE
, GEANY_ENCODING_UTF_16LE
, "UTF-16LE", _("Unicode"));
124 fill(3, UNICODE
, GEANY_ENCODING_UTF_16BE
, "UTF-16BE", _("Unicode"));
125 fill(4, UNICODE
, GEANY_ENCODING_UCS_2LE
, "UCS-2LE", _("Unicode"));
126 fill(5, UNICODE
, GEANY_ENCODING_UCS_2BE
, "UCS-2BE", _("Unicode"));
127 fill(6, UNICODE
, GEANY_ENCODING_UTF_32LE
, "UTF-32LE", _("Unicode"));
128 fill(7, UNICODE
, GEANY_ENCODING_UTF_32BE
, "UTF-32BE", _("Unicode"));
130 fill(0, EASTASIAN
, GEANY_ENCODING_GB18030
, "GB18030", _("Chinese Simplified"));
131 fill(1, EASTASIAN
, GEANY_ENCODING_GB2312
, "GB2312", _("Chinese Simplified"));
132 fill(2, EASTASIAN
, GEANY_ENCODING_GBK
, "GBK", _("Chinese Simplified"));
133 /* maybe not available on Linux */
134 fill(3, EASTASIAN
, GEANY_ENCODING_HZ
, "HZ", _("Chinese Simplified"));
135 fill(4, EASTASIAN
, GEANY_ENCODING_BIG5
, "BIG5", _("Chinese Traditional"));
136 fill(5, EASTASIAN
, GEANY_ENCODING_BIG5_HKSCS
, "BIG5-HKSCS", _("Chinese Traditional"));
137 fill(6, EASTASIAN
, GEANY_ENCODING_EUC_TW
, "EUC-TW", _("Chinese Traditional"));
138 fill(7, EASTASIAN
, GEANY_ENCODING_EUC_JP
, "EUC-JP", _("Japanese"));
139 fill(8, EASTASIAN
, GEANY_ENCODING_ISO_2022_JP
, "ISO-2022-JP", _("Japanese"));
140 fill(9, EASTASIAN
, GEANY_ENCODING_SHIFT_JIS
, "SHIFT_JIS", _("Japanese"));
141 fill(10, EASTASIAN
, GEANY_ENCODING_CP_932
, "CP932", _("Japanese"));
142 fill(11, EASTASIAN
, GEANY_ENCODING_EUC_KR
, "EUC-KR", _("Korean"));
143 fill(12, EASTASIAN
, GEANY_ENCODING_ISO_2022_KR
, "ISO-2022-KR", _("Korean"));
144 fill(13, EASTASIAN
, GEANY_ENCODING_JOHAB
, "JOHAB", _("Korean"));
145 fill(14, EASTASIAN
, GEANY_ENCODING_UHC
, "UHC", _("Korean"));
147 fill(0, NONE
, GEANY_ENCODING_NONE
, "None", _("Without encoding"));
151 GeanyEncodingIndex
encodings_get_idx_from_charset(const gchar
*charset
)
156 return GEANY_ENCODING_UTF_8
;
159 while (i
< GEANY_ENCODINGS_MAX
)
161 if (strcmp(charset
, encodings
[i
].charset
) == 0)
166 return GEANY_ENCODING_UTF_8
;
170 const GeanyEncoding
*encodings_get_from_charset(const gchar
*charset
)
175 return &encodings
[GEANY_ENCODING_UTF_8
];
178 while (i
< GEANY_ENCODINGS_MAX
)
180 if (strcmp(charset
, encodings
[i
].charset
) == 0)
181 return &encodings
[i
];
190 const GeanyEncoding
*encodings_get_from_index(gint idx
)
192 g_return_val_if_fail(idx
>= 0 && idx
< GEANY_ENCODINGS_MAX
, NULL
);
194 return &encodings
[idx
];
199 * Gets the character set name of the specified index e.g. for use with
200 * @ref document_set_encoding().
202 * @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
205 * @return The charset according to idx, or @c NULL if the index is invalid.
209 const gchar
* encodings_get_charset_from_index(gint idx
)
211 g_return_val_if_fail(idx
>= 0 && idx
< GEANY_ENCODINGS_MAX
, NULL
);
213 return encodings
[idx
].charset
;
217 gchar
*encodings_to_string(const GeanyEncoding
* enc
)
219 g_return_val_if_fail(enc
!= NULL
, NULL
);
220 g_return_val_if_fail(enc
->name
!= NULL
, NULL
);
221 g_return_val_if_fail(enc
->charset
!= NULL
, NULL
);
223 return g_strdup_printf("%s (%s)", enc
->name
, enc
->charset
);
227 const gchar
*encodings_get_charset(const GeanyEncoding
* enc
)
229 g_return_val_if_fail(enc
!= NULL
, NULL
);
230 g_return_val_if_fail(enc
->charset
!= NULL
, NULL
);
236 static GtkWidget
*radio_items
[GEANY_ENCODINGS_MAX
];
239 void encodings_select_radio_item(const gchar
*charset
)
243 g_return_if_fail(charset
!= NULL
);
246 while (i
< GEANY_ENCODINGS_MAX
)
248 if (utils_str_equal(charset
, encodings
[i
].charset
))
252 if (i
== GEANY_ENCODINGS_MAX
)
253 i
= GEANY_ENCODING_UTF_8
; /* fallback to UTF-8 */
255 /* ignore_callback has to be set by the caller */
256 gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items
[i
]), TRUE
);
261 /* Regexp detection of file encoding declared in the file itself.
262 * Idea and parts of code taken from Bluefish, thanks.
263 * regex_compile() is used to compile regular expressions on program init and keep it in memory
264 * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
266 static void regex_compile(regex_t
*preg
, const gchar
*pattern
)
268 gint retval
= regcomp(preg
, pattern
, REG_EXTENDED
| REG_ICASE
);
272 regerror(retval
, preg
, errmsg
, 512);
273 geany_debug("regcomp() failed (%s)", errmsg
);
280 static gchar
*regex_match(regex_t
*preg
, const gchar
*buffer
, gsize size
)
283 gchar
*tmp_buf
= NULL
;
284 gchar
*encoding
= NULL
;
285 regmatch_t pmatch
[10];
287 if (G_UNLIKELY(! pregs_loaded
) || G_UNLIKELY(buffer
== NULL
))
291 tmp_buf
= g_strndup(buffer
, 512); /* scan only the first 512 characters in the buffer */
293 retval
= regexec(preg
, (tmp_buf
!= NULL
) ? tmp_buf
: buffer
, 10, pmatch
, 0);
294 if (retval
== 0 && pmatch
[0].rm_so
!= -1 && pmatch
[1].rm_so
!= -1)
296 encoding
= g_strndup(&buffer
[pmatch
[1].rm_so
], pmatch
[1].rm_eo
- pmatch
[1].rm_so
);
297 geany_debug("Detected encoding by regex search: %s", encoding
);
299 setptr(encoding
, g_utf8_strup(encoding
, -1));
307 static void encodings_radio_item_change_cb(GtkCheckMenuItem
*menuitem
, gpointer user_data
)
309 GeanyDocument
*doc
= document_get_current();
310 guint i
= GPOINTER_TO_INT(user_data
);
312 if (ignore_callback
|| doc
== NULL
|| encodings
[i
].charset
== NULL
||
313 ! gtk_check_menu_item_get_active(menuitem
) ||
314 utils_str_equal(encodings
[i
].charset
, doc
->encoding
))
322 document_undo_add(doc
, UNDO_ENCODING
, g_strdup(doc
->encoding
));
324 document_set_encoding(doc
, encodings
[i
].charset
);
328 void encodings_finalize(void)
334 len
= G_N_ELEMENTS(pregs
);
335 for (i
= 0; i
< len
; i
++)
344 void encodings_init(void)
346 GtkWidget
*item
, *menu
[2], *submenu
, *menu_westeuro
, *menu_easteuro
, *menu_eastasian
, *menu_asian
,
347 *menu_utf8
, *menu_middleeast
, *item_westeuro
, *item_easteuro
, *item_eastasian
,
348 *item_asian
, *item_utf8
, *item_middleeast
;
349 GCallback cb_func
[2];
350 GSList
*group
= NULL
;
352 gint order
, group_size
;
360 regex_compile(&pregs
[0], PATTERN_HTMLMETA
);
361 regex_compile(&pregs
[1], PATTERN_CODING
);
366 /* create encodings submenu in document menu */
367 menu
[0] = ui_lookup_widget(main_widgets
.window
, "set_encoding1_menu");
368 menu
[1] = ui_lookup_widget(main_widgets
.window
, "menu_reload_as1_menu");
369 cb_func
[0] = G_CALLBACK(encodings_radio_item_change_cb
);
370 cb_func
[1] = G_CALLBACK(on_reload_as_activate
);
372 for (k
= 0; k
< 2; k
++)
374 menu_westeuro
= gtk_menu_new();
375 item_westeuro
= gtk_menu_item_new_with_mnemonic(_("_West European"));
376 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro
), menu_westeuro
);
377 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_westeuro
);
378 gtk_widget_show_all(item_westeuro
);
380 menu_easteuro
= gtk_menu_new();
381 item_easteuro
= gtk_menu_item_new_with_mnemonic(_("_East European"));
382 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro
), menu_easteuro
);
383 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_easteuro
);
384 gtk_widget_show_all(item_easteuro
);
386 menu_eastasian
= gtk_menu_new();
387 item_eastasian
= gtk_menu_item_new_with_mnemonic(_("East _Asian"));
388 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian
), menu_eastasian
);
389 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_eastasian
);
390 gtk_widget_show_all(item_eastasian
);
392 menu_asian
= gtk_menu_new();
393 item_asian
= gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
394 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian
), menu_asian
);
395 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_asian
);
396 gtk_widget_show_all(item_asian
);
398 menu_middleeast
= gtk_menu_new();
399 item_middleeast
= gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
400 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast
), menu_middleeast
);
401 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_middleeast
);
402 gtk_widget_show_all(item_middleeast
);
404 menu_utf8
= gtk_menu_new();
405 item_utf8
= gtk_menu_item_new_with_mnemonic(_("_Unicode"));
406 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8
), menu_utf8
);
407 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_utf8
);
408 gtk_widget_show_all(item_utf8
);
410 /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
411 for (i
= 0; i
< GEANY_ENCODING_GROUPS_MAX
; i
++)
416 case WESTEUROPEAN
: submenu
= menu_westeuro
; group_size
= 9; break;
417 case EASTEUROPEAN
: submenu
= menu_easteuro
; group_size
= 14; break;
418 case EASTASIAN
: submenu
= menu_eastasian
; group_size
= 14; break;
419 case ASIAN
: submenu
= menu_asian
; group_size
= 9; break;
420 case MIDDLEEASTERN
: submenu
= menu_middleeast
; group_size
= 7; break;
421 case UNICODE
: submenu
= menu_utf8
; group_size
= 8; break;
422 default: submenu
= menu
[k
]; group_size
= 1;
425 while (order
< group_size
) /* the biggest group has 13 elements */
427 for (j
= 0; j
< GEANY_ENCODINGS_MAX
; j
++)
429 if (encodings
[j
].group
== i
&& encodings
[j
].order
== order
)
431 label
= encodings_to_string(&encodings
[j
]);
434 item
= gtk_radio_menu_item_new_with_label(group
, label
);
435 group
= gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item
));
436 radio_items
[j
] = item
;
439 item
= gtk_menu_item_new_with_label(label
);
440 gtk_widget_show(item
);
441 gtk_container_add(GTK_CONTAINER(submenu
), item
);
442 g_signal_connect(item
, "activate",
443 cb_func
[k
], GINT_TO_POINTER(encodings
[j
].idx
));
456 * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
457 * If @a fast is not set, additional checks to validate the converted string are performed.
459 * @param buffer The input string to convert.
460 * @param size The length of the string, or -1 if the string is nul-terminated.
461 * @param charset The charset to be used for conversion.
462 * @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
464 * @return If the conversion was successful, a newly allocated nul-terminated string,
465 * which must be freed with @c g_free(). Otherwise @c NULL.
467 gchar
*encodings_convert_to_utf8_from_charset(const gchar
*buffer
, gsize size
,
468 const gchar
*charset
, gboolean fast
)
470 gchar
*utf8_content
= NULL
;
471 GError
*conv_error
= NULL
;
472 gchar
* converted_contents
= NULL
;
475 g_return_val_if_fail(buffer
!= NULL
, NULL
);
476 g_return_val_if_fail(charset
!= NULL
, NULL
);
478 converted_contents
= g_convert(buffer
, size
, "UTF-8", charset
, NULL
,
479 &bytes_written
, &conv_error
);
483 utf8_content
= converted_contents
;
484 if (conv_error
!= NULL
) g_error_free(conv_error
);
486 else if (conv_error
!= NULL
|| ! g_utf8_validate(converted_contents
, bytes_written
, NULL
))
488 if (conv_error
!= NULL
)
490 geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset
, conv_error
->message
);
491 g_error_free(conv_error
);
495 geany_debug("Couldn't convert from %s to UTF-8.", charset
);
498 if (converted_contents
!= NULL
)
499 g_free(converted_contents
);
503 geany_debug("Converted from %s to UTF-8.", charset
);
504 utf8_content
= converted_contents
;
512 * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
515 * @param buffer the input string to convert.
516 * @param size the length of the string, or -1 if the string is nul-terminated.
517 * @param used_encoding return location of the detected encoding of the input string, or @c NULL.
519 * @return If the conversion was successful, a newly allocated nul-terminated string,
520 * which must be freed with @c g_free(). Otherwise @c NULL.
522 gchar
*encodings_convert_to_utf8(const gchar
*buffer
, gsize size
, gchar
**used_encoding
)
524 gchar
*locale_charset
= NULL
;
525 gchar
*regex_charset
= NULL
;
526 const gchar
*charset
;
528 gboolean check_regex
= FALSE
;
529 gboolean check_locale
= FALSE
;
530 gint i
, len
, preferred_charset
;
532 if ((gint
)size
== -1)
534 size
= strlen(buffer
);
538 /* first try to read the encoding from the file content */
539 len
= (gint
) G_N_ELEMENTS(pregs
);
540 for (i
= 0; i
< len
&& ! check_regex
; i
++)
542 if ((regex_charset
= regex_match(&pregs
[i
], buffer
, size
)) != NULL
)
547 /* current locale is not UTF-8, we have to check this charset */
548 check_locale
= ! g_get_charset((const gchar
**) &charset
);
550 /* First check for preferred charset, if specified */
551 preferred_charset
= file_prefs
.default_open_encoding
;
553 if (preferred_charset
== encodings
[GEANY_ENCODING_NONE
].idx
||
554 preferred_charset
< 0 ||
555 preferred_charset
>= GEANY_ENCODINGS_MAX
)
557 preferred_charset
= -1;
560 /* -1 means "Preferred charset" */
561 for (i
= -1; i
< GEANY_ENCODINGS_MAX
; i
++)
563 if (G_UNLIKELY(i
== encodings
[GEANY_ENCODING_NONE
].idx
))
569 charset
= regex_charset
;
570 i
= -2; /* keep i below the start value to have it again at -1 on the next loop run */
572 else if (check_locale
)
574 check_locale
= FALSE
;
575 charset
= locale_charset
;
576 i
= -2; /* keep i below the start value to have it again at -1 on the next loop run */
580 if (preferred_charset
>= 0)
582 charset
= encodings
[preferred_charset
].charset
;
583 geany_debug("Using preferred charset: %s", charset
);
589 charset
= encodings
[i
].charset
;
590 else /* in this case we have i == -2, continue to increase i and go ahead */
593 if (G_UNLIKELY(charset
== NULL
))
596 geany_debug("Trying to convert %" G_GSIZE_FORMAT
" bytes of data from %s into UTF-8.",
598 utf8_content
= encodings_convert_to_utf8_from_charset(buffer
, size
, charset
, FALSE
);
600 if (G_LIKELY(utf8_content
!= NULL
))
602 if (used_encoding
!= NULL
)
604 if (G_UNLIKELY(*used_encoding
!= NULL
))
606 geany_debug("%s:%d", __FILE__
, __LINE__
);
607 g_free(*used_encoding
);
609 *used_encoding
= g_strdup(charset
);
611 g_free(regex_charset
);
615 g_free(regex_charset
);
621 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
622 * otherwise GEANY_ENCODING_NONE.
624 GeanyEncodingIndex
encodings_scan_unicode_bom(const gchar
*string
, gsize len
, guint
*bom_len
)
631 if ((guchar
)string
[0] == 0xef && (guchar
)string
[1] == 0xbb &&
632 (guchar
)string
[2] == 0xbf)
634 return GEANY_ENCODING_UTF_8
;
642 if ((guchar
)string
[0] == 0x00 && (guchar
)string
[1] == 0x00 &&
643 (guchar
)string
[2] == 0xfe && (guchar
)string
[3] == 0xff)
645 return GEANY_ENCODING_UTF_32BE
; /* Big endian */
647 if ((guchar
)string
[0] == 0xff && (guchar
)string
[1] == 0xfe &&
648 (guchar
)string
[2] == 0x00 && (guchar
)string
[3] == 0x00)
650 return GEANY_ENCODING_UTF_32LE
; /* Little endian */
652 if ((string
[0] == 0x2b && string
[1] == 0x2f && string
[2] == 0x76) &&
653 (string
[3] == 0x38 || string
[3] == 0x39 || string
[3] == 0x2b || string
[3] == 0x2f))
655 return GEANY_ENCODING_UTF_7
;
663 if ((guchar
)string
[0] == 0xfe && (guchar
)string
[1] == 0xff)
665 return GEANY_ENCODING_UTF_16BE
; /* Big endian */
667 if ((guchar
)string
[0] == 0xff && (guchar
)string
[1] == 0xfe)
669 return GEANY_ENCODING_UTF_16LE
; /* Little endian */
674 return GEANY_ENCODING_NONE
;
678 gboolean
encodings_is_unicode_charset(const gchar
*string
)
680 if (string
!= NULL
&&
681 (strncmp(string
, "UTF", 3) == 0 || strncmp(string
, "UCS", 3) == 0))