2 * encodings.c - this file is part of Geany, a fast and lightweight IDE
4 * Copyright 2005-2012 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
5 * Copyright 2006-2012 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 * Encoding conversion and Byte Order Mark (BOM) handling.
27 * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
28 * list of people on the gedit Team.
29 * See the gedit ChangeLog files for a list of changes.
31 /* Stolen from anjuta */
39 #include "documentprivate.h"
40 #include "msgwindow.h"
41 #include "encodings.h"
42 #include "callbacks.h"
45 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
46 #define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
47 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
48 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
50 /* precompiled regexps */
51 static GRegex
*pregs
[2];
52 static gboolean pregs_loaded
= FALSE
;
55 GeanyEncoding encodings
[GEANY_ENCODINGS_MAX
];
58 #define fill(Order, Group, Idx, Charset, Name) \
59 encodings[Idx].idx = Idx; \
60 encodings[Idx].order = Order; \
61 encodings[Idx].group = Group; \
62 encodings[Idx].charset = Charset; \
63 encodings[Idx].name = Name;
65 static void init_encodings(void)
67 fill(0, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_14
, "ISO-8859-14", _("Celtic"));
68 fill(1, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_7
, "ISO-8859-7", _("Greek"));
69 fill(2, WESTEUROPEAN
, GEANY_ENCODING_WINDOWS_1253
, "WINDOWS-1253", _("Greek"));
70 fill(3, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_10
, "ISO-8859-10", _("Nordic"));
71 fill(4, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_3
, "ISO-8859-3", _("South European"));
72 fill(5, WESTEUROPEAN
, GEANY_ENCODING_IBM_850
, "IBM850", _("Western"));
73 fill(6, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_1
, "ISO-8859-1", _("Western"));
74 fill(7, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_15
, "ISO-8859-15", _("Western"));
75 fill(8, WESTEUROPEAN
, GEANY_ENCODING_WINDOWS_1252
, "WINDOWS-1252", _("Western"));
77 fill(0, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_4
, "ISO-8859-4", _("Baltic"));
78 fill(1, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_13
, "ISO-8859-13", _("Baltic"));
79 fill(2, EASTEUROPEAN
, GEANY_ENCODING_WINDOWS_1257
, "WINDOWS-1257", _("Baltic"));
80 fill(3, EASTEUROPEAN
, GEANY_ENCODING_IBM_852
, "IBM852", _("Central European"));
81 fill(4, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_2
, "ISO-8859-2", _("Central European"));
82 fill(5, EASTEUROPEAN
, GEANY_ENCODING_WINDOWS_1250
, "WINDOWS-1250", _("Central European"));
83 fill(6, EASTEUROPEAN
, GEANY_ENCODING_IBM_855
, "IBM855", _("Cyrillic"));
84 fill(7, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_5
, "ISO-8859-5", _("Cyrillic"));
85 /* ISO-IR-111 not available on Windows */
86 fill(8, EASTEUROPEAN
, GEANY_ENCODING_ISO_IR_111
, "ISO-IR-111", _("Cyrillic"));
87 fill(9, EASTEUROPEAN
, GEANY_ENCODING_KOI8_R
, "KOI8-R", _("Cyrillic"));
88 fill(10, EASTEUROPEAN
, GEANY_ENCODING_WINDOWS_1251
, "WINDOWS-1251", _("Cyrillic"));
89 fill(11, EASTEUROPEAN
, GEANY_ENCODING_CP_866
, "CP866", _("Cyrillic/Russian"));
90 fill(12, EASTEUROPEAN
, GEANY_ENCODING_KOI8_U
, "KOI8-U", _("Cyrillic/Ukrainian"));
91 fill(13, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_16
, "ISO-8859-16", _("Romanian"));
93 fill(0, MIDDLEEASTERN
, GEANY_ENCODING_IBM_864
, "IBM864", _("Arabic"));
94 fill(1, MIDDLEEASTERN
, GEANY_ENCODING_ISO_8859_6
, "ISO-8859-6", _("Arabic"));
95 fill(2, MIDDLEEASTERN
, GEANY_ENCODING_WINDOWS_1256
, "WINDOWS-1256", _("Arabic"));
96 fill(3, MIDDLEEASTERN
, GEANY_ENCODING_IBM_862
, "IBM862", _("Hebrew"));
97 /* not available at all, ? */
98 fill(4, MIDDLEEASTERN
, GEANY_ENCODING_ISO_8859_8_I
, "ISO-8859-8-I", _("Hebrew"));
99 fill(5, MIDDLEEASTERN
, GEANY_ENCODING_WINDOWS_1255
, "WINDOWS-1255", _("Hebrew"));
100 fill(6, MIDDLEEASTERN
, GEANY_ENCODING_ISO_8859_8
, "ISO-8859-8", _("Hebrew Visual"));
102 fill(0, ASIAN
, GEANY_ENCODING_ARMSCII_8
, "ARMSCII-8", _("Armenian"));
103 fill(1, ASIAN
, GEANY_ENCODING_GEOSTD8
, "GEORGIAN-ACADEMY", _("Georgian"));
104 fill(2, ASIAN
, GEANY_ENCODING_TIS_620
, "TIS-620", _("Thai"));
105 fill(3, ASIAN
, GEANY_ENCODING_IBM_857
, "IBM857", _("Turkish"));
106 fill(4, ASIAN
, GEANY_ENCODING_WINDOWS_1254
, "WINDOWS-1254", _("Turkish"));
107 fill(5, ASIAN
, GEANY_ENCODING_ISO_8859_9
, "ISO-8859-9", _("Turkish"));
108 fill(6, ASIAN
, GEANY_ENCODING_TCVN
, "TCVN", _("Vietnamese"));
109 fill(7, ASIAN
, GEANY_ENCODING_VISCII
, "VISCII", _("Vietnamese"));
110 fill(8, ASIAN
, GEANY_ENCODING_WINDOWS_1258
, "WINDOWS-1258", _("Vietnamese"));
112 fill(0, UNICODE
, GEANY_ENCODING_UTF_7
, "UTF-7", _("Unicode"));
113 fill(1, UNICODE
, GEANY_ENCODING_UTF_8
, "UTF-8", _("Unicode"));
114 fill(2, UNICODE
, GEANY_ENCODING_UTF_16LE
, "UTF-16LE", _("Unicode"));
115 fill(3, UNICODE
, GEANY_ENCODING_UTF_16BE
, "UTF-16BE", _("Unicode"));
116 fill(4, UNICODE
, GEANY_ENCODING_UCS_2LE
, "UCS-2LE", _("Unicode"));
117 fill(5, UNICODE
, GEANY_ENCODING_UCS_2BE
, "UCS-2BE", _("Unicode"));
118 fill(6, UNICODE
, GEANY_ENCODING_UTF_32LE
, "UTF-32LE", _("Unicode"));
119 fill(7, UNICODE
, GEANY_ENCODING_UTF_32BE
, "UTF-32BE", _("Unicode"));
121 fill(0, EASTASIAN
, GEANY_ENCODING_GB18030
, "GB18030", _("Chinese Simplified"));
122 fill(1, EASTASIAN
, GEANY_ENCODING_GB2312
, "GB2312", _("Chinese Simplified"));
123 fill(2, EASTASIAN
, GEANY_ENCODING_GBK
, "GBK", _("Chinese Simplified"));
124 /* maybe not available on Linux */
125 fill(3, EASTASIAN
, GEANY_ENCODING_HZ
, "HZ", _("Chinese Simplified"));
126 fill(4, EASTASIAN
, GEANY_ENCODING_BIG5
, "BIG5", _("Chinese Traditional"));
127 fill(5, EASTASIAN
, GEANY_ENCODING_BIG5_HKSCS
, "BIG5-HKSCS", _("Chinese Traditional"));
128 fill(6, EASTASIAN
, GEANY_ENCODING_EUC_TW
, "EUC-TW", _("Chinese Traditional"));
129 fill(7, EASTASIAN
, GEANY_ENCODING_EUC_JP
, "EUC-JP", _("Japanese"));
130 fill(8, EASTASIAN
, GEANY_ENCODING_ISO_2022_JP
, "ISO-2022-JP", _("Japanese"));
131 fill(9, EASTASIAN
, GEANY_ENCODING_SHIFT_JIS
, "SHIFT_JIS", _("Japanese"));
132 fill(10, EASTASIAN
, GEANY_ENCODING_CP_932
, "CP932", _("Japanese"));
133 fill(11, EASTASIAN
, GEANY_ENCODING_EUC_KR
, "EUC-KR", _("Korean"));
134 fill(12, EASTASIAN
, GEANY_ENCODING_ISO_2022_KR
, "ISO-2022-KR", _("Korean"));
135 fill(13, EASTASIAN
, GEANY_ENCODING_JOHAB
, "JOHAB", _("Korean"));
136 fill(14, EASTASIAN
, GEANY_ENCODING_UHC
, "UHC", _("Korean"));
138 fill(0, NONE
, GEANY_ENCODING_NONE
, "None", _("Without encoding"));
142 /* compares two encoding names in a permissive fashion.
143 * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
144 static gboolean
encodings_charset_equals(const gchar
*a
, const gchar
*b
)
146 gboolean was_alpha
= FALSE
; /* whether last character of previous word was a letter */
147 gboolean need_sep
= FALSE
; /* whether we're expecting an implicit separator */
153 if (g_ascii_toupper(*a
) == g_ascii_toupper(*b
) &&
154 ((is_alpha
= g_ascii_isalpha(*a
)) || g_ascii_isdigit(*a
)))
156 /* either there was a real separator, or we need a implicit one (a chage from alpha to
158 if (! need_sep
|| (was_alpha
!= is_alpha
))
162 was_alpha
= is_alpha
;
172 if (! g_ascii_isalnum(*a
))
177 if (! g_ascii_isalnum(*b
))
192 GeanyEncodingIndex
encodings_get_idx_from_charset(const gchar
*charset
)
197 return GEANY_ENCODING_UTF_8
;
200 while (i
< GEANY_ENCODINGS_MAX
)
202 if (encodings_charset_equals(charset
, encodings
[i
].charset
))
207 return GEANY_ENCODING_UTF_8
;
211 const GeanyEncoding
*encodings_get_from_charset(const gchar
*charset
)
216 return &encodings
[GEANY_ENCODING_UTF_8
];
219 while (i
< GEANY_ENCODINGS_MAX
)
221 if (encodings_charset_equals(charset
, encodings
[i
].charset
))
222 return &encodings
[i
];
231 static const gchar
*encodings_normalize_charset(const gchar
*charset
)
233 const GeanyEncoding
*encoding
;
235 encoding
= encodings_get_from_charset(charset
);
236 if (encoding
!= NULL
)
237 return encoding
->charset
;
243 const GeanyEncoding
*encodings_get_from_index(gint idx
)
245 g_return_val_if_fail(idx
>= 0 && idx
< GEANY_ENCODINGS_MAX
, NULL
);
247 return &encodings
[idx
];
252 * Gets the character set name of the specified index e.g. for use with
253 * @ref document_set_encoding().
255 * @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
258 * @return The charset according to idx, or @c NULL if the index is invalid.
262 const gchar
* encodings_get_charset_from_index(gint idx
)
264 g_return_val_if_fail(idx
>= 0 && idx
< GEANY_ENCODINGS_MAX
, NULL
);
266 return encodings
[idx
].charset
;
270 gchar
*encodings_to_string(const GeanyEncoding
* enc
)
272 g_return_val_if_fail(enc
!= NULL
, NULL
);
273 g_return_val_if_fail(enc
->name
!= NULL
, NULL
);
274 g_return_val_if_fail(enc
->charset
!= NULL
, NULL
);
276 return g_strdup_printf("%s (%s)", enc
->name
, enc
->charset
);
280 const gchar
*encodings_get_charset(const GeanyEncoding
* enc
)
282 g_return_val_if_fail(enc
!= NULL
, NULL
);
283 g_return_val_if_fail(enc
->charset
!= NULL
, NULL
);
289 static GtkWidget
*radio_items
[GEANY_ENCODINGS_MAX
];
292 void encodings_select_radio_item(const gchar
*charset
)
296 g_return_if_fail(charset
!= NULL
);
299 while (i
< GEANY_ENCODINGS_MAX
)
301 if (utils_str_equal(charset
, encodings
[i
].charset
))
305 if (i
== GEANY_ENCODINGS_MAX
)
306 i
= GEANY_ENCODING_UTF_8
; /* fallback to UTF-8 */
308 /* ignore_callback has to be set by the caller */
309 gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items
[i
]), TRUE
);
313 /* Regexp detection of file encoding declared in the file itself.
314 * Idea and parts of code taken from Bluefish, thanks.
315 * regex_compile() is used to compile regular expressions on program init and keep it in memory
316 * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
318 static GRegex
*regex_compile(const gchar
*pattern
)
320 GError
*error
= NULL
;
321 GRegex
*regex
= g_regex_new(pattern
, G_REGEX_CASELESS
, 0, &error
);
325 geany_debug("Failed to compile encoding regex (%s)", error
->message
);
332 static gchar
*regex_match(GRegex
*preg
, const gchar
*buffer
, gsize size
)
334 gchar
*encoding
= NULL
;
337 if (G_UNLIKELY(! pregs_loaded
|| buffer
== NULL
))
340 /* scan only the first 512 characters in the buffer */
341 size
= MIN(size
, 512);
343 if (g_regex_match_full(preg
, buffer
, size
, 0, 0, &minfo
, NULL
) &&
344 g_match_info_get_match_count(minfo
) >= 2)
346 encoding
= g_match_info_fetch(minfo
, 1);
347 geany_debug("Detected encoding by regex search: %s", encoding
);
349 SETPTR(encoding
, g_utf8_strup(encoding
, -1));
351 g_match_info_free(minfo
);
356 static void encodings_radio_item_change_cb(GtkCheckMenuItem
*menuitem
, gpointer user_data
)
358 GeanyDocument
*doc
= document_get_current();
359 guint i
= GPOINTER_TO_INT(user_data
);
361 if (ignore_callback
|| doc
== NULL
|| encodings
[i
].charset
== NULL
||
362 ! gtk_check_menu_item_get_active(menuitem
) ||
363 utils_str_equal(encodings
[i
].charset
, doc
->encoding
))
371 document_undo_add(doc
, UNDO_ENCODING
, g_strdup(doc
->encoding
));
373 document_set_encoding(doc
, encodings
[i
].charset
);
377 void encodings_finalize(void)
382 len
= G_N_ELEMENTS(pregs
);
383 for (i
= 0; i
< len
; i
++)
385 g_regex_unref(pregs
[i
]);
391 void encodings_init(void)
393 GtkWidget
*item
, *menu
[2], *submenu
, *menu_westeuro
, *menu_easteuro
, *menu_eastasian
, *menu_asian
,
394 *menu_utf8
, *menu_middleeast
, *item_westeuro
, *item_easteuro
, *item_eastasian
,
395 *item_asian
, *item_utf8
, *item_middleeast
;
396 GCallback cb_func
[2];
397 GSList
*group
= NULL
;
399 gint order
, group_size
;
406 pregs
[0] = regex_compile(PATTERN_HTMLMETA
);
407 pregs
[1] = regex_compile(PATTERN_CODING
);
411 /* create encodings submenu in document menu */
412 menu
[0] = ui_lookup_widget(main_widgets
.window
, "set_encoding1_menu");
413 menu
[1] = ui_lookup_widget(main_widgets
.window
, "menu_reload_as1_menu");
414 cb_func
[0] = G_CALLBACK(encodings_radio_item_change_cb
);
415 cb_func
[1] = G_CALLBACK(on_reload_as_activate
);
417 for (k
= 0; k
< 2; k
++)
419 menu_westeuro
= gtk_menu_new();
420 item_westeuro
= gtk_menu_item_new_with_mnemonic(_("_West European"));
421 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro
), menu_westeuro
);
422 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_westeuro
);
423 gtk_widget_show_all(item_westeuro
);
425 menu_easteuro
= gtk_menu_new();
426 item_easteuro
= gtk_menu_item_new_with_mnemonic(_("_East European"));
427 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro
), menu_easteuro
);
428 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_easteuro
);
429 gtk_widget_show_all(item_easteuro
);
431 menu_eastasian
= gtk_menu_new();
432 item_eastasian
= gtk_menu_item_new_with_mnemonic(_("East _Asian"));
433 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian
), menu_eastasian
);
434 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_eastasian
);
435 gtk_widget_show_all(item_eastasian
);
437 menu_asian
= gtk_menu_new();
438 item_asian
= gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
439 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian
), menu_asian
);
440 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_asian
);
441 gtk_widget_show_all(item_asian
);
443 menu_middleeast
= gtk_menu_new();
444 item_middleeast
= gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
445 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast
), menu_middleeast
);
446 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_middleeast
);
447 gtk_widget_show_all(item_middleeast
);
449 menu_utf8
= gtk_menu_new();
450 item_utf8
= gtk_menu_item_new_with_mnemonic(_("_Unicode"));
451 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8
), menu_utf8
);
452 gtk_container_add(GTK_CONTAINER(menu
[k
]), item_utf8
);
453 gtk_widget_show_all(item_utf8
);
455 /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
456 for (i
= 0; i
< GEANY_ENCODING_GROUPS_MAX
; i
++)
461 case WESTEUROPEAN
: submenu
= menu_westeuro
; group_size
= 9; break;
462 case EASTEUROPEAN
: submenu
= menu_easteuro
; group_size
= 14; break;
463 case EASTASIAN
: submenu
= menu_eastasian
; group_size
= 14; break;
464 case ASIAN
: submenu
= menu_asian
; group_size
= 9; break;
465 case MIDDLEEASTERN
: submenu
= menu_middleeast
; group_size
= 7; break;
466 case UNICODE
: submenu
= menu_utf8
; group_size
= 8; break;
467 default: submenu
= menu
[k
]; group_size
= 1;
470 while (order
< group_size
) /* the biggest group has 13 elements */
472 for (j
= 0; j
< GEANY_ENCODINGS_MAX
; j
++)
474 if (encodings
[j
].group
== i
&& encodings
[j
].order
== order
)
476 label
= encodings_to_string(&encodings
[j
]);
479 item
= gtk_radio_menu_item_new_with_label(group
, label
);
480 group
= gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item
));
481 radio_items
[j
] = item
;
484 item
= gtk_menu_item_new_with_label(label
);
485 gtk_widget_show(item
);
486 gtk_container_add(GTK_CONTAINER(submenu
), item
);
487 g_signal_connect(item
, "activate",
488 cb_func
[k
], GINT_TO_POINTER(encodings
[j
].idx
));
501 * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
502 * If @a fast is not set, additional checks to validate the converted string are performed.
504 * @param buffer The input string to convert.
505 * @param size The length of the string, or -1 if the string is nul-terminated.
506 * @param charset The charset to be used for conversion.
507 * @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
509 * @return If the conversion was successful, a newly allocated nul-terminated string,
510 * which must be freed with @c g_free(). Otherwise @c NULL.
512 gchar
*encodings_convert_to_utf8_from_charset(const gchar
*buffer
, gssize size
,
513 const gchar
*charset
, gboolean fast
)
515 gchar
*utf8_content
= NULL
;
516 GError
*conv_error
= NULL
;
517 gchar
* converted_contents
= NULL
;
520 g_return_val_if_fail(buffer
!= NULL
, NULL
);
521 g_return_val_if_fail(charset
!= NULL
, NULL
);
523 converted_contents
= g_convert(buffer
, size
, "UTF-8", charset
, NULL
,
524 &bytes_written
, &conv_error
);
528 utf8_content
= converted_contents
;
529 if (conv_error
!= NULL
) g_error_free(conv_error
);
531 else if (conv_error
!= NULL
|| ! g_utf8_validate(converted_contents
, bytes_written
, NULL
))
533 if (conv_error
!= NULL
)
535 geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset
, conv_error
->message
);
536 g_error_free(conv_error
);
540 geany_debug("Couldn't convert from %s to UTF-8.", charset
);
543 g_free(converted_contents
);
547 geany_debug("Converted from %s to UTF-8.", charset
);
548 utf8_content
= converted_contents
;
555 static gchar
*encodings_check_regexes(const gchar
*buffer
, gsize size
)
559 for (i
= 0; i
< G_N_ELEMENTS(pregs
); i
++)
563 if ((charset
= regex_match(pregs
[i
], buffer
, size
)) != NULL
)
570 static gchar
*encodings_convert_to_utf8_with_suggestion(const gchar
*buffer
, gssize size
,
571 const gchar
*suggested_charset
, gchar
**used_encoding
)
573 const gchar
*locale_charset
= NULL
;
574 const gchar
*charset
;
576 gboolean check_suggestion
= suggested_charset
!= NULL
;
577 gboolean check_locale
= FALSE
;
578 gint i
, preferred_charset
;
582 size
= strlen(buffer
);
585 /* current locale is not UTF-8, we have to check this charset */
586 check_locale
= ! g_get_charset(&locale_charset
);
588 /* First check for preferred charset, if specified */
589 preferred_charset
= file_prefs
.default_open_encoding
;
591 if (preferred_charset
== encodings
[GEANY_ENCODING_NONE
].idx
||
592 preferred_charset
< 0 ||
593 preferred_charset
>= GEANY_ENCODINGS_MAX
)
595 preferred_charset
= -1;
598 /* -1 means "Preferred charset" */
599 for (i
= -1; i
< GEANY_ENCODINGS_MAX
; i
++)
601 if (G_UNLIKELY(i
== encodings
[GEANY_ENCODING_NONE
].idx
))
604 if (check_suggestion
)
606 check_suggestion
= FALSE
;
607 charset
= encodings_normalize_charset(suggested_charset
);
608 if (charset
== NULL
) /* we failed at normalizing suggested encoding, try it as is */
609 charset
= suggested_charset
;
610 i
= -2; /* keep i below the start value to have it again at -1 on the next loop run */
612 else if (check_locale
)
614 check_locale
= FALSE
;
615 charset
= locale_charset
;
616 i
= -2; /* keep i below the start value to have it again at -1 on the next loop run */
620 if (preferred_charset
>= 0)
622 charset
= encodings
[preferred_charset
].charset
;
623 geany_debug("Using preferred charset: %s", charset
);
629 charset
= encodings
[i
].charset
;
630 else /* in this case we have i == -2, continue to increase i and go ahead */
633 if (G_UNLIKELY(charset
== NULL
))
636 geany_debug("Trying to convert %" G_GSIZE_FORMAT
" bytes of data from %s into UTF-8.",
638 utf8_content
= encodings_convert_to_utf8_from_charset(buffer
, size
, charset
, FALSE
);
640 if (G_LIKELY(utf8_content
!= NULL
))
642 if (used_encoding
!= NULL
)
644 if (G_UNLIKELY(*used_encoding
!= NULL
))
646 geany_debug("%s:%d", __FILE__
, __LINE__
);
647 g_free(*used_encoding
);
649 *used_encoding
= g_strdup(charset
);
660 * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
663 * @param buffer the input string to convert.
664 * @param size the length of the string, or -1 if the string is nul-terminated.
665 * @param used_encoding return location of the detected encoding of the input string, or @c NULL.
667 * @return If the conversion was successful, a newly allocated nul-terminated string,
668 * which must be freed with @c g_free(). Otherwise @c NULL.
670 gchar
*encodings_convert_to_utf8(const gchar
*buffer
, gssize size
, gchar
**used_encoding
)
672 gchar
*regex_charset
;
675 /* first try to read the encoding from the file content */
676 regex_charset
= encodings_check_regexes(buffer
, size
);
677 utf8
= encodings_convert_to_utf8_with_suggestion(buffer
, size
, regex_charset
, used_encoding
);
678 g_free(regex_charset
);
684 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
685 * otherwise GEANY_ENCODING_NONE.
687 GeanyEncodingIndex
encodings_scan_unicode_bom(const gchar
*string
, gsize len
, guint
*bom_len
)
694 if ((guchar
)string
[0] == 0xef && (guchar
)string
[1] == 0xbb &&
695 (guchar
)string
[2] == 0xbf)
697 return GEANY_ENCODING_UTF_8
;
705 if ((guchar
)string
[0] == 0x00 && (guchar
)string
[1] == 0x00 &&
706 (guchar
)string
[2] == 0xfe && (guchar
)string
[3] == 0xff)
708 return GEANY_ENCODING_UTF_32BE
; /* Big endian */
710 if ((guchar
)string
[0] == 0xff && (guchar
)string
[1] == 0xfe &&
711 (guchar
)string
[2] == 0x00 && (guchar
)string
[3] == 0x00)
713 return GEANY_ENCODING_UTF_32LE
; /* Little endian */
715 if ((string
[0] == 0x2b && string
[1] == 0x2f && string
[2] == 0x76) &&
716 (string
[3] == 0x38 || string
[3] == 0x39 || string
[3] == 0x2b || string
[3] == 0x2f))
718 return GEANY_ENCODING_UTF_7
;
726 if ((guchar
)string
[0] == 0xfe && (guchar
)string
[1] == 0xff)
728 return GEANY_ENCODING_UTF_16BE
; /* Big endian */
730 if ((guchar
)string
[0] == 0xff && (guchar
)string
[1] == 0xfe)
732 return GEANY_ENCODING_UTF_16LE
; /* Little endian */
737 return GEANY_ENCODING_NONE
;
741 gboolean
encodings_is_unicode_charset(const gchar
*string
)
743 if (string
!= NULL
&&
744 (strncmp(string
, "UTF", 3) == 0 || strncmp(string
, "UCS", 3) == 0))
754 gchar
*data
; /* null-terminated data */
755 gsize size
; /* actual data size */
756 gsize len
; /* string length of data */
763 /* convert data with the specified encoding */
765 handle_forced_encoding(BufferData
*buffer
, const gchar
*forced_enc
)
767 GeanyEncodingIndex enc_idx
;
769 if (utils_str_equal(forced_enc
, "UTF-8"))
771 if (! g_utf8_validate(buffer
->data
, buffer
->len
, NULL
))
778 gchar
*converted_text
= encodings_convert_to_utf8_from_charset(
779 buffer
->data
, buffer
->size
, forced_enc
, FALSE
);
780 if (converted_text
== NULL
)
786 SETPTR(buffer
->data
, converted_text
);
787 buffer
->len
= strlen(converted_text
);
790 enc_idx
= encodings_scan_unicode_bom(buffer
->data
, buffer
->size
, NULL
);
791 buffer
->bom
= (enc_idx
== GEANY_ENCODING_UTF_8
);
792 buffer
->enc
= g_strdup(forced_enc
);
797 /* detect encoding and convert to UTF-8 if necessary */
799 handle_encoding(BufferData
*buffer
, GeanyEncodingIndex enc_idx
)
801 g_return_val_if_fail(buffer
->enc
== NULL
, FALSE
);
802 g_return_val_if_fail(buffer
->bom
== FALSE
, FALSE
);
804 if (buffer
->size
== 0)
806 /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
807 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
808 buffer
->enc
= g_strdup("UTF-8");
812 /* first check for a BOM */
813 if (enc_idx
!= GEANY_ENCODING_NONE
)
815 buffer
->enc
= g_strdup(encodings
[enc_idx
].charset
);
818 if (enc_idx
!= GEANY_ENCODING_UTF_8
) /* the BOM indicated something else than UTF-8 */
820 gchar
*converted_text
= encodings_convert_to_utf8_from_charset(
821 buffer
->data
, buffer
->size
, buffer
->enc
, FALSE
);
822 if (converted_text
!= NULL
)
824 SETPTR(buffer
->data
, converted_text
);
825 buffer
->len
= strlen(converted_text
);
829 /* there was a problem converting data from BOM encoding type */
830 SETPTR(buffer
->enc
, NULL
);
836 if (buffer
->enc
== NULL
) /* either there was no BOM or the BOM encoding failed */
838 /* first try to read the encoding from the file content */
839 gchar
*regex_charset
= encodings_check_regexes(buffer
->data
, buffer
->size
);
841 /* try UTF-8 first */
842 if (encodings_get_idx_from_charset(regex_charset
) == GEANY_ENCODING_UTF_8
&&
843 (buffer
->size
== buffer
->len
) && g_utf8_validate(buffer
->data
, buffer
->len
, NULL
))
845 buffer
->enc
= g_strdup("UTF-8");
849 /* detect the encoding */
850 gchar
*converted_text
= encodings_convert_to_utf8_with_suggestion(buffer
->data
,
851 buffer
->size
, regex_charset
, &buffer
->enc
);
853 if (converted_text
== NULL
)
855 g_free(regex_charset
);
858 SETPTR(buffer
->data
, converted_text
);
859 buffer
->len
= strlen(converted_text
);
861 g_free(regex_charset
);
869 handle_bom(BufferData
*buffer
)
873 encodings_scan_unicode_bom(buffer
->data
, buffer
->size
, &bom_len
);
874 g_return_if_fail(bom_len
!= 0);
876 /* use filedata->len here because the contents are already converted into UTF-8 */
877 buffer
->len
-= bom_len
;
878 /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
879 g_memmove(buffer
->data
, buffer
->data
+ bom_len
, buffer
->len
+ 1);
880 buffer
->data
= g_realloc(buffer
->data
, buffer
->len
+ 1);
884 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
885 static gboolean
handle_buffer(BufferData
*buffer
, const gchar
*forced_enc
)
887 GeanyEncodingIndex tmp_enc_idx
;
889 /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
890 * if we have a BOM */
891 tmp_enc_idx
= encodings_scan_unicode_bom(buffer
->data
, buffer
->size
, NULL
);
893 /* check whether the size of the loaded data is equal to the size of the file in the
894 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
895 * file size of 0 bytes */
896 if (buffer
->len
!= buffer
->size
&& buffer
->size
!= 0 && (
897 tmp_enc_idx
== GEANY_ENCODING_UTF_8
|| /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
898 tmp_enc_idx
== GEANY_ENCODING_UTF_7
)) /* filter UTF-7/8 where no NULL bytes are allowed */
900 buffer
->partial
= TRUE
;
903 /* Determine character encoding and convert to UTF-8 */
904 if (forced_enc
!= NULL
)
906 /* the encoding should be ignored(requested by user), so open the file "as it is" */
907 if (utils_str_equal(forced_enc
, encodings
[GEANY_ENCODING_NONE
].charset
))
910 buffer
->enc
= g_strdup(encodings
[GEANY_ENCODING_NONE
].charset
);
912 else if (! handle_forced_encoding(buffer
, forced_enc
))
917 else if (! handle_encoding(buffer
, tmp_enc_idx
))
929 * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
930 * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
932 * @param buf a pointer to modifiable null-terminated buffer to convert.
933 * It may or may not be modified, and should be freed whatever happens.
934 * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
935 * file size). It will be updated to the new size.
936 * @param forced_enc forced encoding to use, or @c NULL
937 * @param used_encoding return location for the actually used encoding, or @c NULL
938 * @param has_bom return location to store whether the data had a BOM, or @c NULL
939 * @param partial return location to store whether the conversion may be partial, or @c NULL
941 * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
943 gboolean
encodings_convert_to_utf8_auto(gchar
**buf
, gsize
*size
, const gchar
*forced_enc
,
944 gchar
**used_encoding
, gboolean
*has_bom
, gboolean
*partial
)
950 /* use strlen to check for null chars */
951 buffer
.len
= strlen(buffer
.data
);
954 buffer
.partial
= FALSE
;
956 if (! handle_buffer(&buffer
, forced_enc
))
961 *used_encoding
= buffer
.enc
;
965 *has_bom
= buffer
.bom
;
967 *partial
= buffer
.partial
;