2 * encodings.c - this file is part of Geany, a fast and lightweight IDE
4 * Copyright 2005 The Geany contributors
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 * Encoding conversion and Byte Order Mark (BOM) handling.
26 * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
27 * list of people on the gedit Team.
28 * See the gedit ChangeLog files for a list of changes.
30 /* Stolen from anjuta */
36 #include "encodings.h"
37 #include "encodingsprivate.h"
40 #include "callbacks.h"
41 #include "documentprivate.h"
49 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
50 #define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
51 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
52 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
54 /* precompiled regexps */
55 static GRegex
*pregs
[2];
56 static gboolean pregs_loaded
= FALSE
;
59 GeanyEncoding encodings
[GEANY_ENCODINGS_MAX
];
62 static gboolean
conversion_supported(const gchar
*to
, const gchar
*from
)
64 GIConv conv
= g_iconv_open(to
, from
);
65 if (conv
== (GIConv
) -1)
73 #define fill(Order, Group, Idx, Charset, Name) \
74 encodings[Idx].idx = Idx; \
75 encodings[Idx].order = Order; \
76 encodings[Idx].group = Group; \
77 encodings[Idx].charset = Charset; \
78 encodings[Idx].name = Name; \
79 encodings[Idx].supported = FALSE;
81 static void init_encodings(void)
83 fill(0, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_14
, "ISO-8859-14", _("Celtic"));
84 fill(1, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_7
, "ISO-8859-7", _("Greek"));
85 fill(2, WESTEUROPEAN
, GEANY_ENCODING_WINDOWS_1253
, "WINDOWS-1253", _("Greek"));
86 fill(3, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_10
, "ISO-8859-10", _("Nordic"));
87 fill(4, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_3
, "ISO-8859-3", _("South European"));
88 fill(5, WESTEUROPEAN
, GEANY_ENCODING_IBM_850
, "IBM850", _("Western"));
89 fill(6, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_1
, "ISO-8859-1", _("Western"));
90 fill(7, WESTEUROPEAN
, GEANY_ENCODING_ISO_8859_15
, "ISO-8859-15", _("Western"));
91 fill(8, WESTEUROPEAN
, GEANY_ENCODING_WINDOWS_1252
, "WINDOWS-1252", _("Western"));
93 fill(0, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_4
, "ISO-8859-4", _("Baltic"));
94 fill(1, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_13
, "ISO-8859-13", _("Baltic"));
95 fill(2, EASTEUROPEAN
, GEANY_ENCODING_WINDOWS_1257
, "WINDOWS-1257", _("Baltic"));
96 fill(3, EASTEUROPEAN
, GEANY_ENCODING_IBM_852
, "IBM852", _("Central European"));
97 fill(4, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_2
, "ISO-8859-2", _("Central European"));
98 fill(5, EASTEUROPEAN
, GEANY_ENCODING_WINDOWS_1250
, "WINDOWS-1250", _("Central European"));
99 fill(6, EASTEUROPEAN
, GEANY_ENCODING_IBM_855
, "IBM855", _("Cyrillic"));
100 fill(7, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_5
, "ISO-8859-5", _("Cyrillic"));
101 /* ISO-IR-111 not available on Windows */
102 fill(8, EASTEUROPEAN
, GEANY_ENCODING_ISO_IR_111
, "ISO-IR-111", _("Cyrillic"));
103 fill(9, EASTEUROPEAN
, GEANY_ENCODING_KOI8_R
, "KOI8-R", _("Cyrillic"));
104 fill(10, EASTEUROPEAN
, GEANY_ENCODING_WINDOWS_1251
, "WINDOWS-1251", _("Cyrillic"));
105 fill(11, EASTEUROPEAN
, GEANY_ENCODING_CP_866
, "CP866", _("Cyrillic/Russian"));
106 fill(12, EASTEUROPEAN
, GEANY_ENCODING_KOI8_U
, "KOI8-U", _("Cyrillic/Ukrainian"));
107 fill(13, EASTEUROPEAN
, GEANY_ENCODING_ISO_8859_16
, "ISO-8859-16", _("Romanian"));
109 fill(0, MIDDLEEASTERN
, GEANY_ENCODING_IBM_864
, "IBM864", _("Arabic"));
110 fill(1, MIDDLEEASTERN
, GEANY_ENCODING_ISO_8859_6
, "ISO-8859-6", _("Arabic"));
111 fill(2, MIDDLEEASTERN
, GEANY_ENCODING_WINDOWS_1256
, "WINDOWS-1256", _("Arabic"));
112 fill(3, MIDDLEEASTERN
, GEANY_ENCODING_IBM_862
, "IBM862", _("Hebrew"));
113 /* not available at all, ? */
114 fill(4, MIDDLEEASTERN
, GEANY_ENCODING_ISO_8859_8_I
, "ISO-8859-8-I", _("Hebrew"));
115 fill(5, MIDDLEEASTERN
, GEANY_ENCODING_WINDOWS_1255
, "WINDOWS-1255", _("Hebrew"));
116 fill(6, MIDDLEEASTERN
, GEANY_ENCODING_ISO_8859_8
, "ISO-8859-8", _("Hebrew Visual"));
118 fill(0, ASIAN
, GEANY_ENCODING_ARMSCII_8
, "ARMSCII-8", _("Armenian"));
119 fill(1, ASIAN
, GEANY_ENCODING_GEOSTD8
, "GEORGIAN-ACADEMY", _("Georgian"));
120 fill(2, ASIAN
, GEANY_ENCODING_TIS_620
, "TIS-620", _("Thai"));
121 fill(3, ASIAN
, GEANY_ENCODING_IBM_857
, "IBM857", _("Turkish"));
122 fill(4, ASIAN
, GEANY_ENCODING_WINDOWS_1254
, "WINDOWS-1254", _("Turkish"));
123 fill(5, ASIAN
, GEANY_ENCODING_ISO_8859_9
, "ISO-8859-9", _("Turkish"));
124 fill(6, ASIAN
, GEANY_ENCODING_TCVN
, "TCVN", _("Vietnamese"));
125 fill(7, ASIAN
, GEANY_ENCODING_VISCII
, "VISCII", _("Vietnamese"));
126 fill(8, ASIAN
, GEANY_ENCODING_WINDOWS_1258
, "WINDOWS-1258", _("Vietnamese"));
128 fill(0, UNICODE
, GEANY_ENCODING_UTF_7
, "UTF-7", _("Unicode"));
129 fill(1, UNICODE
, GEANY_ENCODING_UTF_8
, "UTF-8", _("Unicode"));
130 fill(2, UNICODE
, GEANY_ENCODING_UTF_16LE
, "UTF-16LE", _("Unicode"));
131 fill(3, UNICODE
, GEANY_ENCODING_UTF_16BE
, "UTF-16BE", _("Unicode"));
132 fill(4, UNICODE
, GEANY_ENCODING_UCS_2LE
, "UCS-2LE", _("Unicode"));
133 fill(5, UNICODE
, GEANY_ENCODING_UCS_2BE
, "UCS-2BE", _("Unicode"));
134 fill(6, UNICODE
, GEANY_ENCODING_UTF_32LE
, "UTF-32LE", _("Unicode"));
135 fill(7, UNICODE
, GEANY_ENCODING_UTF_32BE
, "UTF-32BE", _("Unicode"));
137 fill(0, EASTASIAN
, GEANY_ENCODING_GB18030
, "GB18030", _("Chinese Simplified"));
138 fill(1, EASTASIAN
, GEANY_ENCODING_GB2312
, "GB2312", _("Chinese Simplified"));
139 fill(2, EASTASIAN
, GEANY_ENCODING_GBK
, "GBK", _("Chinese Simplified"));
140 /* maybe not available on Linux */
141 fill(3, EASTASIAN
, GEANY_ENCODING_HZ
, "HZ", _("Chinese Simplified"));
142 fill(4, EASTASIAN
, GEANY_ENCODING_BIG5
, "BIG5", _("Chinese Traditional"));
143 fill(5, EASTASIAN
, GEANY_ENCODING_BIG5_HKSCS
, "BIG5-HKSCS", _("Chinese Traditional"));
144 fill(6, EASTASIAN
, GEANY_ENCODING_EUC_TW
, "EUC-TW", _("Chinese Traditional"));
145 fill(7, EASTASIAN
, GEANY_ENCODING_EUC_JP
, "EUC-JP", _("Japanese"));
146 fill(8, EASTASIAN
, GEANY_ENCODING_ISO_2022_JP
, "ISO-2022-JP", _("Japanese"));
147 fill(9, EASTASIAN
, GEANY_ENCODING_SHIFT_JIS
, "SHIFT_JIS", _("Japanese"));
148 fill(10, EASTASIAN
, GEANY_ENCODING_CP_932
, "CP932", _("Japanese"));
149 fill(11, EASTASIAN
, GEANY_ENCODING_EUC_KR
, "EUC-KR", _("Korean"));
150 fill(12, EASTASIAN
, GEANY_ENCODING_ISO_2022_KR
, "ISO-2022-KR", _("Korean"));
151 fill(13, EASTASIAN
, GEANY_ENCODING_JOHAB
, "JOHAB", _("Korean"));
152 fill(14, EASTASIAN
, GEANY_ENCODING_UHC
, "UHC", _("Korean"));
154 fill(0, NONE
, GEANY_ENCODING_NONE
, "None", _("Without encoding"));
156 /* fill the flags member */
157 for (guint i
= 0; i
< G_N_ELEMENTS(encodings
); i
++)
159 if (i
== GEANY_ENCODING_NONE
|| conversion_supported("UTF-8", encodings
[i
].charset
))
160 encodings
[i
].supported
= TRUE
;
163 /* geany_debug() doesn't really work at this point, unless G_MESSAGES_DEBUG
164 * is set explicitly by the caller, but that's better than nothing */
165 geany_debug("Encoding %s is not supported by the system", encodings
[i
].charset
);
171 /* compares two encoding names in a permissive fashion.
172 * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
173 static gboolean
encodings_charset_equals(const gchar
*a
, const gchar
*b
)
175 gboolean was_alpha
= FALSE
; /* whether last character of previous word was a letter */
176 gboolean need_sep
= FALSE
; /* whether we're expecting an implicit separator */
182 if (g_ascii_toupper(*a
) == g_ascii_toupper(*b
) &&
183 ((is_alpha
= g_ascii_isalpha(*a
)) || g_ascii_isdigit(*a
)))
185 /* either there was a real separator, or we need a implicit one (a chage from alpha to
187 if (! need_sep
|| (was_alpha
!= is_alpha
))
191 was_alpha
= is_alpha
;
201 if (! g_ascii_isalnum(*a
))
206 if (! g_ascii_isalnum(*b
))
221 GeanyEncodingIndex
encodings_get_idx_from_charset(const gchar
*charset
)
224 return GEANY_ENCODING_UTF_8
;
226 for (gint i
= 0; i
< GEANY_ENCODINGS_MAX
; i
++)
228 if (encodings_charset_equals(charset
, encodings
[i
].charset
))
231 return GEANY_ENCODING_UTF_8
;
235 const GeanyEncoding
*encodings_get_from_charset(const gchar
*charset
)
238 return &encodings
[GEANY_ENCODING_UTF_8
];
240 for (gint i
= 0; i
< GEANY_ENCODINGS_MAX
; i
++)
242 if (encodings_charset_equals(charset
, encodings
[i
].charset
))
243 return &encodings
[i
];
250 static const gchar
*encodings_normalize_charset(const gchar
*charset
)
252 const GeanyEncoding
*encoding
;
254 encoding
= encodings_get_from_charset(charset
);
255 if (encoding
!= NULL
)
256 return encoding
->charset
;
262 const GeanyEncoding
*encodings_get_from_index(gint idx
)
264 g_return_val_if_fail(idx
>= 0 && idx
< GEANY_ENCODINGS_MAX
, NULL
);
266 return &encodings
[idx
];
271 * Gets the character set name of the specified index e.g. for use with
272 * @ref document_set_encoding().
274 * @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
277 * @return @nullable The charset according to idx, or @c NULL if the index is invalid.
282 const gchar
* encodings_get_charset_from_index(gint idx
)
284 g_return_val_if_fail(idx
>= 0 && idx
< GEANY_ENCODINGS_MAX
, NULL
);
286 return encodings
[idx
].charset
;
290 gchar
*encodings_to_string(const GeanyEncoding
* enc
)
292 g_return_val_if_fail(enc
!= NULL
, NULL
);
293 g_return_val_if_fail(enc
->name
!= NULL
, NULL
);
294 g_return_val_if_fail(enc
->charset
!= NULL
, NULL
);
296 if (enc
->idx
== GEANY_ENCODING_NONE
)
297 return g_strdup(enc
->name
); // enc->charset is "None" and would be useless to display
299 return g_strdup_printf("%s (%s)", enc
->name
, enc
->charset
);
303 const gchar
*encodings_get_charset(const GeanyEncoding
* enc
)
305 g_return_val_if_fail(enc
!= NULL
, NULL
);
306 g_return_val_if_fail(enc
->charset
!= NULL
, NULL
);
312 static GtkWidget
*radio_items
[GEANY_ENCODINGS_MAX
];
315 void encodings_select_radio_item(const gchar
*charset
)
319 g_return_if_fail(charset
!= NULL
);
321 for (i
= 0; i
< GEANY_ENCODINGS_MAX
; i
++)
323 if (utils_str_equal(charset
, encodings
[i
].charset
))
326 if (i
== GEANY_ENCODINGS_MAX
)
327 i
= GEANY_ENCODING_UTF_8
; /* fallback to UTF-8 */
329 /* ignore_callback has to be set by the caller */
330 gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items
[i
]), TRUE
);
334 /* Regexp detection of file encoding declared in the file itself.
335 * Idea and parts of code taken from Bluefish, thanks.
336 * regex_compile() is used to compile regular expressions on program init and keep it in memory
337 * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
339 static GRegex
*regex_compile(const gchar
*pattern
)
341 GError
*error
= NULL
;
342 GRegex
*regex
= g_regex_new(pattern
, G_REGEX_CASELESS
| G_REGEX_RAW
, 0, &error
);
346 geany_debug("Failed to compile encoding regex (%s)", error
->message
);
353 static gchar
*regex_match(GRegex
*preg
, const gchar
*buffer
, gsize size
)
355 gchar
*encoding
= NULL
;
358 if (G_UNLIKELY(! pregs_loaded
|| buffer
== NULL
))
361 /* scan only the first 512 characters in the buffer */
362 size
= MIN(size
, 512);
364 if (g_regex_match_full(preg
, buffer
, size
, 0, 0, &minfo
, NULL
) &&
365 g_match_info_get_match_count(minfo
) >= 2)
367 encoding
= g_match_info_fetch(minfo
, 1);
368 geany_debug("Detected encoding by regex search: %s", encoding
);
370 SETPTR(encoding
, g_utf8_strup(encoding
, -1));
372 g_match_info_free(minfo
);
377 static void encodings_radio_item_change_cb(GtkCheckMenuItem
*menuitem
, gpointer user_data
)
379 GeanyDocument
*doc
= document_get_current();
380 const gchar
*charset
= user_data
;
382 if (ignore_callback
|| doc
== NULL
|| charset
== NULL
||
383 ! gtk_check_menu_item_get_active(menuitem
) ||
384 utils_str_equal(charset
, doc
->encoding
))
392 document_undo_add(doc
, UNDO_ENCODING
, g_strdup(doc
->encoding
));
394 document_set_encoding(doc
, charset
);
397 static void encodings_reload_radio_item_change_cb(GtkMenuItem
*menuitem
, gpointer user_data
)
399 GeanyDocument
*doc
= document_get_current();
401 g_return_if_fail(doc
!= NULL
);
403 document_reload_prompt(doc
, user_data
);
407 void encodings_finalize(void)
412 len
= G_N_ELEMENTS(pregs
);
413 for (i
= 0; i
< len
; i
++)
415 g_regex_unref(pregs
[i
]);
421 /* initialization of non-UI parts */
422 void encodings_init_headless(void)
424 static gboolean initialized
= FALSE
;
433 pregs
[0] = regex_compile(PATTERN_HTMLMETA
);
434 pregs
[1] = regex_compile(PATTERN_CODING
);
442 void encodings_init(void)
445 GCallback cb_func
[2];
446 const gchar
*const groups
[GEANY_ENCODING_GROUPS_MAX
] =
449 [WESTEUROPEAN
] = N_("_West European"),
450 [EASTEUROPEAN
] = N_("_East European"),
451 [EASTASIAN
] = N_("East _Asian"),
452 [ASIAN
] = N_("_SE & SW Asian"),
453 [MIDDLEEASTERN
] = N_("_Middle Eastern"),
454 [UNICODE
] = N_("_Unicode"),
457 encodings_init_headless();
459 /* create encodings submenu in document menu */
460 menu
[0] = ui_lookup_widget(main_widgets
.window
, "set_encoding1_menu");
461 menu
[1] = ui_lookup_widget(main_widgets
.window
, "menu_reload_as1_menu");
462 cb_func
[0] = G_CALLBACK(encodings_radio_item_change_cb
);
463 cb_func
[1] = G_CALLBACK(encodings_reload_radio_item_change_cb
);
465 for (guint k
= 0; k
< 2; k
++)
467 GSList
*group
= NULL
;
468 GtkWidget
*submenus
[GEANY_ENCODING_GROUPS_MAX
];
469 gint orders
[GEANY_ENCODING_GROUPS_MAX
] = { 0 };
472 for (guint i
= 0; i
< GEANY_ENCODING_GROUPS_MAX
; i
++)
474 if (! groups
[i
]) /* NONE */
475 submenus
[i
] = menu
[k
];
478 GtkWidget
*item
= gtk_menu_item_new_with_mnemonic(_(groups
[i
]));
479 submenus
[i
] = gtk_menu_new();
480 gtk_menu_item_set_submenu(GTK_MENU_ITEM(item
), submenus
[i
]);
481 gtk_container_add(GTK_CONTAINER(menu
[k
]), item
);
482 gtk_widget_show_all(item
);
486 /** TODO can it be optimized? ATM 882 runs at line "if (encodings[i].order ...)" */
489 for (guint i
= 0; i
< G_N_ELEMENTS(encodings
); i
++)
491 if (encodings
[i
].order
== orders
[encodings
[i
].group
])
494 gchar
*label
= encodings_to_string(&encodings
[i
]);
496 if (k
== 0) /* Set Encoding menu */
498 item
= gtk_radio_menu_item_new_with_label(group
, label
);
499 group
= gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item
));
500 radio_items
[i
] = item
;
503 item
= gtk_menu_item_new_with_label(label
);
504 if (encodings
[i
].supported
)
505 gtk_widget_show(item
);
506 gtk_container_add(GTK_CONTAINER(submenus
[encodings
[i
].group
]), item
);
507 g_signal_connect(item
, "activate", cb_func
[k
],
508 (gpointer
) encodings
[i
].charset
);
511 orders
[encodings
[i
].group
]++;
516 while (n_added
< G_N_ELEMENTS(encodings
));
521 static gint
encoding_combo_store_sort_func(GtkTreeModel
*model
,
526 gboolean a_has_child
= gtk_tree_model_iter_has_child(model
, a
);
527 gboolean b_has_child
= gtk_tree_model_iter_has_child(model
, b
);
532 if (a_has_child
!= b_has_child
)
533 return a_has_child
? -1 : 1;
535 gtk_tree_model_get(model
, a
, 1, &a_string
, -1);
536 gtk_tree_model_get(model
, b
, 1, &b_string
, -1);
537 cmp_res
= strcmp(a_string
, b_string
);
544 GtkTreeStore
*encodings_encoding_store_new(gboolean has_detect
)
547 GtkTreeIter iter_current
, iter_westeuro
, iter_easteuro
, iter_eastasian
,
548 iter_asian
, iter_utf8
, iter_middleeast
;
549 GtkTreeIter
*iter_parent
;
552 store
= gtk_tree_store_new(2, G_TYPE_INT
, G_TYPE_STRING
);
556 gtk_tree_store_append(store
, &iter_current
, NULL
);
557 gtk_tree_store_set(store
, &iter_current
, 0, GEANY_ENCODINGS_MAX
, 1, _("Detect from file"), -1);
560 gtk_tree_store_append(store
, &iter_westeuro
, NULL
);
561 gtk_tree_store_set(store
, &iter_westeuro
, 0, -1, 1, _("West European"), -1);
562 gtk_tree_store_append(store
, &iter_easteuro
, NULL
);
563 gtk_tree_store_set(store
, &iter_easteuro
, 0, -1, 1, _("East European"), -1);
564 gtk_tree_store_append(store
, &iter_eastasian
, NULL
);
565 gtk_tree_store_set(store
, &iter_eastasian
, 0, -1, 1, _("East Asian"), -1);
566 gtk_tree_store_append(store
, &iter_asian
, NULL
);
567 gtk_tree_store_set(store
, &iter_asian
, 0, -1, 1, _("SE & SW Asian"), -1);
568 gtk_tree_store_append(store
, &iter_middleeast
, NULL
);
569 gtk_tree_store_set(store
, &iter_middleeast
, 0, -1, 1, _("Middle Eastern"), -1);
570 gtk_tree_store_append(store
, &iter_utf8
, NULL
);
571 gtk_tree_store_set(store
, &iter_utf8
, 0, -1, 1, _("Unicode"), -1);
573 for (i
= 0; i
< GEANY_ENCODINGS_MAX
; i
++)
575 gchar
*encoding_string
;
577 if (! encodings
[i
].supported
)
580 switch (encodings
[i
].group
)
582 case WESTEUROPEAN
: iter_parent
= &iter_westeuro
; break;
583 case EASTEUROPEAN
: iter_parent
= &iter_easteuro
; break;
584 case EASTASIAN
: iter_parent
= &iter_eastasian
; break;
585 case ASIAN
: iter_parent
= &iter_asian
; break;
586 case MIDDLEEASTERN
: iter_parent
= &iter_middleeast
; break;
587 case UNICODE
: iter_parent
= &iter_utf8
; break;
589 default: iter_parent
= NULL
;
591 gtk_tree_store_append(store
, &iter_current
, iter_parent
);
592 encoding_string
= encodings_to_string(&encodings
[i
]);
593 gtk_tree_store_set(store
, &iter_current
, 0, i
, 1, encoding_string
, -1);
594 g_free(encoding_string
);
597 gtk_tree_sortable_set_sort_column_id(GTK_TREE_SORTABLE(store
), 1, GTK_SORT_ASCENDING
);
598 gtk_tree_sortable_set_sort_func(GTK_TREE_SORTABLE(store
), 1, encoding_combo_store_sort_func
, NULL
, NULL
);
604 gint
encodings_encoding_store_get_encoding(GtkTreeStore
*store
, GtkTreeIter
*iter
)
607 gtk_tree_model_get(GTK_TREE_MODEL(store
), iter
, 0, &enc
, -1);
612 gboolean
encodings_encoding_store_get_iter(GtkTreeStore
*store
, GtkTreeIter
*iter
, gint enc
)
614 if (gtk_tree_model_get_iter_first(GTK_TREE_MODEL(store
), iter
))
618 if (encodings_encoding_store_get_encoding(store
, iter
) == enc
)
621 while (ui_tree_model_iter_any_next(GTK_TREE_MODEL(store
), iter
, TRUE
));
627 void encodings_encoding_store_cell_data_func(GtkCellLayout
*cell_layout
,
628 GtkCellRenderer
*cell
,
629 GtkTreeModel
*tree_model
,
633 gboolean sensitive
= !gtk_tree_model_iter_has_child(tree_model
, iter
);
636 gtk_tree_model_get(tree_model
, iter
, 1, &text
, -1);
637 g_object_set(cell
, "sensitive", sensitive
, "text", text
, NULL
);
642 static gchar
*convert_to_utf8_from_charset(const gchar
*buffer
, gssize size
,
643 const gchar
*charset
, gboolean fast
,
644 gsize
*utf8_size
, GError
**error
)
646 gchar
*utf8_content
= NULL
;
647 GError
*conv_error
= NULL
;
648 gchar
* converted_contents
= NULL
;
651 g_return_val_if_fail(buffer
!= NULL
, NULL
);
652 g_return_val_if_fail(charset
!= NULL
, NULL
);
654 converted_contents
= g_convert(buffer
, size
, "UTF-8", charset
, NULL
,
655 &bytes_written
, &conv_error
);
659 utf8_content
= converted_contents
;
660 if (conv_error
!= NULL
) g_propagate_error(error
, conv_error
);
662 else if (conv_error
!= NULL
|| ! g_utf8_validate(converted_contents
, bytes_written
, NULL
))
664 if (conv_error
!= NULL
)
666 geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset
, conv_error
->message
);
667 g_propagate_error(error
, conv_error
);
672 geany_debug("Couldn't convert from %s to UTF-8.", charset
);
673 g_set_error(error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
674 _("Data contains NULs"));
678 g_free(converted_contents
);
682 geany_debug("Converted from %s to UTF-8.", charset
);
683 utf8_content
= converted_contents
;
686 if (utf8_content
&& utf8_size
)
687 *utf8_size
= bytes_written
;
694 * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
695 * If @a fast is not set, additional checks to validate the converted string are performed.
697 * @param buffer The input string to convert.
698 * @param size The length of the string, or -1 if the string is nul-terminated.
699 * @param charset The charset to be used for conversion.
700 * @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
702 * @return If the conversion was successful, a newly allocated nul-terminated string,
703 * which must be freed with @c g_free(). Otherwise @c NULL.
706 gchar
*encodings_convert_to_utf8_from_charset(const gchar
*buffer
, gssize size
,
707 const gchar
*charset
, gboolean fast
)
709 /* If fast=FALSE, we can safely ignore the size as the output cannot contain NULs.
710 * Otherwise, the caller already agrees on partial data anyway. */
711 return convert_to_utf8_from_charset(buffer
, size
, charset
, fast
, NULL
, NULL
);
715 static gchar
*encodings_check_regexes(const gchar
*buffer
, gsize size
)
719 for (i
= 0; i
< G_N_ELEMENTS(pregs
); i
++)
723 if ((charset
= regex_match(pregs
[i
], buffer
, size
)) != NULL
)
730 static gchar
*encodings_convert_to_utf8_with_suggestion(const gchar
*buffer
, gssize size
,
731 const gchar
*suggested_charset
, gchar
**used_encoding
, gsize
*utf8_size
, GError
**error
)
733 const gchar
*locale_charset
= NULL
;
734 const gchar
*charset
;
736 gboolean check_suggestion
= suggested_charset
!= NULL
;
737 gboolean check_locale
= FALSE
;
738 gint i
, preferred_charset
;
742 size
= strlen(buffer
);
745 /* current locale is not UTF-8, we have to check this charset */
746 check_locale
= ! g_get_charset(&locale_charset
);
748 /* First check for preferred charset, if specified */
749 preferred_charset
= file_prefs
.default_open_encoding
;
751 if (preferred_charset
== (gint
) encodings
[GEANY_ENCODING_NONE
].idx
||
752 preferred_charset
< 0 ||
753 preferred_charset
>= GEANY_ENCODINGS_MAX
)
755 preferred_charset
= -1;
758 /* -1 means "Preferred charset" */
759 for (i
= -1; i
< GEANY_ENCODINGS_MAX
; i
++)
761 if (G_UNLIKELY(i
== (gint
) encodings
[GEANY_ENCODING_NONE
].idx
))
764 if (check_suggestion
)
766 check_suggestion
= FALSE
;
767 charset
= encodings_normalize_charset(suggested_charset
);
768 if (charset
== NULL
) /* we failed at normalizing suggested encoding, try it as is */
769 charset
= suggested_charset
;
770 i
= -2; /* keep i below the start value to have it again at -1 on the next loop run */
772 else if (check_locale
)
774 check_locale
= FALSE
;
775 charset
= locale_charset
;
776 i
= -2; /* keep i below the start value to have it again at -1 on the next loop run */
780 if (preferred_charset
>= 0)
782 charset
= encodings
[preferred_charset
].charset
;
783 geany_debug("Using preferred charset: %s", charset
);
788 else if (i
>= 0 && encodings
[i
].supported
)
789 charset
= encodings
[i
].charset
;
790 else /* in this case we have i == -2, continue to increase i and go ahead */
793 if (G_UNLIKELY(charset
== NULL
))
796 geany_debug("Trying to convert %" G_GSIZE_FORMAT
" bytes of data from %s into UTF-8.",
798 utf8_content
= convert_to_utf8_from_charset(buffer
, size
, charset
, FALSE
, utf8_size
, NULL
);
800 if (G_LIKELY(utf8_content
!= NULL
))
802 if (used_encoding
!= NULL
)
804 if (G_UNLIKELY(*used_encoding
!= NULL
))
806 geany_debug("%s:%d", __FILE__
, __LINE__
);
807 g_free(*used_encoding
);
809 *used_encoding
= g_strdup(charset
);
815 g_set_error(error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_FAILED
,
816 _("Data contains NULs or the encoding is not supported"));
823 * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
826 * @param buffer the input string to convert.
827 * @param size the length of the string, or -1 if the string is nul-terminated.
828 * @param used_encoding @out @optional return location of the detected encoding of the input string, or @c NULL.
830 * @return @nullable If the conversion was successful, a newly allocated nul-terminated string,
831 * which must be freed with @c g_free(). Otherwise @c NULL.
834 gchar
*encodings_convert_to_utf8(const gchar
*buffer
, gssize size
, gchar
**used_encoding
)
836 gchar
*regex_charset
;
839 /* first try to read the encoding from the file content */
840 regex_charset
= encodings_check_regexes(buffer
, size
);
841 /* we know this cannot succeed if there are NULs in the output, so ignoring the size is OK */
842 utf8
= encodings_convert_to_utf8_with_suggestion(buffer
, size
, regex_charset
, used_encoding
, NULL
, NULL
);
843 g_free(regex_charset
);
849 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
850 * otherwise GEANY_ENCODING_NONE.
852 GeanyEncodingIndex
encodings_scan_unicode_bom(const gchar
*string
, gsize len
, guint
*bom_len
)
859 if ((guchar
)string
[0] == 0xef && (guchar
)string
[1] == 0xbb &&
860 (guchar
)string
[2] == 0xbf)
862 return GEANY_ENCODING_UTF_8
;
870 if ((guchar
)string
[0] == 0x00 && (guchar
)string
[1] == 0x00 &&
871 (guchar
)string
[2] == 0xfe && (guchar
)string
[3] == 0xff)
873 return GEANY_ENCODING_UTF_32BE
; /* Big endian */
875 if ((guchar
)string
[0] == 0xff && (guchar
)string
[1] == 0xfe &&
876 (guchar
)string
[2] == 0x00 && (guchar
)string
[3] == 0x00)
878 return GEANY_ENCODING_UTF_32LE
; /* Little endian */
880 if ((string
[0] == 0x2b && string
[1] == 0x2f && string
[2] == 0x76) &&
881 (string
[3] == 0x38 || string
[3] == 0x39 || string
[3] == 0x2b || string
[3] == 0x2f))
883 return GEANY_ENCODING_UTF_7
;
891 if ((guchar
)string
[0] == 0xfe && (guchar
)string
[1] == 0xff)
893 return GEANY_ENCODING_UTF_16BE
; /* Big endian */
895 if ((guchar
)string
[0] == 0xff && (guchar
)string
[1] == 0xfe)
897 return GEANY_ENCODING_UTF_16LE
; /* Little endian */
902 return GEANY_ENCODING_NONE
;
906 gboolean
encodings_is_unicode_charset(const gchar
*string
)
908 if (string
!= NULL
&&
909 (strncmp(string
, "UTF", 3) == 0 || strncmp(string
, "UCS", 3) == 0))
919 gchar
*data
; /* null-terminated data */
920 gsize size
; /* actual data size */
926 /* convert data with the specified encoding */
928 handle_forced_encoding(BufferData
*buffer
, const gchar
*forced_enc
, GError
**error
)
930 GeanyEncodingIndex enc_idx
;
932 if (utils_str_equal(forced_enc
, "UTF-8"))
934 if (! g_utf8_validate(buffer
->data
, buffer
->size
, NULL
))
936 g_set_error(error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
937 _("Data contains NULs or is not valid UTF-8"));
943 gchar
*converted_text
= convert_to_utf8_from_charset(
944 buffer
->data
, buffer
->size
, forced_enc
, FALSE
, &buffer
->size
, error
);
945 if (converted_text
== NULL
)
951 SETPTR(buffer
->data
, converted_text
);
954 enc_idx
= encodings_scan_unicode_bom(buffer
->data
, buffer
->size
, NULL
);
955 buffer
->bom
= (enc_idx
== GEANY_ENCODING_UTF_8
);
956 buffer
->enc
= g_strdup(forced_enc
);
961 /* detect encoding and convert to UTF-8 if necessary */
963 handle_encoding(BufferData
*buffer
, GeanyEncodingIndex enc_idx
, GError
**error
)
965 g_return_val_if_fail(buffer
->enc
== NULL
, FALSE
);
966 g_return_val_if_fail(buffer
->bom
== FALSE
, FALSE
);
968 if (buffer
->size
== 0)
970 /* we have no data so assume UTF-8 */
971 buffer
->enc
= g_strdup("UTF-8");
975 /* first check for a BOM */
976 if (enc_idx
!= GEANY_ENCODING_NONE
)
978 buffer
->enc
= g_strdup(encodings
[enc_idx
].charset
);
981 if (enc_idx
== GEANY_ENCODING_UTF_8
)
983 if (! g_utf8_validate(buffer
->data
, buffer
->size
, NULL
))
985 /* this is not actually valid UTF-8 */
986 SETPTR(buffer
->enc
, NULL
);
990 else /* the BOM indicated something else than UTF-8 */
992 gchar
*converted_text
= convert_to_utf8_from_charset(
993 buffer
->data
, buffer
->size
, buffer
->enc
, FALSE
, &buffer
->size
, NULL
);
994 if (converted_text
!= NULL
)
996 SETPTR(buffer
->data
, converted_text
);
1000 /* there was a problem converting data from BOM encoding type */
1001 SETPTR(buffer
->enc
, NULL
);
1002 buffer
->bom
= FALSE
;
1007 if (buffer
->enc
== NULL
) /* either there was no BOM or the BOM encoding failed */
1009 /* first try to read the encoding from the file content */
1010 gchar
*regex_charset
= encodings_check_regexes(buffer
->data
, buffer
->size
);
1012 /* try UTF-8 first */
1013 if (encodings_get_idx_from_charset(regex_charset
) == GEANY_ENCODING_UTF_8
&&
1014 g_utf8_validate(buffer
->data
, buffer
->size
, NULL
))
1016 buffer
->enc
= g_strdup("UTF-8");
1020 /* detect the encoding */
1021 gchar
*converted_text
= encodings_convert_to_utf8_with_suggestion(buffer
->data
,
1022 buffer
->size
, regex_charset
, &buffer
->enc
, &buffer
->size
, error
);
1024 if (converted_text
== NULL
)
1026 g_free(regex_charset
);
1029 SETPTR(buffer
->data
, converted_text
);
1031 g_free(regex_charset
);
1039 handle_bom(BufferData
*buffer
)
1043 encodings_scan_unicode_bom(buffer
->data
, buffer
->size
, &bom_len
);
1044 g_return_if_fail(bom_len
!= 0);
1046 /* the contents are already converted into UTF-8 here */
1047 buffer
->size
-= bom_len
;
1048 /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
1049 memmove(buffer
->data
, buffer
->data
+ bom_len
, buffer
->size
+ 1);
1050 buffer
->data
= g_realloc(buffer
->data
, buffer
->size
+ 1);
1054 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
1055 static gboolean
handle_buffer(BufferData
*buffer
, const gchar
*forced_enc
, GError
**error
)
1057 GeanyEncodingIndex tmp_enc_idx
;
1059 /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
1060 * if we have a BOM */
1061 tmp_enc_idx
= encodings_scan_unicode_bom(buffer
->data
, buffer
->size
, NULL
);
1063 /* Determine character encoding and convert to UTF-8 */
1064 if (forced_enc
!= NULL
)
1066 /* the encoding should be ignored(requested by user), so open the file "as it is" */
1067 if (utils_str_equal(forced_enc
, encodings
[GEANY_ENCODING_NONE
].charset
))
1069 buffer
->bom
= FALSE
;
1070 buffer
->enc
= g_strdup(encodings
[GEANY_ENCODING_NONE
].charset
);
1072 else if (! handle_forced_encoding(buffer
, forced_enc
, error
))
1077 else if (! handle_encoding(buffer
, tmp_enc_idx
, error
))
1089 * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
1090 * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
1092 * @param buf a pointer to modifiable null-terminated buffer to convert.
1093 * It may or may not be modified, and should be freed whatever happens.
1094 * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
1095 * file size). It will be updated to the new size.
1096 * @param forced_enc forced encoding to use, or @c NULL
1097 * @param used_encoding return location for the actually used encoding, or @c NULL
1098 * @param has_bom return location to store whether the data had a BOM, or @c NULL
1099 * @param has_nuls return location to store whether the converted data contains NULs, or @c NULL
1101 * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
1104 gboolean
encodings_convert_to_utf8_auto(gchar
**buf
, gsize
*size
, const gchar
*forced_enc
,
1105 gchar
**used_encoding
, gboolean
*has_bom
, gboolean
*has_nuls
, GError
**error
)
1110 buffer
.size
= *size
;
1114 if (! handle_buffer(&buffer
, forced_enc
, error
))
1117 *size
= buffer
.size
;
1119 *used_encoding
= buffer
.enc
;
1123 *has_bom
= buffer
.bom
;
1125 *has_nuls
= strlen(buffer
.data
) != buffer
.size
;