Ticket #3616: speed up of utf-8 normalization.
[midnight-commander.git] / lib / strutil / strutilutf8.c
blobc7376beb20eaa9a97774c15225e551afacdfcb47
1 /*
2 UTF-8 strings utilities
4 Copyright (C) 2007-2017
5 Free Software Foundation, Inc.
7 Written by:
8 Rostislav Benes, 2007
10 This file is part of the Midnight Commander.
12 The Midnight Commander is free software: you can redistribute it
13 and/or modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation, either version 3 of the License,
15 or (at your option) any later version.
17 The Midnight Commander is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>.
26 #include <config.h>
28 #include <stdlib.h>
29 #include <langinfo.h>
30 #include <string.h>
32 #include "lib/global.h"
33 #include "lib/strutil.h"
35 /* using function for utf-8 from glib */
37 /*** global variables ****************************************************************************/
39 /*** file scope macro definitions ****************************************************************/
41 /*** file scope type declarations ****************************************************************/
43 struct utf8_tool
45 char *actual;
46 size_t remain;
47 const char *checked;
48 int ident;
49 gboolean compose;
52 struct term_form
54 char text[BUF_MEDIUM * 6];
55 size_t width;
56 gboolean compose;
59 /*** file scope variables ************************************************************************/
61 static const char replch[] = "\xEF\xBF\xBD";
63 /* --------------------------------------------------------------------------------------------- */
64 /*** file scope functions ************************************************************************/
65 /* --------------------------------------------------------------------------------------------- */
67 static gboolean
68 str_unichar_iscombiningmark (gunichar uni)
70 GUnicodeType type;
72 type = g_unichar_type (uni);
73 return (type == G_UNICODE_COMBINING_MARK)
74 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
77 /* --------------------------------------------------------------------------------------------- */
79 static void
80 str_utf8_insert_replace_char (GString * buffer)
82 g_string_append (buffer, replch);
85 /* --------------------------------------------------------------------------------------------- */
87 static int
88 str_utf8_is_valid_string (const char *text)
90 return g_utf8_validate (text, -1, NULL);
93 /* --------------------------------------------------------------------------------------------- */
95 static int
96 str_utf8_is_valid_char (const char *ch, size_t size)
98 switch (g_utf8_get_char_validated (ch, size))
100 case (gunichar) (-2):
101 return (-2);
102 case (gunichar) (-1):
103 return (-1);
104 default:
105 return 1;
109 /* --------------------------------------------------------------------------------------------- */
111 static void
112 str_utf8_cnext_char (const char **text)
114 (*text) = g_utf8_next_char (*text);
117 /* --------------------------------------------------------------------------------------------- */
119 static void
120 str_utf8_cprev_char (const char **text)
122 (*text) = g_utf8_prev_char (*text);
125 /* --------------------------------------------------------------------------------------------- */
127 static void
128 str_utf8_cnext_char_safe (const char **text)
130 if (str_utf8_is_valid_char (*text, -1) == 1)
131 (*text) = g_utf8_next_char (*text);
132 else
133 (*text)++;
136 /* --------------------------------------------------------------------------------------------- */
138 static void
139 str_utf8_cprev_char_safe (const char **text)
141 const char *result, *t;
143 result = g_utf8_prev_char (*text);
144 t = result;
145 str_utf8_cnext_char_safe (&t);
146 if (t == *text)
147 (*text) = result;
148 else
149 (*text)--;
152 /* --------------------------------------------------------------------------------------------- */
154 static void
155 str_utf8_fix_string (char *text)
157 while (text[0] != '\0')
159 gunichar uni;
161 uni = g_utf8_get_char_validated (text, -1);
162 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
163 text = g_utf8_next_char (text);
164 else
166 text[0] = '?';
167 text++;
172 /* --------------------------------------------------------------------------------------------- */
174 static int
175 str_utf8_isspace (const char *text)
177 gunichar uni;
179 uni = g_utf8_get_char_validated (text, -1);
180 return g_unichar_isspace (uni);
183 /* --------------------------------------------------------------------------------------------- */
185 static int
186 str_utf8_ispunct (const char *text)
188 gunichar uni;
190 uni = g_utf8_get_char_validated (text, -1);
191 return g_unichar_ispunct (uni);
194 /* --------------------------------------------------------------------------------------------- */
196 static int
197 str_utf8_isalnum (const char *text)
199 gunichar uni;
201 uni = g_utf8_get_char_validated (text, -1);
202 return g_unichar_isalnum (uni);
205 /* --------------------------------------------------------------------------------------------- */
207 static int
208 str_utf8_isdigit (const char *text)
210 gunichar uni;
212 uni = g_utf8_get_char_validated (text, -1);
213 return g_unichar_isdigit (uni);
216 /* --------------------------------------------------------------------------------------------- */
218 static int
219 str_utf8_isprint (const char *ch)
221 gunichar uni;
223 uni = g_utf8_get_char_validated (ch, -1);
224 return g_unichar_isprint (uni);
227 /* --------------------------------------------------------------------------------------------- */
229 static gboolean
230 str_utf8_iscombiningmark (const char *ch)
232 gunichar uni;
234 uni = g_utf8_get_char_validated (ch, -1);
235 return str_unichar_iscombiningmark (uni);
238 /* --------------------------------------------------------------------------------------------- */
240 static int
241 str_utf8_cnext_noncomb_char (const char **text)
243 int count = 0;
245 while ((*text)[0] != '\0')
247 str_utf8_cnext_char_safe (text);
248 count++;
249 if (!str_utf8_iscombiningmark (*text))
250 break;
253 return count;
256 /* --------------------------------------------------------------------------------------------- */
258 static int
259 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
261 int count = 0;
263 while ((*text) != begin)
265 str_utf8_cprev_char_safe (text);
266 count++;
267 if (!str_utf8_iscombiningmark (*text))
268 break;
271 return count;
274 /* --------------------------------------------------------------------------------------------- */
276 static int
277 str_utf8_toupper (const char *text, char **out, size_t * remain)
279 gunichar uni;
280 size_t left;
282 uni = g_utf8_get_char_validated (text, -1);
283 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
284 return 0;
286 uni = g_unichar_toupper (uni);
287 left = g_unichar_to_utf8 (uni, NULL);
288 if (left >= *remain)
289 return 0;
291 left = g_unichar_to_utf8 (uni, *out);
292 (*out) += left;
293 (*remain) -= left;
294 return 1;
297 /* --------------------------------------------------------------------------------------------- */
299 static int
300 str_utf8_tolower (const char *text, char **out, size_t * remain)
302 gunichar uni;
303 size_t left;
305 uni = g_utf8_get_char_validated (text, -1);
306 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
307 return 0;
309 uni = g_unichar_tolower (uni);
310 left = g_unichar_to_utf8 (uni, NULL);
311 if (left >= *remain)
312 return 0;
314 left = g_unichar_to_utf8 (uni, *out);
315 (*out) += left;
316 (*remain) -= left;
317 return 1;
320 /* --------------------------------------------------------------------------------------------- */
322 static int
323 str_utf8_length (const char *text)
325 int result = 0;
326 const char *start;
327 const char *end;
329 start = text;
330 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
332 if (start != end)
333 result += g_utf8_strlen (start, end - start);
335 result++;
336 start = end + 1;
339 if (start == text)
340 result = g_utf8_strlen (text, -1);
341 else if (start[0] != '\0' && start != end)
342 result += g_utf8_strlen (start, end - start);
344 return result;
347 /* --------------------------------------------------------------------------------------------- */
349 static int
350 str_utf8_length2 (const char *text, int size)
352 int result = 0;
353 const char *start;
354 const char *end;
356 start = text;
357 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
359 if (start != end)
361 result += g_utf8_strlen (start, MIN (end - start, size));
362 size -= end - start;
364 result += (size > 0);
365 size--;
366 start = end + 1;
369 if (start == text)
370 result = g_utf8_strlen (text, size);
371 else if (start[0] != '\0' && start != end && size > 0)
372 result += g_utf8_strlen (start, MIN (end - start, size));
374 return result;
377 /* --------------------------------------------------------------------------------------------- */
379 static int
380 str_utf8_length_noncomb (const char *text)
382 int result = 0;
383 const char *t = text;
385 while (t[0] != '\0')
387 str_utf8_cnext_noncomb_char (&t);
388 result++;
391 return result;
394 /* --------------------------------------------------------------------------------------------- */
396 #if 0
397 static void
398 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
400 char *next;
402 next = g_utf8_next_char (*string);
403 (*left) -= next - (*string);
404 (*string) = next;
405 g_string_append_c (buffer, '?');
407 #endif
409 /* --------------------------------------------------------------------------------------------- */
411 static gchar *
412 str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
414 if (mcerror != NULL)
415 return g_strdup (mcerror->message);
417 return g_strdup (def_msg != NULL ? def_msg : "");
420 /* --------------------------------------------------------------------------------------------- */
422 static estr_t
423 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
425 estr_t result = ESTR_SUCCESS;
427 if (coder == str_cnv_not_convert)
428 g_string_append_len (buffer, string, size);
429 else
430 result = str_nconvert (coder, string, size, buffer);
432 return result;
435 /* --------------------------------------------------------------------------------------------- */
436 /* utility function, that makes string valid in utf8 and all characters printable
437 * return width of string too */
439 static const struct term_form *
440 str_utf8_make_make_term_form (const char *text, size_t length)
442 static struct term_form result;
443 gunichar uni;
444 size_t left;
445 char *actual;
447 result.text[0] = '\0';
448 result.width = 0;
449 result.compose = FALSE;
450 actual = result.text;
452 /* check if text start with combining character,
453 * add space at begin in this case */
454 if (length != 0 && text[0] != '\0')
456 uni = g_utf8_get_char_validated (text, -1);
457 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
458 && str_unichar_iscombiningmark (uni))
460 actual[0] = ' ';
461 actual++;
462 result.width++;
463 result.compose = TRUE;
467 while (length != 0 && text[0] != '\0')
469 uni = g_utf8_get_char_validated (text, -1);
470 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
472 if (g_unichar_isprint (uni))
474 left = g_unichar_to_utf8 (uni, actual);
475 actual += left;
476 if (str_unichar_iscombiningmark (uni))
477 result.compose = TRUE;
478 else
480 result.width++;
481 if (g_unichar_iswide (uni))
482 result.width++;
485 else
487 actual[0] = '.';
488 actual++;
489 result.width++;
491 text = g_utf8_next_char (text);
493 else
495 text++;
496 /*actual[0] = '?'; */
497 memcpy (actual, replch, strlen (replch));
498 actual += strlen (replch);
499 result.width++;
502 if (length != (size_t) (-1))
503 length--;
505 actual[0] = '\0';
507 return &result;
510 /* --------------------------------------------------------------------------------------------- */
512 static const char *
513 str_utf8_term_form (const char *text)
515 static char result[BUF_MEDIUM * 6];
516 const struct term_form *pre_form;
518 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
519 if (pre_form->compose)
521 char *composed;
523 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
524 g_strlcpy (result, composed, sizeof (result));
525 g_free (composed);
527 else
528 g_strlcpy (result, pre_form->text, sizeof (result));
530 return result;
533 /* --------------------------------------------------------------------------------------------- */
534 /* utility function, that copies all characters from checked to actual */
536 static gboolean
537 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
539 tool->compose = FALSE;
541 while (tool->checked[0] != '\0')
543 gunichar uni;
544 size_t left;
546 uni = g_utf8_get_char (tool->checked);
547 tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
548 left = g_unichar_to_utf8 (uni, NULL);
549 if (tool->remain <= left)
550 return FALSE;
551 left = g_unichar_to_utf8 (uni, tool->actual);
552 tool->actual += left;
553 tool->remain -= left;
554 tool->checked = g_utf8_next_char (tool->checked);
557 return TRUE;
560 /* --------------------------------------------------------------------------------------------- */
561 /* utility function, that copies characters from checked to actual until ident is
562 * smaller than to_ident */
564 static gboolean
565 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
567 tool->compose = FALSE;
569 while (tool->checked[0] != '\0')
571 gunichar uni;
572 size_t left;
573 int w = 0;
575 uni = g_utf8_get_char (tool->checked);
576 if (str_unichar_iscombiningmark (uni))
577 tool->compose = TRUE;
578 else
580 w = 1;
581 if (g_unichar_iswide (uni))
582 w++;
583 if (tool->ident + w > to_ident)
584 return TRUE;
587 left = g_unichar_to_utf8 (uni, NULL);
588 if (tool->remain <= left)
589 return FALSE;
590 left = g_unichar_to_utf8 (uni, tool->actual);
591 tool->actual += left;
592 tool->remain -= left;
593 tool->checked = g_utf8_next_char (tool->checked);
594 tool->ident += w;
597 return TRUE;
600 /* --------------------------------------------------------------------------------------------- */
601 /* utility function, adds count spaces to actual */
603 static int
604 utf8_tool_insert_space (struct utf8_tool *tool, int count)
606 if (count <= 0)
607 return 1;
608 if (tool->remain <= (gsize) count)
609 return 0;
611 memset (tool->actual, ' ', count);
612 tool->actual += count;
613 tool->remain -= count;
614 return 1;
617 /* --------------------------------------------------------------------------------------------- */
618 /* utility function, adds one characters to actual */
620 static int
621 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
623 if (tool->remain <= 1)
624 return 0;
626 tool->actual[0] = ch;
627 tool->actual++;
628 tool->remain--;
629 return 1;
632 /* --------------------------------------------------------------------------------------------- */
633 /* utility function, thah skips characters from checked until ident is greater or
634 * equal to to_ident */
636 static gboolean
637 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
639 gunichar uni;
641 while (to_ident > tool->ident && tool->checked[0] != '\0')
643 uni = g_utf8_get_char (tool->checked);
644 if (!str_unichar_iscombiningmark (uni))
646 tool->ident++;
647 if (g_unichar_iswide (uni))
648 tool->ident++;
650 tool->checked = g_utf8_next_char (tool->checked);
653 uni = g_utf8_get_char (tool->checked);
654 while (str_unichar_iscombiningmark (uni))
656 tool->checked = g_utf8_next_char (tool->checked);
657 uni = g_utf8_get_char (tool->checked);
660 return TRUE;
663 /* --------------------------------------------------------------------------------------------- */
665 static void
666 utf8_tool_compose (char *buffer, size_t size)
668 char *composed;
670 composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
671 g_strlcpy (buffer, composed, size);
672 g_free (composed);
675 /* --------------------------------------------------------------------------------------------- */
677 static const char *
678 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
680 static char result[BUF_MEDIUM * 6];
681 const struct term_form *pre_form;
682 struct utf8_tool tool;
684 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
685 tool.checked = pre_form->text;
686 tool.actual = result;
687 tool.remain = sizeof (result);
688 tool.compose = FALSE;
690 if (pre_form->width <= (gsize) width)
692 switch (HIDE_FIT (just_mode))
694 case J_CENTER_LEFT:
695 case J_CENTER:
696 tool.ident = (width - pre_form->width) / 2;
697 break;
698 case J_RIGHT:
699 tool.ident = width - pre_form->width;
700 break;
701 default:
702 tool.ident = 0;
703 break;
706 utf8_tool_insert_space (&tool, tool.ident);
707 utf8_tool_copy_chars_to_end (&tool);
708 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
710 else if (IS_FIT (just_mode))
712 tool.ident = 0;
713 utf8_tool_copy_chars_to (&tool, width / 2);
714 utf8_tool_insert_char (&tool, '~');
716 tool.ident = 0;
717 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
718 utf8_tool_copy_chars_to_end (&tool);
719 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
721 else
723 switch (HIDE_FIT (just_mode))
725 case J_CENTER:
726 tool.ident = (width - pre_form->width) / 2;
727 break;
728 case J_RIGHT:
729 tool.ident = width - pre_form->width;
730 break;
731 default:
732 tool.ident = 0;
733 break;
736 utf8_tool_skip_chars_to (&tool, 0);
737 utf8_tool_insert_space (&tool, tool.ident);
738 utf8_tool_copy_chars_to (&tool, width);
739 utf8_tool_insert_space (&tool, width - tool.ident);
742 tool.actual[0] = '\0';
743 if (tool.compose)
744 utf8_tool_compose (result, sizeof (result));
745 return result;
748 /* --------------------------------------------------------------------------------------------- */
750 static const char *
751 str_utf8_term_trim (const char *text, int width)
753 static char result[BUF_MEDIUM * 6];
754 const struct term_form *pre_form;
755 struct utf8_tool tool;
757 if (width < 1)
759 result[0] = '\0';
760 return result;
763 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
765 tool.checked = pre_form->text;
766 tool.actual = result;
767 tool.remain = sizeof (result);
768 tool.compose = FALSE;
770 if ((gsize) width >= pre_form->width)
771 utf8_tool_copy_chars_to_end (&tool);
772 else if (width <= 3)
774 memset (tool.actual, '.', width);
775 tool.actual += width;
776 tool.remain -= width;
778 else
780 memset (tool.actual, '.', 3);
781 tool.actual += 3;
782 tool.remain -= 3;
784 tool.ident = 0;
785 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
786 utf8_tool_copy_chars_to_end (&tool);
789 tool.actual[0] = '\0';
790 if (tool.compose)
791 utf8_tool_compose (result, sizeof (result));
792 return result;
795 /* --------------------------------------------------------------------------------------------- */
797 static int
798 str_utf8_term_width2 (const char *text, size_t length)
800 const struct term_form *result;
802 result = str_utf8_make_make_term_form (text, length);
803 return result->width;
806 /* --------------------------------------------------------------------------------------------- */
808 static int
809 str_utf8_term_width1 (const char *text)
811 return str_utf8_term_width2 (text, (size_t) (-1));
814 /* --------------------------------------------------------------------------------------------- */
816 static int
817 str_utf8_term_char_width (const char *text)
819 gunichar uni;
821 uni = g_utf8_get_char_validated (text, -1);
822 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
825 /* --------------------------------------------------------------------------------------------- */
827 static const char *
828 str_utf8_term_substring (const char *text, int start, int width)
830 static char result[BUF_MEDIUM * 6];
831 const struct term_form *pre_form;
832 struct utf8_tool tool;
834 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
836 tool.checked = pre_form->text;
837 tool.actual = result;
838 tool.remain = sizeof (result);
839 tool.compose = FALSE;
841 tool.ident = -start;
842 utf8_tool_skip_chars_to (&tool, 0);
843 if (tool.ident < 0)
844 tool.ident = 0;
845 utf8_tool_insert_space (&tool, tool.ident);
847 utf8_tool_copy_chars_to (&tool, width);
848 utf8_tool_insert_space (&tool, width - tool.ident);
850 tool.actual[0] = '\0';
851 if (tool.compose)
852 utf8_tool_compose (result, sizeof (result));
853 return result;
856 /* --------------------------------------------------------------------------------------------- */
858 static const char *
859 str_utf8_trunc (const char *text, int width)
861 static char result[MC_MAXPATHLEN * 6 * 2];
862 const struct term_form *pre_form;
863 struct utf8_tool tool;
865 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
867 tool.checked = pre_form->text;
868 tool.actual = result;
869 tool.remain = sizeof (result);
870 tool.compose = FALSE;
872 if (pre_form->width <= (gsize) width)
873 utf8_tool_copy_chars_to_end (&tool);
874 else
876 tool.ident = 0;
877 utf8_tool_copy_chars_to (&tool, width / 2);
878 utf8_tool_insert_char (&tool, '~');
880 tool.ident = 0;
881 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
882 utf8_tool_copy_chars_to_end (&tool);
885 tool.actual[0] = '\0';
886 if (tool.compose)
887 utf8_tool_compose (result, sizeof (result));
888 return result;
891 /* --------------------------------------------------------------------------------------------- */
893 static int
894 str_utf8_offset_to_pos (const char *text, size_t length)
896 if (str_utf8_is_valid_string (text))
897 return g_utf8_offset_to_pointer (text, length) - text;
898 else
900 int result;
901 GString *buffer;
903 buffer = g_string_new (text);
904 str_utf8_fix_string (buffer->str);
905 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
906 g_string_free (buffer, TRUE);
907 return result;
911 /* --------------------------------------------------------------------------------------------- */
913 static int
914 str_utf8_column_to_pos (const char *text, size_t pos)
916 int result = 0;
917 int width = 0;
919 while (text[0] != '\0')
921 gunichar uni;
923 uni = g_utf8_get_char_validated (text, 6);
924 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
926 if (g_unichar_isprint (uni))
928 if (!str_unichar_iscombiningmark (uni))
930 width++;
931 if (g_unichar_iswide (uni))
932 width++;
935 else
937 width++;
939 text = g_utf8_next_char (text);
941 else
943 text++;
944 width++;
947 if ((gsize) width > pos)
948 return result;
950 result++;
953 return result;
956 /* --------------------------------------------------------------------------------------------- */
958 static char *
959 str_utf8_create_search_needle (const char *needle, int case_sen)
961 char *fold, *result;
963 if (needle == NULL)
964 return NULL;
966 if (case_sen)
967 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
970 fold = g_utf8_casefold (needle, -1);
971 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
972 g_free (fold);
973 return result;
976 /* --------------------------------------------------------------------------------------------- */
978 static void
979 str_utf8_release_search_needle (char *needle, int case_sen)
981 (void) case_sen;
982 g_free (needle);
985 /* --------------------------------------------------------------------------------------------- */
987 static const char *
988 str_utf8_search_first (const char *text, const char *search, int case_sen)
990 char *fold_text;
991 char *deco_text;
992 const char *match;
993 const char *result = NULL;
994 const char *m;
996 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
997 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
999 match = deco_text;
1002 match = g_strstr_len (match, -1, search);
1003 if (match != NULL)
1005 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1006 !str_utf8_iscombiningmark (match + strlen (search)))
1008 result = text;
1009 m = deco_text;
1010 while (m < match)
1012 str_utf8_cnext_noncomb_char (&m);
1013 str_utf8_cnext_noncomb_char (&result);
1016 else
1017 str_utf8_cnext_char (&match);
1020 while (match != NULL && result == NULL);
1022 g_free (deco_text);
1023 if (!case_sen)
1024 g_free (fold_text);
1026 return result;
1029 /* --------------------------------------------------------------------------------------------- */
1031 static const char *
1032 str_utf8_search_last (const char *text, const char *search, int case_sen)
1034 char *fold_text;
1035 char *deco_text;
1036 char *match;
1037 const char *result = NULL;
1038 const char *m;
1040 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
1041 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1045 match = g_strrstr_len (deco_text, -1, search);
1046 if (match != NULL)
1048 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1049 !str_utf8_iscombiningmark (match + strlen (search)))
1051 result = text;
1052 m = deco_text;
1053 while (m < match)
1055 str_utf8_cnext_noncomb_char (&m);
1056 str_utf8_cnext_noncomb_char (&result);
1059 else
1060 match[0] = '\0';
1063 while (match != NULL && result == NULL);
1065 g_free (deco_text);
1066 if (!case_sen)
1067 g_free (fold_text);
1069 return result;
1072 /* --------------------------------------------------------------------------------------------- */
1074 static char *
1075 str_utf8_normalize (const char *text)
1077 GString *fixed;
1078 char *tmp;
1079 char *result;
1080 const char *start;
1081 const char *end;
1083 /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
1084 * does the normalization and then converts UCS-4 back into UTF-8.
1085 * Since file names are composed of ASCII characters in most cases, we can speed up
1086 * utf8 normalization by checking if the heavyweight Unicode normalization is actually
1087 * needed. Normalization of ASCII string is no-op.
1090 /* find out whether text is ASCII only */
1091 for (end = text; *end != '\0'; end++)
1092 if ((*end & 0x80) != 0)
1094 /* found 2nd byte of utf8-encoded symbol */
1095 break;
1098 /* if text is ASCII-only, return copy, normalize otherwise */
1099 if (*end == '\0')
1100 return g_strndup (text, end - text);
1102 fixed = g_string_sized_new (4);
1104 start = text;
1105 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1107 if (start != end)
1109 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1110 g_string_append (fixed, tmp);
1111 g_free (tmp);
1113 g_string_append_c (fixed, end[0]);
1114 start = end + 1;
1117 if (start == text)
1119 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1120 g_string_free (fixed, TRUE);
1122 else
1124 if (start[0] != '\0' && start != end)
1126 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1127 g_string_append (fixed, tmp);
1128 g_free (tmp);
1130 result = g_string_free (fixed, FALSE);
1133 return result;
1136 /* --------------------------------------------------------------------------------------------- */
1138 static char *
1139 str_utf8_casefold_normalize (const char *text)
1141 GString *fixed;
1142 char *tmp, *fold;
1143 char *result;
1144 const char *start;
1145 const char *end;
1147 fixed = g_string_sized_new (4);
1149 start = text;
1150 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1152 if (start != end)
1154 fold = g_utf8_casefold (start, end - start);
1155 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1156 g_string_append (fixed, tmp);
1157 g_free (tmp);
1158 g_free (fold);
1160 g_string_append_c (fixed, end[0]);
1161 start = end + 1;
1164 if (start == text)
1166 fold = g_utf8_casefold (text, -1);
1167 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1168 g_free (fold);
1169 g_string_free (fixed, TRUE);
1171 else
1173 if (start[0] != '\0' && start != end)
1175 fold = g_utf8_casefold (start, end - start);
1176 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1177 g_string_append (fixed, tmp);
1178 g_free (tmp);
1179 g_free (fold);
1181 result = g_string_free (fixed, FALSE);
1184 return result;
1187 /* --------------------------------------------------------------------------------------------- */
1189 static int
1190 str_utf8_compare (const char *t1, const char *t2)
1192 char *n1, *n2;
1193 int result;
1195 n1 = str_utf8_normalize (t1);
1196 n2 = str_utf8_normalize (t2);
1198 result = strcmp (n1, n2);
1200 g_free (n1);
1201 g_free (n2);
1203 return result;
1206 /* --------------------------------------------------------------------------------------------- */
1208 static int
1209 str_utf8_ncompare (const char *t1, const char *t2)
1211 char *n1, *n2;
1212 size_t l1, l2;
1213 int result;
1215 n1 = str_utf8_normalize (t1);
1216 n2 = str_utf8_normalize (t2);
1218 l1 = strlen (n1);
1219 l2 = strlen (n2);
1220 result = strncmp (n1, n2, MIN (l1, l2));
1222 g_free (n1);
1223 g_free (n2);
1225 return result;
1228 /* --------------------------------------------------------------------------------------------- */
1230 static int
1231 str_utf8_casecmp (const char *t1, const char *t2)
1233 char *n1, *n2;
1234 int result;
1236 n1 = str_utf8_casefold_normalize (t1);
1237 n2 = str_utf8_casefold_normalize (t2);
1239 result = strcmp (n1, n2);
1241 g_free (n1);
1242 g_free (n2);
1244 return result;
1247 /* --------------------------------------------------------------------------------------------- */
1249 static int
1250 str_utf8_ncasecmp (const char *t1, const char *t2)
1252 char *n1, *n2;
1253 size_t l1, l2;
1254 int result;
1256 n1 = str_utf8_casefold_normalize (t1);
1257 n2 = str_utf8_casefold_normalize (t2);
1259 l1 = strlen (n1);
1260 l2 = strlen (n2);
1261 result = strncmp (n1, n2, MIN (l1, l2));
1263 g_free (n1);
1264 g_free (n2);
1266 return result;
1269 /* --------------------------------------------------------------------------------------------- */
1271 static int
1272 str_utf8_prefix (const char *text, const char *prefix)
1274 char *t, *p;
1275 const char *nt, *np;
1276 const char *nnt, *nnp;
1277 int result;
1279 t = str_utf8_normalize (text);
1280 p = str_utf8_normalize (prefix);
1281 nt = t;
1282 np = p;
1283 nnt = t;
1284 nnp = p;
1286 while (nt[0] != '\0' && np[0] != '\0')
1288 str_utf8_cnext_char_safe (&nnt);
1289 str_utf8_cnext_char_safe (&nnp);
1290 if (nnt - nt != nnp - np)
1291 break;
1292 if (strncmp (nt, np, nnt - nt) != 0)
1293 break;
1294 nt = nnt;
1295 np = nnp;
1298 result = np - p;
1300 g_free (t);
1301 g_free (p);
1303 return result;
1306 /* --------------------------------------------------------------------------------------------- */
1308 static int
1309 str_utf8_caseprefix (const char *text, const char *prefix)
1311 char *t, *p;
1312 const char *nt, *np;
1313 const char *nnt, *nnp;
1314 int result;
1316 t = str_utf8_casefold_normalize (text);
1317 p = str_utf8_casefold_normalize (prefix);
1318 nt = t;
1319 np = p;
1320 nnt = t;
1321 nnp = p;
1323 while (nt[0] != '\0' && np[0] != '\0')
1325 str_utf8_cnext_char_safe (&nnt);
1326 str_utf8_cnext_char_safe (&nnp);
1327 if (nnt - nt != nnp - np)
1328 break;
1329 if (strncmp (nt, np, nnt - nt) != 0)
1330 break;
1331 nt = nnt;
1332 np = nnp;
1335 result = np - p;
1337 g_free (t);
1338 g_free (p);
1340 return result;
1343 /* --------------------------------------------------------------------------------------------- */
1345 static char *
1346 str_utf8_create_key_gen (const char *text, int case_sen,
1347 gchar * (*keygen) (const gchar * text, gssize size))
1349 char *result;
1351 if (case_sen)
1352 result = str_utf8_normalize (text);
1353 else
1355 gboolean dot;
1356 GString *fixed;
1357 const char *start, *end;
1358 char *fold, *key;
1360 dot = text[0] == '.';
1361 fixed = g_string_sized_new (16);
1363 if (!dot)
1364 start = text;
1365 else
1367 start = text + 1;
1368 g_string_append_c (fixed, '.');
1371 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1373 if (start != end)
1375 fold = g_utf8_casefold (start, end - start);
1376 key = keygen (fold, -1);
1377 g_string_append (fixed, key);
1378 g_free (key);
1379 g_free (fold);
1381 g_string_append_c (fixed, end[0]);
1382 start = end + 1;
1385 if (start == text)
1387 fold = g_utf8_casefold (start, -1);
1388 result = keygen (fold, -1);
1389 g_free (fold);
1390 g_string_free (fixed, TRUE);
1392 else if (dot && (start == text + 1))
1394 fold = g_utf8_casefold (start, -1);
1395 key = keygen (fold, -1);
1396 g_string_append (fixed, key);
1397 g_free (key);
1398 g_free (fold);
1399 result = g_string_free (fixed, FALSE);
1401 else
1403 if (start[0] != '\0' && start != end)
1405 fold = g_utf8_casefold (start, end - start);
1406 key = keygen (fold, -1);
1407 g_string_append (fixed, key);
1408 g_free (key);
1409 g_free (fold);
1411 result = g_string_free (fixed, FALSE);
1414 return result;
1417 /* --------------------------------------------------------------------------------------------- */
1419 static char *
1420 str_utf8_create_key (const char *text, int case_sen)
1422 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1425 /* --------------------------------------------------------------------------------------------- */
1427 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1428 static char *
1429 str_utf8_create_key_for_filename (const char *text, int case_sen)
1431 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1433 #endif
1435 /* --------------------------------------------------------------------------------------------- */
1437 static int
1438 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1440 (void) case_sen;
1441 return strcmp (t1, t2);
1444 /* --------------------------------------------------------------------------------------------- */
1446 static void
1447 str_utf8_release_key (char *key, int case_sen)
1449 (void) case_sen;
1450 g_free (key);
1453 /* --------------------------------------------------------------------------------------------- */
1454 /*** public functions ****************************************************************************/
1455 /* --------------------------------------------------------------------------------------------- */
1457 struct str_class
1458 str_utf8_init (void)
1460 struct str_class result;
1462 result.conv_gerror_message = str_utf8_conv_gerror_message;
1463 result.vfs_convert_to = str_utf8_vfs_convert_to;
1464 result.insert_replace_char = str_utf8_insert_replace_char;
1465 result.is_valid_string = str_utf8_is_valid_string;
1466 result.is_valid_char = str_utf8_is_valid_char;
1467 result.cnext_char = str_utf8_cnext_char;
1468 result.cprev_char = str_utf8_cprev_char;
1469 result.cnext_char_safe = str_utf8_cnext_char_safe;
1470 result.cprev_char_safe = str_utf8_cprev_char_safe;
1471 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1472 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1473 result.char_isspace = str_utf8_isspace;
1474 result.char_ispunct = str_utf8_ispunct;
1475 result.char_isalnum = str_utf8_isalnum;
1476 result.char_isdigit = str_utf8_isdigit;
1477 result.char_isprint = str_utf8_isprint;
1478 result.char_iscombiningmark = str_utf8_iscombiningmark;
1479 result.char_toupper = str_utf8_toupper;
1480 result.char_tolower = str_utf8_tolower;
1481 result.length = str_utf8_length;
1482 result.length2 = str_utf8_length2;
1483 result.length_noncomb = str_utf8_length_noncomb;
1484 result.fix_string = str_utf8_fix_string;
1485 result.term_form = str_utf8_term_form;
1486 result.fit_to_term = str_utf8_fit_to_term;
1487 result.term_trim = str_utf8_term_trim;
1488 result.term_width2 = str_utf8_term_width2;
1489 result.term_width1 = str_utf8_term_width1;
1490 result.term_char_width = str_utf8_term_char_width;
1491 result.term_substring = str_utf8_term_substring;
1492 result.trunc = str_utf8_trunc;
1493 result.offset_to_pos = str_utf8_offset_to_pos;
1494 result.column_to_pos = str_utf8_column_to_pos;
1495 result.create_search_needle = str_utf8_create_search_needle;
1496 result.release_search_needle = str_utf8_release_search_needle;
1497 result.search_first = str_utf8_search_first;
1498 result.search_last = str_utf8_search_last;
1499 result.compare = str_utf8_compare;
1500 result.ncompare = str_utf8_ncompare;
1501 result.casecmp = str_utf8_casecmp;
1502 result.ncasecmp = str_utf8_ncasecmp;
1503 result.prefix = str_utf8_prefix;
1504 result.caseprefix = str_utf8_caseprefix;
1505 result.create_key = str_utf8_create_key;
1506 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1507 /* case insensitive sort files in "a1 a2 a10" order */
1508 result.create_key_for_filename = str_utf8_create_key_for_filename;
1509 #else
1510 /* case insensitive sort files in "a1 a10 a2" order */
1511 result.create_key_for_filename = str_utf8_create_key;
1512 #endif
1513 result.key_collate = str_utf8_key_collate;
1514 result.release_key = str_utf8_release_key;
1516 return result;
1519 /* --------------------------------------------------------------------------------------------- */