2 UTF-8 strings utilities
4 Copyright (C) 2007-2024
5 Free Software Foundation, Inc.
10 This file is part of the Midnight Commander.
12 The Midnight Commander is free software: you can redistribute it
13 and/or modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation, either version 3 of the License,
15 or (at your option) any later version.
17 The Midnight Commander is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>.
30 #include <limits.h> /* MB_LEN_MAX */
33 #include "lib/global.h"
34 #include "lib/strutil.h"
36 /* using function for utf-8 from glib */
38 /*** global variables ****************************************************************************/
40 /*** file scope macro definitions ****************************************************************/
42 /*** file scope type declarations ****************************************************************/
55 char text
[BUF_MEDIUM
* MB_LEN_MAX
];
60 /*** forward declarations (file scope functions) *************************************************/
62 /*** file scope variables ************************************************************************/
64 static const char replch
[] = "\xEF\xBF\xBD";
66 /* --------------------------------------------------------------------------------------------- */
67 /*** file scope functions ************************************************************************/
68 /* --------------------------------------------------------------------------------------------- */
71 str_unichar_iscombiningmark (gunichar uni
)
75 type
= g_unichar_type (uni
);
76 return (type
== G_UNICODE_SPACING_MARK
)
77 || (type
== G_UNICODE_ENCLOSING_MARK
) || (type
== G_UNICODE_NON_SPACING_MARK
);
80 /* --------------------------------------------------------------------------------------------- */
83 str_utf8_insert_replace_char (GString
*buffer
)
85 g_string_append (buffer
, replch
);
88 /* --------------------------------------------------------------------------------------------- */
91 str_utf8_is_valid_string (const char *text
)
93 return g_utf8_validate (text
, -1, NULL
);
96 /* --------------------------------------------------------------------------------------------- */
99 str_utf8_is_valid_char (const char *ch
, size_t size
)
101 switch (g_utf8_get_char_validated (ch
, size
))
103 case (gunichar
) (-2):
105 case (gunichar
) (-1):
112 /* --------------------------------------------------------------------------------------------- */
115 str_utf8_cnext_char (const char **text
)
117 (*text
) = g_utf8_next_char (*text
);
120 /* --------------------------------------------------------------------------------------------- */
123 str_utf8_cprev_char (const char **text
)
125 (*text
) = g_utf8_prev_char (*text
);
128 /* --------------------------------------------------------------------------------------------- */
131 str_utf8_cnext_char_safe (const char **text
)
133 if (str_utf8_is_valid_char (*text
, -1) == 1)
134 (*text
) = g_utf8_next_char (*text
);
139 /* --------------------------------------------------------------------------------------------- */
142 str_utf8_cprev_char_safe (const char **text
)
144 const char *result
, *t
;
146 result
= g_utf8_prev_char (*text
);
148 str_utf8_cnext_char_safe (&t
);
155 /* --------------------------------------------------------------------------------------------- */
158 str_utf8_fix_string (char *text
)
160 while (text
[0] != '\0')
164 uni
= g_utf8_get_char_validated (text
, -1);
165 if ((uni
!= (gunichar
) (-1)) && (uni
!= (gunichar
) (-2)))
166 text
= g_utf8_next_char (text
);
175 /* --------------------------------------------------------------------------------------------- */
178 str_utf8_isspace (const char *text
)
182 uni
= g_utf8_get_char_validated (text
, -1);
183 return g_unichar_isspace (uni
);
186 /* --------------------------------------------------------------------------------------------- */
189 str_utf8_ispunct (const char *text
)
193 uni
= g_utf8_get_char_validated (text
, -1);
194 return g_unichar_ispunct (uni
);
197 /* --------------------------------------------------------------------------------------------- */
200 str_utf8_isalnum (const char *text
)
204 uni
= g_utf8_get_char_validated (text
, -1);
205 return g_unichar_isalnum (uni
);
208 /* --------------------------------------------------------------------------------------------- */
211 str_utf8_isdigit (const char *text
)
215 uni
= g_utf8_get_char_validated (text
, -1);
216 return g_unichar_isdigit (uni
);
219 /* --------------------------------------------------------------------------------------------- */
222 str_utf8_isprint (const char *ch
)
226 uni
= g_utf8_get_char_validated (ch
, -1);
227 return g_unichar_isprint (uni
);
230 /* --------------------------------------------------------------------------------------------- */
233 str_utf8_iscombiningmark (const char *ch
)
237 uni
= g_utf8_get_char_validated (ch
, -1);
238 return str_unichar_iscombiningmark (uni
);
241 /* --------------------------------------------------------------------------------------------- */
244 str_utf8_cnext_noncomb_char (const char **text
)
248 while ((*text
)[0] != '\0')
250 str_utf8_cnext_char_safe (text
);
252 if (!str_utf8_iscombiningmark (*text
))
259 /* --------------------------------------------------------------------------------------------- */
262 str_utf8_cprev_noncomb_char (const char **text
, const char *begin
)
266 while ((*text
) != begin
)
268 str_utf8_cprev_char_safe (text
);
270 if (!str_utf8_iscombiningmark (*text
))
277 /* --------------------------------------------------------------------------------------------- */
280 str_utf8_toupper (const char *text
, char **out
, size_t *remain
)
285 uni
= g_utf8_get_char_validated (text
, -1);
286 if (uni
== (gunichar
) (-1) || uni
== (gunichar
) (-2))
289 uni
= g_unichar_toupper (uni
);
290 left
= g_unichar_to_utf8 (uni
, NULL
);
294 left
= g_unichar_to_utf8 (uni
, *out
);
300 /* --------------------------------------------------------------------------------------------- */
303 str_utf8_tolower (const char *text
, char **out
, size_t *remain
)
308 uni
= g_utf8_get_char_validated (text
, -1);
309 if (uni
== (gunichar
) (-1) || uni
== (gunichar
) (-2))
312 uni
= g_unichar_tolower (uni
);
313 left
= g_unichar_to_utf8 (uni
, NULL
);
317 left
= g_unichar_to_utf8 (uni
, *out
);
323 /* --------------------------------------------------------------------------------------------- */
326 str_utf8_length (const char *text
)
333 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0')
336 result
+= g_utf8_strlen (start
, end
- start
);
343 result
= g_utf8_strlen (text
, -1);
344 else if (start
[0] != '\0' && start
!= end
)
345 result
+= g_utf8_strlen (start
, end
- start
);
350 /* --------------------------------------------------------------------------------------------- */
353 str_utf8_length2 (const char *text
, int size
)
360 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0' && size
> 0)
364 result
+= g_utf8_strlen (start
, MIN (end
- start
, size
));
367 result
+= (size
> 0);
373 result
= g_utf8_strlen (text
, size
);
374 else if (start
[0] != '\0' && start
!= end
&& size
> 0)
375 result
+= g_utf8_strlen (start
, MIN (end
- start
, size
));
380 /* --------------------------------------------------------------------------------------------- */
383 str_utf8_length_noncomb (const char *text
)
386 const char *t
= text
;
390 str_utf8_cnext_noncomb_char (&t
);
397 /* --------------------------------------------------------------------------------------------- */
401 str_utf8_questmark_sustb (char **string
, size_t *left
, GString
*buffer
)
405 next
= g_utf8_next_char (*string
);
406 (*left
) -= next
- (*string
);
408 g_string_append_c (buffer
, '?');
412 /* --------------------------------------------------------------------------------------------- */
415 str_utf8_conv_gerror_message (GError
*mcerror
, const char *def_msg
)
418 return g_strdup (mcerror
->message
);
420 return g_strdup (def_msg
!= NULL
? def_msg
: "");
423 /* --------------------------------------------------------------------------------------------- */
426 str_utf8_vfs_convert_to (GIConv coder
, const char *string
, int size
, GString
*buffer
)
428 estr_t result
= ESTR_SUCCESS
;
430 if (coder
== str_cnv_not_convert
)
431 g_string_append_len (buffer
, string
, size
);
433 result
= str_nconvert (coder
, string
, size
, buffer
);
438 /* --------------------------------------------------------------------------------------------- */
439 /* utility function, that makes string valid in utf8 and all characters printable
440 * return width of string too */
442 static const struct term_form
*
443 str_utf8_make_make_term_form (const char *text
, size_t length
)
445 static struct term_form result
;
450 result
.text
[0] = '\0';
452 result
.compose
= FALSE
;
453 actual
= result
.text
;
455 /* check if text start with combining character,
456 * add space at begin in this case */
457 if (length
!= 0 && text
[0] != '\0')
459 uni
= g_utf8_get_char_validated (text
, -1);
460 if ((uni
!= (gunichar
) (-1)) && (uni
!= (gunichar
) (-2))
461 && str_unichar_iscombiningmark (uni
))
466 result
.compose
= TRUE
;
470 while (length
!= 0 && text
[0] != '\0')
472 uni
= g_utf8_get_char_validated (text
, -1);
473 if ((uni
!= (gunichar
) (-1)) && (uni
!= (gunichar
) (-2)))
475 if (g_unichar_isprint (uni
))
477 left
= g_unichar_to_utf8 (uni
, actual
);
479 if (str_unichar_iscombiningmark (uni
))
480 result
.compose
= TRUE
;
484 if (g_unichar_iswide (uni
))
494 text
= g_utf8_next_char (text
);
499 /*actual[0] = '?'; */
500 memcpy (actual
, replch
, strlen (replch
));
501 actual
+= strlen (replch
);
505 if (length
!= (size_t) (-1))
513 /* --------------------------------------------------------------------------------------------- */
516 str_utf8_term_form (const char *text
)
518 static char result
[BUF_MEDIUM
* MB_LEN_MAX
];
519 const struct term_form
*pre_form
;
521 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
522 if (pre_form
->compose
)
526 composed
= g_utf8_normalize (pre_form
->text
, -1, G_NORMALIZE_DEFAULT_COMPOSE
);
527 g_strlcpy (result
, composed
, sizeof (result
));
531 g_strlcpy (result
, pre_form
->text
, sizeof (result
));
536 /* --------------------------------------------------------------------------------------------- */
537 /* utility function, that copies all characters from checked to actual */
540 utf8_tool_copy_chars_to_end (struct utf8_tool
*tool
)
542 tool
->compose
= FALSE
;
544 while (tool
->checked
[0] != '\0')
549 uni
= g_utf8_get_char (tool
->checked
);
550 tool
->compose
= tool
->compose
|| str_unichar_iscombiningmark (uni
);
551 left
= g_unichar_to_utf8 (uni
, NULL
);
552 if (tool
->remain
<= left
)
554 left
= g_unichar_to_utf8 (uni
, tool
->actual
);
555 tool
->actual
+= left
;
556 tool
->remain
-= left
;
557 tool
->checked
= g_utf8_next_char (tool
->checked
);
563 /* --------------------------------------------------------------------------------------------- */
564 /* utility function, that copies characters from checked to actual until ident is
565 * smaller than to_ident */
568 utf8_tool_copy_chars_to (struct utf8_tool
*tool
, int to_ident
)
570 tool
->compose
= FALSE
;
572 while (tool
->checked
[0] != '\0')
578 uni
= g_utf8_get_char (tool
->checked
);
579 if (str_unichar_iscombiningmark (uni
))
580 tool
->compose
= TRUE
;
584 if (g_unichar_iswide (uni
))
586 if (tool
->ident
+ w
> to_ident
)
590 left
= g_unichar_to_utf8 (uni
, NULL
);
591 if (tool
->remain
<= left
)
593 left
= g_unichar_to_utf8 (uni
, tool
->actual
);
594 tool
->actual
+= left
;
595 tool
->remain
-= left
;
596 tool
->checked
= g_utf8_next_char (tool
->checked
);
603 /* --------------------------------------------------------------------------------------------- */
604 /* utility function, adds count spaces to actual */
607 utf8_tool_insert_space (struct utf8_tool
*tool
, int count
)
611 if (tool
->remain
<= (gsize
) count
)
614 memset (tool
->actual
, ' ', count
);
615 tool
->actual
+= count
;
616 tool
->remain
-= count
;
620 /* --------------------------------------------------------------------------------------------- */
621 /* utility function, adds one characters to actual */
624 utf8_tool_insert_char (struct utf8_tool
*tool
, char ch
)
626 if (tool
->remain
<= 1)
629 tool
->actual
[0] = ch
;
635 /* --------------------------------------------------------------------------------------------- */
636 /* utility function, thah skips characters from checked until ident is greater or
637 * equal to to_ident */
640 utf8_tool_skip_chars_to (struct utf8_tool
*tool
, int to_ident
)
644 while (to_ident
> tool
->ident
&& tool
->checked
[0] != '\0')
646 uni
= g_utf8_get_char (tool
->checked
);
647 if (!str_unichar_iscombiningmark (uni
))
650 if (g_unichar_iswide (uni
))
653 tool
->checked
= g_utf8_next_char (tool
->checked
);
656 uni
= g_utf8_get_char (tool
->checked
);
657 while (str_unichar_iscombiningmark (uni
))
659 tool
->checked
= g_utf8_next_char (tool
->checked
);
660 uni
= g_utf8_get_char (tool
->checked
);
666 /* --------------------------------------------------------------------------------------------- */
669 utf8_tool_compose (char *buffer
, size_t size
)
673 composed
= g_utf8_normalize (buffer
, -1, G_NORMALIZE_DEFAULT_COMPOSE
);
674 g_strlcpy (buffer
, composed
, size
);
678 /* --------------------------------------------------------------------------------------------- */
681 str_utf8_fit_to_term (const char *text
, int width
, align_crt_t just_mode
)
683 static char result
[BUF_MEDIUM
* MB_LEN_MAX
];
684 const struct term_form
*pre_form
;
685 struct utf8_tool tool
;
687 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
688 tool
.checked
= pre_form
->text
;
689 tool
.actual
= result
;
690 tool
.remain
= sizeof (result
);
691 tool
.compose
= FALSE
;
693 if (pre_form
->width
<= (gsize
) width
)
695 switch (HIDE_FIT (just_mode
))
699 tool
.ident
= (width
- pre_form
->width
) / 2;
702 tool
.ident
= width
- pre_form
->width
;
709 utf8_tool_insert_space (&tool
, tool
.ident
);
710 utf8_tool_copy_chars_to_end (&tool
);
711 utf8_tool_insert_space (&tool
, width
- pre_form
->width
- tool
.ident
);
713 else if (IS_FIT (just_mode
))
716 utf8_tool_copy_chars_to (&tool
, width
/ 2);
717 utf8_tool_insert_char (&tool
, '~');
720 utf8_tool_skip_chars_to (&tool
, pre_form
->width
- width
+ 1);
721 utf8_tool_copy_chars_to_end (&tool
);
722 utf8_tool_insert_space (&tool
, width
- (pre_form
->width
- tool
.ident
+ 1));
726 switch (HIDE_FIT (just_mode
))
729 tool
.ident
= (width
- pre_form
->width
) / 2;
732 tool
.ident
= width
- pre_form
->width
;
739 utf8_tool_skip_chars_to (&tool
, 0);
740 utf8_tool_insert_space (&tool
, tool
.ident
);
741 utf8_tool_copy_chars_to (&tool
, width
);
742 utf8_tool_insert_space (&tool
, width
- tool
.ident
);
745 tool
.actual
[0] = '\0';
747 utf8_tool_compose (result
, sizeof (result
));
751 /* --------------------------------------------------------------------------------------------- */
754 str_utf8_term_trim (const char *text
, int width
)
756 static char result
[BUF_MEDIUM
* MB_LEN_MAX
];
757 const struct term_form
*pre_form
;
758 struct utf8_tool tool
;
766 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
768 tool
.checked
= pre_form
->text
;
769 tool
.actual
= result
;
770 tool
.remain
= sizeof (result
);
771 tool
.compose
= FALSE
;
773 if ((gsize
) width
>= pre_form
->width
)
774 utf8_tool_copy_chars_to_end (&tool
);
777 memset (tool
.actual
, '.', width
);
778 tool
.actual
+= width
;
779 tool
.remain
-= width
;
783 memset (tool
.actual
, '.', 3);
788 utf8_tool_skip_chars_to (&tool
, pre_form
->width
- width
+ 3);
789 utf8_tool_copy_chars_to_end (&tool
);
792 tool
.actual
[0] = '\0';
794 utf8_tool_compose (result
, sizeof (result
));
798 /* --------------------------------------------------------------------------------------------- */
801 str_utf8_term_width2 (const char *text
, size_t length
)
803 const struct term_form
*result
;
805 result
= str_utf8_make_make_term_form (text
, length
);
806 return result
->width
;
809 /* --------------------------------------------------------------------------------------------- */
812 str_utf8_term_width1 (const char *text
)
814 return str_utf8_term_width2 (text
, (size_t) (-1));
817 /* --------------------------------------------------------------------------------------------- */
820 str_utf8_term_char_width (const char *text
)
824 uni
= g_utf8_get_char_validated (text
, -1);
825 return (str_unichar_iscombiningmark (uni
)) ? 0 : ((g_unichar_iswide (uni
)) ? 2 : 1);
828 /* --------------------------------------------------------------------------------------------- */
831 str_utf8_term_substring (const char *text
, int start
, int width
)
833 static char result
[BUF_MEDIUM
* MB_LEN_MAX
];
834 const struct term_form
*pre_form
;
835 struct utf8_tool tool
;
837 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
839 tool
.checked
= pre_form
->text
;
840 tool
.actual
= result
;
841 tool
.remain
= sizeof (result
);
842 tool
.compose
= FALSE
;
845 utf8_tool_skip_chars_to (&tool
, 0);
848 utf8_tool_insert_space (&tool
, tool
.ident
);
850 utf8_tool_copy_chars_to (&tool
, width
);
851 utf8_tool_insert_space (&tool
, width
- tool
.ident
);
853 tool
.actual
[0] = '\0';
855 utf8_tool_compose (result
, sizeof (result
));
859 /* --------------------------------------------------------------------------------------------- */
862 str_utf8_trunc (const char *text
, int width
)
864 static char result
[MC_MAXPATHLEN
* MB_LEN_MAX
* 2];
865 const struct term_form
*pre_form
;
866 struct utf8_tool tool
;
868 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
870 tool
.checked
= pre_form
->text
;
871 tool
.actual
= result
;
872 tool
.remain
= sizeof (result
);
873 tool
.compose
= FALSE
;
875 if (pre_form
->width
<= (gsize
) width
)
876 utf8_tool_copy_chars_to_end (&tool
);
880 utf8_tool_copy_chars_to (&tool
, width
/ 2);
881 utf8_tool_insert_char (&tool
, '~');
884 utf8_tool_skip_chars_to (&tool
, pre_form
->width
- width
+ 1);
885 utf8_tool_copy_chars_to_end (&tool
);
888 tool
.actual
[0] = '\0';
890 utf8_tool_compose (result
, sizeof (result
));
894 /* --------------------------------------------------------------------------------------------- */
897 str_utf8_offset_to_pos (const char *text
, size_t length
)
899 if (str_utf8_is_valid_string (text
))
900 return g_utf8_offset_to_pointer (text
, length
) - text
;
906 buffer
= g_strdup (text
);
907 str_utf8_fix_string (buffer
);
908 result
= g_utf8_offset_to_pointer (buffer
, length
) - buffer
;
914 /* --------------------------------------------------------------------------------------------- */
917 str_utf8_column_to_pos (const char *text
, size_t pos
)
922 while (text
[0] != '\0')
926 uni
= g_utf8_get_char_validated (text
, MB_LEN_MAX
);
927 if ((uni
!= (gunichar
) (-1)) && (uni
!= (gunichar
) (-2)))
929 if (g_unichar_isprint (uni
))
931 if (!str_unichar_iscombiningmark (uni
))
934 if (g_unichar_iswide (uni
))
942 text
= g_utf8_next_char (text
);
950 if ((gsize
) width
> pos
)
959 /* --------------------------------------------------------------------------------------------- */
962 str_utf8_create_search_needle (const char *needle
, gboolean case_sen
)
970 return g_utf8_normalize (needle
, -1, G_NORMALIZE_ALL
);
972 fold
= g_utf8_casefold (needle
, -1);
973 result
= g_utf8_normalize (fold
, -1, G_NORMALIZE_ALL
);
978 /* --------------------------------------------------------------------------------------------- */
981 str_utf8_release_search_needle (char *needle
, gboolean case_sen
)
987 /* --------------------------------------------------------------------------------------------- */
990 str_utf8_search_first (const char *text
, const char *search
, gboolean case_sen
)
995 const char *result
= NULL
;
998 fold_text
= case_sen
? (char *) text
: g_utf8_casefold (text
, -1);
999 deco_text
= g_utf8_normalize (fold_text
, -1, G_NORMALIZE_ALL
);
1004 match
= g_strstr_len (match
, -1, search
);
1007 if ((!str_utf8_iscombiningmark (match
) || (match
== deco_text
)) &&
1008 !str_utf8_iscombiningmark (match
+ strlen (search
)))
1014 str_utf8_cnext_noncomb_char (&m
);
1015 str_utf8_cnext_noncomb_char (&result
);
1019 str_utf8_cnext_char (&match
);
1022 while (match
!= NULL
&& result
== NULL
);
1031 /* --------------------------------------------------------------------------------------------- */
1034 str_utf8_search_last (const char *text
, const char *search
, gboolean case_sen
)
1039 const char *result
= NULL
;
1042 fold_text
= case_sen
? (char *) text
: g_utf8_casefold (text
, -1);
1043 deco_text
= g_utf8_normalize (fold_text
, -1, G_NORMALIZE_ALL
);
1047 match
= g_strrstr_len (deco_text
, -1, search
);
1050 if ((!str_utf8_iscombiningmark (match
) || (match
== deco_text
)) &&
1051 !str_utf8_iscombiningmark (match
+ strlen (search
)))
1057 str_utf8_cnext_noncomb_char (&m
);
1058 str_utf8_cnext_noncomb_char (&result
);
1065 while (match
!= NULL
&& result
== NULL
);
1074 /* --------------------------------------------------------------------------------------------- */
1077 str_utf8_normalize (const char *text
)
1085 /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
1086 * does the normalization and then converts UCS-4 back into UTF-8.
1087 * Since file names are composed of ASCII characters in most cases, we can speed up
1088 * utf8 normalization by checking if the heavyweight Unicode normalization is actually
1089 * needed. Normalization of ASCII string is no-op.
1092 /* find out whether text is ASCII only */
1093 for (end
= text
; *end
!= '\0'; end
++)
1094 if ((*end
& 0x80) != 0)
1096 /* found 2nd byte of utf8-encoded symbol */
1100 /* if text is ASCII-only, return copy, normalize otherwise */
1102 return g_strndup (text
, end
- text
);
1104 fixed
= g_string_sized_new (4);
1107 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0')
1111 tmp
= g_utf8_normalize (start
, end
- start
, G_NORMALIZE_ALL
);
1112 g_string_append (fixed
, tmp
);
1115 g_string_append_c (fixed
, end
[0]);
1121 result
= g_utf8_normalize (text
, -1, G_NORMALIZE_ALL
);
1122 g_string_free (fixed
, TRUE
);
1126 if (start
[0] != '\0' && start
!= end
)
1128 tmp
= g_utf8_normalize (start
, end
- start
, G_NORMALIZE_ALL
);
1129 g_string_append (fixed
, tmp
);
1132 result
= g_string_free (fixed
, FALSE
);
1138 /* --------------------------------------------------------------------------------------------- */
1141 str_utf8_casefold_normalize (const char *text
)
1149 fixed
= g_string_sized_new (4);
1152 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0')
1156 fold
= g_utf8_casefold (start
, end
- start
);
1157 tmp
= g_utf8_normalize (fold
, -1, G_NORMALIZE_ALL
);
1158 g_string_append (fixed
, tmp
);
1162 g_string_append_c (fixed
, end
[0]);
1168 fold
= g_utf8_casefold (text
, -1);
1169 result
= g_utf8_normalize (fold
, -1, G_NORMALIZE_ALL
);
1171 g_string_free (fixed
, TRUE
);
1175 if (start
[0] != '\0' && start
!= end
)
1177 fold
= g_utf8_casefold (start
, end
- start
);
1178 tmp
= g_utf8_normalize (fold
, -1, G_NORMALIZE_ALL
);
1179 g_string_append (fixed
, tmp
);
1183 result
= g_string_free (fixed
, FALSE
);
1189 /* --------------------------------------------------------------------------------------------- */
1192 str_utf8_compare (const char *t1
, const char *t2
)
1197 n1
= str_utf8_normalize (t1
);
1198 n2
= str_utf8_normalize (t2
);
1200 result
= strcmp (n1
, n2
);
1208 /* --------------------------------------------------------------------------------------------- */
1211 str_utf8_ncompare (const char *t1
, const char *t2
)
1217 n1
= str_utf8_normalize (t1
);
1218 n2
= str_utf8_normalize (t2
);
1222 result
= strncmp (n1
, n2
, MIN (l1
, l2
));
1230 /* --------------------------------------------------------------------------------------------- */
1233 str_utf8_casecmp (const char *t1
, const char *t2
)
1238 n1
= str_utf8_casefold_normalize (t1
);
1239 n2
= str_utf8_casefold_normalize (t2
);
1241 result
= strcmp (n1
, n2
);
1249 /* --------------------------------------------------------------------------------------------- */
1252 str_utf8_ncasecmp (const char *t1
, const char *t2
)
1258 n1
= str_utf8_casefold_normalize (t1
);
1259 n2
= str_utf8_casefold_normalize (t2
);
1263 result
= strncmp (n1
, n2
, MIN (l1
, l2
));
1271 /* --------------------------------------------------------------------------------------------- */
1274 str_utf8_prefix (const char *text
, const char *prefix
)
1277 const char *nt
, *np
;
1278 const char *nnt
, *nnp
;
1281 t
= str_utf8_normalize (text
);
1282 p
= str_utf8_normalize (prefix
);
1288 while (nt
[0] != '\0' && np
[0] != '\0')
1290 str_utf8_cnext_char_safe (&nnt
);
1291 str_utf8_cnext_char_safe (&nnp
);
1292 if (nnt
- nt
!= nnp
- np
)
1294 if (strncmp (nt
, np
, nnt
- nt
) != 0)
1308 /* --------------------------------------------------------------------------------------------- */
1311 str_utf8_caseprefix (const char *text
, const char *prefix
)
1314 const char *nt
, *np
;
1315 const char *nnt
, *nnp
;
1318 t
= str_utf8_casefold_normalize (text
);
1319 p
= str_utf8_casefold_normalize (prefix
);
1325 while (nt
[0] != '\0' && np
[0] != '\0')
1327 str_utf8_cnext_char_safe (&nnt
);
1328 str_utf8_cnext_char_safe (&nnp
);
1329 if (nnt
- nt
!= nnp
- np
)
1331 if (strncmp (nt
, np
, nnt
- nt
) != 0)
1345 /* --------------------------------------------------------------------------------------------- */
1348 str_utf8_create_key_gen (const char *text
, gboolean case_sen
,
1349 gchar
*(*keygen
) (const gchar
*text
, gssize size
))
1354 result
= str_utf8_normalize (text
);
1359 const char *start
, *end
;
1362 dot
= text
[0] == '.';
1363 fixed
= g_string_sized_new (16);
1370 g_string_append_c (fixed
, '.');
1373 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0')
1377 fold
= g_utf8_casefold (start
, end
- start
);
1378 key
= keygen (fold
, -1);
1379 g_string_append (fixed
, key
);
1383 g_string_append_c (fixed
, end
[0]);
1389 fold
= g_utf8_casefold (start
, -1);
1390 result
= keygen (fold
, -1);
1392 g_string_free (fixed
, TRUE
);
1394 else if (dot
&& (start
== text
+ 1))
1396 fold
= g_utf8_casefold (start
, -1);
1397 key
= keygen (fold
, -1);
1398 g_string_append (fixed
, key
);
1401 result
= g_string_free (fixed
, FALSE
);
1405 if (start
[0] != '\0' && start
!= end
)
1407 fold
= g_utf8_casefold (start
, end
- start
);
1408 key
= keygen (fold
, -1);
1409 g_string_append (fixed
, key
);
1413 result
= g_string_free (fixed
, FALSE
);
1419 /* --------------------------------------------------------------------------------------------- */
1422 str_utf8_create_key (const char *text
, gboolean case_sen
)
1424 return str_utf8_create_key_gen (text
, case_sen
, g_utf8_collate_key
);
1427 /* --------------------------------------------------------------------------------------------- */
1429 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1431 str_utf8_create_key_for_filename (const char *text
, gboolean case_sen
)
1433 return str_utf8_create_key_gen (text
, case_sen
, g_utf8_collate_key_for_filename
);
1437 /* --------------------------------------------------------------------------------------------- */
1440 str_utf8_key_collate (const char *t1
, const char *t2
, gboolean case_sen
)
1443 return strcmp (t1
, t2
);
1446 /* --------------------------------------------------------------------------------------------- */
1449 str_utf8_release_key (char *key
, gboolean case_sen
)
1455 /* --------------------------------------------------------------------------------------------- */
1456 /*** public functions ****************************************************************************/
1457 /* --------------------------------------------------------------------------------------------- */
1460 str_utf8_init (void)
1462 struct str_class result
;
1464 result
.conv_gerror_message
= str_utf8_conv_gerror_message
;
1465 result
.vfs_convert_to
= str_utf8_vfs_convert_to
;
1466 result
.insert_replace_char
= str_utf8_insert_replace_char
;
1467 result
.is_valid_string
= str_utf8_is_valid_string
;
1468 result
.is_valid_char
= str_utf8_is_valid_char
;
1469 result
.cnext_char
= str_utf8_cnext_char
;
1470 result
.cprev_char
= str_utf8_cprev_char
;
1471 result
.cnext_char_safe
= str_utf8_cnext_char_safe
;
1472 result
.cprev_char_safe
= str_utf8_cprev_char_safe
;
1473 result
.cnext_noncomb_char
= str_utf8_cnext_noncomb_char
;
1474 result
.cprev_noncomb_char
= str_utf8_cprev_noncomb_char
;
1475 result
.char_isspace
= str_utf8_isspace
;
1476 result
.char_ispunct
= str_utf8_ispunct
;
1477 result
.char_isalnum
= str_utf8_isalnum
;
1478 result
.char_isdigit
= str_utf8_isdigit
;
1479 result
.char_isprint
= str_utf8_isprint
;
1480 result
.char_iscombiningmark
= str_utf8_iscombiningmark
;
1481 result
.char_toupper
= str_utf8_toupper
;
1482 result
.char_tolower
= str_utf8_tolower
;
1483 result
.length
= str_utf8_length
;
1484 result
.length2
= str_utf8_length2
;
1485 result
.length_noncomb
= str_utf8_length_noncomb
;
1486 result
.fix_string
= str_utf8_fix_string
;
1487 result
.term_form
= str_utf8_term_form
;
1488 result
.fit_to_term
= str_utf8_fit_to_term
;
1489 result
.term_trim
= str_utf8_term_trim
;
1490 result
.term_width2
= str_utf8_term_width2
;
1491 result
.term_width1
= str_utf8_term_width1
;
1492 result
.term_char_width
= str_utf8_term_char_width
;
1493 result
.term_substring
= str_utf8_term_substring
;
1494 result
.trunc
= str_utf8_trunc
;
1495 result
.offset_to_pos
= str_utf8_offset_to_pos
;
1496 result
.column_to_pos
= str_utf8_column_to_pos
;
1497 result
.create_search_needle
= str_utf8_create_search_needle
;
1498 result
.release_search_needle
= str_utf8_release_search_needle
;
1499 result
.search_first
= str_utf8_search_first
;
1500 result
.search_last
= str_utf8_search_last
;
1501 result
.compare
= str_utf8_compare
;
1502 result
.ncompare
= str_utf8_ncompare
;
1503 result
.casecmp
= str_utf8_casecmp
;
1504 result
.ncasecmp
= str_utf8_ncasecmp
;
1505 result
.prefix
= str_utf8_prefix
;
1506 result
.caseprefix
= str_utf8_caseprefix
;
1507 result
.create_key
= str_utf8_create_key
;
1508 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1509 /* case insensitive sort files in "a1 a2 a10" order */
1510 result
.create_key_for_filename
= str_utf8_create_key_for_filename
;
1512 /* case insensitive sort files in "a1 a10 a2" order */
1513 result
.create_key_for_filename
= str_utf8_create_key
;
1515 result
.key_collate
= str_utf8_key_collate
;
1516 result
.release_key
= str_utf8_release_key
;
1521 /* --------------------------------------------------------------------------------------------- */