Code indentation.
[midnight-commander.git] / lib / strutil / strutilutf8.c
blob4d9f9733d4a1011b6ad8a5491bc7c897cfa4bad0
1 /*
2 UTF-8 strings utilities
4 Copyright (C) 2007, 2011
5 The Free Software Foundation, Inc.
7 Written by:
8 Rostislav Benes, 2007
10 The file_date routine is mostly from GNU's fileutils package,
11 written by Richard Stallman and David MacKenzie.
13 This file is part of the Midnight Commander.
15 The Midnight Commander is free software: you can redistribute it
16 and/or modify it under the terms of the GNU General Public License as
17 published by the Free Software Foundation, either version 3 of the License,
18 or (at your option) any later version.
20 The Midnight Commander is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU General Public License for more details.
25 You should have received a copy of the GNU General Public License
26 along with this program. If not, see <http://www.gnu.org/licenses/>.
29 #include <config.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <glib.h>
34 #include <langinfo.h>
35 #include <string.h>
37 #include "lib/global.h"
38 #include "lib/strutil.h"
40 /* using function for utf-8 from glib */
42 static const char replch[] = "\xEF\xBF\xBD";
44 static int
45 str_unichar_iscombiningmark (gunichar uni)
47 int type = g_unichar_type (uni);
48 return (type == G_UNICODE_COMBINING_MARK)
49 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
52 static void
53 str_utf8_insert_replace_char (GString * buffer)
55 g_string_append (buffer, replch);
58 static int
59 str_utf8_is_valid_string (const char *text)
61 return g_utf8_validate (text, -1, NULL);
64 static int
65 str_utf8_is_valid_char (const char *ch, size_t size)
67 switch (g_utf8_get_char_validated (ch, size))
69 case (gunichar) (-2):
70 return -2;
71 case (gunichar) (-1):
72 return -1;
73 default:
74 return 1;
78 static void
79 str_utf8_cnext_char (const char **text)
81 (*text) = g_utf8_next_char (*text);
84 static void
85 str_utf8_cprev_char (const char **text)
87 (*text) = g_utf8_prev_char (*text);
90 static void
91 str_utf8_cnext_char_safe (const char **text)
93 if (str_utf8_is_valid_char (*text, -1) == 1)
94 (*text) = g_utf8_next_char (*text);
95 else
96 (*text)++;
99 static void
100 str_utf8_cprev_char_safe (const char **text)
102 const char *result = g_utf8_prev_char (*text);
103 const char *t = result;
104 str_utf8_cnext_char_safe (&t);
105 if (t == *text)
106 (*text) = result;
107 else
108 (*text)--;
111 static void
112 str_utf8_fix_string (char *text)
114 gunichar uni;
116 while (text[0] != '\0')
118 uni = g_utf8_get_char_validated (text, -1);
119 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
121 text = g_utf8_next_char (text);
123 else
125 text[0] = '?';
126 text++;
131 static int
132 str_utf8_isspace (const char *text)
134 gunichar uni = g_utf8_get_char_validated (text, -1);
135 return g_unichar_isspace (uni);
138 static int
139 str_utf8_ispunct (const char *text)
141 gunichar uni = g_utf8_get_char_validated (text, -1);
142 return g_unichar_ispunct (uni);
145 static int
146 str_utf8_isalnum (const char *text)
148 gunichar uni = g_utf8_get_char_validated (text, -1);
149 return g_unichar_isalnum (uni);
152 static int
153 str_utf8_isdigit (const char *text)
155 gunichar uni = g_utf8_get_char_validated (text, -1);
156 return g_unichar_isdigit (uni);
159 static int
160 str_utf8_isprint (const char *ch)
162 gunichar uni = g_utf8_get_char_validated (ch, -1);
163 return g_unichar_isprint (uni);
166 static int
167 str_utf8_iscombiningmark (const char *ch)
169 gunichar uni = g_utf8_get_char_validated (ch, -1);
170 return str_unichar_iscombiningmark (uni);
173 static int
174 str_utf8_cnext_noncomb_char (const char **text)
176 int count = 0;
177 while ((*text)[0] != '\0')
179 str_utf8_cnext_char_safe (text);
180 count++;
181 if (!str_utf8_iscombiningmark (*text))
182 break;
184 return count;
187 static int
188 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
190 int count = 0;
191 while ((*text) != begin)
193 str_utf8_cprev_char_safe (text);
194 count++;
195 if (!str_utf8_iscombiningmark (*text))
196 break;
198 return count;
201 static int
202 str_utf8_toupper (const char *text, char **out, size_t * remain)
204 gunichar uni;
205 size_t left;
207 uni = g_utf8_get_char_validated (text, -1);
208 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
209 return 0;
211 uni = g_unichar_toupper (uni);
212 left = g_unichar_to_utf8 (uni, NULL);
213 if (left >= *remain)
214 return 0;
216 left = g_unichar_to_utf8 (uni, *out);
217 (*out) += left;
218 (*remain) -= left;
219 return 1;
222 static int
223 str_utf8_tolower (const char *text, char **out, size_t * remain)
225 gunichar uni;
226 size_t left;
228 uni = g_utf8_get_char_validated (text, -1);
229 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
230 return 0;
232 uni = g_unichar_tolower (uni);
233 left = g_unichar_to_utf8 (uni, NULL);
234 if (left >= *remain)
235 return 0;
237 left = g_unichar_to_utf8 (uni, *out);
238 (*out) += left;
239 (*remain) -= left;
240 return 1;
243 static int
244 str_utf8_length (const char *text)
246 int result = 0;
247 const char *start;
248 const char *end;
250 start = text;
251 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
253 if (start != end)
255 result += g_utf8_strlen (start, end - start);
257 result++;
258 start = end + 1;
261 if (start == text)
263 result = g_utf8_strlen (text, -1);
265 else
267 if (start[0] != '\0' && start != end)
269 result += g_utf8_strlen (start, end - start);
273 return result;
276 static int
277 str_utf8_length2 (const char *text, int size)
279 int result = 0;
280 const char *start;
281 const char *end;
283 start = text;
284 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
286 if (start != end)
288 result += g_utf8_strlen (start, min (end - start, size));
289 size -= end - start;
291 result += (size > 0);
292 size--;
293 start = end + 1;
296 if (start == text)
298 result = g_utf8_strlen (text, size);
300 else
302 if (start[0] != '\0' && start != end && size > 0)
304 result += g_utf8_strlen (start, min (end - start, size));
308 return result;
311 static int
312 str_utf8_length_noncomb (const char *text)
314 int result = 0;
315 const char *t = text;
317 while (t[0] != '\0')
319 str_utf8_cnext_noncomb_char (&t);
320 result++;
323 return result;
327 static void
328 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
330 char *next = g_utf8_next_char (*string);
331 (*left) -= next - (*string);
332 (*string) = next;
333 g_string_append_c (buffer, '?');
337 static gchar *
338 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
340 if ((error != NULL) && (error->message != NULL))
341 return g_strdup (error->message);
343 return g_strdup (def_msg != NULL ? def_msg : "");
346 static estr_t
347 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
349 estr_t result;
351 if (coder == str_cnv_not_convert)
353 g_string_append_len (buffer, string, size);
354 result = ESTR_SUCCESS;
356 else
357 result = str_nconvert (coder, (char *) string, size, buffer);
359 return result;
362 struct term_form
364 char text[BUF_MEDIUM * 6];
365 size_t width;
366 int compose;
369 /* utiliti function, that make string valid in utf8 and all characters printable
370 * return width of string too*/
371 static const struct term_form *
372 str_utf8_make_make_term_form (const char *text, size_t length)
374 static struct term_form result;
375 gunichar uni;
376 size_t left;
377 char *actual;
379 result.text[0] = '\0';
380 result.width = 0;
381 result.compose = 0;
382 actual = result.text;
384 /* check if text start with combining character,
385 * add space at begin in this case */
386 if (length != 0 && text[0] != '\0')
388 uni = g_utf8_get_char_validated (text, -1);
389 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
391 if (str_unichar_iscombiningmark (uni))
393 actual[0] = ' ';
394 actual++;
395 result.width++;
396 result.compose = 1;
401 while (length != 0 && text[0] != '\0')
403 uni = g_utf8_get_char_validated (text, -1);
404 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
406 if (g_unichar_isprint (uni))
408 left = g_unichar_to_utf8 (uni, actual);
409 actual += left;
410 if (!str_unichar_iscombiningmark (uni))
412 result.width++;
413 if (g_unichar_iswide (uni))
414 result.width++;
416 else
417 result.compose = 1;
419 else
421 actual[0] = '.';
422 actual++;
423 result.width++;
425 text = g_utf8_next_char (text);
427 else
429 text++;
430 /*actual[0] = '?'; */
431 memcpy (actual, replch, strlen (replch));
432 actual += strlen (replch);
433 result.width++;
435 if (length != (size_t) (-1))
436 length--;
438 actual[0] = '\0';
440 return &result;
443 static const char *
444 str_utf8_term_form (const char *text)
446 static char result[BUF_MEDIUM * 6];
447 const struct term_form *pre_form;
448 char *composed;
450 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
451 if (pre_form->compose)
453 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
454 g_strlcpy (result, composed, sizeof (result));
455 g_free (composed);
457 else
459 g_strlcpy (result, pre_form->text, sizeof (result));
461 return result;
464 struct utf8_tool
466 char *actual;
467 size_t remain;
468 const char *cheked;
469 int ident;
470 int compose;
473 /* utiliti function, that copy all characters from cheked to actual */
474 static int
475 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
477 size_t left;
478 gunichar uni;
480 tool->compose = 0;
482 while (tool->cheked[0] != '\0')
484 uni = g_utf8_get_char (tool->cheked);
485 tool->compose |= str_unichar_iscombiningmark (uni);
486 left = g_unichar_to_utf8 (uni, NULL);
487 if (tool->remain <= left)
488 return 0;
489 left = g_unichar_to_utf8 (uni, tool->actual);
490 tool->actual += left;
491 tool->remain -= left;
492 tool->cheked = g_utf8_next_char (tool->cheked);
494 return 1;
497 /* utiliti function, that copy characters from cheked to actual until ident is
498 * smaller than to_ident */
499 static int
500 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
502 size_t left;
503 gunichar uni;
504 int w;
506 tool->compose = 0;
508 while (tool->cheked[0] != '\0')
510 uni = g_utf8_get_char (tool->cheked);
511 if (!str_unichar_iscombiningmark (uni))
513 w = 1;
514 if (g_unichar_iswide (uni))
515 w++;
516 if (tool->ident + w > to_ident)
517 return 1;
519 else
521 w = 0;
522 tool->compose = 1;
525 left = g_unichar_to_utf8 (uni, NULL);
526 if (tool->remain <= left)
527 return 0;
528 left = g_unichar_to_utf8 (uni, tool->actual);
529 tool->actual += left;
530 tool->remain -= left;
531 tool->cheked = g_utf8_next_char (tool->cheked);
532 tool->ident += w;
534 return 1;
537 /* utiliti function, add count spaces to actual */
538 static int
539 utf8_tool_insert_space (struct utf8_tool *tool, int count)
541 if (count <= 0)
542 return 1;
543 if (tool->remain <= (gsize) count)
544 return 0;
545 memset (tool->actual, ' ', count);
546 tool->actual += count;
547 tool->remain -= count;
548 return 1;
551 /* utiliti function, add one characters to actual */
552 static int
553 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
555 if (tool->remain <= 1)
556 return 0;
557 tool->actual[0] = ch;
558 tool->actual++;
559 tool->remain--;
560 return 1;
563 /* utiliti function, thah skip characters from cheked until ident is greater or
564 * equal to to_ident */
565 static int
566 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
568 gunichar uni;
570 while (to_ident > tool->ident && tool->cheked[0] != '\0')
572 uni = g_utf8_get_char (tool->cheked);
573 if (!str_unichar_iscombiningmark (uni))
575 tool->ident++;
576 if (g_unichar_iswide (uni))
577 tool->ident++;
579 tool->cheked = g_utf8_next_char (tool->cheked);
581 uni = g_utf8_get_char (tool->cheked);
582 while (str_unichar_iscombiningmark (uni))
584 tool->cheked = g_utf8_next_char (tool->cheked);
585 uni = g_utf8_get_char (tool->cheked);
587 return 1;
590 static void
591 utf8_tool_compose (char *buffer, size_t size)
593 char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
594 g_strlcpy (buffer, composed, size);
595 g_free (composed);
599 static const char *
600 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
602 static char result[BUF_MEDIUM * 6];
603 const struct term_form *pre_form;
604 struct utf8_tool tool;
606 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
607 tool.cheked = pre_form->text;
608 tool.actual = result;
609 tool.remain = sizeof (result);
610 tool.compose = 0;
612 if (pre_form->width <= (gsize) width)
614 tool.ident = 0;
615 switch (HIDE_FIT (just_mode))
617 case J_CENTER_LEFT:
618 case J_CENTER:
619 tool.ident = (width - pre_form->width) / 2;
620 break;
621 case J_RIGHT:
622 tool.ident = width - pre_form->width;
623 break;
626 utf8_tool_insert_space (&tool, tool.ident);
627 utf8_tool_copy_chars_to_end (&tool);
628 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
630 else
632 if (IS_FIT (just_mode))
634 tool.ident = 0;
635 utf8_tool_copy_chars_to (&tool, width / 2);
636 utf8_tool_insert_char (&tool, '~');
638 tool.ident = 0;
639 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
640 utf8_tool_copy_chars_to_end (&tool);
641 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
643 else
645 tool.ident = 0;
646 switch (HIDE_FIT (just_mode))
648 case J_CENTER:
649 tool.ident = (width - pre_form->width) / 2;
650 break;
651 case J_RIGHT:
652 tool.ident = width - pre_form->width;
653 break;
656 utf8_tool_skip_chars_to (&tool, 0);
657 utf8_tool_insert_space (&tool, tool.ident);
658 utf8_tool_copy_chars_to (&tool, width);
659 utf8_tool_insert_space (&tool, width - tool.ident);
663 tool.actual[0] = '\0';
664 if (tool.compose)
665 utf8_tool_compose (result, sizeof (result));
666 return result;
669 static const char *
670 str_utf8_term_trim (const char *text, int width)
672 static char result[BUF_MEDIUM * 6];
673 const struct term_form *pre_form;
674 struct utf8_tool tool;
676 if (width < 1)
678 result[0] = '\0';
679 return result;
682 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
684 tool.cheked = pre_form->text;
685 tool.actual = result;
686 tool.remain = sizeof (result);
687 tool.compose = 0;
689 if ((gsize) width < pre_form->width)
691 if (width <= 3)
693 memset (tool.actual, '.', width);
694 tool.actual += width;
695 tool.remain -= width;
697 else
699 memset (tool.actual, '.', 3);
700 tool.actual += 3;
701 tool.remain -= 3;
703 tool.ident = 0;
704 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
705 utf8_tool_copy_chars_to_end (&tool);
708 else
710 utf8_tool_copy_chars_to_end (&tool);
713 tool.actual[0] = '\0';
714 if (tool.compose)
715 utf8_tool_compose (result, sizeof (result));
716 return result;
719 static int
720 str_utf8_term_width2 (const char *text, size_t length)
722 const struct term_form *result;
724 result = str_utf8_make_make_term_form (text, length);
725 return result->width;
728 static int
729 str_utf8_term_width1 (const char *text)
731 return str_utf8_term_width2 (text, (size_t) (-1));
734 static int
735 str_utf8_term_char_width (const char *text)
737 gunichar uni = g_utf8_get_char_validated (text, -1);
738 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
741 static const char *
742 str_utf8_term_substring (const char *text, int start, int width)
744 static char result[BUF_MEDIUM * 6];
745 const struct term_form *pre_form;
746 struct utf8_tool tool;
748 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
750 tool.cheked = pre_form->text;
751 tool.actual = result;
752 tool.remain = sizeof (result);
753 tool.compose = 0;
755 tool.ident = -start;
756 utf8_tool_skip_chars_to (&tool, 0);
757 if (tool.ident < 0)
758 tool.ident = 0;
759 utf8_tool_insert_space (&tool, tool.ident);
761 utf8_tool_copy_chars_to (&tool, width);
762 utf8_tool_insert_space (&tool, width - tool.ident);
764 tool.actual[0] = '\0';
765 if (tool.compose)
766 utf8_tool_compose (result, sizeof (result));
767 return result;
770 static const char *
771 str_utf8_trunc (const char *text, int width)
773 static char result[MC_MAXPATHLEN * 6 * 2];
774 const struct term_form *pre_form;
775 struct utf8_tool tool;
777 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
779 tool.cheked = pre_form->text;
780 tool.actual = result;
781 tool.remain = sizeof (result);
782 tool.compose = 0;
784 if (pre_form->width > (gsize) width)
786 tool.ident = 0;
787 utf8_tool_copy_chars_to (&tool, width / 2);
788 utf8_tool_insert_char (&tool, '~');
790 tool.ident = 0;
791 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
792 utf8_tool_copy_chars_to_end (&tool);
794 else
796 utf8_tool_copy_chars_to_end (&tool);
799 tool.actual[0] = '\0';
800 if (tool.compose)
801 utf8_tool_compose (result, sizeof (result));
802 return result;
805 static int
806 str_utf8_offset_to_pos (const char *text, size_t length)
808 if (str_utf8_is_valid_string (text))
809 return g_utf8_offset_to_pointer (text, length) - text;
810 else
812 int result;
813 GString *buffer = g_string_new (text);
815 str_utf8_fix_string (buffer->str);
816 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
817 g_string_free (buffer, TRUE);
818 return result;
822 static int
823 str_utf8_column_to_pos (const char *text, size_t pos)
825 static int result;
826 gunichar uni;
827 int width;
829 width = 0;
830 result = 0;
832 while (text[0] != '\0')
834 uni = g_utf8_get_char_validated (text, 6);
835 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
837 if (g_unichar_isprint (uni))
839 if (!str_unichar_iscombiningmark (uni))
841 width++;
842 if (g_unichar_iswide (uni))
843 width++;
846 else
848 width++;
850 text = g_utf8_next_char (text);
852 else
854 text++;
855 width++;
857 if ((gsize) width > pos)
858 return result;
860 result++;
863 return result;
866 static char *
867 str_utf8_create_search_needle (const char *needle, int case_sen)
869 if (needle != NULL)
871 if (case_sen)
873 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
875 else
877 char *fold = g_utf8_casefold (needle, -1);
878 char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
879 g_free (fold);
880 return result;
883 else
884 return NULL;
887 static void
888 str_utf8_release_search_needle (char *needle, int case_sen)
890 (void) case_sen;
891 if (needle != NULL)
892 g_free (needle);
895 static const char *
896 str_utf8_search_first (const char *text, const char *search, int case_sen)
898 char *fold_text;
899 char *deco_text;
900 const char *match;
901 const char *result = NULL;
902 const char *m;
904 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
905 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
907 match = deco_text;
910 match = g_strstr_len (match, -1, search);
911 if (match != NULL)
913 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
914 !str_utf8_iscombiningmark (match + strlen (search)))
917 result = text;
918 m = deco_text;
919 while (m < match)
921 str_utf8_cnext_noncomb_char (&m);
922 str_utf8_cnext_noncomb_char (&result);
925 else
927 str_utf8_cnext_char (&match);
931 while (match != NULL && result == NULL);
933 g_free (deco_text);
934 if (!case_sen)
935 g_free (fold_text);
937 return result;
940 static const char *
941 str_utf8_search_last (const char *text, const char *search, int case_sen)
943 char *fold_text;
944 char *deco_text;
945 char *match;
946 const char *result = NULL;
947 const char *m;
949 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
950 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
954 match = g_strrstr_len (deco_text, -1, search);
955 if (match != NULL)
957 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
958 !str_utf8_iscombiningmark (match + strlen (search)))
961 result = text;
962 m = deco_text;
963 while (m < match)
965 str_utf8_cnext_noncomb_char (&m);
966 str_utf8_cnext_noncomb_char (&result);
969 else
971 match[0] = '\0';
975 while (match != NULL && result == NULL);
977 g_free (deco_text);
978 if (!case_sen)
979 g_free (fold_text);
981 return result;
984 static char *
985 str_utf8_normalize (const char *text)
987 GString *fixed = g_string_new ("");
988 char *tmp;
989 char *result;
990 const char *start;
991 const char *end;
993 start = text;
994 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
996 if (start != end)
998 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
999 g_string_append (fixed, tmp);
1000 g_free (tmp);
1002 g_string_append_c (fixed, end[0]);
1003 start = end + 1;
1006 if (start == text)
1008 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1010 else
1012 if (start[0] != '\0' && start != end)
1014 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1015 g_string_append (fixed, tmp);
1016 g_free (tmp);
1018 result = g_strdup (fixed->str);
1020 g_string_free (fixed, TRUE);
1022 return result;
1025 static char *
1026 str_utf8_casefold_normalize (const char *text)
1028 GString *fixed = g_string_new ("");
1029 char *tmp, *fold;
1030 char *result;
1031 const char *start;
1032 const char *end;
1034 start = text;
1035 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1037 if (start != end)
1039 fold = g_utf8_casefold (start, end - start);
1040 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1041 g_string_append (fixed, tmp);
1042 g_free (tmp);
1043 g_free (fold);
1045 g_string_append_c (fixed, end[0]);
1046 start = end + 1;
1049 if (start == text)
1051 fold = g_utf8_casefold (text, -1);
1052 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1053 g_free (fold);
1055 else
1057 if (start[0] != '\0' && start != end)
1059 fold = g_utf8_casefold (start, end - start);
1060 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1061 g_string_append (fixed, tmp);
1062 g_free (tmp);
1063 g_free (fold);
1065 result = g_strdup (fixed->str);
1067 g_string_free (fixed, TRUE);
1069 return result;
1072 static int
1073 str_utf8_compare (const char *t1, const char *t2)
1075 char *n1, *n2;
1076 int result;
1078 n1 = str_utf8_normalize (t1);
1079 n2 = str_utf8_normalize (t2);
1081 result = strcmp (n1, n2);
1083 g_free (n1);
1084 g_free (n2);
1086 return result;
1089 static int
1090 str_utf8_ncompare (const char *t1, const char *t2)
1092 char *n1, *n2;
1093 int result;
1095 n1 = str_utf8_normalize (t1);
1096 n2 = str_utf8_normalize (t2);
1098 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1100 g_free (n1);
1101 g_free (n2);
1103 return result;
1106 static int
1107 str_utf8_casecmp (const char *t1, const char *t2)
1109 char *n1, *n2;
1110 int result;
1112 n1 = str_utf8_casefold_normalize (t1);
1113 n2 = str_utf8_casefold_normalize (t2);
1115 result = strcmp (n1, n2);
1117 g_free (n1);
1118 g_free (n2);
1120 return result;
1123 static int
1124 str_utf8_ncasecmp (const char *t1, const char *t2)
1126 char *n1, *n2;
1127 int result;
1129 n1 = str_utf8_casefold_normalize (t1);
1130 n2 = str_utf8_casefold_normalize (t2);
1132 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1134 g_free (n1);
1135 g_free (n2);
1137 return result;
1140 static int
1141 str_utf8_prefix (const char *text, const char *prefix)
1143 char *t = str_utf8_normalize (text);
1144 char *p = str_utf8_normalize (prefix);
1145 const char *nt = t;
1146 const char *np = p;
1147 const char *nnt = t;
1148 const char *nnp = p;
1149 int result;
1151 while (nt[0] != '\0' && np[0] != '\0')
1153 str_utf8_cnext_char_safe (&nnt);
1154 str_utf8_cnext_char_safe (&nnp);
1155 if (nnt - nt != nnp - np)
1156 break;
1157 if (strncmp (nt, np, nnt - nt) != 0)
1158 break;
1159 nt = nnt;
1160 np = nnp;
1163 result = np - p;
1165 g_free (t);
1166 g_free (p);
1168 return result;
1171 static int
1172 str_utf8_caseprefix (const char *text, const char *prefix)
1174 char *t = str_utf8_casefold_normalize (text);
1175 char *p = str_utf8_casefold_normalize (prefix);
1176 const char *nt = t;
1177 const char *np = p;
1178 const char *nnt = t;
1179 const char *nnp = p;
1180 int result;
1182 while (nt[0] != '\0' && np[0] != '\0')
1184 str_utf8_cnext_char_safe (&nnt);
1185 str_utf8_cnext_char_safe (&nnp);
1186 if (nnt - nt != nnp - np)
1187 break;
1188 if (strncmp (nt, np, nnt - nt) != 0)
1189 break;
1190 nt = nnt;
1191 np = nnp;
1194 result = np - p;
1196 g_free (t);
1197 g_free (p);
1199 return result;
1202 static char *
1203 str_utf8_create_key_gen (const char *text, int case_sen,
1204 gchar * (*keygen) (const gchar * text, gssize size))
1206 char *result;
1208 if (case_sen)
1210 result = str_utf8_normalize (text);
1212 else
1214 gboolean dot;
1215 GString *fixed;
1216 const char *start, *end;
1217 char *fold, *key;
1219 dot = text[0] == '.';
1220 fixed = g_string_sized_new (16);
1222 if (!dot)
1223 start = text;
1224 else
1226 start = text + 1;
1227 g_string_append_c (fixed, '.');
1230 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1232 if (start != end)
1234 fold = g_utf8_casefold (start, end - start);
1235 key = keygen (fold, -1);
1236 g_string_append (fixed, key);
1237 g_free (key);
1238 g_free (fold);
1240 g_string_append_c (fixed, end[0]);
1241 start = end + 1;
1244 if (start == text)
1246 fold = g_utf8_casefold (start, -1);
1247 result = keygen (fold, -1);
1248 g_free (fold);
1249 g_string_free (fixed, TRUE);
1251 else if (dot && (start == text + 1))
1253 fold = g_utf8_casefold (start, -1);
1254 key = keygen (fold, -1);
1255 g_string_append (fixed, key);
1256 g_free (key);
1257 g_free (fold);
1258 result = g_string_free (fixed, FALSE);
1260 else
1262 if (start[0] != '\0' && start != end)
1264 fold = g_utf8_casefold (start, end - start);
1265 key = keygen (fold, -1);
1266 g_string_append (fixed, key);
1267 g_free (key);
1268 g_free (fold);
1270 result = g_string_free (fixed, FALSE);
1273 return result;
1276 static char *
1277 str_utf8_create_key (const char *text, int case_sen)
1279 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1282 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1283 static char *
1284 str_utf8_create_key_for_filename (const char *text, int case_sen)
1286 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1288 #endif
1290 static int
1291 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1293 (void) case_sen;
1294 return strcmp (t1, t2);
1297 static void
1298 str_utf8_release_key (char *key, int case_sen)
1300 (void) case_sen;
1301 g_free (key);
1304 struct str_class
1305 str_utf8_init (void)
1307 struct str_class result;
1309 result.conv_gerror_message = str_utf8_conv_gerror_message;
1310 result.vfs_convert_to = str_utf8_vfs_convert_to;
1311 result.insert_replace_char = str_utf8_insert_replace_char;
1312 result.is_valid_string = str_utf8_is_valid_string;
1313 result.is_valid_char = str_utf8_is_valid_char;
1314 result.cnext_char = str_utf8_cnext_char;
1315 result.cprev_char = str_utf8_cprev_char;
1316 result.cnext_char_safe = str_utf8_cnext_char_safe;
1317 result.cprev_char_safe = str_utf8_cprev_char_safe;
1318 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1319 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1320 result.isspace = str_utf8_isspace;
1321 result.ispunct = str_utf8_ispunct;
1322 result.isalnum = str_utf8_isalnum;
1323 result.isdigit = str_utf8_isdigit;
1324 result.isprint = str_utf8_isprint;
1325 result.iscombiningmark = str_utf8_iscombiningmark;
1326 result.toupper = str_utf8_toupper;
1327 result.tolower = str_utf8_tolower;
1328 result.length = str_utf8_length;
1329 result.length2 = str_utf8_length2;
1330 result.length_noncomb = str_utf8_length_noncomb;
1331 result.fix_string = str_utf8_fix_string;
1332 result.term_form = str_utf8_term_form;
1333 result.fit_to_term = str_utf8_fit_to_term;
1334 result.term_trim = str_utf8_term_trim;
1335 result.term_width2 = str_utf8_term_width2;
1336 result.term_width1 = str_utf8_term_width1;
1337 result.term_char_width = str_utf8_term_char_width;
1338 result.term_substring = str_utf8_term_substring;
1339 result.trunc = str_utf8_trunc;
1340 result.offset_to_pos = str_utf8_offset_to_pos;
1341 result.column_to_pos = str_utf8_column_to_pos;
1342 result.create_search_needle = str_utf8_create_search_needle;
1343 result.release_search_needle = str_utf8_release_search_needle;
1344 result.search_first = str_utf8_search_first;
1345 result.search_last = str_utf8_search_last;
1346 result.compare = str_utf8_compare;
1347 result.ncompare = str_utf8_ncompare;
1348 result.casecmp = str_utf8_casecmp;
1349 result.ncasecmp = str_utf8_ncasecmp;
1350 result.prefix = str_utf8_prefix;
1351 result.caseprefix = str_utf8_caseprefix;
1352 result.create_key = str_utf8_create_key;
1353 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1354 /* case insensitive sort files in "a1 a2 a10" order */
1355 result.create_key_for_filename = str_utf8_create_key_for_filename;
1356 #else
1357 /* case insensitive sort files in "a1 a10 a2" order */
1358 result.create_key_for_filename = str_utf8_create_key;
1359 #endif
1360 result.key_collate = str_utf8_key_collate;
1361 result.release_key = str_utf8_release_key;
1363 return result;