Remove irrelevant comments about file_date() function.
[midnight-commander.git] / lib / strutil / strutilutf8.c
blobfccb8eacdaeb2290b1b281526ccd4378759b6e88
1 /*
2 UTF-8 strings utilities
4 Copyright (C) 2007, 2011
5 The Free Software Foundation, Inc.
7 Written by:
8 Rostislav Benes, 2007
10 This file is part of the Midnight Commander.
12 The Midnight Commander is free software: you can redistribute it
13 and/or modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation, either version 3 of the License,
15 or (at your option) any later version.
17 The Midnight Commander is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>.
26 #include <config.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <errno.h>
30 #include <glib.h>
31 #include <langinfo.h>
32 #include <string.h>
34 #include "lib/global.h"
35 #include "lib/strutil.h"
37 /* using function for utf-8 from glib */
39 static const char replch[] = "\xEF\xBF\xBD";
41 static gboolean
42 str_unichar_iscombiningmark (gunichar uni)
44 GUnicodeType type;
46 type = g_unichar_type (uni);
47 return (type == G_UNICODE_COMBINING_MARK)
48 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
51 static void
52 str_utf8_insert_replace_char (GString * buffer)
54 g_string_append (buffer, replch);
57 static int
58 str_utf8_is_valid_string (const char *text)
60 return g_utf8_validate (text, -1, NULL);
63 static int
64 str_utf8_is_valid_char (const char *ch, size_t size)
66 switch (g_utf8_get_char_validated (ch, size))
68 case (gunichar) (-2):
69 return -2;
70 case (gunichar) (-1):
71 return -1;
72 default:
73 return 1;
77 static void
78 str_utf8_cnext_char (const char **text)
80 (*text) = g_utf8_next_char (*text);
83 static void
84 str_utf8_cprev_char (const char **text)
86 (*text) = g_utf8_prev_char (*text);
89 static void
90 str_utf8_cnext_char_safe (const char **text)
92 if (str_utf8_is_valid_char (*text, -1) == 1)
93 (*text) = g_utf8_next_char (*text);
94 else
95 (*text)++;
98 static void
99 str_utf8_cprev_char_safe (const char **text)
101 const char *result = g_utf8_prev_char (*text);
102 const char *t = result;
103 str_utf8_cnext_char_safe (&t);
104 if (t == *text)
105 (*text) = result;
106 else
107 (*text)--;
110 static void
111 str_utf8_fix_string (char *text)
113 gunichar uni;
115 while (text[0] != '\0')
117 uni = g_utf8_get_char_validated (text, -1);
118 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
120 text = g_utf8_next_char (text);
122 else
124 text[0] = '?';
125 text++;
130 static int
131 str_utf8_isspace (const char *text)
133 gunichar uni = g_utf8_get_char_validated (text, -1);
134 return g_unichar_isspace (uni);
137 static int
138 str_utf8_ispunct (const char *text)
140 gunichar uni = g_utf8_get_char_validated (text, -1);
141 return g_unichar_ispunct (uni);
144 static int
145 str_utf8_isalnum (const char *text)
147 gunichar uni = g_utf8_get_char_validated (text, -1);
148 return g_unichar_isalnum (uni);
151 static int
152 str_utf8_isdigit (const char *text)
154 gunichar uni = g_utf8_get_char_validated (text, -1);
155 return g_unichar_isdigit (uni);
158 static int
159 str_utf8_isprint (const char *ch)
161 gunichar uni = g_utf8_get_char_validated (ch, -1);
162 return g_unichar_isprint (uni);
165 static gboolean
166 str_utf8_iscombiningmark (const char *ch)
168 gunichar uni = g_utf8_get_char_validated (ch, -1);
169 return str_unichar_iscombiningmark (uni);
172 static int
173 str_utf8_cnext_noncomb_char (const char **text)
175 int count = 0;
176 while ((*text)[0] != '\0')
178 str_utf8_cnext_char_safe (text);
179 count++;
180 if (!str_utf8_iscombiningmark (*text))
181 break;
183 return count;
186 static int
187 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
189 int count = 0;
190 while ((*text) != begin)
192 str_utf8_cprev_char_safe (text);
193 count++;
194 if (!str_utf8_iscombiningmark (*text))
195 break;
197 return count;
200 static int
201 str_utf8_toupper (const char *text, char **out, size_t * remain)
203 gunichar uni;
204 size_t left;
206 uni = g_utf8_get_char_validated (text, -1);
207 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
208 return 0;
210 uni = g_unichar_toupper (uni);
211 left = g_unichar_to_utf8 (uni, NULL);
212 if (left >= *remain)
213 return 0;
215 left = g_unichar_to_utf8 (uni, *out);
216 (*out) += left;
217 (*remain) -= left;
218 return 1;
221 static int
222 str_utf8_tolower (const char *text, char **out, size_t * remain)
224 gunichar uni;
225 size_t left;
227 uni = g_utf8_get_char_validated (text, -1);
228 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
229 return 0;
231 uni = g_unichar_tolower (uni);
232 left = g_unichar_to_utf8 (uni, NULL);
233 if (left >= *remain)
234 return 0;
236 left = g_unichar_to_utf8 (uni, *out);
237 (*out) += left;
238 (*remain) -= left;
239 return 1;
242 static int
243 str_utf8_length (const char *text)
245 int result = 0;
246 const char *start;
247 const char *end;
249 start = text;
250 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
252 if (start != end)
254 result += g_utf8_strlen (start, end - start);
256 result++;
257 start = end + 1;
260 if (start == text)
262 result = g_utf8_strlen (text, -1);
264 else
266 if (start[0] != '\0' && start != end)
268 result += g_utf8_strlen (start, end - start);
272 return result;
275 static int
276 str_utf8_length2 (const char *text, int size)
278 int result = 0;
279 const char *start;
280 const char *end;
282 start = text;
283 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
285 if (start != end)
287 result += g_utf8_strlen (start, min (end - start, size));
288 size -= end - start;
290 result += (size > 0);
291 size--;
292 start = end + 1;
295 if (start == text)
297 result = g_utf8_strlen (text, size);
299 else
301 if (start[0] != '\0' && start != end && size > 0)
303 result += g_utf8_strlen (start, min (end - start, size));
307 return result;
310 static int
311 str_utf8_length_noncomb (const char *text)
313 int result = 0;
314 const char *t = text;
316 while (t[0] != '\0')
318 str_utf8_cnext_noncomb_char (&t);
319 result++;
322 return result;
326 static void
327 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
329 char *next = g_utf8_next_char (*string);
330 (*left) -= next - (*string);
331 (*string) = next;
332 g_string_append_c (buffer, '?');
336 static gchar *
337 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
339 if ((error != NULL) && (error->message != NULL))
340 return g_strdup (error->message);
342 return g_strdup (def_msg != NULL ? def_msg : "");
345 static estr_t
346 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
348 estr_t result;
350 if (coder == str_cnv_not_convert)
352 g_string_append_len (buffer, string, size);
353 result = ESTR_SUCCESS;
355 else
356 result = str_nconvert (coder, (char *) string, size, buffer);
358 return result;
361 struct term_form
363 char text[BUF_MEDIUM * 6];
364 size_t width;
365 gboolean compose;
368 /* utiliti function, that make string valid in utf8 and all characters printable
369 * return width of string too*/
370 static const struct term_form *
371 str_utf8_make_make_term_form (const char *text, size_t length)
373 static struct term_form result;
374 gunichar uni;
375 size_t left;
376 char *actual;
378 result.text[0] = '\0';
379 result.width = 0;
380 result.compose = FALSE;
381 actual = result.text;
383 /* check if text start with combining character,
384 * add space at begin in this case */
385 if (length != 0 && text[0] != '\0')
387 uni = g_utf8_get_char_validated (text, -1);
388 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
390 if (str_unichar_iscombiningmark (uni))
392 actual[0] = ' ';
393 actual++;
394 result.width++;
395 result.compose = TRUE;
400 while (length != 0 && text[0] != '\0')
402 uni = g_utf8_get_char_validated (text, -1);
403 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
405 if (g_unichar_isprint (uni))
407 left = g_unichar_to_utf8 (uni, actual);
408 actual += left;
409 if (str_unichar_iscombiningmark (uni))
410 result.compose = TRUE;
411 else
413 result.width++;
414 if (g_unichar_iswide (uni))
415 result.width++;
418 else
420 actual[0] = '.';
421 actual++;
422 result.width++;
424 text = g_utf8_next_char (text);
426 else
428 text++;
429 /*actual[0] = '?'; */
430 memcpy (actual, replch, strlen (replch));
431 actual += strlen (replch);
432 result.width++;
434 if (length != (size_t) (-1))
435 length--;
437 actual[0] = '\0';
439 return &result;
442 static const char *
443 str_utf8_term_form (const char *text)
445 static char result[BUF_MEDIUM * 6];
446 const struct term_form *pre_form;
447 char *composed;
449 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
450 if (pre_form->compose)
452 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
453 g_strlcpy (result, composed, sizeof (result));
454 g_free (composed);
456 else
458 g_strlcpy (result, pre_form->text, sizeof (result));
460 return result;
463 struct utf8_tool
465 char *actual;
466 size_t remain;
467 const char *cheked;
468 int ident;
469 gboolean compose;
472 /* utiliti function, that copy all characters from cheked to actual */
473 static gboolean
474 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
476 size_t left;
477 gunichar uni;
479 tool->compose = FALSE;
481 while (tool->cheked[0] != '\0')
483 uni = g_utf8_get_char (tool->cheked);
484 tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
485 left = g_unichar_to_utf8 (uni, NULL);
486 if (tool->remain <= left)
487 return FALSE;
488 left = g_unichar_to_utf8 (uni, tool->actual);
489 tool->actual += left;
490 tool->remain -= left;
491 tool->cheked = g_utf8_next_char (tool->cheked);
493 return TRUE;
496 /* utiliti function, that copy characters from cheked to actual until ident is
497 * smaller than to_ident */
498 static gboolean
499 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
501 size_t left;
502 gunichar uni;
503 int w;
505 tool->compose = FALSE;
507 while (tool->cheked[0] != '\0')
509 uni = g_utf8_get_char (tool->cheked);
510 if (!str_unichar_iscombiningmark (uni))
512 w = 1;
513 if (g_unichar_iswide (uni))
514 w++;
515 if (tool->ident + w > to_ident)
516 return TRUE;
518 else
520 w = 0;
521 tool->compose = TRUE;
524 left = g_unichar_to_utf8 (uni, NULL);
525 if (tool->remain <= left)
526 return FALSE;
527 left = g_unichar_to_utf8 (uni, tool->actual);
528 tool->actual += left;
529 tool->remain -= left;
530 tool->cheked = g_utf8_next_char (tool->cheked);
531 tool->ident += w;
533 return TRUE;
536 /* utiliti function, add count spaces to actual */
537 static int
538 utf8_tool_insert_space (struct utf8_tool *tool, int count)
540 if (count <= 0)
541 return 1;
542 if (tool->remain <= (gsize) count)
543 return 0;
544 memset (tool->actual, ' ', count);
545 tool->actual += count;
546 tool->remain -= count;
547 return 1;
550 /* utiliti function, add one characters to actual */
551 static int
552 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
554 if (tool->remain <= 1)
555 return 0;
556 tool->actual[0] = ch;
557 tool->actual++;
558 tool->remain--;
559 return 1;
562 /* utiliti function, thah skip characters from cheked until ident is greater or
563 * equal to to_ident */
564 static gboolean
565 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
567 gunichar uni;
569 while (to_ident > tool->ident && tool->cheked[0] != '\0')
571 uni = g_utf8_get_char (tool->cheked);
572 if (!str_unichar_iscombiningmark (uni))
574 tool->ident++;
575 if (g_unichar_iswide (uni))
576 tool->ident++;
578 tool->cheked = g_utf8_next_char (tool->cheked);
580 uni = g_utf8_get_char (tool->cheked);
581 while (str_unichar_iscombiningmark (uni))
583 tool->cheked = g_utf8_next_char (tool->cheked);
584 uni = g_utf8_get_char (tool->cheked);
586 return TRUE;
589 static void
590 utf8_tool_compose (char *buffer, size_t size)
592 char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
593 g_strlcpy (buffer, composed, size);
594 g_free (composed);
598 static const char *
599 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
601 static char result[BUF_MEDIUM * 6];
602 const struct term_form *pre_form;
603 struct utf8_tool tool;
605 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
606 tool.cheked = pre_form->text;
607 tool.actual = result;
608 tool.remain = sizeof (result);
609 tool.compose = FALSE;
611 if (pre_form->width <= (gsize) width)
613 tool.ident = 0;
614 switch (HIDE_FIT (just_mode))
616 case J_CENTER_LEFT:
617 case J_CENTER:
618 tool.ident = (width - pre_form->width) / 2;
619 break;
620 case J_RIGHT:
621 tool.ident = width - pre_form->width;
622 break;
625 utf8_tool_insert_space (&tool, tool.ident);
626 utf8_tool_copy_chars_to_end (&tool);
627 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
629 else
631 if (IS_FIT (just_mode))
633 tool.ident = 0;
634 utf8_tool_copy_chars_to (&tool, width / 2);
635 utf8_tool_insert_char (&tool, '~');
637 tool.ident = 0;
638 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
639 utf8_tool_copy_chars_to_end (&tool);
640 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
642 else
644 tool.ident = 0;
645 switch (HIDE_FIT (just_mode))
647 case J_CENTER:
648 tool.ident = (width - pre_form->width) / 2;
649 break;
650 case J_RIGHT:
651 tool.ident = width - pre_form->width;
652 break;
655 utf8_tool_skip_chars_to (&tool, 0);
656 utf8_tool_insert_space (&tool, tool.ident);
657 utf8_tool_copy_chars_to (&tool, width);
658 utf8_tool_insert_space (&tool, width - tool.ident);
662 tool.actual[0] = '\0';
663 if (tool.compose)
664 utf8_tool_compose (result, sizeof (result));
665 return result;
668 static const char *
669 str_utf8_term_trim (const char *text, int width)
671 static char result[BUF_MEDIUM * 6];
672 const struct term_form *pre_form;
673 struct utf8_tool tool;
675 if (width < 1)
677 result[0] = '\0';
678 return result;
681 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
683 tool.cheked = pre_form->text;
684 tool.actual = result;
685 tool.remain = sizeof (result);
686 tool.compose = FALSE;
688 if ((gsize) width < pre_form->width)
690 if (width <= 3)
692 memset (tool.actual, '.', width);
693 tool.actual += width;
694 tool.remain -= width;
696 else
698 memset (tool.actual, '.', 3);
699 tool.actual += 3;
700 tool.remain -= 3;
702 tool.ident = 0;
703 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
704 utf8_tool_copy_chars_to_end (&tool);
707 else
709 utf8_tool_copy_chars_to_end (&tool);
712 tool.actual[0] = '\0';
713 if (tool.compose)
714 utf8_tool_compose (result, sizeof (result));
715 return result;
718 static int
719 str_utf8_term_width2 (const char *text, size_t length)
721 const struct term_form *result;
723 result = str_utf8_make_make_term_form (text, length);
724 return result->width;
727 static int
728 str_utf8_term_width1 (const char *text)
730 return str_utf8_term_width2 (text, (size_t) (-1));
733 static int
734 str_utf8_term_char_width (const char *text)
736 gunichar uni = g_utf8_get_char_validated (text, -1);
737 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
740 static const char *
741 str_utf8_term_substring (const char *text, int start, int width)
743 static char result[BUF_MEDIUM * 6];
744 const struct term_form *pre_form;
745 struct utf8_tool tool;
747 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
749 tool.cheked = pre_form->text;
750 tool.actual = result;
751 tool.remain = sizeof (result);
752 tool.compose = FALSE;
754 tool.ident = -start;
755 utf8_tool_skip_chars_to (&tool, 0);
756 if (tool.ident < 0)
757 tool.ident = 0;
758 utf8_tool_insert_space (&tool, tool.ident);
760 utf8_tool_copy_chars_to (&tool, width);
761 utf8_tool_insert_space (&tool, width - tool.ident);
763 tool.actual[0] = '\0';
764 if (tool.compose)
765 utf8_tool_compose (result, sizeof (result));
766 return result;
769 static const char *
770 str_utf8_trunc (const char *text, int width)
772 static char result[MC_MAXPATHLEN * 6 * 2];
773 const struct term_form *pre_form;
774 struct utf8_tool tool;
776 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
778 tool.cheked = pre_form->text;
779 tool.actual = result;
780 tool.remain = sizeof (result);
781 tool.compose = FALSE;
783 if (pre_form->width > (gsize) width)
785 tool.ident = 0;
786 utf8_tool_copy_chars_to (&tool, width / 2);
787 utf8_tool_insert_char (&tool, '~');
789 tool.ident = 0;
790 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
791 utf8_tool_copy_chars_to_end (&tool);
793 else
795 utf8_tool_copy_chars_to_end (&tool);
798 tool.actual[0] = '\0';
799 if (tool.compose)
800 utf8_tool_compose (result, sizeof (result));
801 return result;
804 static int
805 str_utf8_offset_to_pos (const char *text, size_t length)
807 if (str_utf8_is_valid_string (text))
808 return g_utf8_offset_to_pointer (text, length) - text;
809 else
811 int result;
812 GString *buffer = g_string_new (text);
814 str_utf8_fix_string (buffer->str);
815 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
816 g_string_free (buffer, TRUE);
817 return result;
821 static int
822 str_utf8_column_to_pos (const char *text, size_t pos)
824 static int result;
825 gunichar uni;
826 int width;
828 width = 0;
829 result = 0;
831 while (text[0] != '\0')
833 uni = g_utf8_get_char_validated (text, 6);
834 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
836 if (g_unichar_isprint (uni))
838 if (!str_unichar_iscombiningmark (uni))
840 width++;
841 if (g_unichar_iswide (uni))
842 width++;
845 else
847 width++;
849 text = g_utf8_next_char (text);
851 else
853 text++;
854 width++;
856 if ((gsize) width > pos)
857 return result;
859 result++;
862 return result;
865 static char *
866 str_utf8_create_search_needle (const char *needle, int case_sen)
868 if (needle != NULL)
870 if (case_sen)
872 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
874 else
876 char *fold = g_utf8_casefold (needle, -1);
877 char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
878 g_free (fold);
879 return result;
882 else
883 return NULL;
886 static void
887 str_utf8_release_search_needle (char *needle, int case_sen)
889 (void) case_sen;
890 if (needle != NULL)
891 g_free (needle);
894 static const char *
895 str_utf8_search_first (const char *text, const char *search, int case_sen)
897 char *fold_text;
898 char *deco_text;
899 const char *match;
900 const char *result = NULL;
901 const char *m;
903 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
904 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
906 match = deco_text;
909 match = g_strstr_len (match, -1, search);
910 if (match != NULL)
912 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
913 !str_utf8_iscombiningmark (match + strlen (search)))
916 result = text;
917 m = deco_text;
918 while (m < match)
920 str_utf8_cnext_noncomb_char (&m);
921 str_utf8_cnext_noncomb_char (&result);
924 else
926 str_utf8_cnext_char (&match);
930 while (match != NULL && result == NULL);
932 g_free (deco_text);
933 if (!case_sen)
934 g_free (fold_text);
936 return result;
939 static const char *
940 str_utf8_search_last (const char *text, const char *search, int case_sen)
942 char *fold_text;
943 char *deco_text;
944 char *match;
945 const char *result = NULL;
946 const char *m;
948 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
949 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
953 match = g_strrstr_len (deco_text, -1, search);
954 if (match != NULL)
956 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
957 !str_utf8_iscombiningmark (match + strlen (search)))
960 result = text;
961 m = deco_text;
962 while (m < match)
964 str_utf8_cnext_noncomb_char (&m);
965 str_utf8_cnext_noncomb_char (&result);
968 else
970 match[0] = '\0';
974 while (match != NULL && result == NULL);
976 g_free (deco_text);
977 if (!case_sen)
978 g_free (fold_text);
980 return result;
983 static char *
984 str_utf8_normalize (const char *text)
986 GString *fixed;
987 char *tmp;
988 char *result;
989 const char *start;
990 const char *end;
992 fixed = g_string_sized_new (4);
994 start = text;
995 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
997 if (start != end)
999 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1000 g_string_append (fixed, tmp);
1001 g_free (tmp);
1003 g_string_append_c (fixed, end[0]);
1004 start = end + 1;
1007 if (start == text)
1009 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1010 g_string_free (fixed, TRUE);
1012 else
1014 if (start[0] != '\0' && start != end)
1016 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1017 g_string_append (fixed, tmp);
1018 g_free (tmp);
1020 result = g_string_free (fixed, FALSE);
1023 return result;
1026 static char *
1027 str_utf8_casefold_normalize (const char *text)
1029 GString *fixed;
1030 char *tmp, *fold;
1031 char *result;
1032 const char *start;
1033 const char *end;
1035 fixed = g_string_sized_new (4);
1037 start = text;
1038 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1040 if (start != end)
1042 fold = g_utf8_casefold (start, end - start);
1043 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1044 g_string_append (fixed, tmp);
1045 g_free (tmp);
1046 g_free (fold);
1048 g_string_append_c (fixed, end[0]);
1049 start = end + 1;
1052 if (start == text)
1054 fold = g_utf8_casefold (text, -1);
1055 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1056 g_free (fold);
1057 g_string_free (fixed, TRUE);
1059 else
1061 if (start[0] != '\0' && start != end)
1063 fold = g_utf8_casefold (start, end - start);
1064 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1065 g_string_append (fixed, tmp);
1066 g_free (tmp);
1067 g_free (fold);
1069 result = g_string_free (fixed, FALSE);
1072 return result;
1075 static int
1076 str_utf8_compare (const char *t1, const char *t2)
1078 char *n1, *n2;
1079 int result;
1081 n1 = str_utf8_normalize (t1);
1082 n2 = str_utf8_normalize (t2);
1084 result = strcmp (n1, n2);
1086 g_free (n1);
1087 g_free (n2);
1089 return result;
1092 static int
1093 str_utf8_ncompare (const char *t1, const char *t2)
1095 char *n1, *n2;
1096 int result;
1098 n1 = str_utf8_normalize (t1);
1099 n2 = str_utf8_normalize (t2);
1101 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1103 g_free (n1);
1104 g_free (n2);
1106 return result;
1109 static int
1110 str_utf8_casecmp (const char *t1, const char *t2)
1112 char *n1, *n2;
1113 int result;
1115 n1 = str_utf8_casefold_normalize (t1);
1116 n2 = str_utf8_casefold_normalize (t2);
1118 result = strcmp (n1, n2);
1120 g_free (n1);
1121 g_free (n2);
1123 return result;
1126 static int
1127 str_utf8_ncasecmp (const char *t1, const char *t2)
1129 char *n1, *n2;
1130 int result;
1132 n1 = str_utf8_casefold_normalize (t1);
1133 n2 = str_utf8_casefold_normalize (t2);
1135 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1137 g_free (n1);
1138 g_free (n2);
1140 return result;
1143 static int
1144 str_utf8_prefix (const char *text, const char *prefix)
1146 char *t = str_utf8_normalize (text);
1147 char *p = str_utf8_normalize (prefix);
1148 const char *nt = t;
1149 const char *np = p;
1150 const char *nnt = t;
1151 const char *nnp = p;
1152 int result;
1154 while (nt[0] != '\0' && np[0] != '\0')
1156 str_utf8_cnext_char_safe (&nnt);
1157 str_utf8_cnext_char_safe (&nnp);
1158 if (nnt - nt != nnp - np)
1159 break;
1160 if (strncmp (nt, np, nnt - nt) != 0)
1161 break;
1162 nt = nnt;
1163 np = nnp;
1166 result = np - p;
1168 g_free (t);
1169 g_free (p);
1171 return result;
1174 static int
1175 str_utf8_caseprefix (const char *text, const char *prefix)
1177 char *t = str_utf8_casefold_normalize (text);
1178 char *p = str_utf8_casefold_normalize (prefix);
1179 const char *nt = t;
1180 const char *np = p;
1181 const char *nnt = t;
1182 const char *nnp = p;
1183 int result;
1185 while (nt[0] != '\0' && np[0] != '\0')
1187 str_utf8_cnext_char_safe (&nnt);
1188 str_utf8_cnext_char_safe (&nnp);
1189 if (nnt - nt != nnp - np)
1190 break;
1191 if (strncmp (nt, np, nnt - nt) != 0)
1192 break;
1193 nt = nnt;
1194 np = nnp;
1197 result = np - p;
1199 g_free (t);
1200 g_free (p);
1202 return result;
1205 static char *
1206 str_utf8_create_key_gen (const char *text, int case_sen,
1207 gchar * (*keygen) (const gchar * text, gssize size))
1209 char *result;
1211 if (case_sen)
1213 result = str_utf8_normalize (text);
1215 else
1217 gboolean dot;
1218 GString *fixed;
1219 const char *start, *end;
1220 char *fold, *key;
1222 dot = text[0] == '.';
1223 fixed = g_string_sized_new (16);
1225 if (!dot)
1226 start = text;
1227 else
1229 start = text + 1;
1230 g_string_append_c (fixed, '.');
1233 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1235 if (start != end)
1237 fold = g_utf8_casefold (start, end - start);
1238 key = keygen (fold, -1);
1239 g_string_append (fixed, key);
1240 g_free (key);
1241 g_free (fold);
1243 g_string_append_c (fixed, end[0]);
1244 start = end + 1;
1247 if (start == text)
1249 fold = g_utf8_casefold (start, -1);
1250 result = keygen (fold, -1);
1251 g_free (fold);
1252 g_string_free (fixed, TRUE);
1254 else if (dot && (start == text + 1))
1256 fold = g_utf8_casefold (start, -1);
1257 key = keygen (fold, -1);
1258 g_string_append (fixed, key);
1259 g_free (key);
1260 g_free (fold);
1261 result = g_string_free (fixed, FALSE);
1263 else
1265 if (start[0] != '\0' && start != end)
1267 fold = g_utf8_casefold (start, end - start);
1268 key = keygen (fold, -1);
1269 g_string_append (fixed, key);
1270 g_free (key);
1271 g_free (fold);
1273 result = g_string_free (fixed, FALSE);
1276 return result;
1279 static char *
1280 str_utf8_create_key (const char *text, int case_sen)
1282 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1285 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1286 static char *
1287 str_utf8_create_key_for_filename (const char *text, int case_sen)
1289 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1291 #endif
1293 static int
1294 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1296 (void) case_sen;
1297 return strcmp (t1, t2);
1300 static void
1301 str_utf8_release_key (char *key, int case_sen)
1303 (void) case_sen;
1304 g_free (key);
1307 struct str_class
1308 str_utf8_init (void)
1310 struct str_class result;
1312 result.conv_gerror_message = str_utf8_conv_gerror_message;
1313 result.vfs_convert_to = str_utf8_vfs_convert_to;
1314 result.insert_replace_char = str_utf8_insert_replace_char;
1315 result.is_valid_string = str_utf8_is_valid_string;
1316 result.is_valid_char = str_utf8_is_valid_char;
1317 result.cnext_char = str_utf8_cnext_char;
1318 result.cprev_char = str_utf8_cprev_char;
1319 result.cnext_char_safe = str_utf8_cnext_char_safe;
1320 result.cprev_char_safe = str_utf8_cprev_char_safe;
1321 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1322 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1323 result.char_isspace = str_utf8_isspace;
1324 result.char_ispunct = str_utf8_ispunct;
1325 result.char_isalnum = str_utf8_isalnum;
1326 result.char_isdigit = str_utf8_isdigit;
1327 result.char_isprint = str_utf8_isprint;
1328 result.char_iscombiningmark = str_utf8_iscombiningmark;
1329 result.char_toupper = str_utf8_toupper;
1330 result.char_tolower = str_utf8_tolower;
1331 result.length = str_utf8_length;
1332 result.length2 = str_utf8_length2;
1333 result.length_noncomb = str_utf8_length_noncomb;
1334 result.fix_string = str_utf8_fix_string;
1335 result.term_form = str_utf8_term_form;
1336 result.fit_to_term = str_utf8_fit_to_term;
1337 result.term_trim = str_utf8_term_trim;
1338 result.term_width2 = str_utf8_term_width2;
1339 result.term_width1 = str_utf8_term_width1;
1340 result.term_char_width = str_utf8_term_char_width;
1341 result.term_substring = str_utf8_term_substring;
1342 result.trunc = str_utf8_trunc;
1343 result.offset_to_pos = str_utf8_offset_to_pos;
1344 result.column_to_pos = str_utf8_column_to_pos;
1345 result.create_search_needle = str_utf8_create_search_needle;
1346 result.release_search_needle = str_utf8_release_search_needle;
1347 result.search_first = str_utf8_search_first;
1348 result.search_last = str_utf8_search_last;
1349 result.compare = str_utf8_compare;
1350 result.ncompare = str_utf8_ncompare;
1351 result.casecmp = str_utf8_casecmp;
1352 result.ncasecmp = str_utf8_ncasecmp;
1353 result.prefix = str_utf8_prefix;
1354 result.caseprefix = str_utf8_caseprefix;
1355 result.create_key = str_utf8_create_key;
1356 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1357 /* case insensitive sort files in "a1 a2 a10" order */
1358 result.create_key_for_filename = str_utf8_create_key_for_filename;
1359 #else
1360 /* case insensitive sort files in "a1 a10 a2" order */
1361 result.create_key_for_filename = str_utf8_create_key;
1362 #endif
1363 result.key_collate = str_utf8_key_collate;
1364 result.release_key = str_utf8_release_key;
1366 return result;