Fix potential segfault in term_trim() functions
[midnight-commander.git] / lib / strutil / strutilutf8.c
blobcde3302d0563172f3a73122cc44474b56616c528
1 /* UTF-8 strings utilities
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 Written 2007 by:
5 Rostislav Benes
7 The file_date routine is mostly from GNU's fileutils package,
8 written by Richard Stallman and David MacKenzie.
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <config.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <errno.h>
29 #include <glib.h>
30 #include <langinfo.h>
31 #include <string.h>
33 #include "lib/global.h"
34 #include "lib/strutil.h"
36 /* using function for utf-8 from glib */
38 static const char replch[] = "\xEF\xBF\xBD";
40 static int
41 str_unichar_iscombiningmark (gunichar uni)
43 int type = g_unichar_type (uni);
44 return (type == G_UNICODE_COMBINING_MARK)
45 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
48 static void
49 str_utf8_insert_replace_char (GString * buffer)
51 g_string_append (buffer, replch);
54 static int
55 str_utf8_is_valid_string (const char *text)
57 return g_utf8_validate (text, -1, NULL);
60 static int
61 str_utf8_is_valid_char (const char *ch, size_t size)
63 switch (g_utf8_get_char_validated (ch, size))
65 case (gunichar) (-2):
66 return -2;
67 case (gunichar) (-1):
68 return -1;
69 default:
70 return 1;
74 static void
75 str_utf8_cnext_char (const char **text)
77 (*text) = g_utf8_next_char (*text);
80 static void
81 str_utf8_cprev_char (const char **text)
83 (*text) = g_utf8_prev_char (*text);
86 static void
87 str_utf8_cnext_char_safe (const char **text)
89 if (str_utf8_is_valid_char (*text, -1) == 1)
90 (*text) = g_utf8_next_char (*text);
91 else
92 (*text)++;
95 static void
96 str_utf8_cprev_char_safe (const char **text)
98 const char *result = g_utf8_prev_char (*text);
99 const char *t = result;
100 str_utf8_cnext_char_safe (&t);
101 if (t == *text)
102 (*text) = result;
103 else
104 (*text)--;
107 static void
108 str_utf8_fix_string (char *text)
110 gunichar uni;
112 while (text[0] != '\0')
114 uni = g_utf8_get_char_validated (text, -1);
115 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
117 text = g_utf8_next_char (text);
119 else
121 text[0] = '?';
122 text++;
127 static int
128 str_utf8_isspace (const char *text)
130 gunichar uni = g_utf8_get_char_validated (text, -1);
131 return g_unichar_isspace (uni);
134 static int
135 str_utf8_ispunct (const char *text)
137 gunichar uni = g_utf8_get_char_validated (text, -1);
138 return g_unichar_ispunct (uni);
141 static int
142 str_utf8_isalnum (const char *text)
144 gunichar uni = g_utf8_get_char_validated (text, -1);
145 return g_unichar_isalnum (uni);
148 static int
149 str_utf8_isdigit (const char *text)
151 gunichar uni = g_utf8_get_char_validated (text, -1);
152 return g_unichar_isdigit (uni);
155 static int
156 str_utf8_isprint (const char *ch)
158 gunichar uni = g_utf8_get_char_validated (ch, -1);
159 return g_unichar_isprint (uni);
162 static int
163 str_utf8_iscombiningmark (const char *ch)
165 gunichar uni = g_utf8_get_char_validated (ch, -1);
166 return str_unichar_iscombiningmark (uni);
169 static int
170 str_utf8_cnext_noncomb_char (const char **text)
172 int count = 0;
173 while ((*text)[0] != '\0')
175 str_utf8_cnext_char_safe (text);
176 count++;
177 if (!str_utf8_iscombiningmark (*text))
178 break;
180 return count;
183 static int
184 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
186 int count = 0;
187 while ((*text) != begin)
189 str_utf8_cprev_char_safe (text);
190 count++;
191 if (!str_utf8_iscombiningmark (*text))
192 break;
194 return count;
197 static int
198 str_utf8_toupper (const char *text, char **out, size_t * remain)
200 gunichar uni;
201 size_t left;
203 uni = g_utf8_get_char_validated (text, -1);
204 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
205 return 0;
207 uni = g_unichar_toupper (uni);
208 left = g_unichar_to_utf8 (uni, NULL);
209 if (left >= *remain)
210 return 0;
212 left = g_unichar_to_utf8 (uni, *out);
213 (*out) += left;
214 (*remain) -= left;
215 return 1;
218 static int
219 str_utf8_tolower (const char *text, char **out, size_t * remain)
221 gunichar uni;
222 size_t left;
224 uni = g_utf8_get_char_validated (text, -1);
225 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
226 return 0;
228 uni = g_unichar_tolower (uni);
229 left = g_unichar_to_utf8 (uni, NULL);
230 if (left >= *remain)
231 return 0;
233 left = g_unichar_to_utf8 (uni, *out);
234 (*out) += left;
235 (*remain) -= left;
236 return 1;
239 static int
240 str_utf8_length (const char *text)
242 int result = 0;
243 const char *start;
244 const char *end;
246 start = text;
247 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
249 if (start != end)
251 result += g_utf8_strlen (start, end - start);
253 result++;
254 start = end + 1;
257 if (start == text)
259 result = g_utf8_strlen (text, -1);
261 else
263 if (start[0] != '\0' && start != end)
265 result += g_utf8_strlen (start, end - start);
269 return result;
272 static int
273 str_utf8_length2 (const char *text, int size)
275 int result = 0;
276 const char *start;
277 const char *end;
279 start = text;
280 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
282 if (start != end)
284 result += g_utf8_strlen (start, min (end - start, size));
285 size -= end - start;
287 result += (size > 0);
288 size--;
289 start = end + 1;
292 if (start == text)
294 result = g_utf8_strlen (text, size);
296 else
298 if (start[0] != '\0' && start != end && size > 0)
300 result += g_utf8_strlen (start, min (end - start, size));
304 return result;
307 static int
308 str_utf8_length_noncomb (const char *text)
310 int result = 0;
311 const char *t = text;
313 while (t[0] != '\0')
315 str_utf8_cnext_noncomb_char (&t);
316 result++;
319 return result;
323 static void
324 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
326 char *next = g_utf8_next_char (*string);
327 (*left) -= next - (*string);
328 (*string) = next;
329 g_string_append_c (buffer, '?');
333 static gchar *
334 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
336 if ((error != NULL) && (error->message != NULL))
337 return g_strdup (error->message);
339 return g_strdup (def_msg != NULL ? def_msg : "");
342 static estr_t
343 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
345 estr_t result;
347 if (coder == str_cnv_not_convert)
349 g_string_append_len (buffer, string, size);
350 result = ESTR_SUCCESS;
352 else
353 result = str_nconvert (coder, (char *) string, size, buffer);
355 return result;
358 struct term_form
360 char text[BUF_MEDIUM * 6];
361 size_t width;
362 int compose;
365 /* utiliti function, that make string valid in utf8 and all characters printable
366 * return width of string too*/
367 static const struct term_form *
368 str_utf8_make_make_term_form (const char *text, size_t length)
370 static struct term_form result;
371 gunichar uni;
372 size_t left;
373 char *actual;
375 result.text[0] = '\0';
376 result.width = 0;
377 result.compose = 0;
378 actual = result.text;
380 /* check if text start with combining character,
381 * add space at begin in this case */
382 if (length != 0 && text[0] != '\0')
384 uni = g_utf8_get_char_validated (text, -1);
385 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
387 if (str_unichar_iscombiningmark (uni))
389 actual[0] = ' ';
390 actual++;
391 result.width++;
392 result.compose = 1;
397 while (length != 0 && text[0] != '\0')
399 uni = g_utf8_get_char_validated (text, -1);
400 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
402 if (g_unichar_isprint (uni))
404 left = g_unichar_to_utf8 (uni, actual);
405 actual += left;
406 if (!str_unichar_iscombiningmark (uni))
408 result.width++;
409 if (g_unichar_iswide (uni))
410 result.width++;
412 else
413 result.compose = 1;
415 else
417 actual[0] = '.';
418 actual++;
419 result.width++;
421 text = g_utf8_next_char (text);
423 else
425 text++;
426 /*actual[0] = '?'; */
427 memcpy (actual, replch, strlen (replch));
428 actual += strlen (replch);
429 result.width++;
431 if (length != (size_t) (-1))
432 length--;
434 actual[0] = '\0';
436 return &result;
439 static const char *
440 str_utf8_term_form (const char *text)
442 static char result[BUF_MEDIUM * 6];
443 const struct term_form *pre_form;
444 char *composed;
446 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
447 if (pre_form->compose)
449 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
450 g_strlcpy (result, composed, sizeof (result));
451 g_free (composed);
453 else
455 g_strlcpy (result, pre_form->text, sizeof (result));
457 return result;
460 struct utf8_tool
462 char *actual;
463 size_t remain;
464 const char *cheked;
465 int ident;
466 int compose;
469 /* utiliti function, that copy all characters from cheked to actual */
470 static int
471 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
473 size_t left;
474 gunichar uni;
476 tool->compose = 0;
478 while (tool->cheked[0] != '\0')
480 uni = g_utf8_get_char (tool->cheked);
481 tool->compose |= str_unichar_iscombiningmark (uni);
482 left = g_unichar_to_utf8 (uni, NULL);
483 if (tool->remain <= left)
484 return 0;
485 left = g_unichar_to_utf8 (uni, tool->actual);
486 tool->actual += left;
487 tool->remain -= left;
488 tool->cheked = g_utf8_next_char (tool->cheked);
490 return 1;
493 /* utiliti function, that copy characters from cheked to actual until ident is
494 * smaller than to_ident */
495 static int
496 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
498 size_t left;
499 gunichar uni;
500 int w;
502 tool->compose = 0;
504 while (tool->cheked[0] != '\0')
506 uni = g_utf8_get_char (tool->cheked);
507 if (!str_unichar_iscombiningmark (uni))
509 w = 1;
510 if (g_unichar_iswide (uni))
511 w++;
512 if (tool->ident + w > to_ident)
513 return 1;
515 else
517 w = 0;
518 tool->compose = 1;
521 left = g_unichar_to_utf8 (uni, NULL);
522 if (tool->remain <= left)
523 return 0;
524 left = g_unichar_to_utf8 (uni, tool->actual);
525 tool->actual += left;
526 tool->remain -= left;
527 tool->cheked = g_utf8_next_char (tool->cheked);
528 tool->ident += w;
530 return 1;
533 /* utiliti function, add count spaces to actual */
534 static int
535 utf8_tool_insert_space (struct utf8_tool *tool, int count)
537 if (count <= 0)
538 return 1;
539 if (tool->remain <= (gsize) count)
540 return 0;
541 memset (tool->actual, ' ', count);
542 tool->actual += count;
543 tool->remain -= count;
544 return 1;
547 /* utiliti function, add one characters to actual */
548 static int
549 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
551 if (tool->remain <= 1)
552 return 0;
553 tool->actual[0] = ch;
554 tool->actual++;
555 tool->remain--;
556 return 1;
559 /* utiliti function, thah skip characters from cheked until ident is greater or
560 * equal to to_ident */
561 static int
562 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
564 gunichar uni;
566 while (to_ident > tool->ident && tool->cheked[0] != '\0')
568 uni = g_utf8_get_char (tool->cheked);
569 if (!str_unichar_iscombiningmark (uni))
571 tool->ident++;
572 if (g_unichar_iswide (uni))
573 tool->ident++;
575 tool->cheked = g_utf8_next_char (tool->cheked);
577 uni = g_utf8_get_char (tool->cheked);
578 while (str_unichar_iscombiningmark (uni))
580 tool->cheked = g_utf8_next_char (tool->cheked);
581 uni = g_utf8_get_char (tool->cheked);
583 return 1;
586 static void
587 utf8_tool_compose (char *buffer, size_t size)
589 char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
590 g_strlcpy (buffer, composed, size);
591 g_free (composed);
595 static const char *
596 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
598 static char result[BUF_MEDIUM * 6];
599 const struct term_form *pre_form;
600 struct utf8_tool tool;
602 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
603 tool.cheked = pre_form->text;
604 tool.actual = result;
605 tool.remain = sizeof (result);
606 tool.compose = 0;
608 if (pre_form->width <= (gsize) width)
610 tool.ident = 0;
611 switch (HIDE_FIT (just_mode))
613 case J_CENTER_LEFT:
614 case J_CENTER:
615 tool.ident = (width - pre_form->width) / 2;
616 break;
617 case J_RIGHT:
618 tool.ident = width - pre_form->width;
619 break;
622 utf8_tool_insert_space (&tool, tool.ident);
623 utf8_tool_copy_chars_to_end (&tool);
624 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
626 else
628 if (IS_FIT (just_mode))
630 tool.ident = 0;
631 utf8_tool_copy_chars_to (&tool, width / 2);
632 utf8_tool_insert_char (&tool, '~');
634 tool.ident = 0;
635 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
636 utf8_tool_copy_chars_to_end (&tool);
637 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
639 else
641 tool.ident = 0;
642 switch (HIDE_FIT (just_mode))
644 case J_CENTER:
645 tool.ident = (width - pre_form->width) / 2;
646 break;
647 case J_RIGHT:
648 tool.ident = width - pre_form->width;
649 break;
652 utf8_tool_skip_chars_to (&tool, 0);
653 utf8_tool_insert_space (&tool, tool.ident);
654 utf8_tool_copy_chars_to (&tool, width);
655 utf8_tool_insert_space (&tool, width - tool.ident);
659 tool.actual[0] = '\0';
660 if (tool.compose)
661 utf8_tool_compose (result, sizeof (result));
662 return result;
665 static const char *
666 str_utf8_term_trim (const char *text, int width)
668 static char result[BUF_MEDIUM * 6];
669 const struct term_form *pre_form;
670 struct utf8_tool tool;
672 if (width < 1)
674 result [0] = '\0';
675 return result;
678 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
680 tool.cheked = pre_form->text;
681 tool.actual = result;
682 tool.remain = sizeof (result);
683 tool.compose = 0;
685 if ((gsize) width < pre_form->width)
687 if (width <= 3)
689 memset (tool.actual, '.', width);
690 tool.actual += width;
691 tool.remain -= width;
693 else
695 memset (tool.actual, '.', 3);
696 tool.actual += 3;
697 tool.remain -= 3;
699 tool.ident = 0;
700 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
701 utf8_tool_copy_chars_to_end (&tool);
704 else
706 utf8_tool_copy_chars_to_end (&tool);
709 tool.actual[0] = '\0';
710 if (tool.compose)
711 utf8_tool_compose (result, sizeof (result));
712 return result;
715 static int
716 str_utf8_term_width2 (const char *text, size_t length)
718 const struct term_form *result;
720 result = str_utf8_make_make_term_form (text, length);
721 return result->width;
724 static int
725 str_utf8_term_width1 (const char *text)
727 return str_utf8_term_width2 (text, (size_t) (-1));
730 static int
731 str_utf8_term_char_width (const char *text)
733 gunichar uni = g_utf8_get_char_validated (text, -1);
734 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
737 static const char *
738 str_utf8_term_substring (const char *text, int start, int width)
740 static char result[BUF_MEDIUM * 6];
741 const struct term_form *pre_form;
742 struct utf8_tool tool;
744 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
746 tool.cheked = pre_form->text;
747 tool.actual = result;
748 tool.remain = sizeof (result);
749 tool.compose = 0;
751 tool.ident = -start;
752 utf8_tool_skip_chars_to (&tool, 0);
753 if (tool.ident < 0)
754 tool.ident = 0;
755 utf8_tool_insert_space (&tool, tool.ident);
757 utf8_tool_copy_chars_to (&tool, width);
758 utf8_tool_insert_space (&tool, width - tool.ident);
760 tool.actual[0] = '\0';
761 if (tool.compose)
762 utf8_tool_compose (result, sizeof (result));
763 return result;
766 static const char *
767 str_utf8_trunc (const char *text, int width)
769 static char result[MC_MAXPATHLEN * 6 * 2];
770 const struct term_form *pre_form;
771 struct utf8_tool tool;
773 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
775 tool.cheked = pre_form->text;
776 tool.actual = result;
777 tool.remain = sizeof (result);
778 tool.compose = 0;
780 if (pre_form->width > (gsize) width)
782 tool.ident = 0;
783 utf8_tool_copy_chars_to (&tool, width / 2);
784 utf8_tool_insert_char (&tool, '~');
786 tool.ident = 0;
787 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
788 utf8_tool_copy_chars_to_end (&tool);
790 else
792 utf8_tool_copy_chars_to_end (&tool);
795 tool.actual[0] = '\0';
796 if (tool.compose)
797 utf8_tool_compose (result, sizeof (result));
798 return result;
801 static int
802 str_utf8_offset_to_pos (const char *text, size_t length)
804 if (str_utf8_is_valid_string (text))
805 return g_utf8_offset_to_pointer (text, length) - text;
806 else
808 int result;
809 GString *buffer = g_string_new (text);
811 str_utf8_fix_string (buffer->str);
812 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
813 g_string_free (buffer, TRUE);
814 return result;
818 static int
819 str_utf8_column_to_pos (const char *text, size_t pos)
821 static int result;
822 gunichar uni;
823 int width;
825 width = 0;
826 result = 0;
828 while (text[0] != '\0')
830 uni = g_utf8_get_char_validated (text, 6);
831 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
833 if (g_unichar_isprint (uni))
835 if (!str_unichar_iscombiningmark (uni))
837 width++;
838 if (g_unichar_iswide (uni))
839 width++;
842 else
844 width++;
846 text = g_utf8_next_char (text);
848 else
850 text++;
851 width++;
853 if ((gsize) width > pos)
854 return result;
856 result++;
859 return result;
862 static char *
863 str_utf8_create_search_needle (const char *needle, int case_sen)
865 if (needle != NULL)
867 if (case_sen)
869 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
871 else
873 char *fold = g_utf8_casefold (needle, -1);
874 char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
875 g_free (fold);
876 return result;
879 else
880 return NULL;
883 static void
884 str_utf8_release_search_needle (char *needle, int case_sen)
886 (void) case_sen;
887 if (needle != NULL)
888 g_free (needle);
891 static const char *
892 str_utf8_search_first (const char *text, const char *search, int case_sen)
894 char *fold_text;
895 char *deco_text;
896 const char *match;
897 const char *result = NULL;
898 const char *m;
900 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
901 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
903 match = deco_text;
906 match = g_strstr_len (match, -1, search);
907 if (match != NULL)
909 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
910 !str_utf8_iscombiningmark (match + strlen (search)))
913 result = text;
914 m = deco_text;
915 while (m < match)
917 str_utf8_cnext_noncomb_char (&m);
918 str_utf8_cnext_noncomb_char (&result);
921 else
923 str_utf8_cnext_char (&match);
927 while (match != NULL && result == NULL);
929 g_free (deco_text);
930 if (!case_sen)
931 g_free (fold_text);
933 return result;
936 static const char *
937 str_utf8_search_last (const char *text, const char *search, int case_sen)
939 char *fold_text;
940 char *deco_text;
941 char *match;
942 const char *result = NULL;
943 const char *m;
945 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
946 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
950 match = g_strrstr_len (deco_text, -1, search);
951 if (match != NULL)
953 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
954 !str_utf8_iscombiningmark (match + strlen (search)))
957 result = text;
958 m = deco_text;
959 while (m < match)
961 str_utf8_cnext_noncomb_char (&m);
962 str_utf8_cnext_noncomb_char (&result);
965 else
967 match[0] = '\0';
971 while (match != NULL && result == NULL);
973 g_free (deco_text);
974 if (!case_sen)
975 g_free (fold_text);
977 return result;
980 static char *
981 str_utf8_normalize (const char *text)
983 GString *fixed = g_string_new ("");
984 char *tmp;
985 char *result;
986 const char *start;
987 const char *end;
989 start = text;
990 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
992 if (start != end)
994 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
995 g_string_append (fixed, tmp);
996 g_free (tmp);
998 g_string_append_c (fixed, end[0]);
999 start = end + 1;
1002 if (start == text)
1004 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1006 else
1008 if (start[0] != '\0' && start != end)
1010 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1011 g_string_append (fixed, tmp);
1012 g_free (tmp);
1014 result = g_strdup (fixed->str);
1016 g_string_free (fixed, TRUE);
1018 return result;
1021 static char *
1022 str_utf8_casefold_normalize (const char *text)
1024 GString *fixed = g_string_new ("");
1025 char *tmp, *fold;
1026 char *result;
1027 const char *start;
1028 const char *end;
1030 start = text;
1031 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1033 if (start != end)
1035 fold = g_utf8_casefold (start, end - start);
1036 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1037 g_string_append (fixed, tmp);
1038 g_free (tmp);
1039 g_free (fold);
1041 g_string_append_c (fixed, end[0]);
1042 start = end + 1;
1045 if (start == text)
1047 fold = g_utf8_casefold (text, -1);
1048 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1049 g_free (fold);
1051 else
1053 if (start[0] != '\0' && start != end)
1055 fold = g_utf8_casefold (start, end - start);
1056 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1057 g_string_append (fixed, tmp);
1058 g_free (tmp);
1059 g_free (fold);
1061 result = g_strdup (fixed->str);
1063 g_string_free (fixed, TRUE);
1065 return result;
1068 static int
1069 str_utf8_compare (const char *t1, const char *t2)
1071 char *n1, *n2;
1072 int result;
1074 n1 = str_utf8_normalize (t1);
1075 n2 = str_utf8_normalize (t2);
1077 result = strcmp (n1, n2);
1079 g_free (n1);
1080 g_free (n2);
1082 return result;
1085 static int
1086 str_utf8_ncompare (const char *t1, const char *t2)
1088 char *n1, *n2;
1089 int result;
1091 n1 = str_utf8_normalize (t1);
1092 n2 = str_utf8_normalize (t2);
1094 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1096 g_free (n1);
1097 g_free (n2);
1099 return result;
1102 static int
1103 str_utf8_casecmp (const char *t1, const char *t2)
1105 char *n1, *n2;
1106 int result;
1108 n1 = str_utf8_casefold_normalize (t1);
1109 n2 = str_utf8_casefold_normalize (t2);
1111 result = strcmp (n1, n2);
1113 g_free (n1);
1114 g_free (n2);
1116 return result;
1119 static int
1120 str_utf8_ncasecmp (const char *t1, const char *t2)
1122 char *n1, *n2;
1123 int result;
1125 n1 = str_utf8_casefold_normalize (t1);
1126 n2 = str_utf8_casefold_normalize (t2);
1128 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1130 g_free (n1);
1131 g_free (n2);
1133 return result;
1136 static int
1137 str_utf8_prefix (const char *text, const char *prefix)
1139 char *t = str_utf8_normalize (text);
1140 char *p = str_utf8_normalize (prefix);
1141 const char *nt = t;
1142 const char *np = p;
1143 const char *nnt = t;
1144 const char *nnp = p;
1145 int result;
1147 while (nt[0] != '\0' && np[0] != '\0')
1149 str_utf8_cnext_char_safe (&nnt);
1150 str_utf8_cnext_char_safe (&nnp);
1151 if (nnt - nt != nnp - np)
1152 break;
1153 if (strncmp (nt, np, nnt - nt) != 0)
1154 break;
1155 nt = nnt;
1156 np = nnp;
1159 result = np - p;
1161 g_free (t);
1162 g_free (p);
1164 return result;
1167 static int
1168 str_utf8_caseprefix (const char *text, const char *prefix)
1170 char *t = str_utf8_casefold_normalize (text);
1171 char *p = str_utf8_casefold_normalize (prefix);
1172 const char *nt = t;
1173 const char *np = p;
1174 const char *nnt = t;
1175 const char *nnp = p;
1176 int result;
1178 while (nt[0] != '\0' && np[0] != '\0')
1180 str_utf8_cnext_char_safe (&nnt);
1181 str_utf8_cnext_char_safe (&nnp);
1182 if (nnt - nt != nnp - np)
1183 break;
1184 if (strncmp (nt, np, nnt - nt) != 0)
1185 break;
1186 nt = nnt;
1187 np = nnp;
1190 result = np - p;
1192 g_free (t);
1193 g_free (p);
1195 return result;
1198 static char *
1199 str_utf8_create_key_gen (const char *text, int case_sen,
1200 gchar * (*keygen) (const gchar * text, gssize size))
1202 char *result;
1204 if (case_sen)
1206 result = str_utf8_normalize (text);
1208 else
1210 gboolean dot;
1211 GString *fixed;
1212 const char *start, *end;
1213 char *fold, *key;
1215 dot = text[0] == '.';
1216 fixed = g_string_sized_new (16);
1218 if (!dot)
1219 start = text;
1220 else
1222 start = text + 1;
1223 g_string_append_c (fixed, '.');
1226 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1228 if (start != end)
1230 fold = g_utf8_casefold (start, end - start);
1231 key = keygen (fold, -1);
1232 g_string_append (fixed, key);
1233 g_free (key);
1234 g_free (fold);
1236 g_string_append_c (fixed, end[0]);
1237 start = end + 1;
1240 if (start == text)
1242 fold = g_utf8_casefold (start, -1);
1243 result = keygen (fold, -1);
1244 g_free (fold);
1245 g_string_free (fixed, TRUE);
1247 else if (dot && (start == text + 1))
1249 fold = g_utf8_casefold (start, -1);
1250 key = keygen (fold, -1);
1251 g_string_append (fixed, key);
1252 g_free (key);
1253 g_free (fold);
1254 result = g_string_free (fixed, FALSE);
1256 else
1258 if (start[0] != '\0' && start != end)
1260 fold = g_utf8_casefold (start, end - start);
1261 key = keygen (fold, -1);
1262 g_string_append (fixed, key);
1263 g_free (key);
1264 g_free (fold);
1266 result = g_string_free (fixed, FALSE);
1269 return result;
1272 static char *
1273 str_utf8_create_key (const char *text, int case_sen)
1275 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1278 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1279 static char *
1280 str_utf8_create_key_for_filename (const char *text, int case_sen)
1282 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1284 #endif
1286 static int
1287 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1289 (void) case_sen;
1290 return strcmp (t1, t2);
1293 static void
1294 str_utf8_release_key (char *key, int case_sen)
1296 (void) case_sen;
1297 g_free (key);
1300 struct str_class
1301 str_utf8_init (void)
1303 struct str_class result;
1305 result.conv_gerror_message = str_utf8_conv_gerror_message;
1306 result.vfs_convert_to = str_utf8_vfs_convert_to;
1307 result.insert_replace_char = str_utf8_insert_replace_char;
1308 result.is_valid_string = str_utf8_is_valid_string;
1309 result.is_valid_char = str_utf8_is_valid_char;
1310 result.cnext_char = str_utf8_cnext_char;
1311 result.cprev_char = str_utf8_cprev_char;
1312 result.cnext_char_safe = str_utf8_cnext_char_safe;
1313 result.cprev_char_safe = str_utf8_cprev_char_safe;
1314 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1315 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1316 result.isspace = str_utf8_isspace;
1317 result.ispunct = str_utf8_ispunct;
1318 result.isalnum = str_utf8_isalnum;
1319 result.isdigit = str_utf8_isdigit;
1320 result.isprint = str_utf8_isprint;
1321 result.iscombiningmark = str_utf8_iscombiningmark;
1322 result.toupper = str_utf8_toupper;
1323 result.tolower = str_utf8_tolower;
1324 result.length = str_utf8_length;
1325 result.length2 = str_utf8_length2;
1326 result.length_noncomb = str_utf8_length_noncomb;
1327 result.fix_string = str_utf8_fix_string;
1328 result.term_form = str_utf8_term_form;
1329 result.fit_to_term = str_utf8_fit_to_term;
1330 result.term_trim = str_utf8_term_trim;
1331 result.term_width2 = str_utf8_term_width2;
1332 result.term_width1 = str_utf8_term_width1;
1333 result.term_char_width = str_utf8_term_char_width;
1334 result.term_substring = str_utf8_term_substring;
1335 result.trunc = str_utf8_trunc;
1336 result.offset_to_pos = str_utf8_offset_to_pos;
1337 result.column_to_pos = str_utf8_column_to_pos;
1338 result.create_search_needle = str_utf8_create_search_needle;
1339 result.release_search_needle = str_utf8_release_search_needle;
1340 result.search_first = str_utf8_search_first;
1341 result.search_last = str_utf8_search_last;
1342 result.compare = str_utf8_compare;
1343 result.ncompare = str_utf8_ncompare;
1344 result.casecmp = str_utf8_casecmp;
1345 result.ncasecmp = str_utf8_ncasecmp;
1346 result.prefix = str_utf8_prefix;
1347 result.caseprefix = str_utf8_caseprefix;
1348 result.create_key = str_utf8_create_key;
1349 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1350 /* case insensitive sort files in "a1 a2 a10" order */
1351 result.create_key_for_filename = str_utf8_create_key_for_filename;
1352 #else
1353 /* case insensitive sort files in "a1 a10 a2" order */
1354 result.create_key_for_filename = str_utf8_create_key;
1355 #endif
1356 result.key_collate = str_utf8_key_collate;
1357 result.release_key = str_utf8_release_key;
1359 return result;