PCRE_CFLAGS are CPPFLAGS, actually.
[midnight-commander.git] / lib / strutil / strutilutf8.c
blobbe81c0259b936e0c7fc9dd7997ebdccc0bcb1d02
1 /*
2 UTF-8 strings utilities
4 Copyright (C) 2007, 2011
5 The Free Software Foundation, Inc.
7 Written by:
8 Rostislav Benes, 2007
10 The file_date routine is mostly from GNU's fileutils package,
11 written by Richard Stallman and David MacKenzie.
13 This file is part of the Midnight Commander.
15 The Midnight Commander is free software: you can redistribute it
16 and/or modify it under the terms of the GNU General Public License as
17 published by the Free Software Foundation, either version 3 of the License,
18 or (at your option) any later version.
20 The Midnight Commander is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU General Public License for more details.
25 You should have received a copy of the GNU General Public License
26 along with this program. If not, see <http://www.gnu.org/licenses/>.
29 #include <config.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <glib.h>
34 #include <langinfo.h>
35 #include <string.h>
37 #include "lib/global.h"
38 #include "lib/strutil.h"
40 /* using function for utf-8 from glib */
42 static const char replch[] = "\xEF\xBF\xBD";
44 static int
45 str_unichar_iscombiningmark (gunichar uni)
47 int type = g_unichar_type (uni);
48 return (type == G_UNICODE_COMBINING_MARK)
49 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
52 static void
53 str_utf8_insert_replace_char (GString * buffer)
55 g_string_append (buffer, replch);
58 static int
59 str_utf8_is_valid_string (const char *text)
61 return g_utf8_validate (text, -1, NULL);
64 static int
65 str_utf8_is_valid_char (const char *ch, size_t size)
67 switch (g_utf8_get_char_validated (ch, size))
69 case (gunichar) (-2):
70 return -2;
71 case (gunichar) (-1):
72 return -1;
73 default:
74 return 1;
78 static void
79 str_utf8_cnext_char (const char **text)
81 (*text) = g_utf8_next_char (*text);
84 static void
85 str_utf8_cprev_char (const char **text)
87 (*text) = g_utf8_prev_char (*text);
90 static void
91 str_utf8_cnext_char_safe (const char **text)
93 if (str_utf8_is_valid_char (*text, -1) == 1)
94 (*text) = g_utf8_next_char (*text);
95 else
96 (*text)++;
99 static void
100 str_utf8_cprev_char_safe (const char **text)
102 const char *result = g_utf8_prev_char (*text);
103 const char *t = result;
104 str_utf8_cnext_char_safe (&t);
105 if (t == *text)
106 (*text) = result;
107 else
108 (*text)--;
111 static void
112 str_utf8_fix_string (char *text)
114 gunichar uni;
116 while (text[0] != '\0')
118 uni = g_utf8_get_char_validated (text, -1);
119 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
121 text = g_utf8_next_char (text);
123 else
125 text[0] = '?';
126 text++;
131 static int
132 str_utf8_isspace (const char *text)
134 gunichar uni = g_utf8_get_char_validated (text, -1);
135 return g_unichar_isspace (uni);
138 static int
139 str_utf8_ispunct (const char *text)
141 gunichar uni = g_utf8_get_char_validated (text, -1);
142 return g_unichar_ispunct (uni);
145 static int
146 str_utf8_isalnum (const char *text)
148 gunichar uni = g_utf8_get_char_validated (text, -1);
149 return g_unichar_isalnum (uni);
152 static int
153 str_utf8_isdigit (const char *text)
155 gunichar uni = g_utf8_get_char_validated (text, -1);
156 return g_unichar_isdigit (uni);
159 static int
160 str_utf8_isprint (const char *ch)
162 gunichar uni = g_utf8_get_char_validated (ch, -1);
163 return g_unichar_isprint (uni);
166 static int
167 str_utf8_iscombiningmark (const char *ch)
169 gunichar uni = g_utf8_get_char_validated (ch, -1);
170 return str_unichar_iscombiningmark (uni);
173 static int
174 str_utf8_cnext_noncomb_char (const char **text)
176 int count = 0;
177 while ((*text)[0] != '\0')
179 str_utf8_cnext_char_safe (text);
180 count++;
181 if (!str_utf8_iscombiningmark (*text))
182 break;
184 return count;
187 static int
188 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
190 int count = 0;
191 while ((*text) != begin)
193 str_utf8_cprev_char_safe (text);
194 count++;
195 if (!str_utf8_iscombiningmark (*text))
196 break;
198 return count;
201 static int
202 str_utf8_toupper (const char *text, char **out, size_t * remain)
204 gunichar uni;
205 size_t left;
207 uni = g_utf8_get_char_validated (text, -1);
208 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
209 return 0;
211 uni = g_unichar_toupper (uni);
212 left = g_unichar_to_utf8 (uni, NULL);
213 if (left >= *remain)
214 return 0;
216 left = g_unichar_to_utf8 (uni, *out);
217 (*out) += left;
218 (*remain) -= left;
219 return 1;
222 static int
223 str_utf8_tolower (const char *text, char **out, size_t * remain)
225 gunichar uni;
226 size_t left;
228 uni = g_utf8_get_char_validated (text, -1);
229 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
230 return 0;
232 uni = g_unichar_tolower (uni);
233 left = g_unichar_to_utf8 (uni, NULL);
234 if (left >= *remain)
235 return 0;
237 left = g_unichar_to_utf8 (uni, *out);
238 (*out) += left;
239 (*remain) -= left;
240 return 1;
243 static int
244 str_utf8_length (const char *text)
246 int result = 0;
247 const char *start;
248 const char *end;
250 start = text;
251 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
253 if (start != end)
255 result += g_utf8_strlen (start, end - start);
257 result++;
258 start = end + 1;
261 if (start == text)
263 result = g_utf8_strlen (text, -1);
265 else
267 if (start[0] != '\0' && start != end)
269 result += g_utf8_strlen (start, end - start);
273 return result;
276 static int
277 str_utf8_length2 (const char *text, int size)
279 int result = 0;
280 const char *start;
281 const char *end;
283 start = text;
284 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
286 if (start != end)
288 result += g_utf8_strlen (start, min (end - start, size));
289 size -= end - start;
291 result += (size > 0);
292 size--;
293 start = end + 1;
296 if (start == text)
298 result = g_utf8_strlen (text, size);
300 else
302 if (start[0] != '\0' && start != end && size > 0)
304 result += g_utf8_strlen (start, min (end - start, size));
308 return result;
311 static int
312 str_utf8_length_noncomb (const char *text)
314 int result = 0;
315 const char *t = text;
317 while (t[0] != '\0')
319 str_utf8_cnext_noncomb_char (&t);
320 result++;
323 return result;
327 static void
328 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
330 char *next = g_utf8_next_char (*string);
331 (*left) -= next - (*string);
332 (*string) = next;
333 g_string_append_c (buffer, '?');
337 static gchar *
338 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
340 if ((error != NULL) && (error->message != NULL))
341 return g_strdup (error->message);
343 return g_strdup (def_msg != NULL ? def_msg : "");
346 static estr_t
347 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
349 estr_t result;
351 if (coder == str_cnv_not_convert)
353 g_string_append_len (buffer, string, size);
354 result = ESTR_SUCCESS;
356 else
357 result = str_nconvert (coder, (char *) string, size, buffer);
359 return result;
362 struct term_form
364 char text[BUF_MEDIUM * 6];
365 size_t width;
366 int compose;
369 /* utiliti function, that make string valid in utf8 and all characters printable
370 * return width of string too*/
371 static const struct term_form *
372 str_utf8_make_make_term_form (const char *text, size_t length)
374 static struct term_form result;
375 gunichar uni;
376 size_t left;
377 char *actual;
379 result.text[0] = '\0';
380 result.width = 0;
381 result.compose = 0;
382 actual = result.text;
384 /* check if text start with combining character,
385 * add space at begin in this case */
386 if (length != 0 && text[0] != '\0')
388 uni = g_utf8_get_char_validated (text, -1);
389 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
391 if (str_unichar_iscombiningmark (uni))
393 actual[0] = ' ';
394 actual++;
395 result.width++;
396 result.compose = 1;
401 while (length != 0 && text[0] != '\0')
403 uni = g_utf8_get_char_validated (text, -1);
404 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
406 if (g_unichar_isprint (uni))
408 left = g_unichar_to_utf8 (uni, actual);
409 actual += left;
410 if (!str_unichar_iscombiningmark (uni))
412 result.width++;
413 if (g_unichar_iswide (uni))
414 result.width++;
416 else
417 result.compose = 1;
419 else
421 actual[0] = '.';
422 actual++;
423 result.width++;
425 text = g_utf8_next_char (text);
427 else
429 text++;
430 /*actual[0] = '?'; */
431 memcpy (actual, replch, strlen (replch));
432 actual += strlen (replch);
433 result.width++;
435 if (length != (size_t) (-1))
436 length--;
438 actual[0] = '\0';
440 return &result;
443 static const char *
444 str_utf8_term_form (const char *text)
446 static char result[BUF_MEDIUM * 6];
447 const struct term_form *pre_form;
448 char *composed;
450 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
451 if (pre_form->compose)
453 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
454 g_strlcpy (result, composed, sizeof (result));
455 g_free (composed);
457 else
459 g_strlcpy (result, pre_form->text, sizeof (result));
461 return result;
464 struct utf8_tool
466 char *actual;
467 size_t remain;
468 const char *cheked;
469 int ident;
470 int compose;
473 /* utiliti function, that copy all characters from cheked to actual */
474 static int
475 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
477 size_t left;
478 gunichar uni;
480 tool->compose = 0;
482 while (tool->cheked[0] != '\0')
484 uni = g_utf8_get_char (tool->cheked);
485 tool->compose |= str_unichar_iscombiningmark (uni);
486 left = g_unichar_to_utf8 (uni, NULL);
487 if (tool->remain <= left)
488 return 0;
489 left = g_unichar_to_utf8 (uni, tool->actual);
490 tool->actual += left;
491 tool->remain -= left;
492 tool->cheked = g_utf8_next_char (tool->cheked);
494 return 1;
497 /* utiliti function, that copy characters from cheked to actual until ident is
498 * smaller than to_ident */
499 static int
500 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
502 size_t left;
503 gunichar uni;
504 int w;
506 tool->compose = 0;
508 while (tool->cheked[0] != '\0')
510 uni = g_utf8_get_char (tool->cheked);
511 if (!str_unichar_iscombiningmark (uni))
513 w = 1;
514 if (g_unichar_iswide (uni))
515 w++;
516 if (tool->ident + w > to_ident)
517 return 1;
519 else
521 w = 0;
522 tool->compose = 1;
525 left = g_unichar_to_utf8 (uni, NULL);
526 if (tool->remain <= left)
527 return 0;
528 left = g_unichar_to_utf8 (uni, tool->actual);
529 tool->actual += left;
530 tool->remain -= left;
531 tool->cheked = g_utf8_next_char (tool->cheked);
532 tool->ident += w;
534 return 1;
537 /* utiliti function, add count spaces to actual */
538 static int
539 utf8_tool_insert_space (struct utf8_tool *tool, int count)
541 if (count <= 0)
542 return 1;
543 if (tool->remain <= (gsize) count)
544 return 0;
545 memset (tool->actual, ' ', count);
546 tool->actual += count;
547 tool->remain -= count;
548 return 1;
551 /* utiliti function, add one characters to actual */
552 static int
553 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
555 if (tool->remain <= 1)
556 return 0;
557 tool->actual[0] = ch;
558 tool->actual++;
559 tool->remain--;
560 return 1;
563 /* utiliti function, thah skip characters from cheked until ident is greater or
564 * equal to to_ident */
565 static int
566 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
568 gunichar uni;
570 while (to_ident > tool->ident && tool->cheked[0] != '\0')
572 uni = g_utf8_get_char (tool->cheked);
573 if (!str_unichar_iscombiningmark (uni))
575 tool->ident++;
576 if (g_unichar_iswide (uni))
577 tool->ident++;
579 tool->cheked = g_utf8_next_char (tool->cheked);
581 uni = g_utf8_get_char (tool->cheked);
582 while (str_unichar_iscombiningmark (uni))
584 tool->cheked = g_utf8_next_char (tool->cheked);
585 uni = g_utf8_get_char (tool->cheked);
587 return 1;
590 static void
591 utf8_tool_compose (char *buffer, size_t size)
593 char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
594 g_strlcpy (buffer, composed, size);
595 g_free (composed);
599 static const char *
600 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
602 static char result[BUF_MEDIUM * 6];
603 const struct term_form *pre_form;
604 struct utf8_tool tool;
606 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
607 tool.cheked = pre_form->text;
608 tool.actual = result;
609 tool.remain = sizeof (result);
610 tool.compose = 0;
612 if (pre_form->width <= (gsize) width)
614 tool.ident = 0;
615 switch (HIDE_FIT (just_mode))
617 case J_CENTER_LEFT:
618 case J_CENTER:
619 tool.ident = (width - pre_form->width) / 2;
620 break;
621 case J_RIGHT:
622 tool.ident = width - pre_form->width;
623 break;
626 utf8_tool_insert_space (&tool, tool.ident);
627 utf8_tool_copy_chars_to_end (&tool);
628 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
630 else
632 if (IS_FIT (just_mode))
634 tool.ident = 0;
635 utf8_tool_copy_chars_to (&tool, width / 2);
636 utf8_tool_insert_char (&tool, '~');
638 tool.ident = 0;
639 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
640 utf8_tool_copy_chars_to_end (&tool);
641 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
643 else
645 tool.ident = 0;
646 switch (HIDE_FIT (just_mode))
648 case J_CENTER:
649 tool.ident = (width - pre_form->width) / 2;
650 break;
651 case J_RIGHT:
652 tool.ident = width - pre_form->width;
653 break;
656 utf8_tool_skip_chars_to (&tool, 0);
657 utf8_tool_insert_space (&tool, tool.ident);
658 utf8_tool_copy_chars_to (&tool, width);
659 utf8_tool_insert_space (&tool, width - tool.ident);
663 tool.actual[0] = '\0';
664 if (tool.compose)
665 utf8_tool_compose (result, sizeof (result));
666 return result;
669 static const char *
670 str_utf8_term_trim (const char *text, int width)
672 static char result[BUF_MEDIUM * 6];
673 const struct term_form *pre_form;
674 struct utf8_tool tool;
676 if (width < 1)
678 result[0] = '\0';
679 return result;
682 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
684 tool.cheked = pre_form->text;
685 tool.actual = result;
686 tool.remain = sizeof (result);
687 tool.compose = 0;
689 if ((gsize) width < pre_form->width)
691 if (width <= 3)
693 memset (tool.actual, '.', width);
694 tool.actual += width;
695 tool.remain -= width;
697 else
699 memset (tool.actual, '.', 3);
700 tool.actual += 3;
701 tool.remain -= 3;
703 tool.ident = 0;
704 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
705 utf8_tool_copy_chars_to_end (&tool);
708 else
710 utf8_tool_copy_chars_to_end (&tool);
713 tool.actual[0] = '\0';
714 if (tool.compose)
715 utf8_tool_compose (result, sizeof (result));
716 return result;
719 static int
720 str_utf8_term_width2 (const char *text, size_t length)
722 const struct term_form *result;
724 result = str_utf8_make_make_term_form (text, length);
725 return result->width;
728 static int
729 str_utf8_term_width1 (const char *text)
731 return str_utf8_term_width2 (text, (size_t) (-1));
734 static int
735 str_utf8_term_char_width (const char *text)
737 gunichar uni = g_utf8_get_char_validated (text, -1);
738 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
741 static const char *
742 str_utf8_term_substring (const char *text, int start, int width)
744 static char result[BUF_MEDIUM * 6];
745 const struct term_form *pre_form;
746 struct utf8_tool tool;
748 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
750 tool.cheked = pre_form->text;
751 tool.actual = result;
752 tool.remain = sizeof (result);
753 tool.compose = 0;
755 tool.ident = -start;
756 utf8_tool_skip_chars_to (&tool, 0);
757 if (tool.ident < 0)
758 tool.ident = 0;
759 utf8_tool_insert_space (&tool, tool.ident);
761 utf8_tool_copy_chars_to (&tool, width);
762 utf8_tool_insert_space (&tool, width - tool.ident);
764 tool.actual[0] = '\0';
765 if (tool.compose)
766 utf8_tool_compose (result, sizeof (result));
767 return result;
770 static const char *
771 str_utf8_trunc (const char *text, int width)
773 static char result[MC_MAXPATHLEN * 6 * 2];
774 const struct term_form *pre_form;
775 struct utf8_tool tool;
777 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
779 tool.cheked = pre_form->text;
780 tool.actual = result;
781 tool.remain = sizeof (result);
782 tool.compose = 0;
784 if (pre_form->width > (gsize) width)
786 tool.ident = 0;
787 utf8_tool_copy_chars_to (&tool, width / 2);
788 utf8_tool_insert_char (&tool, '~');
790 tool.ident = 0;
791 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
792 utf8_tool_copy_chars_to_end (&tool);
794 else
796 utf8_tool_copy_chars_to_end (&tool);
799 tool.actual[0] = '\0';
800 if (tool.compose)
801 utf8_tool_compose (result, sizeof (result));
802 return result;
805 static int
806 str_utf8_offset_to_pos (const char *text, size_t length)
808 if (str_utf8_is_valid_string (text))
809 return g_utf8_offset_to_pointer (text, length) - text;
810 else
812 int result;
813 GString *buffer = g_string_new (text);
815 str_utf8_fix_string (buffer->str);
816 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
817 g_string_free (buffer, TRUE);
818 return result;
822 static int
823 str_utf8_column_to_pos (const char *text, size_t pos)
825 static int result;
826 gunichar uni;
827 int width;
829 width = 0;
830 result = 0;
832 while (text[0] != '\0')
834 uni = g_utf8_get_char_validated (text, 6);
835 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
837 if (g_unichar_isprint (uni))
839 if (!str_unichar_iscombiningmark (uni))
841 width++;
842 if (g_unichar_iswide (uni))
843 width++;
846 else
848 width++;
850 text = g_utf8_next_char (text);
852 else
854 text++;
855 width++;
857 if ((gsize) width > pos)
858 return result;
860 result++;
863 return result;
866 static char *
867 str_utf8_create_search_needle (const char *needle, int case_sen)
869 if (needle != NULL)
871 if (case_sen)
873 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
875 else
877 char *fold = g_utf8_casefold (needle, -1);
878 char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
879 g_free (fold);
880 return result;
883 else
884 return NULL;
887 static void
888 str_utf8_release_search_needle (char *needle, int case_sen)
890 (void) case_sen;
891 if (needle != NULL)
892 g_free (needle);
895 static const char *
896 str_utf8_search_first (const char *text, const char *search, int case_sen)
898 char *fold_text;
899 char *deco_text;
900 const char *match;
901 const char *result = NULL;
902 const char *m;
904 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
905 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
907 match = deco_text;
910 match = g_strstr_len (match, -1, search);
911 if (match != NULL)
913 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
914 !str_utf8_iscombiningmark (match + strlen (search)))
917 result = text;
918 m = deco_text;
919 while (m < match)
921 str_utf8_cnext_noncomb_char (&m);
922 str_utf8_cnext_noncomb_char (&result);
925 else
927 str_utf8_cnext_char (&match);
931 while (match != NULL && result == NULL);
933 g_free (deco_text);
934 if (!case_sen)
935 g_free (fold_text);
937 return result;
940 static const char *
941 str_utf8_search_last (const char *text, const char *search, int case_sen)
943 char *fold_text;
944 char *deco_text;
945 char *match;
946 const char *result = NULL;
947 const char *m;
949 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
950 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
954 match = g_strrstr_len (deco_text, -1, search);
955 if (match != NULL)
957 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
958 !str_utf8_iscombiningmark (match + strlen (search)))
961 result = text;
962 m = deco_text;
963 while (m < match)
965 str_utf8_cnext_noncomb_char (&m);
966 str_utf8_cnext_noncomb_char (&result);
969 else
971 match[0] = '\0';
975 while (match != NULL && result == NULL);
977 g_free (deco_text);
978 if (!case_sen)
979 g_free (fold_text);
981 return result;
984 static char *
985 str_utf8_normalize (const char *text)
987 GString *fixed;
988 char *tmp;
989 char *result;
990 const char *start;
991 const char *end;
993 fixed = g_string_sized_new (4);
995 start = text;
996 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
998 if (start != end)
1000 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1001 g_string_append (fixed, tmp);
1002 g_free (tmp);
1004 g_string_append_c (fixed, end[0]);
1005 start = end + 1;
1008 if (start == text)
1010 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1011 g_string_free (fixed, TRUE);
1013 else
1015 if (start[0] != '\0' && start != end)
1017 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1018 g_string_append (fixed, tmp);
1019 g_free (tmp);
1021 result = g_string_free (fixed, FALSE);
1024 return result;
1027 static char *
1028 str_utf8_casefold_normalize (const char *text)
1030 GString *fixed;
1031 char *tmp, *fold;
1032 char *result;
1033 const char *start;
1034 const char *end;
1036 fixed = g_string_sized_new (4);
1038 start = text;
1039 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1041 if (start != end)
1043 fold = g_utf8_casefold (start, end - start);
1044 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1045 g_string_append (fixed, tmp);
1046 g_free (tmp);
1047 g_free (fold);
1049 g_string_append_c (fixed, end[0]);
1050 start = end + 1;
1053 if (start == text)
1055 fold = g_utf8_casefold (text, -1);
1056 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1057 g_free (fold);
1058 g_string_free (fixed, TRUE);
1060 else
1062 if (start[0] != '\0' && start != end)
1064 fold = g_utf8_casefold (start, end - start);
1065 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1066 g_string_append (fixed, tmp);
1067 g_free (tmp);
1068 g_free (fold);
1070 result = g_string_free (fixed, FALSE);
1073 return result;
1076 static int
1077 str_utf8_compare (const char *t1, const char *t2)
1079 char *n1, *n2;
1080 int result;
1082 n1 = str_utf8_normalize (t1);
1083 n2 = str_utf8_normalize (t2);
1085 result = strcmp (n1, n2);
1087 g_free (n1);
1088 g_free (n2);
1090 return result;
1093 static int
1094 str_utf8_ncompare (const char *t1, const char *t2)
1096 char *n1, *n2;
1097 int result;
1099 n1 = str_utf8_normalize (t1);
1100 n2 = str_utf8_normalize (t2);
1102 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1104 g_free (n1);
1105 g_free (n2);
1107 return result;
1110 static int
1111 str_utf8_casecmp (const char *t1, const char *t2)
1113 char *n1, *n2;
1114 int result;
1116 n1 = str_utf8_casefold_normalize (t1);
1117 n2 = str_utf8_casefold_normalize (t2);
1119 result = strcmp (n1, n2);
1121 g_free (n1);
1122 g_free (n2);
1124 return result;
1127 static int
1128 str_utf8_ncasecmp (const char *t1, const char *t2)
1130 char *n1, *n2;
1131 int result;
1133 n1 = str_utf8_casefold_normalize (t1);
1134 n2 = str_utf8_casefold_normalize (t2);
1136 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1138 g_free (n1);
1139 g_free (n2);
1141 return result;
1144 static int
1145 str_utf8_prefix (const char *text, const char *prefix)
1147 char *t = str_utf8_normalize (text);
1148 char *p = str_utf8_normalize (prefix);
1149 const char *nt = t;
1150 const char *np = p;
1151 const char *nnt = t;
1152 const char *nnp = p;
1153 int result;
1155 while (nt[0] != '\0' && np[0] != '\0')
1157 str_utf8_cnext_char_safe (&nnt);
1158 str_utf8_cnext_char_safe (&nnp);
1159 if (nnt - nt != nnp - np)
1160 break;
1161 if (strncmp (nt, np, nnt - nt) != 0)
1162 break;
1163 nt = nnt;
1164 np = nnp;
1167 result = np - p;
1169 g_free (t);
1170 g_free (p);
1172 return result;
1175 static int
1176 str_utf8_caseprefix (const char *text, const char *prefix)
1178 char *t = str_utf8_casefold_normalize (text);
1179 char *p = str_utf8_casefold_normalize (prefix);
1180 const char *nt = t;
1181 const char *np = p;
1182 const char *nnt = t;
1183 const char *nnp = p;
1184 int result;
1186 while (nt[0] != '\0' && np[0] != '\0')
1188 str_utf8_cnext_char_safe (&nnt);
1189 str_utf8_cnext_char_safe (&nnp);
1190 if (nnt - nt != nnp - np)
1191 break;
1192 if (strncmp (nt, np, nnt - nt) != 0)
1193 break;
1194 nt = nnt;
1195 np = nnp;
1198 result = np - p;
1200 g_free (t);
1201 g_free (p);
1203 return result;
1206 static char *
1207 str_utf8_create_key_gen (const char *text, int case_sen,
1208 gchar * (*keygen) (const gchar * text, gssize size))
1210 char *result;
1212 if (case_sen)
1214 result = str_utf8_normalize (text);
1216 else
1218 gboolean dot;
1219 GString *fixed;
1220 const char *start, *end;
1221 char *fold, *key;
1223 dot = text[0] == '.';
1224 fixed = g_string_sized_new (16);
1226 if (!dot)
1227 start = text;
1228 else
1230 start = text + 1;
1231 g_string_append_c (fixed, '.');
1234 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1236 if (start != end)
1238 fold = g_utf8_casefold (start, end - start);
1239 key = keygen (fold, -1);
1240 g_string_append (fixed, key);
1241 g_free (key);
1242 g_free (fold);
1244 g_string_append_c (fixed, end[0]);
1245 start = end + 1;
1248 if (start == text)
1250 fold = g_utf8_casefold (start, -1);
1251 result = keygen (fold, -1);
1252 g_free (fold);
1253 g_string_free (fixed, TRUE);
1255 else if (dot && (start == text + 1))
1257 fold = g_utf8_casefold (start, -1);
1258 key = keygen (fold, -1);
1259 g_string_append (fixed, key);
1260 g_free (key);
1261 g_free (fold);
1262 result = g_string_free (fixed, FALSE);
1264 else
1266 if (start[0] != '\0' && start != end)
1268 fold = g_utf8_casefold (start, end - start);
1269 key = keygen (fold, -1);
1270 g_string_append (fixed, key);
1271 g_free (key);
1272 g_free (fold);
1274 result = g_string_free (fixed, FALSE);
1277 return result;
1280 static char *
1281 str_utf8_create_key (const char *text, int case_sen)
1283 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1286 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1287 static char *
1288 str_utf8_create_key_for_filename (const char *text, int case_sen)
1290 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1292 #endif
1294 static int
1295 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1297 (void) case_sen;
1298 return strcmp (t1, t2);
1301 static void
1302 str_utf8_release_key (char *key, int case_sen)
1304 (void) case_sen;
1305 g_free (key);
1308 struct str_class
1309 str_utf8_init (void)
1311 struct str_class result;
1313 result.conv_gerror_message = str_utf8_conv_gerror_message;
1314 result.vfs_convert_to = str_utf8_vfs_convert_to;
1315 result.insert_replace_char = str_utf8_insert_replace_char;
1316 result.is_valid_string = str_utf8_is_valid_string;
1317 result.is_valid_char = str_utf8_is_valid_char;
1318 result.cnext_char = str_utf8_cnext_char;
1319 result.cprev_char = str_utf8_cprev_char;
1320 result.cnext_char_safe = str_utf8_cnext_char_safe;
1321 result.cprev_char_safe = str_utf8_cprev_char_safe;
1322 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1323 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1324 result.isspace = str_utf8_isspace;
1325 result.ispunct = str_utf8_ispunct;
1326 result.isalnum = str_utf8_isalnum;
1327 result.isdigit = str_utf8_isdigit;
1328 result.isprint = str_utf8_isprint;
1329 result.iscombiningmark = str_utf8_iscombiningmark;
1330 result.toupper = str_utf8_toupper;
1331 result.tolower = str_utf8_tolower;
1332 result.length = str_utf8_length;
1333 result.length2 = str_utf8_length2;
1334 result.length_noncomb = str_utf8_length_noncomb;
1335 result.fix_string = str_utf8_fix_string;
1336 result.term_form = str_utf8_term_form;
1337 result.fit_to_term = str_utf8_fit_to_term;
1338 result.term_trim = str_utf8_term_trim;
1339 result.term_width2 = str_utf8_term_width2;
1340 result.term_width1 = str_utf8_term_width1;
1341 result.term_char_width = str_utf8_term_char_width;
1342 result.term_substring = str_utf8_term_substring;
1343 result.trunc = str_utf8_trunc;
1344 result.offset_to_pos = str_utf8_offset_to_pos;
1345 result.column_to_pos = str_utf8_column_to_pos;
1346 result.create_search_needle = str_utf8_create_search_needle;
1347 result.release_search_needle = str_utf8_release_search_needle;
1348 result.search_first = str_utf8_search_first;
1349 result.search_last = str_utf8_search_last;
1350 result.compare = str_utf8_compare;
1351 result.ncompare = str_utf8_ncompare;
1352 result.casecmp = str_utf8_casecmp;
1353 result.ncasecmp = str_utf8_ncasecmp;
1354 result.prefix = str_utf8_prefix;
1355 result.caseprefix = str_utf8_caseprefix;
1356 result.create_key = str_utf8_create_key;
1357 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1358 /* case insensitive sort files in "a1 a2 a10" order */
1359 result.create_key_for_filename = str_utf8_create_key_for_filename;
1360 #else
1361 /* case insensitive sort files in "a1 a10 a2" order */
1362 result.create_key_for_filename = str_utf8_create_key;
1363 #endif
1364 result.key_collate = str_utf8_key_collate;
1365 result.release_key = str_utf8_release_key;
1367 return result;