Merge branch '1858_segfault_in_search'
[midnight-commander.git] / src / strutilutf8.c
blob99c9e7a3d27e6e167dcbf303d8998252d81b6518
1 /* UTF-8 strings utilities
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 Written 2007 by:
5 Rostislav Benes
7 The file_date routine is mostly from GNU's fileutils package,
8 written by Richard Stallman and David MacKenzie.
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <config.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <errno.h>
29 #include <glib.h>
30 #include <langinfo.h>
31 #include <string.h>
33 #include "global.h"
34 #include "strutil.h"
36 /* using function for utf-8 from glib */
38 static const char replch[] = "\xEF\xBF\xBD";
40 static int
41 str_unichar_iscombiningmark (gunichar uni)
43 int type = g_unichar_type (uni);
44 return (type == G_UNICODE_COMBINING_MARK)
45 || (type == G_UNICODE_ENCLOSING_MARK)
46 || (type == G_UNICODE_NON_SPACING_MARK);
49 static void
50 str_utf8_insert_replace_char (GString * buffer)
52 g_string_append (buffer, replch);
55 static int
56 str_utf8_is_valid_string (const char *text)
58 return g_utf8_validate (text, -1, NULL);
61 static int
62 str_utf8_is_valid_char (const char *ch, size_t size)
64 switch (g_utf8_get_char_validated (ch, size))
66 case (gunichar) (-2):
67 return -2;
68 case (gunichar) (-1):
69 return -1;
70 default:
71 return 1;
75 static void
76 str_utf8_cnext_char (const char **text)
78 (*text) = g_utf8_next_char (*text);
81 static void
82 str_utf8_cprev_char (const char **text)
84 (*text) = g_utf8_prev_char (*text);
87 static void
88 str_utf8_cnext_char_safe (const char **text)
90 if (str_utf8_is_valid_char (*text, -1) == 1)
91 (*text) = g_utf8_next_char (*text);
92 else
93 (*text)++;
96 static void
97 str_utf8_cprev_char_safe (const char **text)
99 const char *result = g_utf8_prev_char (*text);
100 const char *t = result;
101 str_utf8_cnext_char_safe (&t);
102 if (t == *text)
103 (*text) = result;
104 else
105 (*text)--;
108 static void
109 str_utf8_fix_string (char *text)
111 gunichar uni;
113 while (text[0] != '\0')
115 uni = g_utf8_get_char_validated (text, -1);
116 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
118 text = g_utf8_next_char (text);
120 else
122 text[0] = '?';
123 text++;
128 static int
129 str_utf8_isspace (const char *text)
131 gunichar uni = g_utf8_get_char_validated (text, -1);
132 return g_unichar_isspace (uni);
135 static int
136 str_utf8_ispunct (const char *text)
138 gunichar uni = g_utf8_get_char_validated (text, -1);
139 return g_unichar_ispunct (uni);
142 static int
143 str_utf8_isalnum (const char *text)
145 gunichar uni = g_utf8_get_char_validated (text, -1);
146 return g_unichar_isalnum (uni);
149 static int
150 str_utf8_isdigit (const char *text)
152 gunichar uni = g_utf8_get_char_validated (text, -1);
153 return g_unichar_isdigit (uni);
156 static int
157 str_utf8_isprint (const char *ch)
159 gunichar uni = g_utf8_get_char_validated (ch, -1);
160 return g_unichar_isprint (uni);
163 static int
164 str_utf8_iscombiningmark (const char *ch)
166 gunichar uni = g_utf8_get_char_validated (ch, -1);
167 return str_unichar_iscombiningmark (uni);
170 static int
171 str_utf8_cnext_noncomb_char (const char **text)
173 int count = 0;
174 while ((*text)[0] != '\0')
176 str_utf8_cnext_char_safe (text);
177 count++;
178 if (!str_utf8_iscombiningmark (*text))
179 break;
181 return count;
184 static int
185 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
187 int count = 0;
188 while ((*text) != begin)
190 str_utf8_cprev_char_safe (text);
191 count++;
192 if (!str_utf8_iscombiningmark (*text))
193 break;
195 return count;
198 static int
199 str_utf8_toupper (const char *text, char **out, size_t * remain)
201 gunichar uni;
202 size_t left;
204 uni = g_utf8_get_char_validated (text, -1);
205 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
206 return 0;
208 uni = g_unichar_toupper (uni);
209 left = g_unichar_to_utf8 (uni, NULL);
210 if (left >= *remain)
211 return 0;
213 left = g_unichar_to_utf8 (uni, *out);
214 (*out) += left;
215 (*remain) -= left;
216 return 1;
219 static int
220 str_utf8_tolower (const char *text, char **out, size_t * remain)
222 gunichar uni;
223 size_t left;
225 uni = g_utf8_get_char_validated (text, -1);
226 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
227 return 0;
229 uni = g_unichar_tolower (uni);
230 left = g_unichar_to_utf8 (uni, NULL);
231 if (left >= *remain)
232 return 0;
234 left = g_unichar_to_utf8 (uni, *out);
235 (*out) += left;
236 (*remain) -= left;
237 return 1;
240 static int
241 str_utf8_length (const char *text)
243 int result = 0;
244 const char *start;
245 const char *end;
247 start = text;
248 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
250 if (start != end)
252 result += g_utf8_strlen (start, end - start);
254 result++;
255 start = end + 1;
258 if (start == text)
260 result = g_utf8_strlen (text, -1);
262 else
264 if (start[0] != '\0' && start != end)
266 result += g_utf8_strlen (start, end - start);
270 return result;
273 static int
274 str_utf8_length2 (const char *text, int size)
276 int result = 0;
277 const char *start;
278 const char *end;
280 start = text;
281 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
283 if (start != end)
285 result += g_utf8_strlen (start, min (end - start, size));
286 size -= end - start;
288 result += (size > 0);
289 size--;
290 start = end + 1;
293 if (start == text)
295 result = g_utf8_strlen (text, size);
297 else
299 if (start[0] != '\0' && start != end && size > 0)
301 result += g_utf8_strlen (start, min (end - start, size));
305 return result;
308 static int
309 str_utf8_length_noncomb (const char *text)
311 int result = 0;
312 const char *t = text;
314 while (t[0] != '\0')
316 str_utf8_cnext_noncomb_char (&t);
317 result++;
320 return result;
323 static void
324 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
326 char *next = g_utf8_next_char (*string);
327 (*left) -= next - (*string);
328 (*string) = next;
329 g_string_append_c (buffer, '?');
333 static gchar *
334 str_utf8_conv_gerror_message (GError *error, const char *def_msg)
336 if ((error != NULL) && (error->message != NULL))
337 return g_strdup (error->message);
339 return g_strdup (def_msg != NULL ? def_msg : "");
342 static estr_t
343 str_utf8_vfs_convert_to (GIConv coder, const char *string,
344 int size, GString * buffer)
346 estr_t result;
348 if (coder == str_cnv_not_convert)
350 g_string_append_len (buffer, string, size);
351 result = ESTR_SUCCESS;
353 else
354 result = str_nconvert (coder, (char *) string, size, buffer);
356 return result;
359 struct term_form
361 char text[BUF_MEDIUM * 6];
362 size_t width;
363 int compose;
366 /* utiliti function, that make string valid in utf8 and all characters printable
367 * return width of string too*/
368 static const struct term_form *
369 str_utf8_make_make_term_form (const char *text, size_t length)
371 static struct term_form result;
372 gunichar uni;
373 size_t left;
374 char *actual;
376 result.text[0] = '\0';
377 result.width = 0;
378 result.compose = 0;
379 actual = result.text;
381 /* check if text start with combining character,
382 * add space at begin in this case */
383 if (length != 0 && text[0] != '\0')
385 uni = g_utf8_get_char_validated (text, -1);
386 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
388 if (str_unichar_iscombiningmark (uni))
390 actual[0] = ' ';
391 actual++;
392 result.width++;
393 result.compose = 1;
398 while (length != 0 && text[0] != '\0') {
399 uni = g_utf8_get_char_validated (text, -1);
400 if ((uni != (gunichar)(-1)) && (uni != (gunichar)(-2))) {
401 if (g_unichar_isprint(uni)) {
402 left = g_unichar_to_utf8 (uni, actual);
403 actual+= left;
404 if (!str_unichar_iscombiningmark (uni)) {
405 result.width++;
406 if (g_unichar_iswide(uni)) result.width++;
407 } else result.compose = 1;
408 } else {
409 actual[0] = '.';
410 actual++;
411 result.width++;
413 text = g_utf8_next_char (text);
414 } else {
415 text++;
416 /*actual[0] = '?';*/
417 memcpy (actual, replch, strlen (replch));
418 actual+= strlen (replch);
419 result.width++;
421 if (length != (size_t) (-1)) length--; }
422 actual[0] = '\0';
424 return &result;
427 static const char *
428 str_utf8_term_form (const char *text)
430 static char result[BUF_MEDIUM * 6];
431 const struct term_form *pre_form;
432 char *composed;
434 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
435 if (pre_form->compose)
437 composed =
438 g_utf8_normalize (pre_form->text, -1,
439 G_NORMALIZE_DEFAULT_COMPOSE);
440 g_strlcpy (result, composed, sizeof (result));
441 g_free (composed);
443 else
445 g_strlcpy (result, pre_form->text, sizeof (result));
447 return result;
450 struct utf8_tool
452 char *actual;
453 size_t remain;
454 const char *cheked;
455 int ident;
456 int compose;
459 /* utiliti function, that copy all characters from cheked to actual */
460 static int
461 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
463 size_t left;
464 gunichar uni;
466 tool->compose = 0;
468 while (tool->cheked[0] != '\0')
470 uni = g_utf8_get_char (tool->cheked);
471 tool->compose |= str_unichar_iscombiningmark (uni);
472 left = g_unichar_to_utf8 (uni, NULL);
473 if (tool->remain <= left)
474 return 0;
475 left = g_unichar_to_utf8 (uni, tool->actual);
476 tool->actual += left;
477 tool->remain -= left;
478 tool->cheked = g_utf8_next_char (tool->cheked);
480 return 1;
483 /* utiliti function, that copy characters from cheked to actual until ident is
484 * smaller than to_ident */
485 static int
486 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
488 size_t left;
489 gunichar uni;
490 int w;
492 tool->compose = 0;
494 while (tool->cheked[0] != '\0')
496 uni = g_utf8_get_char (tool->cheked);
497 if (!str_unichar_iscombiningmark (uni))
499 w = 1;
500 if (g_unichar_iswide (uni))
501 w++;
502 if (tool->ident + w > to_ident)
503 return 1;
505 else
507 w = 0;
508 tool->compose = 1;
511 left = g_unichar_to_utf8 (uni, NULL);
512 if (tool->remain <= left)
513 return 0;
514 left = g_unichar_to_utf8 (uni, tool->actual);
515 tool->actual += left;
516 tool->remain -= left;
517 tool->cheked = g_utf8_next_char (tool->cheked);
518 tool->ident += w;
520 return 1;
523 /* utiliti function, add count spaces to actual */
524 static int
525 utf8_tool_insert_space (struct utf8_tool *tool, int count)
527 if (count <= 0)
528 return 1;
529 if (tool->remain <= (gsize) count)
530 return 0;
531 memset (tool->actual, ' ', count);
532 tool->actual += count;
533 tool->remain -= count;
534 return 1;
537 /* utiliti function, add one characters to actual */
538 static int
539 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
541 if (tool->remain <= 1)
542 return 0;
543 tool->actual[0] = ch;
544 tool->actual++;
545 tool->remain--;
546 return 1;
549 /* utiliti function, thah skip characters from cheked until ident is greater or
550 * equal to to_ident */
551 static int
552 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
554 gunichar uni;
556 while (to_ident > tool->ident && tool->cheked[0] != '\0')
558 uni = g_utf8_get_char (tool->cheked);
559 if (!str_unichar_iscombiningmark (uni))
561 tool->ident++;
562 if (g_unichar_iswide (uni))
563 tool->ident++;
565 tool->cheked = g_utf8_next_char (tool->cheked);
567 uni = g_utf8_get_char (tool->cheked);
568 while (str_unichar_iscombiningmark (uni))
570 tool->cheked = g_utf8_next_char (tool->cheked);
571 uni = g_utf8_get_char (tool->cheked);
573 return 1;
576 static void
577 utf8_tool_compose (char *buffer, size_t size)
579 char *composed =
580 g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
581 g_strlcpy (buffer, composed, size);
582 g_free (composed);
586 static const char *
587 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
589 static char result[BUF_MEDIUM * 6];
590 const struct term_form *pre_form;
591 struct utf8_tool tool;
593 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
594 tool.cheked = pre_form->text;
595 tool.actual = result;
596 tool.remain = sizeof (result);
597 tool.compose = 0;
599 if (pre_form->width <= (gsize)width)
601 tool.ident = 0;
602 switch (HIDE_FIT (just_mode))
604 case J_CENTER_LEFT:
605 case J_CENTER:
606 tool.ident = (width - pre_form->width) / 2;
607 break;
608 case J_RIGHT:
609 tool.ident = width - pre_form->width;
610 break;
613 utf8_tool_insert_space (&tool, tool.ident);
614 utf8_tool_copy_chars_to_end (&tool);
615 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
617 else
619 if (IS_FIT (just_mode))
621 tool.ident = 0;
622 utf8_tool_copy_chars_to (&tool, width / 2);
623 utf8_tool_insert_char (&tool, '~');
625 tool.ident = 0;
626 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
627 utf8_tool_copy_chars_to_end (&tool);
628 utf8_tool_insert_space (&tool,
629 width - (pre_form->width - tool.ident +
630 1));
632 else
634 tool.ident = 0;
635 switch (HIDE_FIT (just_mode))
637 case J_CENTER:
638 tool.ident = (width - pre_form->width) / 2;
639 break;
640 case J_RIGHT:
641 tool.ident = width - pre_form->width;
642 break;
645 utf8_tool_skip_chars_to (&tool, 0);
646 utf8_tool_insert_space (&tool, tool.ident);
647 utf8_tool_copy_chars_to (&tool, width);
648 utf8_tool_insert_space (&tool, width - tool.ident);
652 tool.actual[0] = '\0';
653 if (tool.compose)
654 utf8_tool_compose (result, sizeof (result));
655 return result;
658 static const char *
659 str_utf8_term_trim (const char *text, int width)
661 static char result[BUF_MEDIUM * 6];
662 const struct term_form *pre_form;
663 struct utf8_tool tool;
665 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
667 tool.cheked = pre_form->text;
668 tool.actual = result;
669 tool.remain = sizeof (result);
670 tool.compose = 0;
672 if ((gsize)width < pre_form->width)
674 if (width <= 3)
676 memset (tool.actual, '.', width);
677 tool.actual += width;
678 tool.remain -= width;
680 else
682 memset (tool.actual, '.', 3);
683 tool.actual += 3;
684 tool.remain -= 3;
686 tool.ident = 0;
687 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
688 utf8_tool_copy_chars_to_end (&tool);
691 else
693 utf8_tool_copy_chars_to_end (&tool);
696 tool.actual[0] = '\0';
697 if (tool.compose)
698 utf8_tool_compose (result, sizeof (result));
699 return result;
702 static int
703 str_utf8_term_width2 (const char *text, size_t length)
705 const struct term_form *result;
707 result = str_utf8_make_make_term_form (text, length);
708 return result->width;
711 static int
712 str_utf8_term_width1 (const char *text)
714 return str_utf8_term_width2 (text, (size_t) (-1));
717 static int
718 str_utf8_term_char_width (const char *text)
720 gunichar uni = g_utf8_get_char_validated (text, -1);
721 return (str_unichar_iscombiningmark (uni)) ? 0
722 : ((g_unichar_iswide (uni)) ? 2 : 1);
725 static void
726 str_utf8_msg_term_size (const char *text, int *lines, int *columns)
728 char *p, *tmp;
729 char *q;
730 char c = '\0';
731 int width;
733 (*lines) = 1;
734 (*columns) = 0;
736 tmp = g_strdup (text);
737 p = tmp;
738 for (;;)
740 q = strchr (p, '\n');
741 if (q != NULL)
743 c = q[0];
744 q[0] = '\0';
747 width = str_utf8_term_width1 (p);
748 if (width > (*columns))
749 (*columns) = width;
751 if (q == NULL)
752 break;
753 q[0] = c;
754 p = q + 1;
755 (*lines)++;
757 g_free (tmp);
760 static const char *
761 str_utf8_term_substring (const char *text, int start, int width)
763 static char result[BUF_MEDIUM * 6];
764 const struct term_form *pre_form;
765 struct utf8_tool tool;
767 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
769 tool.cheked = pre_form->text;
770 tool.actual = result;
771 tool.remain = sizeof (result);
772 tool.compose = 0;
774 tool.ident = -start;
775 utf8_tool_skip_chars_to (&tool, 0);
776 if (tool.ident < 0)
777 tool.ident = 0;
778 utf8_tool_insert_space (&tool, tool.ident);
780 utf8_tool_copy_chars_to (&tool, width);
781 utf8_tool_insert_space (&tool, width - tool.ident);
783 tool.actual[0] = '\0';
784 if (tool.compose)
785 utf8_tool_compose (result, sizeof (result));
786 return result;
789 static const char *
790 str_utf8_trunc (const char *text, int width)
792 static char result[MC_MAXPATHLEN * 6 * 2];
793 const struct term_form *pre_form;
794 struct utf8_tool tool;
796 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
798 tool.cheked = pre_form->text;
799 tool.actual = result;
800 tool.remain = sizeof (result);
801 tool.compose = 0;
803 if (pre_form->width > (gsize)width)
805 tool.ident = 0;
806 utf8_tool_copy_chars_to (&tool, width / 2);
807 utf8_tool_insert_char (&tool, '~');
809 tool.ident = 0;
810 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
811 utf8_tool_copy_chars_to_end (&tool);
813 else
815 utf8_tool_copy_chars_to_end (&tool);
818 tool.actual[0] = '\0';
819 if (tool.compose)
820 utf8_tool_compose (result, sizeof (result));
821 return result;
824 static int
825 str_utf8_offset_to_pos (const char *text, size_t length)
827 if (str_utf8_is_valid_string (text))
828 return g_utf8_offset_to_pointer (text, length) - text;
829 else
831 int result;
832 GString *buffer = g_string_new (text);
834 str_utf8_fix_string (buffer->str);
835 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
836 g_string_free (buffer, TRUE);
837 return result;
841 static int
842 str_utf8_column_to_pos (const char *text, size_t pos)
844 static int result;
845 gunichar uni;
846 int width;
848 width = 0;
849 result = 0;
851 while (text[0] != '\0')
853 uni = g_utf8_get_char_validated (text, 6);
854 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
856 if (g_unichar_isprint (uni))
858 if (!str_unichar_iscombiningmark (uni))
860 width++;
861 if (g_unichar_iswide (uni))
862 width++;
865 else
867 width++;
869 text = g_utf8_next_char (text);
871 else
873 text++;
874 width++;
876 if ((gsize)width > pos)
877 return result;
879 result++;
882 return result;
885 static char *
886 str_utf8_create_search_needle (const char *needle, int case_sen)
888 if (needle != NULL)
890 if (case_sen)
892 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
894 else
896 char *fold = g_utf8_casefold (needle, -1);
897 char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
898 g_free (fold);
899 return result;
902 else
903 return NULL;
906 static void
907 str_utf8_release_search_needle (char *needle, int case_sen)
909 (void) case_sen;
910 if (needle != NULL)
911 g_free (needle);
914 static const char *
915 str_utf8_search_first (const char *text, const char *search, int case_sen)
917 char *fold_text;
918 char *deco_text;
919 const char *match;
920 const char *result = NULL;
921 const char *m;
923 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
924 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
926 match = deco_text;
929 match = g_strstr_len (match, -1, search);
930 if (match != NULL)
932 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
933 !str_utf8_iscombiningmark (match + strlen (search)))
936 result = text;
937 m = deco_text;
938 while (m < match)
940 str_utf8_cnext_noncomb_char (&m);
941 str_utf8_cnext_noncomb_char (&result);
944 else
946 str_utf8_cnext_char (&match);
950 while (match != NULL && result == NULL);
952 g_free (deco_text);
953 if (!case_sen)
954 g_free (fold_text);
956 return result;
959 static const char *
960 str_utf8_search_last (const char *text, const char *search, int case_sen)
962 char *fold_text;
963 char *deco_text;
964 char *match;
965 const char *result = NULL;
966 const char *m;
968 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
969 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
973 match = g_strrstr_len (deco_text, -1, search);
974 if (match != NULL)
976 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
977 !str_utf8_iscombiningmark (match + strlen (search)))
980 result = text;
981 m = deco_text;
982 while (m < match)
984 str_utf8_cnext_noncomb_char (&m);
985 str_utf8_cnext_noncomb_char (&result);
988 else
990 match[0] = '\0';
994 while (match != NULL && result == NULL);
996 g_free (deco_text);
997 if (!case_sen)
998 g_free (fold_text);
1000 return result;
1003 static char *
1004 str_utf8_normalize (const char *text)
1006 GString *fixed = g_string_new ("");
1007 char *tmp;
1008 char *result;
1009 const char *start;
1010 const char *end;
1012 start = text;
1013 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1015 if (start != end)
1017 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1018 g_string_append (fixed, tmp);
1019 g_free (tmp);
1021 g_string_append_c (fixed, end[0]);
1022 start = end + 1;
1025 if (start == text)
1027 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1029 else
1031 if (start[0] != '\0' && start != end)
1033 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1034 g_string_append (fixed, tmp);
1035 g_free (tmp);
1037 result = g_strdup (fixed->str);
1039 g_string_free (fixed, TRUE);
1041 return result;
1044 static char *
1045 str_utf8_casefold_normalize (const char *text)
1047 GString *fixed = g_string_new ("");
1048 char *tmp, *fold;
1049 char *result;
1050 const char *start;
1051 const char *end;
1053 start = text;
1054 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1056 if (start != end)
1058 fold = g_utf8_casefold (start, end - start);
1059 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1060 g_string_append (fixed, tmp);
1061 g_free (tmp);
1062 g_free (fold);
1064 g_string_append_c (fixed, end[0]);
1065 start = end + 1;
1068 if (start == text)
1070 fold = g_utf8_casefold (text, -1);
1071 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1072 g_free (fold);
1074 else
1076 if (start[0] != '\0' && start != end)
1078 fold = g_utf8_casefold (start, end - start);
1079 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1080 g_string_append (fixed, tmp);
1081 g_free (tmp);
1082 g_free (fold);
1084 result = g_strdup (fixed->str);
1086 g_string_free (fixed, TRUE);
1088 return result;
1091 static int
1092 str_utf8_compare (const char *t1, const char *t2)
1094 char *n1, *n2;
1095 int result;
1097 n1 = str_utf8_normalize (t1);
1098 n2 = str_utf8_normalize (t2);
1100 result = strcmp (n1, n2);
1102 g_free (n1);
1103 g_free (n2);
1105 return result;
1108 static int
1109 str_utf8_ncompare (const char *t1, const char *t2)
1111 char *n1, *n2;
1112 int result;
1114 n1 = str_utf8_normalize (t1);
1115 n2 = str_utf8_normalize (t2);
1117 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1119 g_free (n1);
1120 g_free (n2);
1122 return result;
1125 static int
1126 str_utf8_casecmp (const char *t1, const char *t2)
1128 char *n1, *n2;
1129 int result;
1131 n1 = str_utf8_casefold_normalize (t1);
1132 n2 = str_utf8_casefold_normalize (t2);
1134 result = strcmp (n1, n2);
1136 g_free (n1);
1137 g_free (n2);
1139 return result;
1142 static int
1143 str_utf8_ncasecmp (const char *t1, const char *t2)
1145 char *n1, *n2;
1146 int result;
1148 n1 = str_utf8_casefold_normalize (t1);
1149 n2 = str_utf8_casefold_normalize (t2);
1151 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1153 g_free (n1);
1154 g_free (n2);
1156 return result;
1159 static int
1160 str_utf8_prefix (const char *text, const char *prefix)
1162 char *t = str_utf8_normalize (text);
1163 char *p = str_utf8_normalize (prefix);
1164 const char *nt = t;
1165 const char *np = p;
1166 const char *nnt = t;
1167 const char *nnp = p;
1168 int result;
1170 while (nt[0] != '\0' && np[0] != '\0')
1172 str_utf8_cnext_char_safe (&nnt);
1173 str_utf8_cnext_char_safe (&nnp);
1174 if (nnt - nt != nnp - np)
1175 break;
1176 if (strncmp (nt, np, nnt - nt) != 0)
1177 break;
1178 nt = nnt;
1179 np = nnp;
1182 result = np - p;
1184 g_free (t);
1185 g_free (p);
1187 return result;
1190 static int
1191 str_utf8_caseprefix (const char *text, const char *prefix)
1193 char *t = str_utf8_casefold_normalize (text);
1194 char *p = str_utf8_casefold_normalize (prefix);
1195 const char *nt = t;
1196 const char *np = p;
1197 const char *nnt = t;
1198 const char *nnp = p;
1199 int result;
1201 while (nt[0] != '\0' && np[0] != '\0')
1203 str_utf8_cnext_char_safe (&nnt);
1204 str_utf8_cnext_char_safe (&nnp);
1205 if (nnt - nt != nnp - np)
1206 break;
1207 if (strncmp (nt, np, nnt - nt) != 0)
1208 break;
1209 nt = nnt;
1210 np = nnp;
1213 result = np - p;
1215 g_free (t);
1216 g_free (p);
1218 return result;
1221 static char *
1222 str_utf8_create_key_gen (const char *text, int case_sen,
1223 gchar * (*keygen) (const gchar *, gssize size))
1225 char *result;
1227 if (case_sen) {
1228 result = str_utf8_normalize (text);
1229 } else {
1230 const char *start, *end;
1231 char *fold, *key;
1232 GString *fixed = g_string_new ("");
1234 start = text;
1235 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1237 if (start != end)
1239 fold = g_utf8_casefold (start, end - start);
1240 key = keygen (fold, -1);
1241 g_string_append (fixed, key);
1242 g_free (key);
1243 g_free (fold);
1245 g_string_append_c (fixed, end[0]);
1246 start = end + 1;
1249 if (start == text)
1251 fold = g_utf8_casefold (text, -1);
1252 result = keygen (fold, -1);
1253 g_free (fold);
1255 else
1257 if (start[0] != '\0' && start != end)
1259 fold = g_utf8_casefold (start, end - start);
1260 key = keygen (fold, -1);
1261 g_string_append (fixed, key);
1262 g_free (key);
1263 g_free (fold);
1265 result = g_strdup (fixed->str);
1267 g_string_free (fixed, TRUE);
1269 return result;
1272 static char *
1273 str_utf8_create_key (const char *text, int case_sen)
1275 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1278 static char *
1279 str_utf8_create_key_for_filename (const char *text, int case_sen)
1281 return str_utf8_create_key_gen (text, case_sen,
1282 g_utf8_collate_key_for_filename);
1285 static int
1286 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1288 (void) case_sen;
1289 return strcmp (t1, t2);
1292 static void
1293 str_utf8_release_key (char *key, int case_sen)
1295 (void) case_sen;
1296 g_free (key);
1299 struct str_class
1300 str_utf8_init (void)
1302 struct str_class result;
1304 result.conv_gerror_message = str_utf8_conv_gerror_message;
1305 result.vfs_convert_to = str_utf8_vfs_convert_to;
1306 result.insert_replace_char = str_utf8_insert_replace_char;
1307 result.is_valid_string = str_utf8_is_valid_string;
1308 result.is_valid_char = str_utf8_is_valid_char;
1309 result.cnext_char = str_utf8_cnext_char;
1310 result.cprev_char = str_utf8_cprev_char;
1311 result.cnext_char_safe = str_utf8_cnext_char_safe;
1312 result.cprev_char_safe = str_utf8_cprev_char_safe;
1313 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1314 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1315 result.isspace = str_utf8_isspace;
1316 result.ispunct = str_utf8_ispunct;
1317 result.isalnum = str_utf8_isalnum;
1318 result.isdigit = str_utf8_isdigit;
1319 result.isprint = str_utf8_isprint;
1320 result.iscombiningmark = str_utf8_iscombiningmark;
1321 result.toupper = str_utf8_toupper;
1322 result.tolower = str_utf8_tolower;
1323 result.length = str_utf8_length;
1324 result.length2 = str_utf8_length2;
1325 result.length_noncomb = str_utf8_length_noncomb;
1326 result.fix_string = str_utf8_fix_string;
1327 result.term_form = str_utf8_term_form;
1328 result.fit_to_term = str_utf8_fit_to_term;
1329 result.term_trim = str_utf8_term_trim;
1330 result.term_width2 = str_utf8_term_width2;
1331 result.term_width1 = str_utf8_term_width1;
1332 result.term_char_width = str_utf8_term_char_width;
1333 result.msg_term_size = str_utf8_msg_term_size;
1334 result.term_substring = str_utf8_term_substring;
1335 result.trunc = str_utf8_trunc;
1336 result.offset_to_pos = str_utf8_offset_to_pos;
1337 result.column_to_pos = str_utf8_column_to_pos;
1338 result.create_search_needle = str_utf8_create_search_needle;
1339 result.release_search_needle = str_utf8_release_search_needle;
1340 result.search_first = str_utf8_search_first;
1341 result.search_last = str_utf8_search_last;
1342 result.compare = str_utf8_compare;
1343 result.ncompare = str_utf8_ncompare;
1344 result.casecmp = str_utf8_casecmp;
1345 result.ncasecmp = str_utf8_ncasecmp;
1346 result.prefix = str_utf8_prefix;
1347 result.caseprefix = str_utf8_caseprefix;
1348 result.create_key = str_utf8_create_key;
1349 result.create_key_for_filename = str_utf8_create_key_for_filename;
1350 result.key_collate = str_utf8_key_collate;
1351 result.release_key = str_utf8_release_key;
1353 return result;