Updated doc/NEWS file
[midnight-commander.git] / lib / strutil / strutilutf8.c
blobe6e7688b08af78f632a0c7068b93b4c5b939dc39
1 /*
2 UTF-8 strings utilities
4 Copyright (C) 2007, 2011
5 The Free Software Foundation, Inc.
7 Written by:
8 Rostislav Benes, 2007
10 The file_date routine is mostly from GNU's fileutils package,
11 written by Richard Stallman and David MacKenzie.
13 This file is part of the Midnight Commander.
15 The Midnight Commander is free software: you can redistribute it
16 and/or modify it under the terms of the GNU General Public License as
17 published by the Free Software Foundation, either version 3 of the License,
18 or (at your option) any later version.
20 The Midnight Commander is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU General Public License for more details.
25 You should have received a copy of the GNU General Public License
26 along with this program. If not, see <http://www.gnu.org/licenses/>.
29 #include <config.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <glib.h>
34 #include <langinfo.h>
35 #include <string.h>
37 #include "lib/global.h"
38 #include "lib/strutil.h"
40 /* using function for utf-8 from glib */
42 static const char replch[] = "\xEF\xBF\xBD";
44 static gboolean
45 str_unichar_iscombiningmark (gunichar uni)
47 GUnicodeType type;
49 type = g_unichar_type (uni);
50 return (type == G_UNICODE_COMBINING_MARK)
51 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
54 static void
55 str_utf8_insert_replace_char (GString * buffer)
57 g_string_append (buffer, replch);
60 static int
61 str_utf8_is_valid_string (const char *text)
63 return g_utf8_validate (text, -1, NULL);
66 static int
67 str_utf8_is_valid_char (const char *ch, size_t size)
69 switch (g_utf8_get_char_validated (ch, size))
71 case (gunichar) (-2):
72 return -2;
73 case (gunichar) (-1):
74 return -1;
75 default:
76 return 1;
80 static void
81 str_utf8_cnext_char (const char **text)
83 (*text) = g_utf8_next_char (*text);
86 static void
87 str_utf8_cprev_char (const char **text)
89 (*text) = g_utf8_prev_char (*text);
92 static void
93 str_utf8_cnext_char_safe (const char **text)
95 if (str_utf8_is_valid_char (*text, -1) == 1)
96 (*text) = g_utf8_next_char (*text);
97 else
98 (*text)++;
101 static void
102 str_utf8_cprev_char_safe (const char **text)
104 const char *result = g_utf8_prev_char (*text);
105 const char *t = result;
106 str_utf8_cnext_char_safe (&t);
107 if (t == *text)
108 (*text) = result;
109 else
110 (*text)--;
113 static void
114 str_utf8_fix_string (char *text)
116 gunichar uni;
118 while (text[0] != '\0')
120 uni = g_utf8_get_char_validated (text, -1);
121 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
123 text = g_utf8_next_char (text);
125 else
127 text[0] = '?';
128 text++;
133 static int
134 str_utf8_isspace (const char *text)
136 gunichar uni = g_utf8_get_char_validated (text, -1);
137 return g_unichar_isspace (uni);
140 static int
141 str_utf8_ispunct (const char *text)
143 gunichar uni = g_utf8_get_char_validated (text, -1);
144 return g_unichar_ispunct (uni);
147 static int
148 str_utf8_isalnum (const char *text)
150 gunichar uni = g_utf8_get_char_validated (text, -1);
151 return g_unichar_isalnum (uni);
154 static int
155 str_utf8_isdigit (const char *text)
157 gunichar uni = g_utf8_get_char_validated (text, -1);
158 return g_unichar_isdigit (uni);
161 static int
162 str_utf8_isprint (const char *ch)
164 gunichar uni = g_utf8_get_char_validated (ch, -1);
165 return g_unichar_isprint (uni);
168 static gboolean
169 str_utf8_iscombiningmark (const char *ch)
171 gunichar uni = g_utf8_get_char_validated (ch, -1);
172 return str_unichar_iscombiningmark (uni);
175 static int
176 str_utf8_cnext_noncomb_char (const char **text)
178 int count = 0;
179 while ((*text)[0] != '\0')
181 str_utf8_cnext_char_safe (text);
182 count++;
183 if (!str_utf8_iscombiningmark (*text))
184 break;
186 return count;
189 static int
190 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
192 int count = 0;
193 while ((*text) != begin)
195 str_utf8_cprev_char_safe (text);
196 count++;
197 if (!str_utf8_iscombiningmark (*text))
198 break;
200 return count;
203 static int
204 str_utf8_toupper (const char *text, char **out, size_t * remain)
206 gunichar uni;
207 size_t left;
209 uni = g_utf8_get_char_validated (text, -1);
210 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
211 return 0;
213 uni = g_unichar_toupper (uni);
214 left = g_unichar_to_utf8 (uni, NULL);
215 if (left >= *remain)
216 return 0;
218 left = g_unichar_to_utf8 (uni, *out);
219 (*out) += left;
220 (*remain) -= left;
221 return 1;
224 static int
225 str_utf8_tolower (const char *text, char **out, size_t * remain)
227 gunichar uni;
228 size_t left;
230 uni = g_utf8_get_char_validated (text, -1);
231 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
232 return 0;
234 uni = g_unichar_tolower (uni);
235 left = g_unichar_to_utf8 (uni, NULL);
236 if (left >= *remain)
237 return 0;
239 left = g_unichar_to_utf8 (uni, *out);
240 (*out) += left;
241 (*remain) -= left;
242 return 1;
245 static int
246 str_utf8_length (const char *text)
248 int result = 0;
249 const char *start;
250 const char *end;
252 start = text;
253 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
255 if (start != end)
257 result += g_utf8_strlen (start, end - start);
259 result++;
260 start = end + 1;
263 if (start == text)
265 result = g_utf8_strlen (text, -1);
267 else
269 if (start[0] != '\0' && start != end)
271 result += g_utf8_strlen (start, end - start);
275 return result;
278 static int
279 str_utf8_length2 (const char *text, int size)
281 int result = 0;
282 const char *start;
283 const char *end;
285 start = text;
286 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
288 if (start != end)
290 result += g_utf8_strlen (start, min (end - start, size));
291 size -= end - start;
293 result += (size > 0);
294 size--;
295 start = end + 1;
298 if (start == text)
300 result = g_utf8_strlen (text, size);
302 else
304 if (start[0] != '\0' && start != end && size > 0)
306 result += g_utf8_strlen (start, min (end - start, size));
310 return result;
313 static int
314 str_utf8_length_noncomb (const char *text)
316 int result = 0;
317 const char *t = text;
319 while (t[0] != '\0')
321 str_utf8_cnext_noncomb_char (&t);
322 result++;
325 return result;
329 static void
330 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
332 char *next = g_utf8_next_char (*string);
333 (*left) -= next - (*string);
334 (*string) = next;
335 g_string_append_c (buffer, '?');
339 static gchar *
340 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
342 if ((error != NULL) && (error->message != NULL))
343 return g_strdup (error->message);
345 return g_strdup (def_msg != NULL ? def_msg : "");
348 static estr_t
349 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
351 estr_t result;
353 if (coder == str_cnv_not_convert)
355 g_string_append_len (buffer, string, size);
356 result = ESTR_SUCCESS;
358 else
359 result = str_nconvert (coder, (char *) string, size, buffer);
361 return result;
364 struct term_form
366 char text[BUF_MEDIUM * 6];
367 size_t width;
368 gboolean compose;
371 /* utiliti function, that make string valid in utf8 and all characters printable
372 * return width of string too*/
373 static const struct term_form *
374 str_utf8_make_make_term_form (const char *text, size_t length)
376 static struct term_form result;
377 gunichar uni;
378 size_t left;
379 char *actual;
381 result.text[0] = '\0';
382 result.width = 0;
383 result.compose = FALSE;
384 actual = result.text;
386 /* check if text start with combining character,
387 * add space at begin in this case */
388 if (length != 0 && text[0] != '\0')
390 uni = g_utf8_get_char_validated (text, -1);
391 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
393 if (str_unichar_iscombiningmark (uni))
395 actual[0] = ' ';
396 actual++;
397 result.width++;
398 result.compose = TRUE;
403 while (length != 0 && text[0] != '\0')
405 uni = g_utf8_get_char_validated (text, -1);
406 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
408 if (g_unichar_isprint (uni))
410 left = g_unichar_to_utf8 (uni, actual);
411 actual += left;
412 if (str_unichar_iscombiningmark (uni))
413 result.compose = TRUE;
414 else
416 result.width++;
417 if (g_unichar_iswide (uni))
418 result.width++;
421 else
423 actual[0] = '.';
424 actual++;
425 result.width++;
427 text = g_utf8_next_char (text);
429 else
431 text++;
432 /*actual[0] = '?'; */
433 memcpy (actual, replch, strlen (replch));
434 actual += strlen (replch);
435 result.width++;
437 if (length != (size_t) (-1))
438 length--;
440 actual[0] = '\0';
442 return &result;
445 static const char *
446 str_utf8_term_form (const char *text)
448 static char result[BUF_MEDIUM * 6];
449 const struct term_form *pre_form;
450 char *composed;
452 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
453 if (pre_form->compose)
455 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
456 g_strlcpy (result, composed, sizeof (result));
457 g_free (composed);
459 else
461 g_strlcpy (result, pre_form->text, sizeof (result));
463 return result;
466 struct utf8_tool
468 char *actual;
469 size_t remain;
470 const char *cheked;
471 int ident;
472 gboolean compose;
475 /* utiliti function, that copy all characters from cheked to actual */
476 static gboolean
477 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
479 size_t left;
480 gunichar uni;
482 tool->compose = FALSE;
484 while (tool->cheked[0] != '\0')
486 uni = g_utf8_get_char (tool->cheked);
487 tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
488 left = g_unichar_to_utf8 (uni, NULL);
489 if (tool->remain <= left)
490 return FALSE;
491 left = g_unichar_to_utf8 (uni, tool->actual);
492 tool->actual += left;
493 tool->remain -= left;
494 tool->cheked = g_utf8_next_char (tool->cheked);
496 return TRUE;
499 /* utiliti function, that copy characters from cheked to actual until ident is
500 * smaller than to_ident */
501 static gboolean
502 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
504 size_t left;
505 gunichar uni;
506 int w;
508 tool->compose = FALSE;
510 while (tool->cheked[0] != '\0')
512 uni = g_utf8_get_char (tool->cheked);
513 if (!str_unichar_iscombiningmark (uni))
515 w = 1;
516 if (g_unichar_iswide (uni))
517 w++;
518 if (tool->ident + w > to_ident)
519 return TRUE;
521 else
523 w = 0;
524 tool->compose = TRUE;
527 left = g_unichar_to_utf8 (uni, NULL);
528 if (tool->remain <= left)
529 return FALSE;
530 left = g_unichar_to_utf8 (uni, tool->actual);
531 tool->actual += left;
532 tool->remain -= left;
533 tool->cheked = g_utf8_next_char (tool->cheked);
534 tool->ident += w;
536 return TRUE;
539 /* utiliti function, add count spaces to actual */
540 static int
541 utf8_tool_insert_space (struct utf8_tool *tool, int count)
543 if (count <= 0)
544 return 1;
545 if (tool->remain <= (gsize) count)
546 return 0;
547 memset (tool->actual, ' ', count);
548 tool->actual += count;
549 tool->remain -= count;
550 return 1;
553 /* utiliti function, add one characters to actual */
554 static int
555 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
557 if (tool->remain <= 1)
558 return 0;
559 tool->actual[0] = ch;
560 tool->actual++;
561 tool->remain--;
562 return 1;
565 /* utiliti function, thah skip characters from cheked until ident is greater or
566 * equal to to_ident */
567 static gboolean
568 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
570 gunichar uni;
572 while (to_ident > tool->ident && tool->cheked[0] != '\0')
574 uni = g_utf8_get_char (tool->cheked);
575 if (!str_unichar_iscombiningmark (uni))
577 tool->ident++;
578 if (g_unichar_iswide (uni))
579 tool->ident++;
581 tool->cheked = g_utf8_next_char (tool->cheked);
583 uni = g_utf8_get_char (tool->cheked);
584 while (str_unichar_iscombiningmark (uni))
586 tool->cheked = g_utf8_next_char (tool->cheked);
587 uni = g_utf8_get_char (tool->cheked);
589 return TRUE;
592 static void
593 utf8_tool_compose (char *buffer, size_t size)
595 char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
596 g_strlcpy (buffer, composed, size);
597 g_free (composed);
601 static const char *
602 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
604 static char result[BUF_MEDIUM * 6];
605 const struct term_form *pre_form;
606 struct utf8_tool tool;
608 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
609 tool.cheked = pre_form->text;
610 tool.actual = result;
611 tool.remain = sizeof (result);
612 tool.compose = FALSE;
614 if (pre_form->width <= (gsize) width)
616 tool.ident = 0;
617 switch (HIDE_FIT (just_mode))
619 case J_CENTER_LEFT:
620 case J_CENTER:
621 tool.ident = (width - pre_form->width) / 2;
622 break;
623 case J_RIGHT:
624 tool.ident = width - pre_form->width;
625 break;
628 utf8_tool_insert_space (&tool, tool.ident);
629 utf8_tool_copy_chars_to_end (&tool);
630 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
632 else
634 if (IS_FIT (just_mode))
636 tool.ident = 0;
637 utf8_tool_copy_chars_to (&tool, width / 2);
638 utf8_tool_insert_char (&tool, '~');
640 tool.ident = 0;
641 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
642 utf8_tool_copy_chars_to_end (&tool);
643 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
645 else
647 tool.ident = 0;
648 switch (HIDE_FIT (just_mode))
650 case J_CENTER:
651 tool.ident = (width - pre_form->width) / 2;
652 break;
653 case J_RIGHT:
654 tool.ident = width - pre_form->width;
655 break;
658 utf8_tool_skip_chars_to (&tool, 0);
659 utf8_tool_insert_space (&tool, tool.ident);
660 utf8_tool_copy_chars_to (&tool, width);
661 utf8_tool_insert_space (&tool, width - tool.ident);
665 tool.actual[0] = '\0';
666 if (tool.compose)
667 utf8_tool_compose (result, sizeof (result));
668 return result;
671 static const char *
672 str_utf8_term_trim (const char *text, int width)
674 static char result[BUF_MEDIUM * 6];
675 const struct term_form *pre_form;
676 struct utf8_tool tool;
678 if (width < 1)
680 result[0] = '\0';
681 return result;
684 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
686 tool.cheked = pre_form->text;
687 tool.actual = result;
688 tool.remain = sizeof (result);
689 tool.compose = FALSE;
691 if ((gsize) width < pre_form->width)
693 if (width <= 3)
695 memset (tool.actual, '.', width);
696 tool.actual += width;
697 tool.remain -= width;
699 else
701 memset (tool.actual, '.', 3);
702 tool.actual += 3;
703 tool.remain -= 3;
705 tool.ident = 0;
706 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
707 utf8_tool_copy_chars_to_end (&tool);
710 else
712 utf8_tool_copy_chars_to_end (&tool);
715 tool.actual[0] = '\0';
716 if (tool.compose)
717 utf8_tool_compose (result, sizeof (result));
718 return result;
721 static int
722 str_utf8_term_width2 (const char *text, size_t length)
724 const struct term_form *result;
726 result = str_utf8_make_make_term_form (text, length);
727 return result->width;
730 static int
731 str_utf8_term_width1 (const char *text)
733 return str_utf8_term_width2 (text, (size_t) (-1));
736 static int
737 str_utf8_term_char_width (const char *text)
739 gunichar uni = g_utf8_get_char_validated (text, -1);
740 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
743 static const char *
744 str_utf8_term_substring (const char *text, int start, int width)
746 static char result[BUF_MEDIUM * 6];
747 const struct term_form *pre_form;
748 struct utf8_tool tool;
750 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
752 tool.cheked = pre_form->text;
753 tool.actual = result;
754 tool.remain = sizeof (result);
755 tool.compose = FALSE;
757 tool.ident = -start;
758 utf8_tool_skip_chars_to (&tool, 0);
759 if (tool.ident < 0)
760 tool.ident = 0;
761 utf8_tool_insert_space (&tool, tool.ident);
763 utf8_tool_copy_chars_to (&tool, width);
764 utf8_tool_insert_space (&tool, width - tool.ident);
766 tool.actual[0] = '\0';
767 if (tool.compose)
768 utf8_tool_compose (result, sizeof (result));
769 return result;
772 static const char *
773 str_utf8_trunc (const char *text, int width)
775 static char result[MC_MAXPATHLEN * 6 * 2];
776 const struct term_form *pre_form;
777 struct utf8_tool tool;
779 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
781 tool.cheked = pre_form->text;
782 tool.actual = result;
783 tool.remain = sizeof (result);
784 tool.compose = FALSE;
786 if (pre_form->width > (gsize) width)
788 tool.ident = 0;
789 utf8_tool_copy_chars_to (&tool, width / 2);
790 utf8_tool_insert_char (&tool, '~');
792 tool.ident = 0;
793 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
794 utf8_tool_copy_chars_to_end (&tool);
796 else
798 utf8_tool_copy_chars_to_end (&tool);
801 tool.actual[0] = '\0';
802 if (tool.compose)
803 utf8_tool_compose (result, sizeof (result));
804 return result;
807 static int
808 str_utf8_offset_to_pos (const char *text, size_t length)
810 if (str_utf8_is_valid_string (text))
811 return g_utf8_offset_to_pointer (text, length) - text;
812 else
814 int result;
815 GString *buffer = g_string_new (text);
817 str_utf8_fix_string (buffer->str);
818 result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
819 g_string_free (buffer, TRUE);
820 return result;
824 static int
825 str_utf8_column_to_pos (const char *text, size_t pos)
827 static int result;
828 gunichar uni;
829 int width;
831 width = 0;
832 result = 0;
834 while (text[0] != '\0')
836 uni = g_utf8_get_char_validated (text, 6);
837 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
839 if (g_unichar_isprint (uni))
841 if (!str_unichar_iscombiningmark (uni))
843 width++;
844 if (g_unichar_iswide (uni))
845 width++;
848 else
850 width++;
852 text = g_utf8_next_char (text);
854 else
856 text++;
857 width++;
859 if ((gsize) width > pos)
860 return result;
862 result++;
865 return result;
868 static char *
869 str_utf8_create_search_needle (const char *needle, int case_sen)
871 if (needle != NULL)
873 if (case_sen)
875 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
877 else
879 char *fold = g_utf8_casefold (needle, -1);
880 char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
881 g_free (fold);
882 return result;
885 else
886 return NULL;
889 static void
890 str_utf8_release_search_needle (char *needle, int case_sen)
892 (void) case_sen;
893 if (needle != NULL)
894 g_free (needle);
897 static const char *
898 str_utf8_search_first (const char *text, const char *search, int case_sen)
900 char *fold_text;
901 char *deco_text;
902 const char *match;
903 const char *result = NULL;
904 const char *m;
906 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
907 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
909 match = deco_text;
912 match = g_strstr_len (match, -1, search);
913 if (match != NULL)
915 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
916 !str_utf8_iscombiningmark (match + strlen (search)))
919 result = text;
920 m = deco_text;
921 while (m < match)
923 str_utf8_cnext_noncomb_char (&m);
924 str_utf8_cnext_noncomb_char (&result);
927 else
929 str_utf8_cnext_char (&match);
933 while (match != NULL && result == NULL);
935 g_free (deco_text);
936 if (!case_sen)
937 g_free (fold_text);
939 return result;
942 static const char *
943 str_utf8_search_last (const char *text, const char *search, int case_sen)
945 char *fold_text;
946 char *deco_text;
947 char *match;
948 const char *result = NULL;
949 const char *m;
951 fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
952 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
956 match = g_strrstr_len (deco_text, -1, search);
957 if (match != NULL)
959 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
960 !str_utf8_iscombiningmark (match + strlen (search)))
963 result = text;
964 m = deco_text;
965 while (m < match)
967 str_utf8_cnext_noncomb_char (&m);
968 str_utf8_cnext_noncomb_char (&result);
971 else
973 match[0] = '\0';
977 while (match != NULL && result == NULL);
979 g_free (deco_text);
980 if (!case_sen)
981 g_free (fold_text);
983 return result;
986 static char *
987 str_utf8_normalize (const char *text)
989 GString *fixed;
990 char *tmp;
991 char *result;
992 const char *start;
993 const char *end;
995 fixed = g_string_sized_new (4);
997 start = text;
998 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1000 if (start != end)
1002 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1003 g_string_append (fixed, tmp);
1004 g_free (tmp);
1006 g_string_append_c (fixed, end[0]);
1007 start = end + 1;
1010 if (start == text)
1012 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1013 g_string_free (fixed, TRUE);
1015 else
1017 if (start[0] != '\0' && start != end)
1019 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1020 g_string_append (fixed, tmp);
1021 g_free (tmp);
1023 result = g_string_free (fixed, FALSE);
1026 return result;
1029 static char *
1030 str_utf8_casefold_normalize (const char *text)
1032 GString *fixed;
1033 char *tmp, *fold;
1034 char *result;
1035 const char *start;
1036 const char *end;
1038 fixed = g_string_sized_new (4);
1040 start = text;
1041 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1043 if (start != end)
1045 fold = g_utf8_casefold (start, end - start);
1046 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1047 g_string_append (fixed, tmp);
1048 g_free (tmp);
1049 g_free (fold);
1051 g_string_append_c (fixed, end[0]);
1052 start = end + 1;
1055 if (start == text)
1057 fold = g_utf8_casefold (text, -1);
1058 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1059 g_free (fold);
1060 g_string_free (fixed, TRUE);
1062 else
1064 if (start[0] != '\0' && start != end)
1066 fold = g_utf8_casefold (start, end - start);
1067 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1068 g_string_append (fixed, tmp);
1069 g_free (tmp);
1070 g_free (fold);
1072 result = g_string_free (fixed, FALSE);
1075 return result;
1078 static int
1079 str_utf8_compare (const char *t1, const char *t2)
1081 char *n1, *n2;
1082 int result;
1084 n1 = str_utf8_normalize (t1);
1085 n2 = str_utf8_normalize (t2);
1087 result = strcmp (n1, n2);
1089 g_free (n1);
1090 g_free (n2);
1092 return result;
1095 static int
1096 str_utf8_ncompare (const char *t1, const char *t2)
1098 char *n1, *n2;
1099 int result;
1101 n1 = str_utf8_normalize (t1);
1102 n2 = str_utf8_normalize (t2);
1104 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1106 g_free (n1);
1107 g_free (n2);
1109 return result;
1112 static int
1113 str_utf8_casecmp (const char *t1, const char *t2)
1115 char *n1, *n2;
1116 int result;
1118 n1 = str_utf8_casefold_normalize (t1);
1119 n2 = str_utf8_casefold_normalize (t2);
1121 result = strcmp (n1, n2);
1123 g_free (n1);
1124 g_free (n2);
1126 return result;
1129 static int
1130 str_utf8_ncasecmp (const char *t1, const char *t2)
1132 char *n1, *n2;
1133 int result;
1135 n1 = str_utf8_casefold_normalize (t1);
1136 n2 = str_utf8_casefold_normalize (t2);
1138 result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1140 g_free (n1);
1141 g_free (n2);
1143 return result;
1146 static int
1147 str_utf8_prefix (const char *text, const char *prefix)
1149 char *t = str_utf8_normalize (text);
1150 char *p = str_utf8_normalize (prefix);
1151 const char *nt = t;
1152 const char *np = p;
1153 const char *nnt = t;
1154 const char *nnp = p;
1155 int result;
1157 while (nt[0] != '\0' && np[0] != '\0')
1159 str_utf8_cnext_char_safe (&nnt);
1160 str_utf8_cnext_char_safe (&nnp);
1161 if (nnt - nt != nnp - np)
1162 break;
1163 if (strncmp (nt, np, nnt - nt) != 0)
1164 break;
1165 nt = nnt;
1166 np = nnp;
1169 result = np - p;
1171 g_free (t);
1172 g_free (p);
1174 return result;
1177 static int
1178 str_utf8_caseprefix (const char *text, const char *prefix)
1180 char *t = str_utf8_casefold_normalize (text);
1181 char *p = str_utf8_casefold_normalize (prefix);
1182 const char *nt = t;
1183 const char *np = p;
1184 const char *nnt = t;
1185 const char *nnp = p;
1186 int result;
1188 while (nt[0] != '\0' && np[0] != '\0')
1190 str_utf8_cnext_char_safe (&nnt);
1191 str_utf8_cnext_char_safe (&nnp);
1192 if (nnt - nt != nnp - np)
1193 break;
1194 if (strncmp (nt, np, nnt - nt) != 0)
1195 break;
1196 nt = nnt;
1197 np = nnp;
1200 result = np - p;
1202 g_free (t);
1203 g_free (p);
1205 return result;
1208 static char *
1209 str_utf8_create_key_gen (const char *text, int case_sen,
1210 gchar * (*keygen) (const gchar * text, gssize size))
1212 char *result;
1214 if (case_sen)
1216 result = str_utf8_normalize (text);
1218 else
1220 gboolean dot;
1221 GString *fixed;
1222 const char *start, *end;
1223 char *fold, *key;
1225 dot = text[0] == '.';
1226 fixed = g_string_sized_new (16);
1228 if (!dot)
1229 start = text;
1230 else
1232 start = text + 1;
1233 g_string_append_c (fixed, '.');
1236 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1238 if (start != end)
1240 fold = g_utf8_casefold (start, end - start);
1241 key = keygen (fold, -1);
1242 g_string_append (fixed, key);
1243 g_free (key);
1244 g_free (fold);
1246 g_string_append_c (fixed, end[0]);
1247 start = end + 1;
1250 if (start == text)
1252 fold = g_utf8_casefold (start, -1);
1253 result = keygen (fold, -1);
1254 g_free (fold);
1255 g_string_free (fixed, TRUE);
1257 else if (dot && (start == text + 1))
1259 fold = g_utf8_casefold (start, -1);
1260 key = keygen (fold, -1);
1261 g_string_append (fixed, key);
1262 g_free (key);
1263 g_free (fold);
1264 result = g_string_free (fixed, FALSE);
1266 else
1268 if (start[0] != '\0' && start != end)
1270 fold = g_utf8_casefold (start, end - start);
1271 key = keygen (fold, -1);
1272 g_string_append (fixed, key);
1273 g_free (key);
1274 g_free (fold);
1276 result = g_string_free (fixed, FALSE);
1279 return result;
1282 static char *
1283 str_utf8_create_key (const char *text, int case_sen)
1285 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1288 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1289 static char *
1290 str_utf8_create_key_for_filename (const char *text, int case_sen)
1292 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1294 #endif
1296 static int
1297 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1299 (void) case_sen;
1300 return strcmp (t1, t2);
1303 static void
1304 str_utf8_release_key (char *key, int case_sen)
1306 (void) case_sen;
1307 g_free (key);
1310 struct str_class
1311 str_utf8_init (void)
1313 struct str_class result;
1315 result.conv_gerror_message = str_utf8_conv_gerror_message;
1316 result.vfs_convert_to = str_utf8_vfs_convert_to;
1317 result.insert_replace_char = str_utf8_insert_replace_char;
1318 result.is_valid_string = str_utf8_is_valid_string;
1319 result.is_valid_char = str_utf8_is_valid_char;
1320 result.cnext_char = str_utf8_cnext_char;
1321 result.cprev_char = str_utf8_cprev_char;
1322 result.cnext_char_safe = str_utf8_cnext_char_safe;
1323 result.cprev_char_safe = str_utf8_cprev_char_safe;
1324 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1325 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1326 result.char_isspace = str_utf8_isspace;
1327 result.char_ispunct = str_utf8_ispunct;
1328 result.char_isalnum = str_utf8_isalnum;
1329 result.char_isdigit = str_utf8_isdigit;
1330 result.char_isprint = str_utf8_isprint;
1331 result.char_iscombiningmark = str_utf8_iscombiningmark;
1332 result.char_toupper = str_utf8_toupper;
1333 result.char_tolower = str_utf8_tolower;
1334 result.length = str_utf8_length;
1335 result.length2 = str_utf8_length2;
1336 result.length_noncomb = str_utf8_length_noncomb;
1337 result.fix_string = str_utf8_fix_string;
1338 result.term_form = str_utf8_term_form;
1339 result.fit_to_term = str_utf8_fit_to_term;
1340 result.term_trim = str_utf8_term_trim;
1341 result.term_width2 = str_utf8_term_width2;
1342 result.term_width1 = str_utf8_term_width1;
1343 result.term_char_width = str_utf8_term_char_width;
1344 result.term_substring = str_utf8_term_substring;
1345 result.trunc = str_utf8_trunc;
1346 result.offset_to_pos = str_utf8_offset_to_pos;
1347 result.column_to_pos = str_utf8_column_to_pos;
1348 result.create_search_needle = str_utf8_create_search_needle;
1349 result.release_search_needle = str_utf8_release_search_needle;
1350 result.search_first = str_utf8_search_first;
1351 result.search_last = str_utf8_search_last;
1352 result.compare = str_utf8_compare;
1353 result.ncompare = str_utf8_ncompare;
1354 result.casecmp = str_utf8_casecmp;
1355 result.ncasecmp = str_utf8_ncasecmp;
1356 result.prefix = str_utf8_prefix;
1357 result.caseprefix = str_utf8_caseprefix;
1358 result.create_key = str_utf8_create_key;
1359 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1360 /* case insensitive sort files in "a1 a2 a10" order */
1361 result.create_key_for_filename = str_utf8_create_key_for_filename;
1362 #else
1363 /* case insensitive sort files in "a1 a10 a2" order */
1364 result.create_key_for_filename = str_utf8_create_key;
1365 #endif
1366 result.key_collate = str_utf8_key_collate;
1367 result.release_key = str_utf8_release_key;
1369 return result;