lib/strutil/strutilutf8.c

   1 /*
   2    UTF-8 strings utilities
   3
   4    Copyright (C) 2007, 2011
   5    The Free Software Foundation, Inc.
   6
   7    Written by:
   8    Rostislav Benes, 2007
   9
  10    The file_date routine is mostly from GNU's fileutils package,
  11    written by Richard Stallman and David MacKenzie.
  12
  13    This file is part of the Midnight Commander.
  14
  15    The Midnight Commander is free software: you can redistribute it
  16    and/or modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation, either version 3 of the License,
  18    or (at your option) any later version.
  19
  20    The Midnight Commander is distributed in the hope that it will be useful,
  21    but WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23    GNU General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  27  */
  28
  29 #include <config.h>
  30 #include <stdlib.h>
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include <glib.h>
  34 #include <langinfo.h>
  35 #include <string.h>
  36
  37 #include "lib/global.h"
  38 #include "lib/strutil.h"
  39
  40 /* using function for utf-8 from glib */
  41
  42 static const char replch[] = "\xEF\xBF\xBD";
  43
  44 static gboolean
  45 str_unichar_iscombiningmark (gunichar uni)
  46 {
  47     GUnicodeType type;
  48
  49     type = g_unichar_type (uni);
  50     return (type == G_UNICODE_COMBINING_MARK)
  51         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  52 }
  53
  54 static void
  55 str_utf8_insert_replace_char (GString * buffer)
  56 {
  57     g_string_append (buffer, replch);
  58 }
  59
  60 static int
  61 str_utf8_is_valid_string (const char *text)
  62 {
  63     return g_utf8_validate (text, -1, NULL);
  64 }
  65
  66 static int
  67 str_utf8_is_valid_char (const char *ch, size_t size)
  68 {
  69     switch (g_utf8_get_char_validated (ch, size))
  70     {
  71     case (gunichar) (-2):
  72         return -2;
  73     case (gunichar) (-1):
  74         return -1;
  75     default:
  76         return 1;
  77     }
  78 }
  79
  80 static void
  81 str_utf8_cnext_char (const char **text)
  82 {
  83     (*text) = g_utf8_next_char (*text);
  84 }
  85
  86 static void
  87 str_utf8_cprev_char (const char **text)
  88 {
  89     (*text) = g_utf8_prev_char (*text);
  90 }
  91
  92 static void
  93 str_utf8_cnext_char_safe (const char **text)
  94 {
  95     if (str_utf8_is_valid_char (*text, -1) == 1)
  96         (*text) = g_utf8_next_char (*text);
  97     else
  98         (*text)++;
  99 }
 100
 101 static void
 102 str_utf8_cprev_char_safe (const char **text)
 103 {
 104     const char *result = g_utf8_prev_char (*text);
 105     const char *t = result;
 106     str_utf8_cnext_char_safe (&t);
 107     if (t == *text)
 108         (*text) = result;
 109     else
 110         (*text)--;
 111 }
 112
 113 static void
 114 str_utf8_fix_string (char *text)
 115 {
 116     gunichar uni;
 117
 118     while (text[0] != '\0')
 119     {
 120         uni = g_utf8_get_char_validated (text, -1);
 121         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 122         {
 123             text = g_utf8_next_char (text);
 124         }
 125         else
 126         {
 127             text[0] = '?';
 128             text++;
 129         }
 130     }
 131 }
 132
 133 static int
 134 str_utf8_isspace (const char *text)
 135 {
 136     gunichar uni = g_utf8_get_char_validated (text, -1);
 137     return g_unichar_isspace (uni);
 138 }
 139
 140 static int
 141 str_utf8_ispunct (const char *text)
 142 {
 143     gunichar uni = g_utf8_get_char_validated (text, -1);
 144     return g_unichar_ispunct (uni);
 145 }
 146
 147 static int
 148 str_utf8_isalnum (const char *text)
 149 {
 150     gunichar uni = g_utf8_get_char_validated (text, -1);
 151     return g_unichar_isalnum (uni);
 152 }
 153
 154 static int
 155 str_utf8_isdigit (const char *text)
 156 {
 157     gunichar uni = g_utf8_get_char_validated (text, -1);
 158     return g_unichar_isdigit (uni);
 159 }
 160
 161 static int
 162 str_utf8_isprint (const char *ch)
 163 {
 164     gunichar uni = g_utf8_get_char_validated (ch, -1);
 165     return g_unichar_isprint (uni);
 166 }
 167
 168 static gboolean
 169 str_utf8_iscombiningmark (const char *ch)
 170 {
 171     gunichar uni = g_utf8_get_char_validated (ch, -1);
 172     return str_unichar_iscombiningmark (uni);
 173 }
 174
 175 static int
 176 str_utf8_cnext_noncomb_char (const char **text)
 177 {
 178     int count = 0;
 179     while ((*text)[0] != '\0')
 180     {
 181         str_utf8_cnext_char_safe (text);
 182         count++;
 183         if (!str_utf8_iscombiningmark (*text))
 184             break;
 185     }
 186     return count;
 187 }
 188
 189 static int
 190 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 191 {
 192     int count = 0;
 193     while ((*text) != begin)
 194     {
 195         str_utf8_cprev_char_safe (text);
 196         count++;
 197         if (!str_utf8_iscombiningmark (*text))
 198             break;
 199     }
 200     return count;
 201 }
 202
 203 static int
 204 str_utf8_toupper (const char *text, char **out, size_t * remain)
 205 {
 206     gunichar uni;
 207     size_t left;
 208
 209     uni = g_utf8_get_char_validated (text, -1);
 210     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 211         return 0;
 212
 213     uni = g_unichar_toupper (uni);
 214     left = g_unichar_to_utf8 (uni, NULL);
 215     if (left >= *remain)
 216         return 0;
 217
 218     left = g_unichar_to_utf8 (uni, *out);
 219     (*out) += left;
 220     (*remain) -= left;
 221     return 1;
 222 }
 223
 224 static int
 225 str_utf8_tolower (const char *text, char **out, size_t * remain)
 226 {
 227     gunichar uni;
 228     size_t left;
 229
 230     uni = g_utf8_get_char_validated (text, -1);
 231     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 232         return 0;
 233
 234     uni = g_unichar_tolower (uni);
 235     left = g_unichar_to_utf8 (uni, NULL);
 236     if (left >= *remain)
 237         return 0;
 238
 239     left = g_unichar_to_utf8 (uni, *out);
 240     (*out) += left;
 241     (*remain) -= left;
 242     return 1;
 243 }
 244
 245 static int
 246 str_utf8_length (const char *text)
 247 {
 248     int result = 0;
 249     const char *start;
 250     const char *end;
 251
 252     start = text;
 253     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 254     {
 255         if (start != end)
 256         {
 257             result += g_utf8_strlen (start, end - start);
 258         }
 259         result++;
 260         start = end + 1;
 261     }
 262
 263     if (start == text)
 264     {
 265         result = g_utf8_strlen (text, -1);
 266     }
 267     else
 268     {
 269         if (start[0] != '\0' && start != end)
 270         {
 271             result += g_utf8_strlen (start, end - start);
 272         }
 273     }
 274
 275     return result;
 276 }
 277
 278 static int
 279 str_utf8_length2 (const char *text, int size)
 280 {
 281     int result = 0;
 282     const char *start;
 283     const char *end;
 284
 285     start = text;
 286     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 287     {
 288         if (start != end)
 289         {
 290             result += g_utf8_strlen (start, min (end - start, size));
 291             size -= end - start;
 292         }
 293         result += (size > 0);
 294         size--;
 295         start = end + 1;
 296     }
 297
 298     if (start == text)
 299     {
 300         result = g_utf8_strlen (text, size);
 301     }
 302     else
 303     {
 304         if (start[0] != '\0' && start != end && size > 0)
 305         {
 306             result += g_utf8_strlen (start, min (end - start, size));
 307         }
 308     }
 309
 310     return result;
 311 }
 312
 313 static int
 314 str_utf8_length_noncomb (const char *text)
 315 {
 316     int result = 0;
 317     const char *t = text;
 318
 319     while (t[0] != '\0')
 320     {
 321         str_utf8_cnext_noncomb_char (&t);
 322         result++;
 323     }
 324
 325     return result;
 326 }
 327
 328 /*
 329    static void
 330    str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
 331    {
 332    char *next = g_utf8_next_char (*string);
 333    (*left) -= next - (*string);
 334    (*string) = next;
 335    g_string_append_c (buffer, '?');
 336    }
 337  */
 338
 339 static gchar *
 340 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
 341 {
 342     if ((error != NULL) && (error->message != NULL))
 343         return g_strdup (error->message);
 344
 345     return g_strdup (def_msg != NULL ? def_msg : "");
 346 }
 347
 348 static estr_t
 349 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
 350 {
 351     estr_t result;
 352
 353     if (coder == str_cnv_not_convert)
 354     {
 355         g_string_append_len (buffer, string, size);
 356         result = ESTR_SUCCESS;
 357     }
 358     else
 359         result = str_nconvert (coder, (char *) string, size, buffer);
 360
 361     return result;
 362 }
 363
 364 struct term_form
 365 {
 366     char text[BUF_MEDIUM * 6];
 367     size_t width;
 368     gboolean compose;
 369 };
 370
 371 /* utiliti function, that make string valid in utf8 and all characters printable
 372  * return width of string too*/
 373 static const struct term_form *
 374 str_utf8_make_make_term_form (const char *text, size_t length)
 375 {
 376     static struct term_form result;
 377     gunichar uni;
 378     size_t left;
 379     char *actual;
 380
 381     result.text[0] = '\0';
 382     result.width = 0;
 383     result.compose = FALSE;
 384     actual = result.text;
 385
 386     /* check if text start with combining character,
 387      * add space at begin in this case */
 388     if (length != 0 && text[0] != '\0')
 389     {
 390         uni = g_utf8_get_char_validated (text, -1);
 391         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 392         {
 393             if (str_unichar_iscombiningmark (uni))
 394             {
 395                 actual[0] = ' ';
 396                 actual++;
 397                 result.width++;
 398                 result.compose = TRUE;
 399             }
 400         }
 401     }
 402
 403     while (length != 0 && text[0] != '\0')
 404     {
 405         uni = g_utf8_get_char_validated (text, -1);
 406         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 407         {
 408             if (g_unichar_isprint (uni))
 409             {
 410                 left = g_unichar_to_utf8 (uni, actual);
 411                 actual += left;
 412                 if (str_unichar_iscombiningmark (uni))
 413                     result.compose = TRUE;
 414                 else
 415                 {
 416                     result.width++;
 417                     if (g_unichar_iswide (uni))
 418                         result.width++;
 419                 }
 420             }
 421             else
 422             {
 423                 actual[0] = '.';
 424                 actual++;
 425                 result.width++;
 426             }
 427             text = g_utf8_next_char (text);
 428         }
 429         else
 430         {
 431             text++;
 432             /*actual[0] = '?'; */
 433             memcpy (actual, replch, strlen (replch));
 434             actual += strlen (replch);
 435             result.width++;
 436         }
 437         if (length != (size_t) (-1))
 438             length--;
 439     }
 440     actual[0] = '\0';
 441
 442     return &result;
 443 }
 444
 445 static const char *
 446 str_utf8_term_form (const char *text)
 447 {
 448     static char result[BUF_MEDIUM * 6];
 449     const struct term_form *pre_form;
 450     char *composed;
 451
 452     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 453     if (pre_form->compose)
 454     {
 455         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 456         g_strlcpy (result, composed, sizeof (result));
 457         g_free (composed);
 458     }
 459     else
 460     {
 461         g_strlcpy (result, pre_form->text, sizeof (result));
 462     }
 463     return result;
 464 }
 465
 466 struct utf8_tool
 467 {
 468     char *actual;
 469     size_t remain;
 470     const char *cheked;
 471     int ident;
 472     gboolean compose;
 473 };
 474
 475 /* utiliti function, that copy all characters from cheked to actual */
 476 static gboolean
 477 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 478 {
 479     size_t left;
 480     gunichar uni;
 481
 482     tool->compose = FALSE;
 483
 484     while (tool->cheked[0] != '\0')
 485     {
 486         uni = g_utf8_get_char (tool->cheked);
 487         tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
 488         left = g_unichar_to_utf8 (uni, NULL);
 489         if (tool->remain <= left)
 490             return FALSE;
 491         left = g_unichar_to_utf8 (uni, tool->actual);
 492         tool->actual += left;
 493         tool->remain -= left;
 494         tool->cheked = g_utf8_next_char (tool->cheked);
 495     }
 496     return TRUE;
 497 }
 498
 499 /* utiliti function, that copy characters from cheked to actual until ident is
 500  * smaller than to_ident */
 501 static gboolean
 502 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 503 {
 504     size_t left;
 505     gunichar uni;
 506     int w;
 507
 508     tool->compose = FALSE;
 509
 510     while (tool->cheked[0] != '\0')
 511     {
 512         uni = g_utf8_get_char (tool->cheked);
 513         if (!str_unichar_iscombiningmark (uni))
 514         {
 515             w = 1;
 516             if (g_unichar_iswide (uni))
 517                 w++;
 518             if (tool->ident + w > to_ident)
 519                 return TRUE;
 520         }
 521         else
 522         {
 523             w = 0;
 524             tool->compose = TRUE;
 525         }
 526
 527         left = g_unichar_to_utf8 (uni, NULL);
 528         if (tool->remain <= left)
 529             return FALSE;
 530         left = g_unichar_to_utf8 (uni, tool->actual);
 531         tool->actual += left;
 532         tool->remain -= left;
 533         tool->cheked = g_utf8_next_char (tool->cheked);
 534         tool->ident += w;
 535     }
 536     return TRUE;
 537 }
 538
 539 /* utiliti function, add count spaces to actual */
 540 static int
 541 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 542 {
 543     if (count <= 0)
 544         return 1;
 545     if (tool->remain <= (gsize) count)
 546         return 0;
 547     memset (tool->actual, ' ', count);
 548     tool->actual += count;
 549     tool->remain -= count;
 550     return 1;
 551 }
 552
 553 /* utiliti function, add one characters to actual */
 554 static int
 555 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 556 {
 557     if (tool->remain <= 1)
 558         return 0;
 559     tool->actual[0] = ch;
 560     tool->actual++;
 561     tool->remain--;
 562     return 1;
 563 }
 564
 565 /* utiliti function, thah skip characters from cheked until ident is greater or
 566  * equal to to_ident */
 567 static gboolean
 568 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 569 {
 570     gunichar uni;
 571
 572     while (to_ident > tool->ident && tool->cheked[0] != '\0')
 573     {
 574         uni = g_utf8_get_char (tool->cheked);
 575         if (!str_unichar_iscombiningmark (uni))
 576         {
 577             tool->ident++;
 578             if (g_unichar_iswide (uni))
 579                 tool->ident++;
 580         }
 581         tool->cheked = g_utf8_next_char (tool->cheked);
 582     }
 583     uni = g_utf8_get_char (tool->cheked);
 584     while (str_unichar_iscombiningmark (uni))
 585     {
 586         tool->cheked = g_utf8_next_char (tool->cheked);
 587         uni = g_utf8_get_char (tool->cheked);
 588     }
 589     return TRUE;
 590 }
 591
 592 static void
 593 utf8_tool_compose (char *buffer, size_t size)
 594 {
 595     char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 596     g_strlcpy (buffer, composed, size);
 597     g_free (composed);
 598 }
 599
 600
 601 static const char *
 602 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 603 {
 604     static char result[BUF_MEDIUM * 6];
 605     const struct term_form *pre_form;
 606     struct utf8_tool tool;
 607
 608     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 609     tool.cheked = pre_form->text;
 610     tool.actual = result;
 611     tool.remain = sizeof (result);
 612     tool.compose = FALSE;
 613
 614     if (pre_form->width <= (gsize) width)
 615     {
 616         tool.ident = 0;
 617         switch (HIDE_FIT (just_mode))
 618         {
 619         case J_CENTER_LEFT:
 620         case J_CENTER:
 621             tool.ident = (width - pre_form->width) / 2;
 622             break;
 623         case J_RIGHT:
 624             tool.ident = width - pre_form->width;
 625             break;
 626         }
 627
 628         utf8_tool_insert_space (&tool, tool.ident);
 629         utf8_tool_copy_chars_to_end (&tool);
 630         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 631     }
 632     else
 633     {
 634         if (IS_FIT (just_mode))
 635         {
 636             tool.ident = 0;
 637             utf8_tool_copy_chars_to (&tool, width / 2);
 638             utf8_tool_insert_char (&tool, '~');
 639
 640             tool.ident = 0;
 641             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 642             utf8_tool_copy_chars_to_end (&tool);
 643             utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 644         }
 645         else
 646         {
 647             tool.ident = 0;
 648             switch (HIDE_FIT (just_mode))
 649             {
 650             case J_CENTER:
 651                 tool.ident = (width - pre_form->width) / 2;
 652                 break;
 653             case J_RIGHT:
 654                 tool.ident = width - pre_form->width;
 655                 break;
 656             }
 657
 658             utf8_tool_skip_chars_to (&tool, 0);
 659             utf8_tool_insert_space (&tool, tool.ident);
 660             utf8_tool_copy_chars_to (&tool, width);
 661             utf8_tool_insert_space (&tool, width - tool.ident);
 662         }
 663     }
 664
 665     tool.actual[0] = '\0';
 666     if (tool.compose)
 667         utf8_tool_compose (result, sizeof (result));
 668     return result;
 669 }
 670
 671 static const char *
 672 str_utf8_term_trim (const char *text, int width)
 673 {
 674     static char result[BUF_MEDIUM * 6];
 675     const struct term_form *pre_form;
 676     struct utf8_tool tool;
 677
 678     if (width < 1)
 679     {
 680         result[0] = '\0';
 681         return result;
 682     }
 683
 684     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 685
 686     tool.cheked = pre_form->text;
 687     tool.actual = result;
 688     tool.remain = sizeof (result);
 689     tool.compose = FALSE;
 690
 691     if ((gsize) width < pre_form->width)
 692     {
 693         if (width <= 3)
 694         {
 695             memset (tool.actual, '.', width);
 696             tool.actual += width;
 697             tool.remain -= width;
 698         }
 699         else
 700         {
 701             memset (tool.actual, '.', 3);
 702             tool.actual += 3;
 703             tool.remain -= 3;
 704
 705             tool.ident = 0;
 706             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 707             utf8_tool_copy_chars_to_end (&tool);
 708         }
 709     }
 710     else
 711     {
 712         utf8_tool_copy_chars_to_end (&tool);
 713     }
 714
 715     tool.actual[0] = '\0';
 716     if (tool.compose)
 717         utf8_tool_compose (result, sizeof (result));
 718     return result;
 719 }
 720
 721 static int
 722 str_utf8_term_width2 (const char *text, size_t length)
 723 {
 724     const struct term_form *result;
 725
 726     result = str_utf8_make_make_term_form (text, length);
 727     return result->width;
 728 }
 729
 730 static int
 731 str_utf8_term_width1 (const char *text)
 732 {
 733     return str_utf8_term_width2 (text, (size_t) (-1));
 734 }
 735
 736 static int
 737 str_utf8_term_char_width (const char *text)
 738 {
 739     gunichar uni = g_utf8_get_char_validated (text, -1);
 740     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 741 }
 742
 743 static const char *
 744 str_utf8_term_substring (const char *text, int start, int width)
 745 {
 746     static char result[BUF_MEDIUM * 6];
 747     const struct term_form *pre_form;
 748     struct utf8_tool tool;
 749
 750     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 751
 752     tool.cheked = pre_form->text;
 753     tool.actual = result;
 754     tool.remain = sizeof (result);
 755     tool.compose = FALSE;
 756
 757     tool.ident = -start;
 758     utf8_tool_skip_chars_to (&tool, 0);
 759     if (tool.ident < 0)
 760         tool.ident = 0;
 761     utf8_tool_insert_space (&tool, tool.ident);
 762
 763     utf8_tool_copy_chars_to (&tool, width);
 764     utf8_tool_insert_space (&tool, width - tool.ident);
 765
 766     tool.actual[0] = '\0';
 767     if (tool.compose)
 768         utf8_tool_compose (result, sizeof (result));
 769     return result;
 770 }
 771
 772 static const char *
 773 str_utf8_trunc (const char *text, int width)
 774 {
 775     static char result[MC_MAXPATHLEN * 6 * 2];
 776     const struct term_form *pre_form;
 777     struct utf8_tool tool;
 778
 779     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 780
 781     tool.cheked = pre_form->text;
 782     tool.actual = result;
 783     tool.remain = sizeof (result);
 784     tool.compose = FALSE;
 785
 786     if (pre_form->width > (gsize) width)
 787     {
 788         tool.ident = 0;
 789         utf8_tool_copy_chars_to (&tool, width / 2);
 790         utf8_tool_insert_char (&tool, '~');
 791
 792         tool.ident = 0;
 793         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 794         utf8_tool_copy_chars_to_end (&tool);
 795     }
 796     else
 797     {
 798         utf8_tool_copy_chars_to_end (&tool);
 799     }
 800
 801     tool.actual[0] = '\0';
 802     if (tool.compose)
 803         utf8_tool_compose (result, sizeof (result));
 804     return result;
 805 }
 806
 807 static int
 808 str_utf8_offset_to_pos (const char *text, size_t length)
 809 {
 810     if (str_utf8_is_valid_string (text))
 811         return g_utf8_offset_to_pointer (text, length) - text;
 812     else
 813     {
 814         int result;
 815         GString *buffer = g_string_new (text);
 816
 817         str_utf8_fix_string (buffer->str);
 818         result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
 819         g_string_free (buffer, TRUE);
 820         return result;
 821     }
 822 }
 823
 824 static int
 825 str_utf8_column_to_pos (const char *text, size_t pos)
 826 {
 827     static int result;
 828     gunichar uni;
 829     int width;
 830
 831     width = 0;
 832     result = 0;
 833
 834     while (text[0] != '\0')
 835     {
 836         uni = g_utf8_get_char_validated (text, 6);
 837         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 838         {
 839             if (g_unichar_isprint (uni))
 840             {
 841                 if (!str_unichar_iscombiningmark (uni))
 842                 {
 843                     width++;
 844                     if (g_unichar_iswide (uni))
 845                         width++;
 846                 }
 847             }
 848             else
 849             {
 850                 width++;
 851             }
 852             text = g_utf8_next_char (text);
 853         }
 854         else
 855         {
 856             text++;
 857             width++;
 858         }
 859         if ((gsize) width > pos)
 860             return result;
 861
 862         result++;
 863     }
 864
 865     return result;
 866 }
 867
 868 static char *
 869 str_utf8_create_search_needle (const char *needle, int case_sen)
 870 {
 871     if (needle != NULL)
 872     {
 873         if (case_sen)
 874         {
 875             return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 876         }
 877         else
 878         {
 879             char *fold = g_utf8_casefold (needle, -1);
 880             char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 881             g_free (fold);
 882             return result;
 883         }
 884     }
 885     else
 886         return NULL;
 887 }
 888
 889 static void
 890 str_utf8_release_search_needle (char *needle, int case_sen)
 891 {
 892     (void) case_sen;
 893     if (needle != NULL)
 894         g_free (needle);
 895 }
 896
 897 static const char *
 898 str_utf8_search_first (const char *text, const char *search, int case_sen)
 899 {
 900     char *fold_text;
 901     char *deco_text;
 902     const char *match;
 903     const char *result = NULL;
 904     const char *m;
 905
 906     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 907     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 908
 909     match = deco_text;
 910     do
 911     {
 912         match = g_strstr_len (match, -1, search);
 913         if (match != NULL)
 914         {
 915             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 916                 !str_utf8_iscombiningmark (match + strlen (search)))
 917             {
 918
 919                 result = text;
 920                 m = deco_text;
 921                 while (m < match)
 922                 {
 923                     str_utf8_cnext_noncomb_char (&m);
 924                     str_utf8_cnext_noncomb_char (&result);
 925                 }
 926             }
 927             else
 928             {
 929                 str_utf8_cnext_char (&match);
 930             }
 931         }
 932     }
 933     while (match != NULL && result == NULL);
 934
 935     g_free (deco_text);
 936     if (!case_sen)
 937         g_free (fold_text);
 938
 939     return result;
 940 }
 941
 942 static const char *
 943 str_utf8_search_last (const char *text, const char *search, int case_sen)
 944 {
 945     char *fold_text;
 946     char *deco_text;
 947     char *match;
 948     const char *result = NULL;
 949     const char *m;
 950
 951     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 952     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 953
 954     do
 955     {
 956         match = g_strrstr_len (deco_text, -1, search);
 957         if (match != NULL)
 958         {
 959             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 960                 !str_utf8_iscombiningmark (match + strlen (search)))
 961             {
 962
 963                 result = text;
 964                 m = deco_text;
 965                 while (m < match)
 966                 {
 967                     str_utf8_cnext_noncomb_char (&m);
 968                     str_utf8_cnext_noncomb_char (&result);
 969                 }
 970             }
 971             else
 972             {
 973                 match[0] = '\0';
 974             }
 975         }
 976     }
 977     while (match != NULL && result == NULL);
 978
 979     g_free (deco_text);
 980     if (!case_sen)
 981         g_free (fold_text);
 982
 983     return result;
 984 }
 985
 986 static char *
 987 str_utf8_normalize (const char *text)
 988 {
 989     GString *fixed;
 990     char *tmp;
 991     char *result;
 992     const char *start;
 993     const char *end;
 994
 995     fixed = g_string_sized_new (4);
 996
 997     start = text;
 998     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 999     {
1000         if (start != end)
1001         {
1002             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1003             g_string_append (fixed, tmp);
1004             g_free (tmp);
1005         }
1006         g_string_append_c (fixed, end[0]);
1007         start = end + 1;
1008     }
1009
1010     if (start == text)
1011     {
1012         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1013         g_string_free (fixed, TRUE);
1014     }
1015     else
1016     {
1017         if (start[0] != '\0' && start != end)
1018         {
1019             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1020             g_string_append (fixed, tmp);
1021             g_free (tmp);
1022         }
1023         result = g_string_free (fixed, FALSE);
1024     }
1025
1026     return result;
1027 }
1028
1029 static char *
1030 str_utf8_casefold_normalize (const char *text)
1031 {
1032     GString *fixed;
1033     char *tmp, *fold;
1034     char *result;
1035     const char *start;
1036     const char *end;
1037
1038     fixed = g_string_sized_new (4);
1039
1040     start = text;
1041     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1042     {
1043         if (start != end)
1044         {
1045             fold = g_utf8_casefold (start, end - start);
1046             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1047             g_string_append (fixed, tmp);
1048             g_free (tmp);
1049             g_free (fold);
1050         }
1051         g_string_append_c (fixed, end[0]);
1052         start = end + 1;
1053     }
1054
1055     if (start == text)
1056     {
1057         fold = g_utf8_casefold (text, -1);
1058         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1059         g_free (fold);
1060         g_string_free (fixed, TRUE);
1061     }
1062     else
1063     {
1064         if (start[0] != '\0' && start != end)
1065         {
1066             fold = g_utf8_casefold (start, end - start);
1067             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1068             g_string_append (fixed, tmp);
1069             g_free (tmp);
1070             g_free (fold);
1071         }
1072         result = g_string_free (fixed, FALSE);
1073     }
1074
1075     return result;
1076 }
1077
1078 static int
1079 str_utf8_compare (const char *t1, const char *t2)
1080 {
1081     char *n1, *n2;
1082     int result;
1083
1084     n1 = str_utf8_normalize (t1);
1085     n2 = str_utf8_normalize (t2);
1086
1087     result = strcmp (n1, n2);
1088
1089     g_free (n1);
1090     g_free (n2);
1091
1092     return result;
1093 }
1094
1095 static int
1096 str_utf8_ncompare (const char *t1, const char *t2)
1097 {
1098     char *n1, *n2;
1099     int result;
1100
1101     n1 = str_utf8_normalize (t1);
1102     n2 = str_utf8_normalize (t2);
1103
1104     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1105
1106     g_free (n1);
1107     g_free (n2);
1108
1109     return result;
1110 }
1111
1112 static int
1113 str_utf8_casecmp (const char *t1, const char *t2)
1114 {
1115     char *n1, *n2;
1116     int result;
1117
1118     n1 = str_utf8_casefold_normalize (t1);
1119     n2 = str_utf8_casefold_normalize (t2);
1120
1121     result = strcmp (n1, n2);
1122
1123     g_free (n1);
1124     g_free (n2);
1125
1126     return result;
1127 }
1128
1129 static int
1130 str_utf8_ncasecmp (const char *t1, const char *t2)
1131 {
1132     char *n1, *n2;
1133     int result;
1134
1135     n1 = str_utf8_casefold_normalize (t1);
1136     n2 = str_utf8_casefold_normalize (t2);
1137
1138     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1139
1140     g_free (n1);
1141     g_free (n2);
1142
1143     return result;
1144 }
1145
1146 static int
1147 str_utf8_prefix (const char *text, const char *prefix)
1148 {
1149     char *t = str_utf8_normalize (text);
1150     char *p = str_utf8_normalize (prefix);
1151     const char *nt = t;
1152     const char *np = p;
1153     const char *nnt = t;
1154     const char *nnp = p;
1155     int result;
1156
1157     while (nt[0] != '\0' && np[0] != '\0')
1158     {
1159         str_utf8_cnext_char_safe (&nnt);
1160         str_utf8_cnext_char_safe (&nnp);
1161         if (nnt - nt != nnp - np)
1162             break;
1163         if (strncmp (nt, np, nnt - nt) != 0)
1164             break;
1165         nt = nnt;
1166         np = nnp;
1167     }
1168
1169     result = np - p;
1170
1171     g_free (t);
1172     g_free (p);
1173
1174     return result;
1175 }
1176
1177 static int
1178 str_utf8_caseprefix (const char *text, const char *prefix)
1179 {
1180     char *t = str_utf8_casefold_normalize (text);
1181     char *p = str_utf8_casefold_normalize (prefix);
1182     const char *nt = t;
1183     const char *np = p;
1184     const char *nnt = t;
1185     const char *nnp = p;
1186     int result;
1187
1188     while (nt[0] != '\0' && np[0] != '\0')
1189     {
1190         str_utf8_cnext_char_safe (&nnt);
1191         str_utf8_cnext_char_safe (&nnp);
1192         if (nnt - nt != nnp - np)
1193             break;
1194         if (strncmp (nt, np, nnt - nt) != 0)
1195             break;
1196         nt = nnt;
1197         np = nnp;
1198     }
1199
1200     result = np - p;
1201
1202     g_free (t);
1203     g_free (p);
1204
1205     return result;
1206 }
1207
1208 static char *
1209 str_utf8_create_key_gen (const char *text, int case_sen,
1210                          gchar * (*keygen) (const gchar * text, gssize size))
1211 {
1212     char *result;
1213
1214     if (case_sen)
1215     {
1216         result = str_utf8_normalize (text);
1217     }
1218     else
1219     {
1220         gboolean dot;
1221         GString *fixed;
1222         const char *start, *end;
1223         char *fold, *key;
1224
1225         dot = text[0] == '.';
1226         fixed = g_string_sized_new (16);
1227
1228         if (!dot)
1229             start = text;
1230         else
1231         {
1232             start = text + 1;
1233             g_string_append_c (fixed, '.');
1234         }
1235
1236         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1237         {
1238             if (start != end)
1239             {
1240                 fold = g_utf8_casefold (start, end - start);
1241                 key = keygen (fold, -1);
1242                 g_string_append (fixed, key);
1243                 g_free (key);
1244                 g_free (fold);
1245             }
1246             g_string_append_c (fixed, end[0]);
1247             start = end + 1;
1248         }
1249
1250         if (start == text)
1251         {
1252             fold = g_utf8_casefold (start, -1);
1253             result = keygen (fold, -1);
1254             g_free (fold);
1255             g_string_free (fixed, TRUE);
1256         }
1257         else if (dot && (start == text + 1))
1258         {
1259             fold = g_utf8_casefold (start, -1);
1260             key = keygen (fold, -1);
1261             g_string_append (fixed, key);
1262             g_free (key);
1263             g_free (fold);
1264             result = g_string_free (fixed, FALSE);
1265         }
1266         else
1267         {
1268             if (start[0] != '\0' && start != end)
1269             {
1270                 fold = g_utf8_casefold (start, end - start);
1271                 key = keygen (fold, -1);
1272                 g_string_append (fixed, key);
1273                 g_free (key);
1274                 g_free (fold);
1275             }
1276             result = g_string_free (fixed, FALSE);
1277         }
1278     }
1279     return result;
1280 }
1281
1282 static char *
1283 str_utf8_create_key (const char *text, int case_sen)
1284 {
1285     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1286 }
1287
1288 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1289 static char *
1290 str_utf8_create_key_for_filename (const char *text, int case_sen)
1291 {
1292     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1293 }
1294 #endif
1295
1296 static int
1297 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1298 {
1299     (void) case_sen;
1300     return strcmp (t1, t2);
1301 }
1302
1303 static void
1304 str_utf8_release_key (char *key, int case_sen)
1305 {
1306     (void) case_sen;
1307     g_free (key);
1308 }
1309
1310 struct str_class
1311 str_utf8_init (void)
1312 {
1313     struct str_class result;
1314
1315     result.conv_gerror_message = str_utf8_conv_gerror_message;
1316     result.vfs_convert_to = str_utf8_vfs_convert_to;
1317     result.insert_replace_char = str_utf8_insert_replace_char;
1318     result.is_valid_string = str_utf8_is_valid_string;
1319     result.is_valid_char = str_utf8_is_valid_char;
1320     result.cnext_char = str_utf8_cnext_char;
1321     result.cprev_char = str_utf8_cprev_char;
1322     result.cnext_char_safe = str_utf8_cnext_char_safe;
1323     result.cprev_char_safe = str_utf8_cprev_char_safe;
1324     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1325     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1326     result.char_isspace = str_utf8_isspace;
1327     result.char_ispunct = str_utf8_ispunct;
1328     result.char_isalnum = str_utf8_isalnum;
1329     result.char_isdigit = str_utf8_isdigit;
1330     result.char_isprint = str_utf8_isprint;
1331     result.char_iscombiningmark = str_utf8_iscombiningmark;
1332     result.char_toupper = str_utf8_toupper;
1333     result.char_tolower = str_utf8_tolower;
1334     result.length = str_utf8_length;
1335     result.length2 = str_utf8_length2;
1336     result.length_noncomb = str_utf8_length_noncomb;
1337     result.fix_string = str_utf8_fix_string;
1338     result.term_form = str_utf8_term_form;
1339     result.fit_to_term = str_utf8_fit_to_term;
1340     result.term_trim = str_utf8_term_trim;
1341     result.term_width2 = str_utf8_term_width2;
1342     result.term_width1 = str_utf8_term_width1;
1343     result.term_char_width = str_utf8_term_char_width;
1344     result.term_substring = str_utf8_term_substring;
1345     result.trunc = str_utf8_trunc;
1346     result.offset_to_pos = str_utf8_offset_to_pos;
1347     result.column_to_pos = str_utf8_column_to_pos;
1348     result.create_search_needle = str_utf8_create_search_needle;
1349     result.release_search_needle = str_utf8_release_search_needle;
1350     result.search_first = str_utf8_search_first;
1351     result.search_last = str_utf8_search_last;
1352     result.compare = str_utf8_compare;
1353     result.ncompare = str_utf8_ncompare;
1354     result.casecmp = str_utf8_casecmp;
1355     result.ncasecmp = str_utf8_ncasecmp;
1356     result.prefix = str_utf8_prefix;
1357     result.caseprefix = str_utf8_caseprefix;
1358     result.create_key = str_utf8_create_key;
1359 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1360     /* case insensitive sort files in "a1 a2 a10" order */
1361     result.create_key_for_filename = str_utf8_create_key_for_filename;
1362 #else
1363     /* case insensitive sort files in "a1 a10 a2" order */
1364     result.create_key_for_filename = str_utf8_create_key;
1365 #endif
1366     result.key_collate = str_utf8_key_collate;
1367     result.release_key = str_utf8_release_key;
1368
1369     return result;
1370 }