lib/strutil/strutilutf8.c

   1 /*
   2    UTF-8 strings utilities
   3
   4    Copyright (C) 2007, 2011
   5    The Free Software Foundation, Inc.
   6
   7    Written by:
   8    Rostislav Benes, 2007
   9
  10    The file_date routine is mostly from GNU's fileutils package,
  11    written by Richard Stallman and David MacKenzie.
  12
  13    This file is part of the Midnight Commander.
  14
  15    The Midnight Commander is free software: you can redistribute it
  16    and/or modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation, either version 3 of the License,
  18    or (at your option) any later version.
  19
  20    The Midnight Commander is distributed in the hope that it will be useful,
  21    but WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23    GNU General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  27  */
  28
  29 #include <config.h>
  30 #include <stdlib.h>
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include <glib.h>
  34 #include <langinfo.h>
  35 #include <string.h>
  36
  37 #include "lib/global.h"
  38 #include "lib/strutil.h"
  39
  40 /* using function for utf-8 from glib */
  41
  42 static const char replch[] = "\xEF\xBF\xBD";
  43
  44 static int
  45 str_unichar_iscombiningmark (gunichar uni)
  46 {
  47     int type = g_unichar_type (uni);
  48     return (type == G_UNICODE_COMBINING_MARK)
  49         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  50 }
  51
  52 static void
  53 str_utf8_insert_replace_char (GString * buffer)
  54 {
  55     g_string_append (buffer, replch);
  56 }
  57
  58 static int
  59 str_utf8_is_valid_string (const char *text)
  60 {
  61     return g_utf8_validate (text, -1, NULL);
  62 }
  63
  64 static int
  65 str_utf8_is_valid_char (const char *ch, size_t size)
  66 {
  67     switch (g_utf8_get_char_validated (ch, size))
  68     {
  69     case (gunichar) (-2):
  70         return -2;
  71     case (gunichar) (-1):
  72         return -1;
  73     default:
  74         return 1;
  75     }
  76 }
  77
  78 static void
  79 str_utf8_cnext_char (const char **text)
  80 {
  81     (*text) = g_utf8_next_char (*text);
  82 }
  83
  84 static void
  85 str_utf8_cprev_char (const char **text)
  86 {
  87     (*text) = g_utf8_prev_char (*text);
  88 }
  89
  90 static void
  91 str_utf8_cnext_char_safe (const char **text)
  92 {
  93     if (str_utf8_is_valid_char (*text, -1) == 1)
  94         (*text) = g_utf8_next_char (*text);
  95     else
  96         (*text)++;
  97 }
  98
  99 static void
 100 str_utf8_cprev_char_safe (const char **text)
 101 {
 102     const char *result = g_utf8_prev_char (*text);
 103     const char *t = result;
 104     str_utf8_cnext_char_safe (&t);
 105     if (t == *text)
 106         (*text) = result;
 107     else
 108         (*text)--;
 109 }
 110
 111 static void
 112 str_utf8_fix_string (char *text)
 113 {
 114     gunichar uni;
 115
 116     while (text[0] != '\0')
 117     {
 118         uni = g_utf8_get_char_validated (text, -1);
 119         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 120         {
 121             text = g_utf8_next_char (text);
 122         }
 123         else
 124         {
 125             text[0] = '?';
 126             text++;
 127         }
 128     }
 129 }
 130
 131 static int
 132 str_utf8_isspace (const char *text)
 133 {
 134     gunichar uni = g_utf8_get_char_validated (text, -1);
 135     return g_unichar_isspace (uni);
 136 }
 137
 138 static int
 139 str_utf8_ispunct (const char *text)
 140 {
 141     gunichar uni = g_utf8_get_char_validated (text, -1);
 142     return g_unichar_ispunct (uni);
 143 }
 144
 145 static int
 146 str_utf8_isalnum (const char *text)
 147 {
 148     gunichar uni = g_utf8_get_char_validated (text, -1);
 149     return g_unichar_isalnum (uni);
 150 }
 151
 152 static int
 153 str_utf8_isdigit (const char *text)
 154 {
 155     gunichar uni = g_utf8_get_char_validated (text, -1);
 156     return g_unichar_isdigit (uni);
 157 }
 158
 159 static int
 160 str_utf8_isprint (const char *ch)
 161 {
 162     gunichar uni = g_utf8_get_char_validated (ch, -1);
 163     return g_unichar_isprint (uni);
 164 }
 165
 166 static int
 167 str_utf8_iscombiningmark (const char *ch)
 168 {
 169     gunichar uni = g_utf8_get_char_validated (ch, -1);
 170     return str_unichar_iscombiningmark (uni);
 171 }
 172
 173 static int
 174 str_utf8_cnext_noncomb_char (const char **text)
 175 {
 176     int count = 0;
 177     while ((*text)[0] != '\0')
 178     {
 179         str_utf8_cnext_char_safe (text);
 180         count++;
 181         if (!str_utf8_iscombiningmark (*text))
 182             break;
 183     }
 184     return count;
 185 }
 186
 187 static int
 188 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 189 {
 190     int count = 0;
 191     while ((*text) != begin)
 192     {
 193         str_utf8_cprev_char_safe (text);
 194         count++;
 195         if (!str_utf8_iscombiningmark (*text))
 196             break;
 197     }
 198     return count;
 199 }
 200
 201 static int
 202 str_utf8_toupper (const char *text, char **out, size_t * remain)
 203 {
 204     gunichar uni;
 205     size_t left;
 206
 207     uni = g_utf8_get_char_validated (text, -1);
 208     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 209         return 0;
 210
 211     uni = g_unichar_toupper (uni);
 212     left = g_unichar_to_utf8 (uni, NULL);
 213     if (left >= *remain)
 214         return 0;
 215
 216     left = g_unichar_to_utf8 (uni, *out);
 217     (*out) += left;
 218     (*remain) -= left;
 219     return 1;
 220 }
 221
 222 static int
 223 str_utf8_tolower (const char *text, char **out, size_t * remain)
 224 {
 225     gunichar uni;
 226     size_t left;
 227
 228     uni = g_utf8_get_char_validated (text, -1);
 229     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 230         return 0;
 231
 232     uni = g_unichar_tolower (uni);
 233     left = g_unichar_to_utf8 (uni, NULL);
 234     if (left >= *remain)
 235         return 0;
 236
 237     left = g_unichar_to_utf8 (uni, *out);
 238     (*out) += left;
 239     (*remain) -= left;
 240     return 1;
 241 }
 242
 243 static int
 244 str_utf8_length (const char *text)
 245 {
 246     int result = 0;
 247     const char *start;
 248     const char *end;
 249
 250     start = text;
 251     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 252     {
 253         if (start != end)
 254         {
 255             result += g_utf8_strlen (start, end - start);
 256         }
 257         result++;
 258         start = end + 1;
 259     }
 260
 261     if (start == text)
 262     {
 263         result = g_utf8_strlen (text, -1);
 264     }
 265     else
 266     {
 267         if (start[0] != '\0' && start != end)
 268         {
 269             result += g_utf8_strlen (start, end - start);
 270         }
 271     }
 272
 273     return result;
 274 }
 275
 276 static int
 277 str_utf8_length2 (const char *text, int size)
 278 {
 279     int result = 0;
 280     const char *start;
 281     const char *end;
 282
 283     start = text;
 284     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 285     {
 286         if (start != end)
 287         {
 288             result += g_utf8_strlen (start, min (end - start, size));
 289             size -= end - start;
 290         }
 291         result += (size > 0);
 292         size--;
 293         start = end + 1;
 294     }
 295
 296     if (start == text)
 297     {
 298         result = g_utf8_strlen (text, size);
 299     }
 300     else
 301     {
 302         if (start[0] != '\0' && start != end && size > 0)
 303         {
 304             result += g_utf8_strlen (start, min (end - start, size));
 305         }
 306     }
 307
 308     return result;
 309 }
 310
 311 static int
 312 str_utf8_length_noncomb (const char *text)
 313 {
 314     int result = 0;
 315     const char *t = text;
 316
 317     while (t[0] != '\0')
 318     {
 319         str_utf8_cnext_noncomb_char (&t);
 320         result++;
 321     }
 322
 323     return result;
 324 }
 325
 326 /*
 327    static void
 328    str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
 329    {
 330    char *next = g_utf8_next_char (*string);
 331    (*left) -= next - (*string);
 332    (*string) = next;
 333    g_string_append_c (buffer, '?');
 334    }
 335  */
 336
 337 static gchar *
 338 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
 339 {
 340     if ((error != NULL) && (error->message != NULL))
 341         return g_strdup (error->message);
 342
 343     return g_strdup (def_msg != NULL ? def_msg : "");
 344 }
 345
 346 static estr_t
 347 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
 348 {
 349     estr_t result;
 350
 351     if (coder == str_cnv_not_convert)
 352     {
 353         g_string_append_len (buffer, string, size);
 354         result = ESTR_SUCCESS;
 355     }
 356     else
 357         result = str_nconvert (coder, (char *) string, size, buffer);
 358
 359     return result;
 360 }
 361
 362 struct term_form
 363 {
 364     char text[BUF_MEDIUM * 6];
 365     size_t width;
 366     int compose;
 367 };
 368
 369 /* utiliti function, that make string valid in utf8 and all characters printable
 370  * return width of string too*/
 371 static const struct term_form *
 372 str_utf8_make_make_term_form (const char *text, size_t length)
 373 {
 374     static struct term_form result;
 375     gunichar uni;
 376     size_t left;
 377     char *actual;
 378
 379     result.text[0] = '\0';
 380     result.width = 0;
 381     result.compose = 0;
 382     actual = result.text;
 383
 384     /* check if text start with combining character,
 385      * add space at begin in this case */
 386     if (length != 0 && text[0] != '\0')
 387     {
 388         uni = g_utf8_get_char_validated (text, -1);
 389         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 390         {
 391             if (str_unichar_iscombiningmark (uni))
 392             {
 393                 actual[0] = ' ';
 394                 actual++;
 395                 result.width++;
 396                 result.compose = 1;
 397             }
 398         }
 399     }
 400
 401     while (length != 0 && text[0] != '\0')
 402     {
 403         uni = g_utf8_get_char_validated (text, -1);
 404         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 405         {
 406             if (g_unichar_isprint (uni))
 407             {
 408                 left = g_unichar_to_utf8 (uni, actual);
 409                 actual += left;
 410                 if (!str_unichar_iscombiningmark (uni))
 411                 {
 412                     result.width++;
 413                     if (g_unichar_iswide (uni))
 414                         result.width++;
 415                 }
 416                 else
 417                     result.compose = 1;
 418             }
 419             else
 420             {
 421                 actual[0] = '.';
 422                 actual++;
 423                 result.width++;
 424             }
 425             text = g_utf8_next_char (text);
 426         }
 427         else
 428         {
 429             text++;
 430             /*actual[0] = '?'; */
 431             memcpy (actual, replch, strlen (replch));
 432             actual += strlen (replch);
 433             result.width++;
 434         }
 435         if (length != (size_t) (-1))
 436             length--;
 437     }
 438     actual[0] = '\0';
 439
 440     return &result;
 441 }
 442
 443 static const char *
 444 str_utf8_term_form (const char *text)
 445 {
 446     static char result[BUF_MEDIUM * 6];
 447     const struct term_form *pre_form;
 448     char *composed;
 449
 450     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 451     if (pre_form->compose)
 452     {
 453         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 454         g_strlcpy (result, composed, sizeof (result));
 455         g_free (composed);
 456     }
 457     else
 458     {
 459         g_strlcpy (result, pre_form->text, sizeof (result));
 460     }
 461     return result;
 462 }
 463
 464 struct utf8_tool
 465 {
 466     char *actual;
 467     size_t remain;
 468     const char *cheked;
 469     int ident;
 470     int compose;
 471 };
 472
 473 /* utiliti function, that copy all characters from cheked to actual */
 474 static int
 475 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 476 {
 477     size_t left;
 478     gunichar uni;
 479
 480     tool->compose = 0;
 481
 482     while (tool->cheked[0] != '\0')
 483     {
 484         uni = g_utf8_get_char (tool->cheked);
 485         tool->compose |= str_unichar_iscombiningmark (uni);
 486         left = g_unichar_to_utf8 (uni, NULL);
 487         if (tool->remain <= left)
 488             return 0;
 489         left = g_unichar_to_utf8 (uni, tool->actual);
 490         tool->actual += left;
 491         tool->remain -= left;
 492         tool->cheked = g_utf8_next_char (tool->cheked);
 493     }
 494     return 1;
 495 }
 496
 497 /* utiliti function, that copy characters from cheked to actual until ident is
 498  * smaller than to_ident */
 499 static int
 500 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 501 {
 502     size_t left;
 503     gunichar uni;
 504     int w;
 505
 506     tool->compose = 0;
 507
 508     while (tool->cheked[0] != '\0')
 509     {
 510         uni = g_utf8_get_char (tool->cheked);
 511         if (!str_unichar_iscombiningmark (uni))
 512         {
 513             w = 1;
 514             if (g_unichar_iswide (uni))
 515                 w++;
 516             if (tool->ident + w > to_ident)
 517                 return 1;
 518         }
 519         else
 520         {
 521             w = 0;
 522             tool->compose = 1;
 523         }
 524
 525         left = g_unichar_to_utf8 (uni, NULL);
 526         if (tool->remain <= left)
 527             return 0;
 528         left = g_unichar_to_utf8 (uni, tool->actual);
 529         tool->actual += left;
 530         tool->remain -= left;
 531         tool->cheked = g_utf8_next_char (tool->cheked);
 532         tool->ident += w;
 533     }
 534     return 1;
 535 }
 536
 537 /* utiliti function, add count spaces to actual */
 538 static int
 539 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 540 {
 541     if (count <= 0)
 542         return 1;
 543     if (tool->remain <= (gsize) count)
 544         return 0;
 545     memset (tool->actual, ' ', count);
 546     tool->actual += count;
 547     tool->remain -= count;
 548     return 1;
 549 }
 550
 551 /* utiliti function, add one characters to actual */
 552 static int
 553 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 554 {
 555     if (tool->remain <= 1)
 556         return 0;
 557     tool->actual[0] = ch;
 558     tool->actual++;
 559     tool->remain--;
 560     return 1;
 561 }
 562
 563 /* utiliti function, thah skip characters from cheked until ident is greater or
 564  * equal to to_ident */
 565 static int
 566 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 567 {
 568     gunichar uni;
 569
 570     while (to_ident > tool->ident && tool->cheked[0] != '\0')
 571     {
 572         uni = g_utf8_get_char (tool->cheked);
 573         if (!str_unichar_iscombiningmark (uni))
 574         {
 575             tool->ident++;
 576             if (g_unichar_iswide (uni))
 577                 tool->ident++;
 578         }
 579         tool->cheked = g_utf8_next_char (tool->cheked);
 580     }
 581     uni = g_utf8_get_char (tool->cheked);
 582     while (str_unichar_iscombiningmark (uni))
 583     {
 584         tool->cheked = g_utf8_next_char (tool->cheked);
 585         uni = g_utf8_get_char (tool->cheked);
 586     }
 587     return 1;
 588 }
 589
 590 static void
 591 utf8_tool_compose (char *buffer, size_t size)
 592 {
 593     char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 594     g_strlcpy (buffer, composed, size);
 595     g_free (composed);
 596 }
 597
 598
 599 static const char *
 600 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 601 {
 602     static char result[BUF_MEDIUM * 6];
 603     const struct term_form *pre_form;
 604     struct utf8_tool tool;
 605
 606     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 607     tool.cheked = pre_form->text;
 608     tool.actual = result;
 609     tool.remain = sizeof (result);
 610     tool.compose = 0;
 611
 612     if (pre_form->width <= (gsize) width)
 613     {
 614         tool.ident = 0;
 615         switch (HIDE_FIT (just_mode))
 616         {
 617         case J_CENTER_LEFT:
 618         case J_CENTER:
 619             tool.ident = (width - pre_form->width) / 2;
 620             break;
 621         case J_RIGHT:
 622             tool.ident = width - pre_form->width;
 623             break;
 624         }
 625
 626         utf8_tool_insert_space (&tool, tool.ident);
 627         utf8_tool_copy_chars_to_end (&tool);
 628         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 629     }
 630     else
 631     {
 632         if (IS_FIT (just_mode))
 633         {
 634             tool.ident = 0;
 635             utf8_tool_copy_chars_to (&tool, width / 2);
 636             utf8_tool_insert_char (&tool, '~');
 637
 638             tool.ident = 0;
 639             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 640             utf8_tool_copy_chars_to_end (&tool);
 641             utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 642         }
 643         else
 644         {
 645             tool.ident = 0;
 646             switch (HIDE_FIT (just_mode))
 647             {
 648             case J_CENTER:
 649                 tool.ident = (width - pre_form->width) / 2;
 650                 break;
 651             case J_RIGHT:
 652                 tool.ident = width - pre_form->width;
 653                 break;
 654             }
 655
 656             utf8_tool_skip_chars_to (&tool, 0);
 657             utf8_tool_insert_space (&tool, tool.ident);
 658             utf8_tool_copy_chars_to (&tool, width);
 659             utf8_tool_insert_space (&tool, width - tool.ident);
 660         }
 661     }
 662
 663     tool.actual[0] = '\0';
 664     if (tool.compose)
 665         utf8_tool_compose (result, sizeof (result));
 666     return result;
 667 }
 668
 669 static const char *
 670 str_utf8_term_trim (const char *text, int width)
 671 {
 672     static char result[BUF_MEDIUM * 6];
 673     const struct term_form *pre_form;
 674     struct utf8_tool tool;
 675
 676     if (width < 1)
 677     {
 678         result [0] = '\0';
 679         return result;
 680     }
 681
 682     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 683
 684     tool.cheked = pre_form->text;
 685     tool.actual = result;
 686     tool.remain = sizeof (result);
 687     tool.compose = 0;
 688
 689     if ((gsize) width < pre_form->width)
 690     {
 691         if (width <= 3)
 692         {
 693             memset (tool.actual, '.', width);
 694             tool.actual += width;
 695             tool.remain -= width;
 696         }
 697         else
 698         {
 699             memset (tool.actual, '.', 3);
 700             tool.actual += 3;
 701             tool.remain -= 3;
 702
 703             tool.ident = 0;
 704             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 705             utf8_tool_copy_chars_to_end (&tool);
 706         }
 707     }
 708     else
 709     {
 710         utf8_tool_copy_chars_to_end (&tool);
 711     }
 712
 713     tool.actual[0] = '\0';
 714     if (tool.compose)
 715         utf8_tool_compose (result, sizeof (result));
 716     return result;
 717 }
 718
 719 static int
 720 str_utf8_term_width2 (const char *text, size_t length)
 721 {
 722     const struct term_form *result;
 723
 724     result = str_utf8_make_make_term_form (text, length);
 725     return result->width;
 726 }
 727
 728 static int
 729 str_utf8_term_width1 (const char *text)
 730 {
 731     return str_utf8_term_width2 (text, (size_t) (-1));
 732 }
 733
 734 static int
 735 str_utf8_term_char_width (const char *text)
 736 {
 737     gunichar uni = g_utf8_get_char_validated (text, -1);
 738     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 739 }
 740
 741 static const char *
 742 str_utf8_term_substring (const char *text, int start, int width)
 743 {
 744     static char result[BUF_MEDIUM * 6];
 745     const struct term_form *pre_form;
 746     struct utf8_tool tool;
 747
 748     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 749
 750     tool.cheked = pre_form->text;
 751     tool.actual = result;
 752     tool.remain = sizeof (result);
 753     tool.compose = 0;
 754
 755     tool.ident = -start;
 756     utf8_tool_skip_chars_to (&tool, 0);
 757     if (tool.ident < 0)
 758         tool.ident = 0;
 759     utf8_tool_insert_space (&tool, tool.ident);
 760
 761     utf8_tool_copy_chars_to (&tool, width);
 762     utf8_tool_insert_space (&tool, width - tool.ident);
 763
 764     tool.actual[0] = '\0';
 765     if (tool.compose)
 766         utf8_tool_compose (result, sizeof (result));
 767     return result;
 768 }
 769
 770 static const char *
 771 str_utf8_trunc (const char *text, int width)
 772 {
 773     static char result[MC_MAXPATHLEN * 6 * 2];
 774     const struct term_form *pre_form;
 775     struct utf8_tool tool;
 776
 777     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 778
 779     tool.cheked = pre_form->text;
 780     tool.actual = result;
 781     tool.remain = sizeof (result);
 782     tool.compose = 0;
 783
 784     if (pre_form->width > (gsize) width)
 785     {
 786         tool.ident = 0;
 787         utf8_tool_copy_chars_to (&tool, width / 2);
 788         utf8_tool_insert_char (&tool, '~');
 789
 790         tool.ident = 0;
 791         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 792         utf8_tool_copy_chars_to_end (&tool);
 793     }
 794     else
 795     {
 796         utf8_tool_copy_chars_to_end (&tool);
 797     }
 798
 799     tool.actual[0] = '\0';
 800     if (tool.compose)
 801         utf8_tool_compose (result, sizeof (result));
 802     return result;
 803 }
 804
 805 static int
 806 str_utf8_offset_to_pos (const char *text, size_t length)
 807 {
 808     if (str_utf8_is_valid_string (text))
 809         return g_utf8_offset_to_pointer (text, length) - text;
 810     else
 811     {
 812         int result;
 813         GString *buffer = g_string_new (text);
 814
 815         str_utf8_fix_string (buffer->str);
 816         result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
 817         g_string_free (buffer, TRUE);
 818         return result;
 819     }
 820 }
 821
 822 static int
 823 str_utf8_column_to_pos (const char *text, size_t pos)
 824 {
 825     static int result;
 826     gunichar uni;
 827     int width;
 828
 829     width = 0;
 830     result = 0;
 831
 832     while (text[0] != '\0')
 833     {
 834         uni = g_utf8_get_char_validated (text, 6);
 835         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 836         {
 837             if (g_unichar_isprint (uni))
 838             {
 839                 if (!str_unichar_iscombiningmark (uni))
 840                 {
 841                     width++;
 842                     if (g_unichar_iswide (uni))
 843                         width++;
 844                 }
 845             }
 846             else
 847             {
 848                 width++;
 849             }
 850             text = g_utf8_next_char (text);
 851         }
 852         else
 853         {
 854             text++;
 855             width++;
 856         }
 857         if ((gsize) width > pos)
 858             return result;
 859
 860         result++;
 861     }
 862
 863     return result;
 864 }
 865
 866 static char *
 867 str_utf8_create_search_needle (const char *needle, int case_sen)
 868 {
 869     if (needle != NULL)
 870     {
 871         if (case_sen)
 872         {
 873             return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 874         }
 875         else
 876         {
 877             char *fold = g_utf8_casefold (needle, -1);
 878             char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 879             g_free (fold);
 880             return result;
 881         }
 882     }
 883     else
 884         return NULL;
 885 }
 886
 887 static void
 888 str_utf8_release_search_needle (char *needle, int case_sen)
 889 {
 890     (void) case_sen;
 891     if (needle != NULL)
 892         g_free (needle);
 893 }
 894
 895 static const char *
 896 str_utf8_search_first (const char *text, const char *search, int case_sen)
 897 {
 898     char *fold_text;
 899     char *deco_text;
 900     const char *match;
 901     const char *result = NULL;
 902     const char *m;
 903
 904     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 905     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 906
 907     match = deco_text;
 908     do
 909     {
 910         match = g_strstr_len (match, -1, search);
 911         if (match != NULL)
 912         {
 913             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 914                 !str_utf8_iscombiningmark (match + strlen (search)))
 915             {
 916
 917                 result = text;
 918                 m = deco_text;
 919                 while (m < match)
 920                 {
 921                     str_utf8_cnext_noncomb_char (&m);
 922                     str_utf8_cnext_noncomb_char (&result);
 923                 }
 924             }
 925             else
 926             {
 927                 str_utf8_cnext_char (&match);
 928             }
 929         }
 930     }
 931     while (match != NULL && result == NULL);
 932
 933     g_free (deco_text);
 934     if (!case_sen)
 935         g_free (fold_text);
 936
 937     return result;
 938 }
 939
 940 static const char *
 941 str_utf8_search_last (const char *text, const char *search, int case_sen)
 942 {
 943     char *fold_text;
 944     char *deco_text;
 945     char *match;
 946     const char *result = NULL;
 947     const char *m;
 948
 949     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 950     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 951
 952     do
 953     {
 954         match = g_strrstr_len (deco_text, -1, search);
 955         if (match != NULL)
 956         {
 957             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 958                 !str_utf8_iscombiningmark (match + strlen (search)))
 959             {
 960
 961                 result = text;
 962                 m = deco_text;
 963                 while (m < match)
 964                 {
 965                     str_utf8_cnext_noncomb_char (&m);
 966                     str_utf8_cnext_noncomb_char (&result);
 967                 }
 968             }
 969             else
 970             {
 971                 match[0] = '\0';
 972             }
 973         }
 974     }
 975     while (match != NULL && result == NULL);
 976
 977     g_free (deco_text);
 978     if (!case_sen)
 979         g_free (fold_text);
 980
 981     return result;
 982 }
 983
 984 static char *
 985 str_utf8_normalize (const char *text)
 986 {
 987     GString *fixed = g_string_new ("");
 988     char *tmp;
 989     char *result;
 990     const char *start;
 991     const char *end;
 992
 993     start = text;
 994     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 995     {
 996         if (start != end)
 997         {
 998             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
 999             g_string_append (fixed, tmp);
1000             g_free (tmp);
1001         }
1002         g_string_append_c (fixed, end[0]);
1003         start = end + 1;
1004     }
1005
1006     if (start == text)
1007     {
1008         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1009     }
1010     else
1011     {
1012         if (start[0] != '\0' && start != end)
1013         {
1014             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1015             g_string_append (fixed, tmp);
1016             g_free (tmp);
1017         }
1018         result = g_strdup (fixed->str);
1019     }
1020     g_string_free (fixed, TRUE);
1021
1022     return result;
1023 }
1024
1025 static char *
1026 str_utf8_casefold_normalize (const char *text)
1027 {
1028     GString *fixed = g_string_new ("");
1029     char *tmp, *fold;
1030     char *result;
1031     const char *start;
1032     const char *end;
1033
1034     start = text;
1035     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1036     {
1037         if (start != end)
1038         {
1039             fold = g_utf8_casefold (start, end - start);
1040             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1041             g_string_append (fixed, tmp);
1042             g_free (tmp);
1043             g_free (fold);
1044         }
1045         g_string_append_c (fixed, end[0]);
1046         start = end + 1;
1047     }
1048
1049     if (start == text)
1050     {
1051         fold = g_utf8_casefold (text, -1);
1052         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1053         g_free (fold);
1054     }
1055     else
1056     {
1057         if (start[0] != '\0' && start != end)
1058         {
1059             fold = g_utf8_casefold (start, end - start);
1060             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1061             g_string_append (fixed, tmp);
1062             g_free (tmp);
1063             g_free (fold);
1064         }
1065         result = g_strdup (fixed->str);
1066     }
1067     g_string_free (fixed, TRUE);
1068
1069     return result;
1070 }
1071
1072 static int
1073 str_utf8_compare (const char *t1, const char *t2)
1074 {
1075     char *n1, *n2;
1076     int result;
1077
1078     n1 = str_utf8_normalize (t1);
1079     n2 = str_utf8_normalize (t2);
1080
1081     result = strcmp (n1, n2);
1082
1083     g_free (n1);
1084     g_free (n2);
1085
1086     return result;
1087 }
1088
1089 static int
1090 str_utf8_ncompare (const char *t1, const char *t2)
1091 {
1092     char *n1, *n2;
1093     int result;
1094
1095     n1 = str_utf8_normalize (t1);
1096     n2 = str_utf8_normalize (t2);
1097
1098     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1099
1100     g_free (n1);
1101     g_free (n2);
1102
1103     return result;
1104 }
1105
1106 static int
1107 str_utf8_casecmp (const char *t1, const char *t2)
1108 {
1109     char *n1, *n2;
1110     int result;
1111
1112     n1 = str_utf8_casefold_normalize (t1);
1113     n2 = str_utf8_casefold_normalize (t2);
1114
1115     result = strcmp (n1, n2);
1116
1117     g_free (n1);
1118     g_free (n2);
1119
1120     return result;
1121 }
1122
1123 static int
1124 str_utf8_ncasecmp (const char *t1, const char *t2)
1125 {
1126     char *n1, *n2;
1127     int result;
1128
1129     n1 = str_utf8_casefold_normalize (t1);
1130     n2 = str_utf8_casefold_normalize (t2);
1131
1132     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1133
1134     g_free (n1);
1135     g_free (n2);
1136
1137     return result;
1138 }
1139
1140 static int
1141 str_utf8_prefix (const char *text, const char *prefix)
1142 {
1143     char *t = str_utf8_normalize (text);
1144     char *p = str_utf8_normalize (prefix);
1145     const char *nt = t;
1146     const char *np = p;
1147     const char *nnt = t;
1148     const char *nnp = p;
1149     int result;
1150
1151     while (nt[0] != '\0' && np[0] != '\0')
1152     {
1153         str_utf8_cnext_char_safe (&nnt);
1154         str_utf8_cnext_char_safe (&nnp);
1155         if (nnt - nt != nnp - np)
1156             break;
1157         if (strncmp (nt, np, nnt - nt) != 0)
1158             break;
1159         nt = nnt;
1160         np = nnp;
1161     }
1162
1163     result = np - p;
1164
1165     g_free (t);
1166     g_free (p);
1167
1168     return result;
1169 }
1170
1171 static int
1172 str_utf8_caseprefix (const char *text, const char *prefix)
1173 {
1174     char *t = str_utf8_casefold_normalize (text);
1175     char *p = str_utf8_casefold_normalize (prefix);
1176     const char *nt = t;
1177     const char *np = p;
1178     const char *nnt = t;
1179     const char *nnp = p;
1180     int result;
1181
1182     while (nt[0] != '\0' && np[0] != '\0')
1183     {
1184         str_utf8_cnext_char_safe (&nnt);
1185         str_utf8_cnext_char_safe (&nnp);
1186         if (nnt - nt != nnp - np)
1187             break;
1188         if (strncmp (nt, np, nnt - nt) != 0)
1189             break;
1190         nt = nnt;
1191         np = nnp;
1192     }
1193
1194     result = np - p;
1195
1196     g_free (t);
1197     g_free (p);
1198
1199     return result;
1200 }
1201
1202 static char *
1203 str_utf8_create_key_gen (const char *text, int case_sen,
1204                          gchar * (*keygen) (const gchar * text, gssize size))
1205 {
1206     char *result;
1207
1208     if (case_sen)
1209     {
1210         result = str_utf8_normalize (text);
1211     }
1212     else
1213     {
1214         gboolean dot;
1215         GString *fixed;
1216         const char *start, *end;
1217         char *fold, *key;
1218
1219         dot = text[0] == '.';
1220         fixed = g_string_sized_new (16);
1221
1222         if (!dot)
1223             start = text;
1224         else
1225         {
1226             start = text + 1;
1227             g_string_append_c (fixed, '.');
1228         }
1229
1230         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1231         {
1232             if (start != end)
1233             {
1234                 fold = g_utf8_casefold (start, end - start);
1235                 key = keygen (fold, -1);
1236                 g_string_append (fixed, key);
1237                 g_free (key);
1238                 g_free (fold);
1239             }
1240             g_string_append_c (fixed, end[0]);
1241             start = end + 1;
1242         }
1243
1244         if (start == text)
1245         {
1246             fold = g_utf8_casefold (start, -1);
1247             result = keygen (fold, -1);
1248             g_free (fold);
1249             g_string_free (fixed, TRUE);
1250         }
1251         else if (dot && (start == text + 1))
1252         {
1253             fold = g_utf8_casefold (start, -1);
1254             key = keygen (fold, -1);
1255             g_string_append (fixed, key);
1256             g_free (key);
1257             g_free (fold);
1258             result = g_string_free (fixed, FALSE);
1259         }
1260         else
1261         {
1262             if (start[0] != '\0' && start != end)
1263             {
1264                 fold = g_utf8_casefold (start, end - start);
1265                 key = keygen (fold, -1);
1266                 g_string_append (fixed, key);
1267                 g_free (key);
1268                 g_free (fold);
1269             }
1270             result = g_string_free (fixed, FALSE);
1271         }
1272     }
1273     return result;
1274 }
1275
1276 static char *
1277 str_utf8_create_key (const char *text, int case_sen)
1278 {
1279     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1280 }
1281
1282 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1283 static char *
1284 str_utf8_create_key_for_filename (const char *text, int case_sen)
1285 {
1286     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1287 }
1288 #endif
1289
1290 static int
1291 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1292 {
1293     (void) case_sen;
1294     return strcmp (t1, t2);
1295 }
1296
1297 static void
1298 str_utf8_release_key (char *key, int case_sen)
1299 {
1300     (void) case_sen;
1301     g_free (key);
1302 }
1303
1304 struct str_class
1305 str_utf8_init (void)
1306 {
1307     struct str_class result;
1308
1309     result.conv_gerror_message = str_utf8_conv_gerror_message;
1310     result.vfs_convert_to = str_utf8_vfs_convert_to;
1311     result.insert_replace_char = str_utf8_insert_replace_char;
1312     result.is_valid_string = str_utf8_is_valid_string;
1313     result.is_valid_char = str_utf8_is_valid_char;
1314     result.cnext_char = str_utf8_cnext_char;
1315     result.cprev_char = str_utf8_cprev_char;
1316     result.cnext_char_safe = str_utf8_cnext_char_safe;
1317     result.cprev_char_safe = str_utf8_cprev_char_safe;
1318     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1319     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1320     result.isspace = str_utf8_isspace;
1321     result.ispunct = str_utf8_ispunct;
1322     result.isalnum = str_utf8_isalnum;
1323     result.isdigit = str_utf8_isdigit;
1324     result.isprint = str_utf8_isprint;
1325     result.iscombiningmark = str_utf8_iscombiningmark;
1326     result.toupper = str_utf8_toupper;
1327     result.tolower = str_utf8_tolower;
1328     result.length = str_utf8_length;
1329     result.length2 = str_utf8_length2;
1330     result.length_noncomb = str_utf8_length_noncomb;
1331     result.fix_string = str_utf8_fix_string;
1332     result.term_form = str_utf8_term_form;
1333     result.fit_to_term = str_utf8_fit_to_term;
1334     result.term_trim = str_utf8_term_trim;
1335     result.term_width2 = str_utf8_term_width2;
1336     result.term_width1 = str_utf8_term_width1;
1337     result.term_char_width = str_utf8_term_char_width;
1338     result.term_substring = str_utf8_term_substring;
1339     result.trunc = str_utf8_trunc;
1340     result.offset_to_pos = str_utf8_offset_to_pos;
1341     result.column_to_pos = str_utf8_column_to_pos;
1342     result.create_search_needle = str_utf8_create_search_needle;
1343     result.release_search_needle = str_utf8_release_search_needle;
1344     result.search_first = str_utf8_search_first;
1345     result.search_last = str_utf8_search_last;
1346     result.compare = str_utf8_compare;
1347     result.ncompare = str_utf8_ncompare;
1348     result.casecmp = str_utf8_casecmp;
1349     result.ncasecmp = str_utf8_ncasecmp;
1350     result.prefix = str_utf8_prefix;
1351     result.caseprefix = str_utf8_caseprefix;
1352     result.create_key = str_utf8_create_key;
1353 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1354     /* case insensitive sort files in "a1 a2 a10" order */
1355     result.create_key_for_filename = str_utf8_create_key_for_filename;
1356 #else
1357     /* case insensitive sort files in "a1 a10 a2" order */
1358     result.create_key_for_filename = str_utf8_create_key;
1359 #endif
1360     result.key_collate = str_utf8_key_collate;
1361     result.release_key = str_utf8_release_key;
1362
1363     return result;
1364 }