lib/strutil/strutilutf8.c

   1 /*
   2    UTF-8 strings utilities
   3
   4    Copyright (C) 2007, 2011
   5    The Free Software Foundation, Inc.
   6
   7    Written by:
   8    Rostislav Benes, 2007
   9
  10    The file_date routine is mostly from GNU's fileutils package,
  11    written by Richard Stallman and David MacKenzie.
  12
  13    This file is part of the Midnight Commander.
  14
  15    The Midnight Commander is free software: you can redistribute it
  16    and/or modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation, either version 3 of the License,
  18    or (at your option) any later version.
  19
  20    The Midnight Commander is distributed in the hope that it will be useful,
  21    but WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23    GNU General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  27  */
  28
  29 #include <config.h>
  30 #include <stdlib.h>
  31 #include <stdio.h>
  32 #include <errno.h>
  33 #include <glib.h>
  34 #include <langinfo.h>
  35 #include <string.h>
  36
  37 #include "lib/global.h"
  38 #include "lib/strutil.h"
  39
  40 /* using function for utf-8 from glib */
  41
  42 static const char replch[] = "\xEF\xBF\xBD";
  43
  44 static int
  45 str_unichar_iscombiningmark (gunichar uni)
  46 {
  47     int type = g_unichar_type (uni);
  48     return (type == G_UNICODE_COMBINING_MARK)
  49         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  50 }
  51
  52 static void
  53 str_utf8_insert_replace_char (GString * buffer)
  54 {
  55     g_string_append (buffer, replch);
  56 }
  57
  58 static int
  59 str_utf8_is_valid_string (const char *text)
  60 {
  61     return g_utf8_validate (text, -1, NULL);
  62 }
  63
  64 static int
  65 str_utf8_is_valid_char (const char *ch, size_t size)
  66 {
  67     switch (g_utf8_get_char_validated (ch, size))
  68     {
  69     case (gunichar) (-2):
  70         return -2;
  71     case (gunichar) (-1):
  72         return -1;
  73     default:
  74         return 1;
  75     }
  76 }
  77
  78 static void
  79 str_utf8_cnext_char (const char **text)
  80 {
  81     (*text) = g_utf8_next_char (*text);
  82 }
  83
  84 static void
  85 str_utf8_cprev_char (const char **text)
  86 {
  87     (*text) = g_utf8_prev_char (*text);
  88 }
  89
  90 static void
  91 str_utf8_cnext_char_safe (const char **text)
  92 {
  93     if (str_utf8_is_valid_char (*text, -1) == 1)
  94         (*text) = g_utf8_next_char (*text);
  95     else
  96         (*text)++;
  97 }
  98
  99 static void
 100 str_utf8_cprev_char_safe (const char **text)
 101 {
 102     const char *result = g_utf8_prev_char (*text);
 103     const char *t = result;
 104     str_utf8_cnext_char_safe (&t);
 105     if (t == *text)
 106         (*text) = result;
 107     else
 108         (*text)--;
 109 }
 110
 111 static void
 112 str_utf8_fix_string (char *text)
 113 {
 114     gunichar uni;
 115
 116     while (text[0] != '\0')
 117     {
 118         uni = g_utf8_get_char_validated (text, -1);
 119         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 120         {
 121             text = g_utf8_next_char (text);
 122         }
 123         else
 124         {
 125             text[0] = '?';
 126             text++;
 127         }
 128     }
 129 }
 130
 131 static int
 132 str_utf8_isspace (const char *text)
 133 {
 134     gunichar uni = g_utf8_get_char_validated (text, -1);
 135     return g_unichar_isspace (uni);
 136 }
 137
 138 static int
 139 str_utf8_ispunct (const char *text)
 140 {
 141     gunichar uni = g_utf8_get_char_validated (text, -1);
 142     return g_unichar_ispunct (uni);
 143 }
 144
 145 static int
 146 str_utf8_isalnum (const char *text)
 147 {
 148     gunichar uni = g_utf8_get_char_validated (text, -1);
 149     return g_unichar_isalnum (uni);
 150 }
 151
 152 static int
 153 str_utf8_isdigit (const char *text)
 154 {
 155     gunichar uni = g_utf8_get_char_validated (text, -1);
 156     return g_unichar_isdigit (uni);
 157 }
 158
 159 static int
 160 str_utf8_isprint (const char *ch)
 161 {
 162     gunichar uni = g_utf8_get_char_validated (ch, -1);
 163     return g_unichar_isprint (uni);
 164 }
 165
 166 static int
 167 str_utf8_iscombiningmark (const char *ch)
 168 {
 169     gunichar uni = g_utf8_get_char_validated (ch, -1);
 170     return str_unichar_iscombiningmark (uni);
 171 }
 172
 173 static int
 174 str_utf8_cnext_noncomb_char (const char **text)
 175 {
 176     int count = 0;
 177     while ((*text)[0] != '\0')
 178     {
 179         str_utf8_cnext_char_safe (text);
 180         count++;
 181         if (!str_utf8_iscombiningmark (*text))
 182             break;
 183     }
 184     return count;
 185 }
 186
 187 static int
 188 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 189 {
 190     int count = 0;
 191     while ((*text) != begin)
 192     {
 193         str_utf8_cprev_char_safe (text);
 194         count++;
 195         if (!str_utf8_iscombiningmark (*text))
 196             break;
 197     }
 198     return count;
 199 }
 200
 201 static int
 202 str_utf8_toupper (const char *text, char **out, size_t * remain)
 203 {
 204     gunichar uni;
 205     size_t left;
 206
 207     uni = g_utf8_get_char_validated (text, -1);
 208     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 209         return 0;
 210
 211     uni = g_unichar_toupper (uni);
 212     left = g_unichar_to_utf8 (uni, NULL);
 213     if (left >= *remain)
 214         return 0;
 215
 216     left = g_unichar_to_utf8 (uni, *out);
 217     (*out) += left;
 218     (*remain) -= left;
 219     return 1;
 220 }
 221
 222 static int
 223 str_utf8_tolower (const char *text, char **out, size_t * remain)
 224 {
 225     gunichar uni;
 226     size_t left;
 227
 228     uni = g_utf8_get_char_validated (text, -1);
 229     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 230         return 0;
 231
 232     uni = g_unichar_tolower (uni);
 233     left = g_unichar_to_utf8 (uni, NULL);
 234     if (left >= *remain)
 235         return 0;
 236
 237     left = g_unichar_to_utf8 (uni, *out);
 238     (*out) += left;
 239     (*remain) -= left;
 240     return 1;
 241 }
 242
 243 static int
 244 str_utf8_length (const char *text)
 245 {
 246     int result = 0;
 247     const char *start;
 248     const char *end;
 249
 250     start = text;
 251     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 252     {
 253         if (start != end)
 254         {
 255             result += g_utf8_strlen (start, end - start);
 256         }
 257         result++;
 258         start = end + 1;
 259     }
 260
 261     if (start == text)
 262     {
 263         result = g_utf8_strlen (text, -1);
 264     }
 265     else
 266     {
 267         if (start[0] != '\0' && start != end)
 268         {
 269             result += g_utf8_strlen (start, end - start);
 270         }
 271     }
 272
 273     return result;
 274 }
 275
 276 static int
 277 str_utf8_length2 (const char *text, int size)
 278 {
 279     int result = 0;
 280     const char *start;
 281     const char *end;
 282
 283     start = text;
 284     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 285     {
 286         if (start != end)
 287         {
 288             result += g_utf8_strlen (start, min (end - start, size));
 289             size -= end - start;
 290         }
 291         result += (size > 0);
 292         size--;
 293         start = end + 1;
 294     }
 295
 296     if (start == text)
 297     {
 298         result = g_utf8_strlen (text, size);
 299     }
 300     else
 301     {
 302         if (start[0] != '\0' && start != end && size > 0)
 303         {
 304             result += g_utf8_strlen (start, min (end - start, size));
 305         }
 306     }
 307
 308     return result;
 309 }
 310
 311 static int
 312 str_utf8_length_noncomb (const char *text)
 313 {
 314     int result = 0;
 315     const char *t = text;
 316
 317     while (t[0] != '\0')
 318     {
 319         str_utf8_cnext_noncomb_char (&t);
 320         result++;
 321     }
 322
 323     return result;
 324 }
 325
 326 /*
 327    static void
 328    str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
 329    {
 330    char *next = g_utf8_next_char (*string);
 331    (*left) -= next - (*string);
 332    (*string) = next;
 333    g_string_append_c (buffer, '?');
 334    }
 335  */
 336
 337 static gchar *
 338 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
 339 {
 340     if ((error != NULL) && (error->message != NULL))
 341         return g_strdup (error->message);
 342
 343     return g_strdup (def_msg != NULL ? def_msg : "");
 344 }
 345
 346 static estr_t
 347 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
 348 {
 349     estr_t result;
 350
 351     if (coder == str_cnv_not_convert)
 352     {
 353         g_string_append_len (buffer, string, size);
 354         result = ESTR_SUCCESS;
 355     }
 356     else
 357         result = str_nconvert (coder, (char *) string, size, buffer);
 358
 359     return result;
 360 }
 361
 362 struct term_form
 363 {
 364     char text[BUF_MEDIUM * 6];
 365     size_t width;
 366     int compose;
 367 };
 368
 369 /* utiliti function, that make string valid in utf8 and all characters printable
 370  * return width of string too*/
 371 static const struct term_form *
 372 str_utf8_make_make_term_form (const char *text, size_t length)
 373 {
 374     static struct term_form result;
 375     gunichar uni;
 376     size_t left;
 377     char *actual;
 378
 379     result.text[0] = '\0';
 380     result.width = 0;
 381     result.compose = 0;
 382     actual = result.text;
 383
 384     /* check if text start with combining character,
 385      * add space at begin in this case */
 386     if (length != 0 && text[0] != '\0')
 387     {
 388         uni = g_utf8_get_char_validated (text, -1);
 389         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 390         {
 391             if (str_unichar_iscombiningmark (uni))
 392             {
 393                 actual[0] = ' ';
 394                 actual++;
 395                 result.width++;
 396                 result.compose = 1;
 397             }
 398         }
 399     }
 400
 401     while (length != 0 && text[0] != '\0')
 402     {
 403         uni = g_utf8_get_char_validated (text, -1);
 404         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 405         {
 406             if (g_unichar_isprint (uni))
 407             {
 408                 left = g_unichar_to_utf8 (uni, actual);
 409                 actual += left;
 410                 if (!str_unichar_iscombiningmark (uni))
 411                 {
 412                     result.width++;
 413                     if (g_unichar_iswide (uni))
 414                         result.width++;
 415                 }
 416                 else
 417                     result.compose = 1;
 418             }
 419             else
 420             {
 421                 actual[0] = '.';
 422                 actual++;
 423                 result.width++;
 424             }
 425             text = g_utf8_next_char (text);
 426         }
 427         else
 428         {
 429             text++;
 430             /*actual[0] = '?'; */
 431             memcpy (actual, replch, strlen (replch));
 432             actual += strlen (replch);
 433             result.width++;
 434         }
 435         if (length != (size_t) (-1))
 436             length--;
 437     }
 438     actual[0] = '\0';
 439
 440     return &result;
 441 }
 442
 443 static const char *
 444 str_utf8_term_form (const char *text)
 445 {
 446     static char result[BUF_MEDIUM * 6];
 447     const struct term_form *pre_form;
 448     char *composed;
 449
 450     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 451     if (pre_form->compose)
 452     {
 453         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 454         g_strlcpy (result, composed, sizeof (result));
 455         g_free (composed);
 456     }
 457     else
 458     {
 459         g_strlcpy (result, pre_form->text, sizeof (result));
 460     }
 461     return result;
 462 }
 463
 464 struct utf8_tool
 465 {
 466     char *actual;
 467     size_t remain;
 468     const char *cheked;
 469     int ident;
 470     int compose;
 471 };
 472
 473 /* utiliti function, that copy all characters from cheked to actual */
 474 static int
 475 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 476 {
 477     size_t left;
 478     gunichar uni;
 479
 480     tool->compose = 0;
 481
 482     while (tool->cheked[0] != '\0')
 483     {
 484         uni = g_utf8_get_char (tool->cheked);
 485         tool->compose |= str_unichar_iscombiningmark (uni);
 486         left = g_unichar_to_utf8 (uni, NULL);
 487         if (tool->remain <= left)
 488             return 0;
 489         left = g_unichar_to_utf8 (uni, tool->actual);
 490         tool->actual += left;
 491         tool->remain -= left;
 492         tool->cheked = g_utf8_next_char (tool->cheked);
 493     }
 494     return 1;
 495 }
 496
 497 /* utiliti function, that copy characters from cheked to actual until ident is
 498  * smaller than to_ident */
 499 static int
 500 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 501 {
 502     size_t left;
 503     gunichar uni;
 504     int w;
 505
 506     tool->compose = 0;
 507
 508     while (tool->cheked[0] != '\0')
 509     {
 510         uni = g_utf8_get_char (tool->cheked);
 511         if (!str_unichar_iscombiningmark (uni))
 512         {
 513             w = 1;
 514             if (g_unichar_iswide (uni))
 515                 w++;
 516             if (tool->ident + w > to_ident)
 517                 return 1;
 518         }
 519         else
 520         {
 521             w = 0;
 522             tool->compose = 1;
 523         }
 524
 525         left = g_unichar_to_utf8 (uni, NULL);
 526         if (tool->remain <= left)
 527             return 0;
 528         left = g_unichar_to_utf8 (uni, tool->actual);
 529         tool->actual += left;
 530         tool->remain -= left;
 531         tool->cheked = g_utf8_next_char (tool->cheked);
 532         tool->ident += w;
 533     }
 534     return 1;
 535 }
 536
 537 /* utiliti function, add count spaces to actual */
 538 static int
 539 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 540 {
 541     if (count <= 0)
 542         return 1;
 543     if (tool->remain <= (gsize) count)
 544         return 0;
 545     memset (tool->actual, ' ', count);
 546     tool->actual += count;
 547     tool->remain -= count;
 548     return 1;
 549 }
 550
 551 /* utiliti function, add one characters to actual */
 552 static int
 553 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 554 {
 555     if (tool->remain <= 1)
 556         return 0;
 557     tool->actual[0] = ch;
 558     tool->actual++;
 559     tool->remain--;
 560     return 1;
 561 }
 562
 563 /* utiliti function, thah skip characters from cheked until ident is greater or
 564  * equal to to_ident */
 565 static int
 566 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 567 {
 568     gunichar uni;
 569
 570     while (to_ident > tool->ident && tool->cheked[0] != '\0')
 571     {
 572         uni = g_utf8_get_char (tool->cheked);
 573         if (!str_unichar_iscombiningmark (uni))
 574         {
 575             tool->ident++;
 576             if (g_unichar_iswide (uni))
 577                 tool->ident++;
 578         }
 579         tool->cheked = g_utf8_next_char (tool->cheked);
 580     }
 581     uni = g_utf8_get_char (tool->cheked);
 582     while (str_unichar_iscombiningmark (uni))
 583     {
 584         tool->cheked = g_utf8_next_char (tool->cheked);
 585         uni = g_utf8_get_char (tool->cheked);
 586     }
 587     return 1;
 588 }
 589
 590 static void
 591 utf8_tool_compose (char *buffer, size_t size)
 592 {
 593     char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 594     g_strlcpy (buffer, composed, size);
 595     g_free (composed);
 596 }
 597
 598
 599 static const char *
 600 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 601 {
 602     static char result[BUF_MEDIUM * 6];
 603     const struct term_form *pre_form;
 604     struct utf8_tool tool;
 605
 606     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 607     tool.cheked = pre_form->text;
 608     tool.actual = result;
 609     tool.remain = sizeof (result);
 610     tool.compose = 0;
 611
 612     if (pre_form->width <= (gsize) width)
 613     {
 614         tool.ident = 0;
 615         switch (HIDE_FIT (just_mode))
 616         {
 617         case J_CENTER_LEFT:
 618         case J_CENTER:
 619             tool.ident = (width - pre_form->width) / 2;
 620             break;
 621         case J_RIGHT:
 622             tool.ident = width - pre_form->width;
 623             break;
 624         }
 625
 626         utf8_tool_insert_space (&tool, tool.ident);
 627         utf8_tool_copy_chars_to_end (&tool);
 628         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 629     }
 630     else
 631     {
 632         if (IS_FIT (just_mode))
 633         {
 634             tool.ident = 0;
 635             utf8_tool_copy_chars_to (&tool, width / 2);
 636             utf8_tool_insert_char (&tool, '~');
 637
 638             tool.ident = 0;
 639             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 640             utf8_tool_copy_chars_to_end (&tool);
 641             utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 642         }
 643         else
 644         {
 645             tool.ident = 0;
 646             switch (HIDE_FIT (just_mode))
 647             {
 648             case J_CENTER:
 649                 tool.ident = (width - pre_form->width) / 2;
 650                 break;
 651             case J_RIGHT:
 652                 tool.ident = width - pre_form->width;
 653                 break;
 654             }
 655
 656             utf8_tool_skip_chars_to (&tool, 0);
 657             utf8_tool_insert_space (&tool, tool.ident);
 658             utf8_tool_copy_chars_to (&tool, width);
 659             utf8_tool_insert_space (&tool, width - tool.ident);
 660         }
 661     }
 662
 663     tool.actual[0] = '\0';
 664     if (tool.compose)
 665         utf8_tool_compose (result, sizeof (result));
 666     return result;
 667 }
 668
 669 static const char *
 670 str_utf8_term_trim (const char *text, int width)
 671 {
 672     static char result[BUF_MEDIUM * 6];
 673     const struct term_form *pre_form;
 674     struct utf8_tool tool;
 675
 676     if (width < 1)
 677     {
 678         result[0] = '\0';
 679         return result;
 680     }
 681
 682     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 683
 684     tool.cheked = pre_form->text;
 685     tool.actual = result;
 686     tool.remain = sizeof (result);
 687     tool.compose = 0;
 688
 689     if ((gsize) width < pre_form->width)
 690     {
 691         if (width <= 3)
 692         {
 693             memset (tool.actual, '.', width);
 694             tool.actual += width;
 695             tool.remain -= width;
 696         }
 697         else
 698         {
 699             memset (tool.actual, '.', 3);
 700             tool.actual += 3;
 701             tool.remain -= 3;
 702
 703             tool.ident = 0;
 704             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 705             utf8_tool_copy_chars_to_end (&tool);
 706         }
 707     }
 708     else
 709     {
 710         utf8_tool_copy_chars_to_end (&tool);
 711     }
 712
 713     tool.actual[0] = '\0';
 714     if (tool.compose)
 715         utf8_tool_compose (result, sizeof (result));
 716     return result;
 717 }
 718
 719 static int
 720 str_utf8_term_width2 (const char *text, size_t length)
 721 {
 722     const struct term_form *result;
 723
 724     result = str_utf8_make_make_term_form (text, length);
 725     return result->width;
 726 }
 727
 728 static int
 729 str_utf8_term_width1 (const char *text)
 730 {
 731     return str_utf8_term_width2 (text, (size_t) (-1));
 732 }
 733
 734 static int
 735 str_utf8_term_char_width (const char *text)
 736 {
 737     gunichar uni = g_utf8_get_char_validated (text, -1);
 738     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 739 }
 740
 741 static const char *
 742 str_utf8_term_substring (const char *text, int start, int width)
 743 {
 744     static char result[BUF_MEDIUM * 6];
 745     const struct term_form *pre_form;
 746     struct utf8_tool tool;
 747
 748     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 749
 750     tool.cheked = pre_form->text;
 751     tool.actual = result;
 752     tool.remain = sizeof (result);
 753     tool.compose = 0;
 754
 755     tool.ident = -start;
 756     utf8_tool_skip_chars_to (&tool, 0);
 757     if (tool.ident < 0)
 758         tool.ident = 0;
 759     utf8_tool_insert_space (&tool, tool.ident);
 760
 761     utf8_tool_copy_chars_to (&tool, width);
 762     utf8_tool_insert_space (&tool, width - tool.ident);
 763
 764     tool.actual[0] = '\0';
 765     if (tool.compose)
 766         utf8_tool_compose (result, sizeof (result));
 767     return result;
 768 }
 769
 770 static const char *
 771 str_utf8_trunc (const char *text, int width)
 772 {
 773     static char result[MC_MAXPATHLEN * 6 * 2];
 774     const struct term_form *pre_form;
 775     struct utf8_tool tool;
 776
 777     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 778
 779     tool.cheked = pre_form->text;
 780     tool.actual = result;
 781     tool.remain = sizeof (result);
 782     tool.compose = 0;
 783
 784     if (pre_form->width > (gsize) width)
 785     {
 786         tool.ident = 0;
 787         utf8_tool_copy_chars_to (&tool, width / 2);
 788         utf8_tool_insert_char (&tool, '~');
 789
 790         tool.ident = 0;
 791         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 792         utf8_tool_copy_chars_to_end (&tool);
 793     }
 794     else
 795     {
 796         utf8_tool_copy_chars_to_end (&tool);
 797     }
 798
 799     tool.actual[0] = '\0';
 800     if (tool.compose)
 801         utf8_tool_compose (result, sizeof (result));
 802     return result;
 803 }
 804
 805 static int
 806 str_utf8_offset_to_pos (const char *text, size_t length)
 807 {
 808     if (str_utf8_is_valid_string (text))
 809         return g_utf8_offset_to_pointer (text, length) - text;
 810     else
 811     {
 812         int result;
 813         GString *buffer = g_string_new (text);
 814
 815         str_utf8_fix_string (buffer->str);
 816         result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
 817         g_string_free (buffer, TRUE);
 818         return result;
 819     }
 820 }
 821
 822 static int
 823 str_utf8_column_to_pos (const char *text, size_t pos)
 824 {
 825     static int result;
 826     gunichar uni;
 827     int width;
 828
 829     width = 0;
 830     result = 0;
 831
 832     while (text[0] != '\0')
 833     {
 834         uni = g_utf8_get_char_validated (text, 6);
 835         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 836         {
 837             if (g_unichar_isprint (uni))
 838             {
 839                 if (!str_unichar_iscombiningmark (uni))
 840                 {
 841                     width++;
 842                     if (g_unichar_iswide (uni))
 843                         width++;
 844                 }
 845             }
 846             else
 847             {
 848                 width++;
 849             }
 850             text = g_utf8_next_char (text);
 851         }
 852         else
 853         {
 854             text++;
 855             width++;
 856         }
 857         if ((gsize) width > pos)
 858             return result;
 859
 860         result++;
 861     }
 862
 863     return result;
 864 }
 865
 866 static char *
 867 str_utf8_create_search_needle (const char *needle, int case_sen)
 868 {
 869     if (needle != NULL)
 870     {
 871         if (case_sen)
 872         {
 873             return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 874         }
 875         else
 876         {
 877             char *fold = g_utf8_casefold (needle, -1);
 878             char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 879             g_free (fold);
 880             return result;
 881         }
 882     }
 883     else
 884         return NULL;
 885 }
 886
 887 static void
 888 str_utf8_release_search_needle (char *needle, int case_sen)
 889 {
 890     (void) case_sen;
 891     if (needle != NULL)
 892         g_free (needle);
 893 }
 894
 895 static const char *
 896 str_utf8_search_first (const char *text, const char *search, int case_sen)
 897 {
 898     char *fold_text;
 899     char *deco_text;
 900     const char *match;
 901     const char *result = NULL;
 902     const char *m;
 903
 904     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 905     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 906
 907     match = deco_text;
 908     do
 909     {
 910         match = g_strstr_len (match, -1, search);
 911         if (match != NULL)
 912         {
 913             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 914                 !str_utf8_iscombiningmark (match + strlen (search)))
 915             {
 916
 917                 result = text;
 918                 m = deco_text;
 919                 while (m < match)
 920                 {
 921                     str_utf8_cnext_noncomb_char (&m);
 922                     str_utf8_cnext_noncomb_char (&result);
 923                 }
 924             }
 925             else
 926             {
 927                 str_utf8_cnext_char (&match);
 928             }
 929         }
 930     }
 931     while (match != NULL && result == NULL);
 932
 933     g_free (deco_text);
 934     if (!case_sen)
 935         g_free (fold_text);
 936
 937     return result;
 938 }
 939
 940 static const char *
 941 str_utf8_search_last (const char *text, const char *search, int case_sen)
 942 {
 943     char *fold_text;
 944     char *deco_text;
 945     char *match;
 946     const char *result = NULL;
 947     const char *m;
 948
 949     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 950     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 951
 952     do
 953     {
 954         match = g_strrstr_len (deco_text, -1, search);
 955         if (match != NULL)
 956         {
 957             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 958                 !str_utf8_iscombiningmark (match + strlen (search)))
 959             {
 960
 961                 result = text;
 962                 m = deco_text;
 963                 while (m < match)
 964                 {
 965                     str_utf8_cnext_noncomb_char (&m);
 966                     str_utf8_cnext_noncomb_char (&result);
 967                 }
 968             }
 969             else
 970             {
 971                 match[0] = '\0';
 972             }
 973         }
 974     }
 975     while (match != NULL && result == NULL);
 976
 977     g_free (deco_text);
 978     if (!case_sen)
 979         g_free (fold_text);
 980
 981     return result;
 982 }
 983
 984 static char *
 985 str_utf8_normalize (const char *text)
 986 {
 987     GString *fixed;
 988     char *tmp;
 989     char *result;
 990     const char *start;
 991     const char *end;
 992
 993     fixed = g_string_sized_new (4);
 994
 995     start = text;
 996     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 997     {
 998         if (start != end)
 999         {
1000             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1001             g_string_append (fixed, tmp);
1002             g_free (tmp);
1003         }
1004         g_string_append_c (fixed, end[0]);
1005         start = end + 1;
1006     }
1007
1008     if (start == text)
1009     {
1010         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1011         g_string_free (fixed, TRUE);
1012     }
1013     else
1014     {
1015         if (start[0] != '\0' && start != end)
1016         {
1017             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1018             g_string_append (fixed, tmp);
1019             g_free (tmp);
1020         }
1021         result = g_string_free (fixed, FALSE);
1022     }
1023
1024     return result;
1025 }
1026
1027 static char *
1028 str_utf8_casefold_normalize (const char *text)
1029 {
1030     GString *fixed;
1031     char *tmp, *fold;
1032     char *result;
1033     const char *start;
1034     const char *end;
1035
1036     fixed = g_string_sized_new (4);
1037
1038     start = text;
1039     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1040     {
1041         if (start != end)
1042         {
1043             fold = g_utf8_casefold (start, end - start);
1044             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1045             g_string_append (fixed, tmp);
1046             g_free (tmp);
1047             g_free (fold);
1048         }
1049         g_string_append_c (fixed, end[0]);
1050         start = end + 1;
1051     }
1052
1053     if (start == text)
1054     {
1055         fold = g_utf8_casefold (text, -1);
1056         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1057         g_free (fold);
1058         g_string_free (fixed, TRUE);
1059     }
1060     else
1061     {
1062         if (start[0] != '\0' && start != end)
1063         {
1064             fold = g_utf8_casefold (start, end - start);
1065             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1066             g_string_append (fixed, tmp);
1067             g_free (tmp);
1068             g_free (fold);
1069         }
1070         result = g_string_free (fixed, FALSE);
1071     }
1072
1073     return result;
1074 }
1075
1076 static int
1077 str_utf8_compare (const char *t1, const char *t2)
1078 {
1079     char *n1, *n2;
1080     int result;
1081
1082     n1 = str_utf8_normalize (t1);
1083     n2 = str_utf8_normalize (t2);
1084
1085     result = strcmp (n1, n2);
1086
1087     g_free (n1);
1088     g_free (n2);
1089
1090     return result;
1091 }
1092
1093 static int
1094 str_utf8_ncompare (const char *t1, const char *t2)
1095 {
1096     char *n1, *n2;
1097     int result;
1098
1099     n1 = str_utf8_normalize (t1);
1100     n2 = str_utf8_normalize (t2);
1101
1102     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1103
1104     g_free (n1);
1105     g_free (n2);
1106
1107     return result;
1108 }
1109
1110 static int
1111 str_utf8_casecmp (const char *t1, const char *t2)
1112 {
1113     char *n1, *n2;
1114     int result;
1115
1116     n1 = str_utf8_casefold_normalize (t1);
1117     n2 = str_utf8_casefold_normalize (t2);
1118
1119     result = strcmp (n1, n2);
1120
1121     g_free (n1);
1122     g_free (n2);
1123
1124     return result;
1125 }
1126
1127 static int
1128 str_utf8_ncasecmp (const char *t1, const char *t2)
1129 {
1130     char *n1, *n2;
1131     int result;
1132
1133     n1 = str_utf8_casefold_normalize (t1);
1134     n2 = str_utf8_casefold_normalize (t2);
1135
1136     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1137
1138     g_free (n1);
1139     g_free (n2);
1140
1141     return result;
1142 }
1143
1144 static int
1145 str_utf8_prefix (const char *text, const char *prefix)
1146 {
1147     char *t = str_utf8_normalize (text);
1148     char *p = str_utf8_normalize (prefix);
1149     const char *nt = t;
1150     const char *np = p;
1151     const char *nnt = t;
1152     const char *nnp = p;
1153     int result;
1154
1155     while (nt[0] != '\0' && np[0] != '\0')
1156     {
1157         str_utf8_cnext_char_safe (&nnt);
1158         str_utf8_cnext_char_safe (&nnp);
1159         if (nnt - nt != nnp - np)
1160             break;
1161         if (strncmp (nt, np, nnt - nt) != 0)
1162             break;
1163         nt = nnt;
1164         np = nnp;
1165     }
1166
1167     result = np - p;
1168
1169     g_free (t);
1170     g_free (p);
1171
1172     return result;
1173 }
1174
1175 static int
1176 str_utf8_caseprefix (const char *text, const char *prefix)
1177 {
1178     char *t = str_utf8_casefold_normalize (text);
1179     char *p = str_utf8_casefold_normalize (prefix);
1180     const char *nt = t;
1181     const char *np = p;
1182     const char *nnt = t;
1183     const char *nnp = p;
1184     int result;
1185
1186     while (nt[0] != '\0' && np[0] != '\0')
1187     {
1188         str_utf8_cnext_char_safe (&nnt);
1189         str_utf8_cnext_char_safe (&nnp);
1190         if (nnt - nt != nnp - np)
1191             break;
1192         if (strncmp (nt, np, nnt - nt) != 0)
1193             break;
1194         nt = nnt;
1195         np = nnp;
1196     }
1197
1198     result = np - p;
1199
1200     g_free (t);
1201     g_free (p);
1202
1203     return result;
1204 }
1205
1206 static char *
1207 str_utf8_create_key_gen (const char *text, int case_sen,
1208                          gchar * (*keygen) (const gchar * text, gssize size))
1209 {
1210     char *result;
1211
1212     if (case_sen)
1213     {
1214         result = str_utf8_normalize (text);
1215     }
1216     else
1217     {
1218         gboolean dot;
1219         GString *fixed;
1220         const char *start, *end;
1221         char *fold, *key;
1222
1223         dot = text[0] == '.';
1224         fixed = g_string_sized_new (16);
1225
1226         if (!dot)
1227             start = text;
1228         else
1229         {
1230             start = text + 1;
1231             g_string_append_c (fixed, '.');
1232         }
1233
1234         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1235         {
1236             if (start != end)
1237             {
1238                 fold = g_utf8_casefold (start, end - start);
1239                 key = keygen (fold, -1);
1240                 g_string_append (fixed, key);
1241                 g_free (key);
1242                 g_free (fold);
1243             }
1244             g_string_append_c (fixed, end[0]);
1245             start = end + 1;
1246         }
1247
1248         if (start == text)
1249         {
1250             fold = g_utf8_casefold (start, -1);
1251             result = keygen (fold, -1);
1252             g_free (fold);
1253             g_string_free (fixed, TRUE);
1254         }
1255         else if (dot && (start == text + 1))
1256         {
1257             fold = g_utf8_casefold (start, -1);
1258             key = keygen (fold, -1);
1259             g_string_append (fixed, key);
1260             g_free (key);
1261             g_free (fold);
1262             result = g_string_free (fixed, FALSE);
1263         }
1264         else
1265         {
1266             if (start[0] != '\0' && start != end)
1267             {
1268                 fold = g_utf8_casefold (start, end - start);
1269                 key = keygen (fold, -1);
1270                 g_string_append (fixed, key);
1271                 g_free (key);
1272                 g_free (fold);
1273             }
1274             result = g_string_free (fixed, FALSE);
1275         }
1276     }
1277     return result;
1278 }
1279
1280 static char *
1281 str_utf8_create_key (const char *text, int case_sen)
1282 {
1283     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1284 }
1285
1286 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1287 static char *
1288 str_utf8_create_key_for_filename (const char *text, int case_sen)
1289 {
1290     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1291 }
1292 #endif
1293
1294 static int
1295 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1296 {
1297     (void) case_sen;
1298     return strcmp (t1, t2);
1299 }
1300
1301 static void
1302 str_utf8_release_key (char *key, int case_sen)
1303 {
1304     (void) case_sen;
1305     g_free (key);
1306 }
1307
1308 struct str_class
1309 str_utf8_init (void)
1310 {
1311     struct str_class result;
1312
1313     result.conv_gerror_message = str_utf8_conv_gerror_message;
1314     result.vfs_convert_to = str_utf8_vfs_convert_to;
1315     result.insert_replace_char = str_utf8_insert_replace_char;
1316     result.is_valid_string = str_utf8_is_valid_string;
1317     result.is_valid_char = str_utf8_is_valid_char;
1318     result.cnext_char = str_utf8_cnext_char;
1319     result.cprev_char = str_utf8_cprev_char;
1320     result.cnext_char_safe = str_utf8_cnext_char_safe;
1321     result.cprev_char_safe = str_utf8_cprev_char_safe;
1322     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1323     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1324     result.isspace = str_utf8_isspace;
1325     result.ispunct = str_utf8_ispunct;
1326     result.isalnum = str_utf8_isalnum;
1327     result.isdigit = str_utf8_isdigit;
1328     result.isprint = str_utf8_isprint;
1329     result.iscombiningmark = str_utf8_iscombiningmark;
1330     result.toupper = str_utf8_toupper;
1331     result.tolower = str_utf8_tolower;
1332     result.length = str_utf8_length;
1333     result.length2 = str_utf8_length2;
1334     result.length_noncomb = str_utf8_length_noncomb;
1335     result.fix_string = str_utf8_fix_string;
1336     result.term_form = str_utf8_term_form;
1337     result.fit_to_term = str_utf8_fit_to_term;
1338     result.term_trim = str_utf8_term_trim;
1339     result.term_width2 = str_utf8_term_width2;
1340     result.term_width1 = str_utf8_term_width1;
1341     result.term_char_width = str_utf8_term_char_width;
1342     result.term_substring = str_utf8_term_substring;
1343     result.trunc = str_utf8_trunc;
1344     result.offset_to_pos = str_utf8_offset_to_pos;
1345     result.column_to_pos = str_utf8_column_to_pos;
1346     result.create_search_needle = str_utf8_create_search_needle;
1347     result.release_search_needle = str_utf8_release_search_needle;
1348     result.search_first = str_utf8_search_first;
1349     result.search_last = str_utf8_search_last;
1350     result.compare = str_utf8_compare;
1351     result.ncompare = str_utf8_ncompare;
1352     result.casecmp = str_utf8_casecmp;
1353     result.ncasecmp = str_utf8_ncasecmp;
1354     result.prefix = str_utf8_prefix;
1355     result.caseprefix = str_utf8_caseprefix;
1356     result.create_key = str_utf8_create_key;
1357 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1358     /* case insensitive sort files in "a1 a2 a10" order */
1359     result.create_key_for_filename = str_utf8_create_key_for_filename;
1360 #else
1361     /* case insensitive sort files in "a1 a10 a2" order */
1362     result.create_key_for_filename = str_utf8_create_key;
1363 #endif
1364     result.key_collate = str_utf8_key_collate;
1365     result.release_key = str_utf8_release_key;
1366
1367     return result;
1368 }