lib/strutil/strutilutf8.c

   1 /* UTF-8 strings utilities
   2    Copyright (C) 2007 Free Software Foundation, Inc.
   3
   4    Written 2007 by:
   5    Rostislav Benes
   6
   7    The file_date routine is mostly from GNU's fileutils package,
   8    written by Richard Stallman and David MacKenzie.
   9
  10    This program is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2 of the License, or
  13    (at your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  23  */
  24
  25 #include <config.h>
  26 #include <stdlib.h>
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include <glib.h>
  30 #include <langinfo.h>
  31 #include <string.h>
  32
  33 #include "lib/global.h"
  34 #include "lib/strutil.h"
  35
  36 /* using function for utf-8 from glib */
  37
  38 static const char replch[] = "\xEF\xBF\xBD";
  39
  40 static int
  41 str_unichar_iscombiningmark (gunichar uni)
  42 {
  43     int type = g_unichar_type (uni);
  44     return (type == G_UNICODE_COMBINING_MARK)
  45         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  46 }
  47
  48 static void
  49 str_utf8_insert_replace_char (GString * buffer)
  50 {
  51     g_string_append (buffer, replch);
  52 }
  53
  54 static int
  55 str_utf8_is_valid_string (const char *text)
  56 {
  57     return g_utf8_validate (text, -1, NULL);
  58 }
  59
  60 static int
  61 str_utf8_is_valid_char (const char *ch, size_t size)
  62 {
  63     switch (g_utf8_get_char_validated (ch, size))
  64     {
  65     case (gunichar) (-2):
  66         return -2;
  67     case (gunichar) (-1):
  68         return -1;
  69     default:
  70         return 1;
  71     }
  72 }
  73
  74 static void
  75 str_utf8_cnext_char (const char **text)
  76 {
  77     (*text) = g_utf8_next_char (*text);
  78 }
  79
  80 static void
  81 str_utf8_cprev_char (const char **text)
  82 {
  83     (*text) = g_utf8_prev_char (*text);
  84 }
  85
  86 static void
  87 str_utf8_cnext_char_safe (const char **text)
  88 {
  89     if (str_utf8_is_valid_char (*text, -1) == 1)
  90         (*text) = g_utf8_next_char (*text);
  91     else
  92         (*text)++;
  93 }
  94
  95 static void
  96 str_utf8_cprev_char_safe (const char **text)
  97 {
  98     const char *result = g_utf8_prev_char (*text);
  99     const char *t = result;
 100     str_utf8_cnext_char_safe (&t);
 101     if (t == *text)
 102         (*text) = result;
 103     else
 104         (*text)--;
 105 }
 106
 107 static void
 108 str_utf8_fix_string (char *text)
 109 {
 110     gunichar uni;
 111
 112     while (text[0] != '\0')
 113     {
 114         uni = g_utf8_get_char_validated (text, -1);
 115         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 116         {
 117             text = g_utf8_next_char (text);
 118         }
 119         else
 120         {
 121             text[0] = '?';
 122             text++;
 123         }
 124     }
 125 }
 126
 127 static int
 128 str_utf8_isspace (const char *text)
 129 {
 130     gunichar uni = g_utf8_get_char_validated (text, -1);
 131     return g_unichar_isspace (uni);
 132 }
 133
 134 static int
 135 str_utf8_ispunct (const char *text)
 136 {
 137     gunichar uni = g_utf8_get_char_validated (text, -1);
 138     return g_unichar_ispunct (uni);
 139 }
 140
 141 static int
 142 str_utf8_isalnum (const char *text)
 143 {
 144     gunichar uni = g_utf8_get_char_validated (text, -1);
 145     return g_unichar_isalnum (uni);
 146 }
 147
 148 static int
 149 str_utf8_isdigit (const char *text)
 150 {
 151     gunichar uni = g_utf8_get_char_validated (text, -1);
 152     return g_unichar_isdigit (uni);
 153 }
 154
 155 static int
 156 str_utf8_isprint (const char *ch)
 157 {
 158     gunichar uni = g_utf8_get_char_validated (ch, -1);
 159     return g_unichar_isprint (uni);
 160 }
 161
 162 static int
 163 str_utf8_iscombiningmark (const char *ch)
 164 {
 165     gunichar uni = g_utf8_get_char_validated (ch, -1);
 166     return str_unichar_iscombiningmark (uni);
 167 }
 168
 169 static int
 170 str_utf8_cnext_noncomb_char (const char **text)
 171 {
 172     int count = 0;
 173     while ((*text)[0] != '\0')
 174     {
 175         str_utf8_cnext_char_safe (text);
 176         count++;
 177         if (!str_utf8_iscombiningmark (*text))
 178             break;
 179     }
 180     return count;
 181 }
 182
 183 static int
 184 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 185 {
 186     int count = 0;
 187     while ((*text) != begin)
 188     {
 189         str_utf8_cprev_char_safe (text);
 190         count++;
 191         if (!str_utf8_iscombiningmark (*text))
 192             break;
 193     }
 194     return count;
 195 }
 196
 197 static int
 198 str_utf8_toupper (const char *text, char **out, size_t * remain)
 199 {
 200     gunichar uni;
 201     size_t left;
 202
 203     uni = g_utf8_get_char_validated (text, -1);
 204     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 205         return 0;
 206
 207     uni = g_unichar_toupper (uni);
 208     left = g_unichar_to_utf8 (uni, NULL);
 209     if (left >= *remain)
 210         return 0;
 211
 212     left = g_unichar_to_utf8 (uni, *out);
 213     (*out) += left;
 214     (*remain) -= left;
 215     return 1;
 216 }
 217
 218 static int
 219 str_utf8_tolower (const char *text, char **out, size_t * remain)
 220 {
 221     gunichar uni;
 222     size_t left;
 223
 224     uni = g_utf8_get_char_validated (text, -1);
 225     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 226         return 0;
 227
 228     uni = g_unichar_tolower (uni);
 229     left = g_unichar_to_utf8 (uni, NULL);
 230     if (left >= *remain)
 231         return 0;
 232
 233     left = g_unichar_to_utf8 (uni, *out);
 234     (*out) += left;
 235     (*remain) -= left;
 236     return 1;
 237 }
 238
 239 static int
 240 str_utf8_length (const char *text)
 241 {
 242     int result = 0;
 243     const char *start;
 244     const char *end;
 245
 246     start = text;
 247     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 248     {
 249         if (start != end)
 250         {
 251             result += g_utf8_strlen (start, end - start);
 252         }
 253         result++;
 254         start = end + 1;
 255     }
 256
 257     if (start == text)
 258     {
 259         result = g_utf8_strlen (text, -1);
 260     }
 261     else
 262     {
 263         if (start[0] != '\0' && start != end)
 264         {
 265             result += g_utf8_strlen (start, end - start);
 266         }
 267     }
 268
 269     return result;
 270 }
 271
 272 static int
 273 str_utf8_length2 (const char *text, int size)
 274 {
 275     int result = 0;
 276     const char *start;
 277     const char *end;
 278
 279     start = text;
 280     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 281     {
 282         if (start != end)
 283         {
 284             result += g_utf8_strlen (start, min (end - start, size));
 285             size -= end - start;
 286         }
 287         result += (size > 0);
 288         size--;
 289         start = end + 1;
 290     }
 291
 292     if (start == text)
 293     {
 294         result = g_utf8_strlen (text, size);
 295     }
 296     else
 297     {
 298         if (start[0] != '\0' && start != end && size > 0)
 299         {
 300             result += g_utf8_strlen (start, min (end - start, size));
 301         }
 302     }
 303
 304     return result;
 305 }
 306
 307 static int
 308 str_utf8_length_noncomb (const char *text)
 309 {
 310     int result = 0;
 311     const char *t = text;
 312
 313     while (t[0] != '\0')
 314     {
 315         str_utf8_cnext_noncomb_char (&t);
 316         result++;
 317     }
 318
 319     return result;
 320 }
 321
 322 /*
 323    static void
 324    str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
 325    {
 326    char *next = g_utf8_next_char (*string);
 327    (*left) -= next - (*string);
 328    (*string) = next;
 329    g_string_append_c (buffer, '?');
 330    }
 331  */
 332
 333 static gchar *
 334 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
 335 {
 336     if ((error != NULL) && (error->message != NULL))
 337         return g_strdup (error->message);
 338
 339     return g_strdup (def_msg != NULL ? def_msg : "");
 340 }
 341
 342 static estr_t
 343 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
 344 {
 345     estr_t result;
 346
 347     if (coder == str_cnv_not_convert)
 348     {
 349         g_string_append_len (buffer, string, size);
 350         result = ESTR_SUCCESS;
 351     }
 352     else
 353         result = str_nconvert (coder, (char *) string, size, buffer);
 354
 355     return result;
 356 }
 357
 358 struct term_form
 359 {
 360     char text[BUF_MEDIUM * 6];
 361     size_t width;
 362     int compose;
 363 };
 364
 365 /* utiliti function, that make string valid in utf8 and all characters printable
 366  * return width of string too*/
 367 static const struct term_form *
 368 str_utf8_make_make_term_form (const char *text, size_t length)
 369 {
 370     static struct term_form result;
 371     gunichar uni;
 372     size_t left;
 373     char *actual;
 374
 375     result.text[0] = '\0';
 376     result.width = 0;
 377     result.compose = 0;
 378     actual = result.text;
 379
 380     /* check if text start with combining character,
 381      * add space at begin in this case */
 382     if (length != 0 && text[0] != '\0')
 383     {
 384         uni = g_utf8_get_char_validated (text, -1);
 385         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 386         {
 387             if (str_unichar_iscombiningmark (uni))
 388             {
 389                 actual[0] = ' ';
 390                 actual++;
 391                 result.width++;
 392                 result.compose = 1;
 393             }
 394         }
 395     }
 396
 397     while (length != 0 && text[0] != '\0')
 398     {
 399         uni = g_utf8_get_char_validated (text, -1);
 400         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 401         {
 402             if (g_unichar_isprint (uni))
 403             {
 404                 left = g_unichar_to_utf8 (uni, actual);
 405                 actual += left;
 406                 if (!str_unichar_iscombiningmark (uni))
 407                 {
 408                     result.width++;
 409                     if (g_unichar_iswide (uni))
 410                         result.width++;
 411                 }
 412                 else
 413                     result.compose = 1;
 414             }
 415             else
 416             {
 417                 actual[0] = '.';
 418                 actual++;
 419                 result.width++;
 420             }
 421             text = g_utf8_next_char (text);
 422         }
 423         else
 424         {
 425             text++;
 426             /*actual[0] = '?'; */
 427             memcpy (actual, replch, strlen (replch));
 428             actual += strlen (replch);
 429             result.width++;
 430         }
 431         if (length != (size_t) (-1))
 432             length--;
 433     }
 434     actual[0] = '\0';
 435
 436     return &result;
 437 }
 438
 439 static const char *
 440 str_utf8_term_form (const char *text)
 441 {
 442     static char result[BUF_MEDIUM * 6];
 443     const struct term_form *pre_form;
 444     char *composed;
 445
 446     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 447     if (pre_form->compose)
 448     {
 449         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 450         g_strlcpy (result, composed, sizeof (result));
 451         g_free (composed);
 452     }
 453     else
 454     {
 455         g_strlcpy (result, pre_form->text, sizeof (result));
 456     }
 457     return result;
 458 }
 459
 460 struct utf8_tool
 461 {
 462     char *actual;
 463     size_t remain;
 464     const char *cheked;
 465     int ident;
 466     int compose;
 467 };
 468
 469 /* utiliti function, that copy all characters from cheked to actual */
 470 static int
 471 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 472 {
 473     size_t left;
 474     gunichar uni;
 475
 476     tool->compose = 0;
 477
 478     while (tool->cheked[0] != '\0')
 479     {
 480         uni = g_utf8_get_char (tool->cheked);
 481         tool->compose |= str_unichar_iscombiningmark (uni);
 482         left = g_unichar_to_utf8 (uni, NULL);
 483         if (tool->remain <= left)
 484             return 0;
 485         left = g_unichar_to_utf8 (uni, tool->actual);
 486         tool->actual += left;
 487         tool->remain -= left;
 488         tool->cheked = g_utf8_next_char (tool->cheked);
 489     }
 490     return 1;
 491 }
 492
 493 /* utiliti function, that copy characters from cheked to actual until ident is
 494  * smaller than to_ident */
 495 static int
 496 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 497 {
 498     size_t left;
 499     gunichar uni;
 500     int w;
 501
 502     tool->compose = 0;
 503
 504     while (tool->cheked[0] != '\0')
 505     {
 506         uni = g_utf8_get_char (tool->cheked);
 507         if (!str_unichar_iscombiningmark (uni))
 508         {
 509             w = 1;
 510             if (g_unichar_iswide (uni))
 511                 w++;
 512             if (tool->ident + w > to_ident)
 513                 return 1;
 514         }
 515         else
 516         {
 517             w = 0;
 518             tool->compose = 1;
 519         }
 520
 521         left = g_unichar_to_utf8 (uni, NULL);
 522         if (tool->remain <= left)
 523             return 0;
 524         left = g_unichar_to_utf8 (uni, tool->actual);
 525         tool->actual += left;
 526         tool->remain -= left;
 527         tool->cheked = g_utf8_next_char (tool->cheked);
 528         tool->ident += w;
 529     }
 530     return 1;
 531 }
 532
 533 /* utiliti function, add count spaces to actual */
 534 static int
 535 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 536 {
 537     if (count <= 0)
 538         return 1;
 539     if (tool->remain <= (gsize) count)
 540         return 0;
 541     memset (tool->actual, ' ', count);
 542     tool->actual += count;
 543     tool->remain -= count;
 544     return 1;
 545 }
 546
 547 /* utiliti function, add one characters to actual */
 548 static int
 549 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 550 {
 551     if (tool->remain <= 1)
 552         return 0;
 553     tool->actual[0] = ch;
 554     tool->actual++;
 555     tool->remain--;
 556     return 1;
 557 }
 558
 559 /* utiliti function, thah skip characters from cheked until ident is greater or
 560  * equal to to_ident */
 561 static int
 562 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 563 {
 564     gunichar uni;
 565
 566     while (to_ident > tool->ident && tool->cheked[0] != '\0')
 567     {
 568         uni = g_utf8_get_char (tool->cheked);
 569         if (!str_unichar_iscombiningmark (uni))
 570         {
 571             tool->ident++;
 572             if (g_unichar_iswide (uni))
 573                 tool->ident++;
 574         }
 575         tool->cheked = g_utf8_next_char (tool->cheked);
 576     }
 577     uni = g_utf8_get_char (tool->cheked);
 578     while (str_unichar_iscombiningmark (uni))
 579     {
 580         tool->cheked = g_utf8_next_char (tool->cheked);
 581         uni = g_utf8_get_char (tool->cheked);
 582     }
 583     return 1;
 584 }
 585
 586 static void
 587 utf8_tool_compose (char *buffer, size_t size)
 588 {
 589     char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 590     g_strlcpy (buffer, composed, size);
 591     g_free (composed);
 592 }
 593
 594
 595 static const char *
 596 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 597 {
 598     static char result[BUF_MEDIUM * 6];
 599     const struct term_form *pre_form;
 600     struct utf8_tool tool;
 601
 602     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 603     tool.cheked = pre_form->text;
 604     tool.actual = result;
 605     tool.remain = sizeof (result);
 606     tool.compose = 0;
 607
 608     if (pre_form->width <= (gsize) width)
 609     {
 610         tool.ident = 0;
 611         switch (HIDE_FIT (just_mode))
 612         {
 613         case J_CENTER_LEFT:
 614         case J_CENTER:
 615             tool.ident = (width - pre_form->width) / 2;
 616             break;
 617         case J_RIGHT:
 618             tool.ident = width - pre_form->width;
 619             break;
 620         }
 621
 622         utf8_tool_insert_space (&tool, tool.ident);
 623         utf8_tool_copy_chars_to_end (&tool);
 624         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 625     }
 626     else
 627     {
 628         if (IS_FIT (just_mode))
 629         {
 630             tool.ident = 0;
 631             utf8_tool_copy_chars_to (&tool, width / 2);
 632             utf8_tool_insert_char (&tool, '~');
 633
 634             tool.ident = 0;
 635             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 636             utf8_tool_copy_chars_to_end (&tool);
 637             utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 638         }
 639         else
 640         {
 641             tool.ident = 0;
 642             switch (HIDE_FIT (just_mode))
 643             {
 644             case J_CENTER:
 645                 tool.ident = (width - pre_form->width) / 2;
 646                 break;
 647             case J_RIGHT:
 648                 tool.ident = width - pre_form->width;
 649                 break;
 650             }
 651
 652             utf8_tool_skip_chars_to (&tool, 0);
 653             utf8_tool_insert_space (&tool, tool.ident);
 654             utf8_tool_copy_chars_to (&tool, width);
 655             utf8_tool_insert_space (&tool, width - tool.ident);
 656         }
 657     }
 658
 659     tool.actual[0] = '\0';
 660     if (tool.compose)
 661         utf8_tool_compose (result, sizeof (result));
 662     return result;
 663 }
 664
 665 static const char *
 666 str_utf8_term_trim (const char *text, int width)
 667 {
 668     static char result[BUF_MEDIUM * 6];
 669     const struct term_form *pre_form;
 670     struct utf8_tool tool;
 671
 672     if (width < 1)
 673     {
 674         result [0] = '\0';
 675         return result;
 676     }
 677
 678     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 679
 680     tool.cheked = pre_form->text;
 681     tool.actual = result;
 682     tool.remain = sizeof (result);
 683     tool.compose = 0;
 684
 685     if ((gsize) width < pre_form->width)
 686     {
 687         if (width <= 3)
 688         {
 689             memset (tool.actual, '.', width);
 690             tool.actual += width;
 691             tool.remain -= width;
 692         }
 693         else
 694         {
 695             memset (tool.actual, '.', 3);
 696             tool.actual += 3;
 697             tool.remain -= 3;
 698
 699             tool.ident = 0;
 700             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 701             utf8_tool_copy_chars_to_end (&tool);
 702         }
 703     }
 704     else
 705     {
 706         utf8_tool_copy_chars_to_end (&tool);
 707     }
 708
 709     tool.actual[0] = '\0';
 710     if (tool.compose)
 711         utf8_tool_compose (result, sizeof (result));
 712     return result;
 713 }
 714
 715 static int
 716 str_utf8_term_width2 (const char *text, size_t length)
 717 {
 718     const struct term_form *result;
 719
 720     result = str_utf8_make_make_term_form (text, length);
 721     return result->width;
 722 }
 723
 724 static int
 725 str_utf8_term_width1 (const char *text)
 726 {
 727     return str_utf8_term_width2 (text, (size_t) (-1));
 728 }
 729
 730 static int
 731 str_utf8_term_char_width (const char *text)
 732 {
 733     gunichar uni = g_utf8_get_char_validated (text, -1);
 734     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 735 }
 736
 737 static const char *
 738 str_utf8_term_substring (const char *text, int start, int width)
 739 {
 740     static char result[BUF_MEDIUM * 6];
 741     const struct term_form *pre_form;
 742     struct utf8_tool tool;
 743
 744     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 745
 746     tool.cheked = pre_form->text;
 747     tool.actual = result;
 748     tool.remain = sizeof (result);
 749     tool.compose = 0;
 750
 751     tool.ident = -start;
 752     utf8_tool_skip_chars_to (&tool, 0);
 753     if (tool.ident < 0)
 754         tool.ident = 0;
 755     utf8_tool_insert_space (&tool, tool.ident);
 756
 757     utf8_tool_copy_chars_to (&tool, width);
 758     utf8_tool_insert_space (&tool, width - tool.ident);
 759
 760     tool.actual[0] = '\0';
 761     if (tool.compose)
 762         utf8_tool_compose (result, sizeof (result));
 763     return result;
 764 }
 765
 766 static const char *
 767 str_utf8_trunc (const char *text, int width)
 768 {
 769     static char result[MC_MAXPATHLEN * 6 * 2];
 770     const struct term_form *pre_form;
 771     struct utf8_tool tool;
 772
 773     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 774
 775     tool.cheked = pre_form->text;
 776     tool.actual = result;
 777     tool.remain = sizeof (result);
 778     tool.compose = 0;
 779
 780     if (pre_form->width > (gsize) width)
 781     {
 782         tool.ident = 0;
 783         utf8_tool_copy_chars_to (&tool, width / 2);
 784         utf8_tool_insert_char (&tool, '~');
 785
 786         tool.ident = 0;
 787         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 788         utf8_tool_copy_chars_to_end (&tool);
 789     }
 790     else
 791     {
 792         utf8_tool_copy_chars_to_end (&tool);
 793     }
 794
 795     tool.actual[0] = '\0';
 796     if (tool.compose)
 797         utf8_tool_compose (result, sizeof (result));
 798     return result;
 799 }
 800
 801 static int
 802 str_utf8_offset_to_pos (const char *text, size_t length)
 803 {
 804     if (str_utf8_is_valid_string (text))
 805         return g_utf8_offset_to_pointer (text, length) - text;
 806     else
 807     {
 808         int result;
 809         GString *buffer = g_string_new (text);
 810
 811         str_utf8_fix_string (buffer->str);
 812         result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
 813         g_string_free (buffer, TRUE);
 814         return result;
 815     }
 816 }
 817
 818 static int
 819 str_utf8_column_to_pos (const char *text, size_t pos)
 820 {
 821     static int result;
 822     gunichar uni;
 823     int width;
 824
 825     width = 0;
 826     result = 0;
 827
 828     while (text[0] != '\0')
 829     {
 830         uni = g_utf8_get_char_validated (text, 6);
 831         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 832         {
 833             if (g_unichar_isprint (uni))
 834             {
 835                 if (!str_unichar_iscombiningmark (uni))
 836                 {
 837                     width++;
 838                     if (g_unichar_iswide (uni))
 839                         width++;
 840                 }
 841             }
 842             else
 843             {
 844                 width++;
 845             }
 846             text = g_utf8_next_char (text);
 847         }
 848         else
 849         {
 850             text++;
 851             width++;
 852         }
 853         if ((gsize) width > pos)
 854             return result;
 855
 856         result++;
 857     }
 858
 859     return result;
 860 }
 861
 862 static char *
 863 str_utf8_create_search_needle (const char *needle, int case_sen)
 864 {
 865     if (needle != NULL)
 866     {
 867         if (case_sen)
 868         {
 869             return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 870         }
 871         else
 872         {
 873             char *fold = g_utf8_casefold (needle, -1);
 874             char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 875             g_free (fold);
 876             return result;
 877         }
 878     }
 879     else
 880         return NULL;
 881 }
 882
 883 static void
 884 str_utf8_release_search_needle (char *needle, int case_sen)
 885 {
 886     (void) case_sen;
 887     if (needle != NULL)
 888         g_free (needle);
 889 }
 890
 891 static const char *
 892 str_utf8_search_first (const char *text, const char *search, int case_sen)
 893 {
 894     char *fold_text;
 895     char *deco_text;
 896     const char *match;
 897     const char *result = NULL;
 898     const char *m;
 899
 900     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 901     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 902
 903     match = deco_text;
 904     do
 905     {
 906         match = g_strstr_len (match, -1, search);
 907         if (match != NULL)
 908         {
 909             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 910                 !str_utf8_iscombiningmark (match + strlen (search)))
 911             {
 912
 913                 result = text;
 914                 m = deco_text;
 915                 while (m < match)
 916                 {
 917                     str_utf8_cnext_noncomb_char (&m);
 918                     str_utf8_cnext_noncomb_char (&result);
 919                 }
 920             }
 921             else
 922             {
 923                 str_utf8_cnext_char (&match);
 924             }
 925         }
 926     }
 927     while (match != NULL && result == NULL);
 928
 929     g_free (deco_text);
 930     if (!case_sen)
 931         g_free (fold_text);
 932
 933     return result;
 934 }
 935
 936 static const char *
 937 str_utf8_search_last (const char *text, const char *search, int case_sen)
 938 {
 939     char *fold_text;
 940     char *deco_text;
 941     char *match;
 942     const char *result = NULL;
 943     const char *m;
 944
 945     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 946     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 947
 948     do
 949     {
 950         match = g_strrstr_len (deco_text, -1, search);
 951         if (match != NULL)
 952         {
 953             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 954                 !str_utf8_iscombiningmark (match + strlen (search)))
 955             {
 956
 957                 result = text;
 958                 m = deco_text;
 959                 while (m < match)
 960                 {
 961                     str_utf8_cnext_noncomb_char (&m);
 962                     str_utf8_cnext_noncomb_char (&result);
 963                 }
 964             }
 965             else
 966             {
 967                 match[0] = '\0';
 968             }
 969         }
 970     }
 971     while (match != NULL && result == NULL);
 972
 973     g_free (deco_text);
 974     if (!case_sen)
 975         g_free (fold_text);
 976
 977     return result;
 978 }
 979
 980 static char *
 981 str_utf8_normalize (const char *text)
 982 {
 983     GString *fixed = g_string_new ("");
 984     char *tmp;
 985     char *result;
 986     const char *start;
 987     const char *end;
 988
 989     start = text;
 990     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 991     {
 992         if (start != end)
 993         {
 994             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
 995             g_string_append (fixed, tmp);
 996             g_free (tmp);
 997         }
 998         g_string_append_c (fixed, end[0]);
 999         start = end + 1;
1000     }
1001
1002     if (start == text)
1003     {
1004         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1005     }
1006     else
1007     {
1008         if (start[0] != '\0' && start != end)
1009         {
1010             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1011             g_string_append (fixed, tmp);
1012             g_free (tmp);
1013         }
1014         result = g_strdup (fixed->str);
1015     }
1016     g_string_free (fixed, TRUE);
1017
1018     return result;
1019 }
1020
1021 static char *
1022 str_utf8_casefold_normalize (const char *text)
1023 {
1024     GString *fixed = g_string_new ("");
1025     char *tmp, *fold;
1026     char *result;
1027     const char *start;
1028     const char *end;
1029
1030     start = text;
1031     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1032     {
1033         if (start != end)
1034         {
1035             fold = g_utf8_casefold (start, end - start);
1036             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1037             g_string_append (fixed, tmp);
1038             g_free (tmp);
1039             g_free (fold);
1040         }
1041         g_string_append_c (fixed, end[0]);
1042         start = end + 1;
1043     }
1044
1045     if (start == text)
1046     {
1047         fold = g_utf8_casefold (text, -1);
1048         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1049         g_free (fold);
1050     }
1051     else
1052     {
1053         if (start[0] != '\0' && start != end)
1054         {
1055             fold = g_utf8_casefold (start, end - start);
1056             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1057             g_string_append (fixed, tmp);
1058             g_free (tmp);
1059             g_free (fold);
1060         }
1061         result = g_strdup (fixed->str);
1062     }
1063     g_string_free (fixed, TRUE);
1064
1065     return result;
1066 }
1067
1068 static int
1069 str_utf8_compare (const char *t1, const char *t2)
1070 {
1071     char *n1, *n2;
1072     int result;
1073
1074     n1 = str_utf8_normalize (t1);
1075     n2 = str_utf8_normalize (t2);
1076
1077     result = strcmp (n1, n2);
1078
1079     g_free (n1);
1080     g_free (n2);
1081
1082     return result;
1083 }
1084
1085 static int
1086 str_utf8_ncompare (const char *t1, const char *t2)
1087 {
1088     char *n1, *n2;
1089     int result;
1090
1091     n1 = str_utf8_normalize (t1);
1092     n2 = str_utf8_normalize (t2);
1093
1094     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1095
1096     g_free (n1);
1097     g_free (n2);
1098
1099     return result;
1100 }
1101
1102 static int
1103 str_utf8_casecmp (const char *t1, const char *t2)
1104 {
1105     char *n1, *n2;
1106     int result;
1107
1108     n1 = str_utf8_casefold_normalize (t1);
1109     n2 = str_utf8_casefold_normalize (t2);
1110
1111     result = strcmp (n1, n2);
1112
1113     g_free (n1);
1114     g_free (n2);
1115
1116     return result;
1117 }
1118
1119 static int
1120 str_utf8_ncasecmp (const char *t1, const char *t2)
1121 {
1122     char *n1, *n2;
1123     int result;
1124
1125     n1 = str_utf8_casefold_normalize (t1);
1126     n2 = str_utf8_casefold_normalize (t2);
1127
1128     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1129
1130     g_free (n1);
1131     g_free (n2);
1132
1133     return result;
1134 }
1135
1136 static int
1137 str_utf8_prefix (const char *text, const char *prefix)
1138 {
1139     char *t = str_utf8_normalize (text);
1140     char *p = str_utf8_normalize (prefix);
1141     const char *nt = t;
1142     const char *np = p;
1143     const char *nnt = t;
1144     const char *nnp = p;
1145     int result;
1146
1147     while (nt[0] != '\0' && np[0] != '\0')
1148     {
1149         str_utf8_cnext_char_safe (&nnt);
1150         str_utf8_cnext_char_safe (&nnp);
1151         if (nnt - nt != nnp - np)
1152             break;
1153         if (strncmp (nt, np, nnt - nt) != 0)
1154             break;
1155         nt = nnt;
1156         np = nnp;
1157     }
1158
1159     result = np - p;
1160
1161     g_free (t);
1162     g_free (p);
1163
1164     return result;
1165 }
1166
1167 static int
1168 str_utf8_caseprefix (const char *text, const char *prefix)
1169 {
1170     char *t = str_utf8_casefold_normalize (text);
1171     char *p = str_utf8_casefold_normalize (prefix);
1172     const char *nt = t;
1173     const char *np = p;
1174     const char *nnt = t;
1175     const char *nnp = p;
1176     int result;
1177
1178     while (nt[0] != '\0' && np[0] != '\0')
1179     {
1180         str_utf8_cnext_char_safe (&nnt);
1181         str_utf8_cnext_char_safe (&nnp);
1182         if (nnt - nt != nnp - np)
1183             break;
1184         if (strncmp (nt, np, nnt - nt) != 0)
1185             break;
1186         nt = nnt;
1187         np = nnp;
1188     }
1189
1190     result = np - p;
1191
1192     g_free (t);
1193     g_free (p);
1194
1195     return result;
1196 }
1197
1198 static char *
1199 str_utf8_create_key_gen (const char *text, int case_sen,
1200                          gchar * (*keygen) (const gchar * text, gssize size))
1201 {
1202     char *result;
1203
1204     if (case_sen)
1205     {
1206         result = str_utf8_normalize (text);
1207     }
1208     else
1209     {
1210         gboolean dot;
1211         GString *fixed;
1212         const char *start, *end;
1213         char *fold, *key;
1214
1215         dot = text[0] == '.';
1216         fixed = g_string_sized_new (16);
1217
1218         if (!dot)
1219             start = text;
1220         else
1221         {
1222             start = text + 1;
1223             g_string_append_c (fixed, '.');
1224         }
1225
1226         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1227         {
1228             if (start != end)
1229             {
1230                 fold = g_utf8_casefold (start, end - start);
1231                 key = keygen (fold, -1);
1232                 g_string_append (fixed, key);
1233                 g_free (key);
1234                 g_free (fold);
1235             }
1236             g_string_append_c (fixed, end[0]);
1237             start = end + 1;
1238         }
1239
1240         if (start == text)
1241         {
1242             fold = g_utf8_casefold (start, -1);
1243             result = keygen (fold, -1);
1244             g_free (fold);
1245             g_string_free (fixed, TRUE);
1246         }
1247         else if (dot && (start == text + 1))
1248         {
1249             fold = g_utf8_casefold (start, -1);
1250             key = keygen (fold, -1);
1251             g_string_append (fixed, key);
1252             g_free (key);
1253             g_free (fold);
1254             result = g_string_free (fixed, FALSE);
1255         }
1256         else
1257         {
1258             if (start[0] != '\0' && start != end)
1259             {
1260                 fold = g_utf8_casefold (start, end - start);
1261                 key = keygen (fold, -1);
1262                 g_string_append (fixed, key);
1263                 g_free (key);
1264                 g_free (fold);
1265             }
1266             result = g_string_free (fixed, FALSE);
1267         }
1268     }
1269     return result;
1270 }
1271
1272 static char *
1273 str_utf8_create_key (const char *text, int case_sen)
1274 {
1275     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1276 }
1277
1278 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1279 static char *
1280 str_utf8_create_key_for_filename (const char *text, int case_sen)
1281 {
1282     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1283 }
1284 #endif
1285
1286 static int
1287 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1288 {
1289     (void) case_sen;
1290     return strcmp (t1, t2);
1291 }
1292
1293 static void
1294 str_utf8_release_key (char *key, int case_sen)
1295 {
1296     (void) case_sen;
1297     g_free (key);
1298 }
1299
1300 struct str_class
1301 str_utf8_init (void)
1302 {
1303     struct str_class result;
1304
1305     result.conv_gerror_message = str_utf8_conv_gerror_message;
1306     result.vfs_convert_to = str_utf8_vfs_convert_to;
1307     result.insert_replace_char = str_utf8_insert_replace_char;
1308     result.is_valid_string = str_utf8_is_valid_string;
1309     result.is_valid_char = str_utf8_is_valid_char;
1310     result.cnext_char = str_utf8_cnext_char;
1311     result.cprev_char = str_utf8_cprev_char;
1312     result.cnext_char_safe = str_utf8_cnext_char_safe;
1313     result.cprev_char_safe = str_utf8_cprev_char_safe;
1314     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1315     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1316     result.isspace = str_utf8_isspace;
1317     result.ispunct = str_utf8_ispunct;
1318     result.isalnum = str_utf8_isalnum;
1319     result.isdigit = str_utf8_isdigit;
1320     result.isprint = str_utf8_isprint;
1321     result.iscombiningmark = str_utf8_iscombiningmark;
1322     result.toupper = str_utf8_toupper;
1323     result.tolower = str_utf8_tolower;
1324     result.length = str_utf8_length;
1325     result.length2 = str_utf8_length2;
1326     result.length_noncomb = str_utf8_length_noncomb;
1327     result.fix_string = str_utf8_fix_string;
1328     result.term_form = str_utf8_term_form;
1329     result.fit_to_term = str_utf8_fit_to_term;
1330     result.term_trim = str_utf8_term_trim;
1331     result.term_width2 = str_utf8_term_width2;
1332     result.term_width1 = str_utf8_term_width1;
1333     result.term_char_width = str_utf8_term_char_width;
1334     result.term_substring = str_utf8_term_substring;
1335     result.trunc = str_utf8_trunc;
1336     result.offset_to_pos = str_utf8_offset_to_pos;
1337     result.column_to_pos = str_utf8_column_to_pos;
1338     result.create_search_needle = str_utf8_create_search_needle;
1339     result.release_search_needle = str_utf8_release_search_needle;
1340     result.search_first = str_utf8_search_first;
1341     result.search_last = str_utf8_search_last;
1342     result.compare = str_utf8_compare;
1343     result.ncompare = str_utf8_ncompare;
1344     result.casecmp = str_utf8_casecmp;
1345     result.ncasecmp = str_utf8_ncasecmp;
1346     result.prefix = str_utf8_prefix;
1347     result.caseprefix = str_utf8_caseprefix;
1348     result.create_key = str_utf8_create_key;
1349 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1350     /* case insensitive sort files in "a1 a2 a10" order */
1351     result.create_key_for_filename = str_utf8_create_key_for_filename;
1352 #else
1353     /* case insensitive sort files in "a1 a10 a2" order */
1354     result.create_key_for_filename = str_utf8_create_key;
1355 #endif
1356     result.key_collate = str_utf8_key_collate;
1357     result.release_key = str_utf8_release_key;
1358
1359     return result;
1360 }