lib/strutil/strutilutf8.c

   1 /*
   2    UTF-8 strings utilities
   3
   4    Copyright (C) 2007, 2011
   5    The Free Software Foundation, Inc.
   6
   7    Written by:
   8    Rostislav Benes, 2007
   9
  10    This file is part of the Midnight Commander.
  11
  12    The Midnight Commander is free software: you can redistribute it
  13    and/or modify it under the terms of the GNU General Public License as
  14    published by the Free Software Foundation, either version 3 of the License,
  15    or (at your option) any later version.
  16
  17    The Midnight Commander is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20    GNU General Public License for more details.
  21
  22    You should have received a copy of the GNU General Public License
  23    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  24  */
  25
  26 #include <config.h>
  27 #include <stdlib.h>
  28 #include <stdio.h>
  29 #include <errno.h>
  30 #include <glib.h>
  31 #include <langinfo.h>
  32 #include <string.h>
  33
  34 #include "lib/global.h"
  35 #include "lib/strutil.h"
  36
  37 /* using function for utf-8 from glib */
  38
  39 static const char replch[] = "\xEF\xBF\xBD";
  40
  41 static gboolean
  42 str_unichar_iscombiningmark (gunichar uni)
  43 {
  44     GUnicodeType type;
  45
  46     type = g_unichar_type (uni);
  47     return (type == G_UNICODE_COMBINING_MARK)
  48         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  49 }
  50
  51 static void
  52 str_utf8_insert_replace_char (GString * buffer)
  53 {
  54     g_string_append (buffer, replch);
  55 }
  56
  57 static int
  58 str_utf8_is_valid_string (const char *text)
  59 {
  60     return g_utf8_validate (text, -1, NULL);
  61 }
  62
  63 static int
  64 str_utf8_is_valid_char (const char *ch, size_t size)
  65 {
  66     switch (g_utf8_get_char_validated (ch, size))
  67     {
  68     case (gunichar) (-2):
  69         return -2;
  70     case (gunichar) (-1):
  71         return -1;
  72     default:
  73         return 1;
  74     }
  75 }
  76
  77 static void
  78 str_utf8_cnext_char (const char **text)
  79 {
  80     (*text) = g_utf8_next_char (*text);
  81 }
  82
  83 static void
  84 str_utf8_cprev_char (const char **text)
  85 {
  86     (*text) = g_utf8_prev_char (*text);
  87 }
  88
  89 static void
  90 str_utf8_cnext_char_safe (const char **text)
  91 {
  92     if (str_utf8_is_valid_char (*text, -1) == 1)
  93         (*text) = g_utf8_next_char (*text);
  94     else
  95         (*text)++;
  96 }
  97
  98 static void
  99 str_utf8_cprev_char_safe (const char **text)
 100 {
 101     const char *result = g_utf8_prev_char (*text);
 102     const char *t = result;
 103     str_utf8_cnext_char_safe (&t);
 104     if (t == *text)
 105         (*text) = result;
 106     else
 107         (*text)--;
 108 }
 109
 110 static void
 111 str_utf8_fix_string (char *text)
 112 {
 113     gunichar uni;
 114
 115     while (text[0] != '\0')
 116     {
 117         uni = g_utf8_get_char_validated (text, -1);
 118         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 119         {
 120             text = g_utf8_next_char (text);
 121         }
 122         else
 123         {
 124             text[0] = '?';
 125             text++;
 126         }
 127     }
 128 }
 129
 130 static int
 131 str_utf8_isspace (const char *text)
 132 {
 133     gunichar uni = g_utf8_get_char_validated (text, -1);
 134     return g_unichar_isspace (uni);
 135 }
 136
 137 static int
 138 str_utf8_ispunct (const char *text)
 139 {
 140     gunichar uni = g_utf8_get_char_validated (text, -1);
 141     return g_unichar_ispunct (uni);
 142 }
 143
 144 static int
 145 str_utf8_isalnum (const char *text)
 146 {
 147     gunichar uni = g_utf8_get_char_validated (text, -1);
 148     return g_unichar_isalnum (uni);
 149 }
 150
 151 static int
 152 str_utf8_isdigit (const char *text)
 153 {
 154     gunichar uni = g_utf8_get_char_validated (text, -1);
 155     return g_unichar_isdigit (uni);
 156 }
 157
 158 static int
 159 str_utf8_isprint (const char *ch)
 160 {
 161     gunichar uni = g_utf8_get_char_validated (ch, -1);
 162     return g_unichar_isprint (uni);
 163 }
 164
 165 static gboolean
 166 str_utf8_iscombiningmark (const char *ch)
 167 {
 168     gunichar uni = g_utf8_get_char_validated (ch, -1);
 169     return str_unichar_iscombiningmark (uni);
 170 }
 171
 172 static int
 173 str_utf8_cnext_noncomb_char (const char **text)
 174 {
 175     int count = 0;
 176     while ((*text)[0] != '\0')
 177     {
 178         str_utf8_cnext_char_safe (text);
 179         count++;
 180         if (!str_utf8_iscombiningmark (*text))
 181             break;
 182     }
 183     return count;
 184 }
 185
 186 static int
 187 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 188 {
 189     int count = 0;
 190     while ((*text) != begin)
 191     {
 192         str_utf8_cprev_char_safe (text);
 193         count++;
 194         if (!str_utf8_iscombiningmark (*text))
 195             break;
 196     }
 197     return count;
 198 }
 199
 200 static int
 201 str_utf8_toupper (const char *text, char **out, size_t * remain)
 202 {
 203     gunichar uni;
 204     size_t left;
 205
 206     uni = g_utf8_get_char_validated (text, -1);
 207     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 208         return 0;
 209
 210     uni = g_unichar_toupper (uni);
 211     left = g_unichar_to_utf8 (uni, NULL);
 212     if (left >= *remain)
 213         return 0;
 214
 215     left = g_unichar_to_utf8 (uni, *out);
 216     (*out) += left;
 217     (*remain) -= left;
 218     return 1;
 219 }
 220
 221 static int
 222 str_utf8_tolower (const char *text, char **out, size_t * remain)
 223 {
 224     gunichar uni;
 225     size_t left;
 226
 227     uni = g_utf8_get_char_validated (text, -1);
 228     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 229         return 0;
 230
 231     uni = g_unichar_tolower (uni);
 232     left = g_unichar_to_utf8 (uni, NULL);
 233     if (left >= *remain)
 234         return 0;
 235
 236     left = g_unichar_to_utf8 (uni, *out);
 237     (*out) += left;
 238     (*remain) -= left;
 239     return 1;
 240 }
 241
 242 static int
 243 str_utf8_length (const char *text)
 244 {
 245     int result = 0;
 246     const char *start;
 247     const char *end;
 248
 249     start = text;
 250     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 251     {
 252         if (start != end)
 253         {
 254             result += g_utf8_strlen (start, end - start);
 255         }
 256         result++;
 257         start = end + 1;
 258     }
 259
 260     if (start == text)
 261     {
 262         result = g_utf8_strlen (text, -1);
 263     }
 264     else
 265     {
 266         if (start[0] != '\0' && start != end)
 267         {
 268             result += g_utf8_strlen (start, end - start);
 269         }
 270     }
 271
 272     return result;
 273 }
 274
 275 static int
 276 str_utf8_length2 (const char *text, int size)
 277 {
 278     int result = 0;
 279     const char *start;
 280     const char *end;
 281
 282     start = text;
 283     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 284     {
 285         if (start != end)
 286         {
 287             result += g_utf8_strlen (start, min (end - start, size));
 288             size -= end - start;
 289         }
 290         result += (size > 0);
 291         size--;
 292         start = end + 1;
 293     }
 294
 295     if (start == text)
 296     {
 297         result = g_utf8_strlen (text, size);
 298     }
 299     else
 300     {
 301         if (start[0] != '\0' && start != end && size > 0)
 302         {
 303             result += g_utf8_strlen (start, min (end - start, size));
 304         }
 305     }
 306
 307     return result;
 308 }
 309
 310 static int
 311 str_utf8_length_noncomb (const char *text)
 312 {
 313     int result = 0;
 314     const char *t = text;
 315
 316     while (t[0] != '\0')
 317     {
 318         str_utf8_cnext_noncomb_char (&t);
 319         result++;
 320     }
 321
 322     return result;
 323 }
 324
 325 /*
 326    static void
 327    str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
 328    {
 329    char *next = g_utf8_next_char (*string);
 330    (*left) -= next - (*string);
 331    (*string) = next;
 332    g_string_append_c (buffer, '?');
 333    }
 334  */
 335
 336 static gchar *
 337 str_utf8_conv_gerror_message (GError * error, const char *def_msg)
 338 {
 339     if ((error != NULL) && (error->message != NULL))
 340         return g_strdup (error->message);
 341
 342     return g_strdup (def_msg != NULL ? def_msg : "");
 343 }
 344
 345 static estr_t
 346 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
 347 {
 348     estr_t result;
 349
 350     if (coder == str_cnv_not_convert)
 351     {
 352         g_string_append_len (buffer, string, size);
 353         result = ESTR_SUCCESS;
 354     }
 355     else
 356         result = str_nconvert (coder, (char *) string, size, buffer);
 357
 358     return result;
 359 }
 360
 361 struct term_form
 362 {
 363     char text[BUF_MEDIUM * 6];
 364     size_t width;
 365     gboolean compose;
 366 };
 367
 368 /* utiliti function, that make string valid in utf8 and all characters printable
 369  * return width of string too*/
 370 static const struct term_form *
 371 str_utf8_make_make_term_form (const char *text, size_t length)
 372 {
 373     static struct term_form result;
 374     gunichar uni;
 375     size_t left;
 376     char *actual;
 377
 378     result.text[0] = '\0';
 379     result.width = 0;
 380     result.compose = FALSE;
 381     actual = result.text;
 382
 383     /* check if text start with combining character,
 384      * add space at begin in this case */
 385     if (length != 0 && text[0] != '\0')
 386     {
 387         uni = g_utf8_get_char_validated (text, -1);
 388         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 389         {
 390             if (str_unichar_iscombiningmark (uni))
 391             {
 392                 actual[0] = ' ';
 393                 actual++;
 394                 result.width++;
 395                 result.compose = TRUE;
 396             }
 397         }
 398     }
 399
 400     while (length != 0 && text[0] != '\0')
 401     {
 402         uni = g_utf8_get_char_validated (text, -1);
 403         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 404         {
 405             if (g_unichar_isprint (uni))
 406             {
 407                 left = g_unichar_to_utf8 (uni, actual);
 408                 actual += left;
 409                 if (str_unichar_iscombiningmark (uni))
 410                     result.compose = TRUE;
 411                 else
 412                 {
 413                     result.width++;
 414                     if (g_unichar_iswide (uni))
 415                         result.width++;
 416                 }
 417             }
 418             else
 419             {
 420                 actual[0] = '.';
 421                 actual++;
 422                 result.width++;
 423             }
 424             text = g_utf8_next_char (text);
 425         }
 426         else
 427         {
 428             text++;
 429             /*actual[0] = '?'; */
 430             memcpy (actual, replch, strlen (replch));
 431             actual += strlen (replch);
 432             result.width++;
 433         }
 434         if (length != (size_t) (-1))
 435             length--;
 436     }
 437     actual[0] = '\0';
 438
 439     return &result;
 440 }
 441
 442 static const char *
 443 str_utf8_term_form (const char *text)
 444 {
 445     static char result[BUF_MEDIUM * 6];
 446     const struct term_form *pre_form;
 447     char *composed;
 448
 449     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 450     if (pre_form->compose)
 451     {
 452         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 453         g_strlcpy (result, composed, sizeof (result));
 454         g_free (composed);
 455     }
 456     else
 457     {
 458         g_strlcpy (result, pre_form->text, sizeof (result));
 459     }
 460     return result;
 461 }
 462
 463 struct utf8_tool
 464 {
 465     char *actual;
 466     size_t remain;
 467     const char *cheked;
 468     int ident;
 469     gboolean compose;
 470 };
 471
 472 /* utiliti function, that copy all characters from cheked to actual */
 473 static gboolean
 474 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 475 {
 476     size_t left;
 477     gunichar uni;
 478
 479     tool->compose = FALSE;
 480
 481     while (tool->cheked[0] != '\0')
 482     {
 483         uni = g_utf8_get_char (tool->cheked);
 484         tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
 485         left = g_unichar_to_utf8 (uni, NULL);
 486         if (tool->remain <= left)
 487             return FALSE;
 488         left = g_unichar_to_utf8 (uni, tool->actual);
 489         tool->actual += left;
 490         tool->remain -= left;
 491         tool->cheked = g_utf8_next_char (tool->cheked);
 492     }
 493     return TRUE;
 494 }
 495
 496 /* utiliti function, that copy characters from cheked to actual until ident is
 497  * smaller than to_ident */
 498 static gboolean
 499 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 500 {
 501     size_t left;
 502     gunichar uni;
 503     int w;
 504
 505     tool->compose = FALSE;
 506
 507     while (tool->cheked[0] != '\0')
 508     {
 509         uni = g_utf8_get_char (tool->cheked);
 510         if (!str_unichar_iscombiningmark (uni))
 511         {
 512             w = 1;
 513             if (g_unichar_iswide (uni))
 514                 w++;
 515             if (tool->ident + w > to_ident)
 516                 return TRUE;
 517         }
 518         else
 519         {
 520             w = 0;
 521             tool->compose = TRUE;
 522         }
 523
 524         left = g_unichar_to_utf8 (uni, NULL);
 525         if (tool->remain <= left)
 526             return FALSE;
 527         left = g_unichar_to_utf8 (uni, tool->actual);
 528         tool->actual += left;
 529         tool->remain -= left;
 530         tool->cheked = g_utf8_next_char (tool->cheked);
 531         tool->ident += w;
 532     }
 533     return TRUE;
 534 }
 535
 536 /* utiliti function, add count spaces to actual */
 537 static int
 538 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 539 {
 540     if (count <= 0)
 541         return 1;
 542     if (tool->remain <= (gsize) count)
 543         return 0;
 544     memset (tool->actual, ' ', count);
 545     tool->actual += count;
 546     tool->remain -= count;
 547     return 1;
 548 }
 549
 550 /* utiliti function, add one characters to actual */
 551 static int
 552 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 553 {
 554     if (tool->remain <= 1)
 555         return 0;
 556     tool->actual[0] = ch;
 557     tool->actual++;
 558     tool->remain--;
 559     return 1;
 560 }
 561
 562 /* utiliti function, thah skip characters from cheked until ident is greater or
 563  * equal to to_ident */
 564 static gboolean
 565 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 566 {
 567     gunichar uni;
 568
 569     while (to_ident > tool->ident && tool->cheked[0] != '\0')
 570     {
 571         uni = g_utf8_get_char (tool->cheked);
 572         if (!str_unichar_iscombiningmark (uni))
 573         {
 574             tool->ident++;
 575             if (g_unichar_iswide (uni))
 576                 tool->ident++;
 577         }
 578         tool->cheked = g_utf8_next_char (tool->cheked);
 579     }
 580     uni = g_utf8_get_char (tool->cheked);
 581     while (str_unichar_iscombiningmark (uni))
 582     {
 583         tool->cheked = g_utf8_next_char (tool->cheked);
 584         uni = g_utf8_get_char (tool->cheked);
 585     }
 586     return TRUE;
 587 }
 588
 589 static void
 590 utf8_tool_compose (char *buffer, size_t size)
 591 {
 592     char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 593     g_strlcpy (buffer, composed, size);
 594     g_free (composed);
 595 }
 596
 597
 598 static const char *
 599 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 600 {
 601     static char result[BUF_MEDIUM * 6];
 602     const struct term_form *pre_form;
 603     struct utf8_tool tool;
 604
 605     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 606     tool.cheked = pre_form->text;
 607     tool.actual = result;
 608     tool.remain = sizeof (result);
 609     tool.compose = FALSE;
 610
 611     if (pre_form->width <= (gsize) width)
 612     {
 613         tool.ident = 0;
 614         switch (HIDE_FIT (just_mode))
 615         {
 616         case J_CENTER_LEFT:
 617         case J_CENTER:
 618             tool.ident = (width - pre_form->width) / 2;
 619             break;
 620         case J_RIGHT:
 621             tool.ident = width - pre_form->width;
 622             break;
 623         }
 624
 625         utf8_tool_insert_space (&tool, tool.ident);
 626         utf8_tool_copy_chars_to_end (&tool);
 627         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 628     }
 629     else
 630     {
 631         if (IS_FIT (just_mode))
 632         {
 633             tool.ident = 0;
 634             utf8_tool_copy_chars_to (&tool, width / 2);
 635             utf8_tool_insert_char (&tool, '~');
 636
 637             tool.ident = 0;
 638             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 639             utf8_tool_copy_chars_to_end (&tool);
 640             utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 641         }
 642         else
 643         {
 644             tool.ident = 0;
 645             switch (HIDE_FIT (just_mode))
 646             {
 647             case J_CENTER:
 648                 tool.ident = (width - pre_form->width) / 2;
 649                 break;
 650             case J_RIGHT:
 651                 tool.ident = width - pre_form->width;
 652                 break;
 653             }
 654
 655             utf8_tool_skip_chars_to (&tool, 0);
 656             utf8_tool_insert_space (&tool, tool.ident);
 657             utf8_tool_copy_chars_to (&tool, width);
 658             utf8_tool_insert_space (&tool, width - tool.ident);
 659         }
 660     }
 661
 662     tool.actual[0] = '\0';
 663     if (tool.compose)
 664         utf8_tool_compose (result, sizeof (result));
 665     return result;
 666 }
 667
 668 static const char *
 669 str_utf8_term_trim (const char *text, int width)
 670 {
 671     static char result[BUF_MEDIUM * 6];
 672     const struct term_form *pre_form;
 673     struct utf8_tool tool;
 674
 675     if (width < 1)
 676     {
 677         result[0] = '\0';
 678         return result;
 679     }
 680
 681     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 682
 683     tool.cheked = pre_form->text;
 684     tool.actual = result;
 685     tool.remain = sizeof (result);
 686     tool.compose = FALSE;
 687
 688     if ((gsize) width < pre_form->width)
 689     {
 690         if (width <= 3)
 691         {
 692             memset (tool.actual, '.', width);
 693             tool.actual += width;
 694             tool.remain -= width;
 695         }
 696         else
 697         {
 698             memset (tool.actual, '.', 3);
 699             tool.actual += 3;
 700             tool.remain -= 3;
 701
 702             tool.ident = 0;
 703             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 704             utf8_tool_copy_chars_to_end (&tool);
 705         }
 706     }
 707     else
 708     {
 709         utf8_tool_copy_chars_to_end (&tool);
 710     }
 711
 712     tool.actual[0] = '\0';
 713     if (tool.compose)
 714         utf8_tool_compose (result, sizeof (result));
 715     return result;
 716 }
 717
 718 static int
 719 str_utf8_term_width2 (const char *text, size_t length)
 720 {
 721     const struct term_form *result;
 722
 723     result = str_utf8_make_make_term_form (text, length);
 724     return result->width;
 725 }
 726
 727 static int
 728 str_utf8_term_width1 (const char *text)
 729 {
 730     return str_utf8_term_width2 (text, (size_t) (-1));
 731 }
 732
 733 static int
 734 str_utf8_term_char_width (const char *text)
 735 {
 736     gunichar uni = g_utf8_get_char_validated (text, -1);
 737     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 738 }
 739
 740 static const char *
 741 str_utf8_term_substring (const char *text, int start, int width)
 742 {
 743     static char result[BUF_MEDIUM * 6];
 744     const struct term_form *pre_form;
 745     struct utf8_tool tool;
 746
 747     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 748
 749     tool.cheked = pre_form->text;
 750     tool.actual = result;
 751     tool.remain = sizeof (result);
 752     tool.compose = FALSE;
 753
 754     tool.ident = -start;
 755     utf8_tool_skip_chars_to (&tool, 0);
 756     if (tool.ident < 0)
 757         tool.ident = 0;
 758     utf8_tool_insert_space (&tool, tool.ident);
 759
 760     utf8_tool_copy_chars_to (&tool, width);
 761     utf8_tool_insert_space (&tool, width - tool.ident);
 762
 763     tool.actual[0] = '\0';
 764     if (tool.compose)
 765         utf8_tool_compose (result, sizeof (result));
 766     return result;
 767 }
 768
 769 static const char *
 770 str_utf8_trunc (const char *text, int width)
 771 {
 772     static char result[MC_MAXPATHLEN * 6 * 2];
 773     const struct term_form *pre_form;
 774     struct utf8_tool tool;
 775
 776     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 777
 778     tool.cheked = pre_form->text;
 779     tool.actual = result;
 780     tool.remain = sizeof (result);
 781     tool.compose = FALSE;
 782
 783     if (pre_form->width > (gsize) width)
 784     {
 785         tool.ident = 0;
 786         utf8_tool_copy_chars_to (&tool, width / 2);
 787         utf8_tool_insert_char (&tool, '~');
 788
 789         tool.ident = 0;
 790         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 791         utf8_tool_copy_chars_to_end (&tool);
 792     }
 793     else
 794     {
 795         utf8_tool_copy_chars_to_end (&tool);
 796     }
 797
 798     tool.actual[0] = '\0';
 799     if (tool.compose)
 800         utf8_tool_compose (result, sizeof (result));
 801     return result;
 802 }
 803
 804 static int
 805 str_utf8_offset_to_pos (const char *text, size_t length)
 806 {
 807     if (str_utf8_is_valid_string (text))
 808         return g_utf8_offset_to_pointer (text, length) - text;
 809     else
 810     {
 811         int result;
 812         GString *buffer = g_string_new (text);
 813
 814         str_utf8_fix_string (buffer->str);
 815         result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
 816         g_string_free (buffer, TRUE);
 817         return result;
 818     }
 819 }
 820
 821 static int
 822 str_utf8_column_to_pos (const char *text, size_t pos)
 823 {
 824     static int result;
 825     gunichar uni;
 826     int width;
 827
 828     width = 0;
 829     result = 0;
 830
 831     while (text[0] != '\0')
 832     {
 833         uni = g_utf8_get_char_validated (text, 6);
 834         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 835         {
 836             if (g_unichar_isprint (uni))
 837             {
 838                 if (!str_unichar_iscombiningmark (uni))
 839                 {
 840                     width++;
 841                     if (g_unichar_iswide (uni))
 842                         width++;
 843                 }
 844             }
 845             else
 846             {
 847                 width++;
 848             }
 849             text = g_utf8_next_char (text);
 850         }
 851         else
 852         {
 853             text++;
 854             width++;
 855         }
 856         if ((gsize) width > pos)
 857             return result;
 858
 859         result++;
 860     }
 861
 862     return result;
 863 }
 864
 865 static char *
 866 str_utf8_create_search_needle (const char *needle, int case_sen)
 867 {
 868     if (needle != NULL)
 869     {
 870         if (case_sen)
 871         {
 872             return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 873         }
 874         else
 875         {
 876             char *fold = g_utf8_casefold (needle, -1);
 877             char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 878             g_free (fold);
 879             return result;
 880         }
 881     }
 882     else
 883         return NULL;
 884 }
 885
 886 static void
 887 str_utf8_release_search_needle (char *needle, int case_sen)
 888 {
 889     (void) case_sen;
 890     if (needle != NULL)
 891         g_free (needle);
 892 }
 893
 894 static const char *
 895 str_utf8_search_first (const char *text, const char *search, int case_sen)
 896 {
 897     char *fold_text;
 898     char *deco_text;
 899     const char *match;
 900     const char *result = NULL;
 901     const char *m;
 902
 903     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 904     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 905
 906     match = deco_text;
 907     do
 908     {
 909         match = g_strstr_len (match, -1, search);
 910         if (match != NULL)
 911         {
 912             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 913                 !str_utf8_iscombiningmark (match + strlen (search)))
 914             {
 915
 916                 result = text;
 917                 m = deco_text;
 918                 while (m < match)
 919                 {
 920                     str_utf8_cnext_noncomb_char (&m);
 921                     str_utf8_cnext_noncomb_char (&result);
 922                 }
 923             }
 924             else
 925             {
 926                 str_utf8_cnext_char (&match);
 927             }
 928         }
 929     }
 930     while (match != NULL && result == NULL);
 931
 932     g_free (deco_text);
 933     if (!case_sen)
 934         g_free (fold_text);
 935
 936     return result;
 937 }
 938
 939 static const char *
 940 str_utf8_search_last (const char *text, const char *search, int case_sen)
 941 {
 942     char *fold_text;
 943     char *deco_text;
 944     char *match;
 945     const char *result = NULL;
 946     const char *m;
 947
 948     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 949     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 950
 951     do
 952     {
 953         match = g_strrstr_len (deco_text, -1, search);
 954         if (match != NULL)
 955         {
 956             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 957                 !str_utf8_iscombiningmark (match + strlen (search)))
 958             {
 959
 960                 result = text;
 961                 m = deco_text;
 962                 while (m < match)
 963                 {
 964                     str_utf8_cnext_noncomb_char (&m);
 965                     str_utf8_cnext_noncomb_char (&result);
 966                 }
 967             }
 968             else
 969             {
 970                 match[0] = '\0';
 971             }
 972         }
 973     }
 974     while (match != NULL && result == NULL);
 975
 976     g_free (deco_text);
 977     if (!case_sen)
 978         g_free (fold_text);
 979
 980     return result;
 981 }
 982
 983 static char *
 984 str_utf8_normalize (const char *text)
 985 {
 986     GString *fixed;
 987     char *tmp;
 988     char *result;
 989     const char *start;
 990     const char *end;
 991
 992     fixed = g_string_sized_new (4);
 993
 994     start = text;
 995     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 996     {
 997         if (start != end)
 998         {
 999             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1000             g_string_append (fixed, tmp);
1001             g_free (tmp);
1002         }
1003         g_string_append_c (fixed, end[0]);
1004         start = end + 1;
1005     }
1006
1007     if (start == text)
1008     {
1009         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1010         g_string_free (fixed, TRUE);
1011     }
1012     else
1013     {
1014         if (start[0] != '\0' && start != end)
1015         {
1016             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1017             g_string_append (fixed, tmp);
1018             g_free (tmp);
1019         }
1020         result = g_string_free (fixed, FALSE);
1021     }
1022
1023     return result;
1024 }
1025
1026 static char *
1027 str_utf8_casefold_normalize (const char *text)
1028 {
1029     GString *fixed;
1030     char *tmp, *fold;
1031     char *result;
1032     const char *start;
1033     const char *end;
1034
1035     fixed = g_string_sized_new (4);
1036
1037     start = text;
1038     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1039     {
1040         if (start != end)
1041         {
1042             fold = g_utf8_casefold (start, end - start);
1043             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1044             g_string_append (fixed, tmp);
1045             g_free (tmp);
1046             g_free (fold);
1047         }
1048         g_string_append_c (fixed, end[0]);
1049         start = end + 1;
1050     }
1051
1052     if (start == text)
1053     {
1054         fold = g_utf8_casefold (text, -1);
1055         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1056         g_free (fold);
1057         g_string_free (fixed, TRUE);
1058     }
1059     else
1060     {
1061         if (start[0] != '\0' && start != end)
1062         {
1063             fold = g_utf8_casefold (start, end - start);
1064             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1065             g_string_append (fixed, tmp);
1066             g_free (tmp);
1067             g_free (fold);
1068         }
1069         result = g_string_free (fixed, FALSE);
1070     }
1071
1072     return result;
1073 }
1074
1075 static int
1076 str_utf8_compare (const char *t1, const char *t2)
1077 {
1078     char *n1, *n2;
1079     int result;
1080
1081     n1 = str_utf8_normalize (t1);
1082     n2 = str_utf8_normalize (t2);
1083
1084     result = strcmp (n1, n2);
1085
1086     g_free (n1);
1087     g_free (n2);
1088
1089     return result;
1090 }
1091
1092 static int
1093 str_utf8_ncompare (const char *t1, const char *t2)
1094 {
1095     char *n1, *n2;
1096     int result;
1097
1098     n1 = str_utf8_normalize (t1);
1099     n2 = str_utf8_normalize (t2);
1100
1101     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1102
1103     g_free (n1);
1104     g_free (n2);
1105
1106     return result;
1107 }
1108
1109 static int
1110 str_utf8_casecmp (const char *t1, const char *t2)
1111 {
1112     char *n1, *n2;
1113     int result;
1114
1115     n1 = str_utf8_casefold_normalize (t1);
1116     n2 = str_utf8_casefold_normalize (t2);
1117
1118     result = strcmp (n1, n2);
1119
1120     g_free (n1);
1121     g_free (n2);
1122
1123     return result;
1124 }
1125
1126 static int
1127 str_utf8_ncasecmp (const char *t1, const char *t2)
1128 {
1129     char *n1, *n2;
1130     int result;
1131
1132     n1 = str_utf8_casefold_normalize (t1);
1133     n2 = str_utf8_casefold_normalize (t2);
1134
1135     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1136
1137     g_free (n1);
1138     g_free (n2);
1139
1140     return result;
1141 }
1142
1143 static int
1144 str_utf8_prefix (const char *text, const char *prefix)
1145 {
1146     char *t = str_utf8_normalize (text);
1147     char *p = str_utf8_normalize (prefix);
1148     const char *nt = t;
1149     const char *np = p;
1150     const char *nnt = t;
1151     const char *nnp = p;
1152     int result;
1153
1154     while (nt[0] != '\0' && np[0] != '\0')
1155     {
1156         str_utf8_cnext_char_safe (&nnt);
1157         str_utf8_cnext_char_safe (&nnp);
1158         if (nnt - nt != nnp - np)
1159             break;
1160         if (strncmp (nt, np, nnt - nt) != 0)
1161             break;
1162         nt = nnt;
1163         np = nnp;
1164     }
1165
1166     result = np - p;
1167
1168     g_free (t);
1169     g_free (p);
1170
1171     return result;
1172 }
1173
1174 static int
1175 str_utf8_caseprefix (const char *text, const char *prefix)
1176 {
1177     char *t = str_utf8_casefold_normalize (text);
1178     char *p = str_utf8_casefold_normalize (prefix);
1179     const char *nt = t;
1180     const char *np = p;
1181     const char *nnt = t;
1182     const char *nnp = p;
1183     int result;
1184
1185     while (nt[0] != '\0' && np[0] != '\0')
1186     {
1187         str_utf8_cnext_char_safe (&nnt);
1188         str_utf8_cnext_char_safe (&nnp);
1189         if (nnt - nt != nnp - np)
1190             break;
1191         if (strncmp (nt, np, nnt - nt) != 0)
1192             break;
1193         nt = nnt;
1194         np = nnp;
1195     }
1196
1197     result = np - p;
1198
1199     g_free (t);
1200     g_free (p);
1201
1202     return result;
1203 }
1204
1205 static char *
1206 str_utf8_create_key_gen (const char *text, int case_sen,
1207                          gchar * (*keygen) (const gchar * text, gssize size))
1208 {
1209     char *result;
1210
1211     if (case_sen)
1212     {
1213         result = str_utf8_normalize (text);
1214     }
1215     else
1216     {
1217         gboolean dot;
1218         GString *fixed;
1219         const char *start, *end;
1220         char *fold, *key;
1221
1222         dot = text[0] == '.';
1223         fixed = g_string_sized_new (16);
1224
1225         if (!dot)
1226             start = text;
1227         else
1228         {
1229             start = text + 1;
1230             g_string_append_c (fixed, '.');
1231         }
1232
1233         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1234         {
1235             if (start != end)
1236             {
1237                 fold = g_utf8_casefold (start, end - start);
1238                 key = keygen (fold, -1);
1239                 g_string_append (fixed, key);
1240                 g_free (key);
1241                 g_free (fold);
1242             }
1243             g_string_append_c (fixed, end[0]);
1244             start = end + 1;
1245         }
1246
1247         if (start == text)
1248         {
1249             fold = g_utf8_casefold (start, -1);
1250             result = keygen (fold, -1);
1251             g_free (fold);
1252             g_string_free (fixed, TRUE);
1253         }
1254         else if (dot && (start == text + 1))
1255         {
1256             fold = g_utf8_casefold (start, -1);
1257             key = keygen (fold, -1);
1258             g_string_append (fixed, key);
1259             g_free (key);
1260             g_free (fold);
1261             result = g_string_free (fixed, FALSE);
1262         }
1263         else
1264         {
1265             if (start[0] != '\0' && start != end)
1266             {
1267                 fold = g_utf8_casefold (start, end - start);
1268                 key = keygen (fold, -1);
1269                 g_string_append (fixed, key);
1270                 g_free (key);
1271                 g_free (fold);
1272             }
1273             result = g_string_free (fixed, FALSE);
1274         }
1275     }
1276     return result;
1277 }
1278
1279 static char *
1280 str_utf8_create_key (const char *text, int case_sen)
1281 {
1282     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1283 }
1284
1285 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1286 static char *
1287 str_utf8_create_key_for_filename (const char *text, int case_sen)
1288 {
1289     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1290 }
1291 #endif
1292
1293 static int
1294 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1295 {
1296     (void) case_sen;
1297     return strcmp (t1, t2);
1298 }
1299
1300 static void
1301 str_utf8_release_key (char *key, int case_sen)
1302 {
1303     (void) case_sen;
1304     g_free (key);
1305 }
1306
1307 struct str_class
1308 str_utf8_init (void)
1309 {
1310     struct str_class result;
1311
1312     result.conv_gerror_message = str_utf8_conv_gerror_message;
1313     result.vfs_convert_to = str_utf8_vfs_convert_to;
1314     result.insert_replace_char = str_utf8_insert_replace_char;
1315     result.is_valid_string = str_utf8_is_valid_string;
1316     result.is_valid_char = str_utf8_is_valid_char;
1317     result.cnext_char = str_utf8_cnext_char;
1318     result.cprev_char = str_utf8_cprev_char;
1319     result.cnext_char_safe = str_utf8_cnext_char_safe;
1320     result.cprev_char_safe = str_utf8_cprev_char_safe;
1321     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1322     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1323     result.char_isspace = str_utf8_isspace;
1324     result.char_ispunct = str_utf8_ispunct;
1325     result.char_isalnum = str_utf8_isalnum;
1326     result.char_isdigit = str_utf8_isdigit;
1327     result.char_isprint = str_utf8_isprint;
1328     result.char_iscombiningmark = str_utf8_iscombiningmark;
1329     result.char_toupper = str_utf8_toupper;
1330     result.char_tolower = str_utf8_tolower;
1331     result.length = str_utf8_length;
1332     result.length2 = str_utf8_length2;
1333     result.length_noncomb = str_utf8_length_noncomb;
1334     result.fix_string = str_utf8_fix_string;
1335     result.term_form = str_utf8_term_form;
1336     result.fit_to_term = str_utf8_fit_to_term;
1337     result.term_trim = str_utf8_term_trim;
1338     result.term_width2 = str_utf8_term_width2;
1339     result.term_width1 = str_utf8_term_width1;
1340     result.term_char_width = str_utf8_term_char_width;
1341     result.term_substring = str_utf8_term_substring;
1342     result.trunc = str_utf8_trunc;
1343     result.offset_to_pos = str_utf8_offset_to_pos;
1344     result.column_to_pos = str_utf8_column_to_pos;
1345     result.create_search_needle = str_utf8_create_search_needle;
1346     result.release_search_needle = str_utf8_release_search_needle;
1347     result.search_first = str_utf8_search_first;
1348     result.search_last = str_utf8_search_last;
1349     result.compare = str_utf8_compare;
1350     result.ncompare = str_utf8_ncompare;
1351     result.casecmp = str_utf8_casecmp;
1352     result.ncasecmp = str_utf8_ncasecmp;
1353     result.prefix = str_utf8_prefix;
1354     result.caseprefix = str_utf8_caseprefix;
1355     result.create_key = str_utf8_create_key;
1356 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1357     /* case insensitive sort files in "a1 a2 a10" order */
1358     result.create_key_for_filename = str_utf8_create_key_for_filename;
1359 #else
1360     /* case insensitive sort files in "a1 a10 a2" order */
1361     result.create_key_for_filename = str_utf8_create_key;
1362 #endif
1363     result.key_collate = str_utf8_key_collate;
1364     result.release_key = str_utf8_release_key;
1365
1366     return result;
1367 }