lib/strutil/strutilutf8.c

   1 /*
   2    UTF-8 strings utilities
   3
   4    Copyright (C) 2007-2019
   5    Free Software Foundation, Inc.
   6
   7    Written by:
   8    Rostislav Benes, 2007
   9
  10    This file is part of the Midnight Commander.
  11
  12    The Midnight Commander is free software: you can redistribute it
  13    and/or modify it under the terms of the GNU General Public License as
  14    published by the Free Software Foundation, either version 3 of the License,
  15    or (at your option) any later version.
  16
  17    The Midnight Commander is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20    GNU General Public License for more details.
  21
  22    You should have received a copy of the GNU General Public License
  23    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  24  */
  25
  26 #include <config.h>
  27
  28 #include <stdlib.h>
  29 #include <langinfo.h>
  30 #include <string.h>
  31
  32 #include "lib/global.h"
  33 #include "lib/strutil.h"
  34
  35 /* using function for utf-8 from glib */
  36
  37 /*** global variables ****************************************************************************/
  38
  39 /*** file scope macro definitions ****************************************************************/
  40
  41 /*** file scope type declarations ****************************************************************/
  42
  43 struct utf8_tool
  44 {
  45     char *actual;
  46     size_t remain;
  47     const char *checked;
  48     int ident;
  49     gboolean compose;
  50 };
  51
  52 struct term_form
  53 {
  54     char text[BUF_MEDIUM * 6];
  55     size_t width;
  56     gboolean compose;
  57 };
  58
  59 /*** file scope variables ************************************************************************/
  60
  61 static const char replch[] = "\xEF\xBF\xBD";
  62
  63 /* --------------------------------------------------------------------------------------------- */
  64 /*** file scope functions ************************************************************************/
  65 /* --------------------------------------------------------------------------------------------- */
  66
  67 static gboolean
  68 str_unichar_iscombiningmark (gunichar uni)
  69 {
  70     GUnicodeType type;
  71
  72     type = g_unichar_type (uni);
  73     return (type == G_UNICODE_COMBINING_MARK)
  74         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  75 }
  76
  77 /* --------------------------------------------------------------------------------------------- */
  78
  79 static void
  80 str_utf8_insert_replace_char (GString * buffer)
  81 {
  82     g_string_append (buffer, replch);
  83 }
  84
  85 /* --------------------------------------------------------------------------------------------- */
  86
  87 static gboolean
  88 str_utf8_is_valid_string (const char *text)
  89 {
  90     return g_utf8_validate (text, -1, NULL);
  91 }
  92
  93 /* --------------------------------------------------------------------------------------------- */
  94
  95 static int
  96 str_utf8_is_valid_char (const char *ch, size_t size)
  97 {
  98     switch (g_utf8_get_char_validated (ch, size))
  99     {
 100     case (gunichar) (-2):
 101         return (-2);
 102     case (gunichar) (-1):
 103         return (-1);
 104     default:
 105         return 1;
 106     }
 107 }
 108
 109 /* --------------------------------------------------------------------------------------------- */
 110
 111 static void
 112 str_utf8_cnext_char (const char **text)
 113 {
 114     (*text) = g_utf8_next_char (*text);
 115 }
 116
 117 /* --------------------------------------------------------------------------------------------- */
 118
 119 static void
 120 str_utf8_cprev_char (const char **text)
 121 {
 122     (*text) = g_utf8_prev_char (*text);
 123 }
 124
 125 /* --------------------------------------------------------------------------------------------- */
 126
 127 static void
 128 str_utf8_cnext_char_safe (const char **text)
 129 {
 130     if (str_utf8_is_valid_char (*text, -1) == 1)
 131         (*text) = g_utf8_next_char (*text);
 132     else
 133         (*text)++;
 134 }
 135
 136 /* --------------------------------------------------------------------------------------------- */
 137
 138 static void
 139 str_utf8_cprev_char_safe (const char **text)
 140 {
 141     const char *result, *t;
 142
 143     result = g_utf8_prev_char (*text);
 144     t = result;
 145     str_utf8_cnext_char_safe (&t);
 146     if (t == *text)
 147         (*text) = result;
 148     else
 149         (*text)--;
 150 }
 151
 152 /* --------------------------------------------------------------------------------------------- */
 153
 154 static void
 155 str_utf8_fix_string (char *text)
 156 {
 157     while (text[0] != '\0')
 158     {
 159         gunichar uni;
 160
 161         uni = g_utf8_get_char_validated (text, -1);
 162         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 163             text = g_utf8_next_char (text);
 164         else
 165         {
 166             text[0] = '?';
 167             text++;
 168         }
 169     }
 170 }
 171
 172 /* --------------------------------------------------------------------------------------------- */
 173
 174 static gboolean
 175 str_utf8_isspace (const char *text)
 176 {
 177     gunichar uni;
 178
 179     uni = g_utf8_get_char_validated (text, -1);
 180     return g_unichar_isspace (uni);
 181 }
 182
 183 /* --------------------------------------------------------------------------------------------- */
 184
 185 static gboolean
 186 str_utf8_ispunct (const char *text)
 187 {
 188     gunichar uni;
 189
 190     uni = g_utf8_get_char_validated (text, -1);
 191     return g_unichar_ispunct (uni);
 192 }
 193
 194 /* --------------------------------------------------------------------------------------------- */
 195
 196 static gboolean
 197 str_utf8_isalnum (const char *text)
 198 {
 199     gunichar uni;
 200
 201     uni = g_utf8_get_char_validated (text, -1);
 202     return g_unichar_isalnum (uni);
 203 }
 204
 205 /* --------------------------------------------------------------------------------------------- */
 206
 207 static gboolean
 208 str_utf8_isdigit (const char *text)
 209 {
 210     gunichar uni;
 211
 212     uni = g_utf8_get_char_validated (text, -1);
 213     return g_unichar_isdigit (uni);
 214 }
 215
 216 /* --------------------------------------------------------------------------------------------- */
 217
 218 static gboolean
 219 str_utf8_isprint (const char *ch)
 220 {
 221     gunichar uni;
 222
 223     uni = g_utf8_get_char_validated (ch, -1);
 224     return g_unichar_isprint (uni);
 225 }
 226
 227 /* --------------------------------------------------------------------------------------------- */
 228
 229 static gboolean
 230 str_utf8_iscombiningmark (const char *ch)
 231 {
 232     gunichar uni;
 233
 234     uni = g_utf8_get_char_validated (ch, -1);
 235     return str_unichar_iscombiningmark (uni);
 236 }
 237
 238 /* --------------------------------------------------------------------------------------------- */
 239
 240 static int
 241 str_utf8_cnext_noncomb_char (const char **text)
 242 {
 243     int count = 0;
 244
 245     while ((*text)[0] != '\0')
 246     {
 247         str_utf8_cnext_char_safe (text);
 248         count++;
 249         if (!str_utf8_iscombiningmark (*text))
 250             break;
 251     }
 252
 253     return count;
 254 }
 255
 256 /* --------------------------------------------------------------------------------------------- */
 257
 258 static int
 259 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 260 {
 261     int count = 0;
 262
 263     while ((*text) != begin)
 264     {
 265         str_utf8_cprev_char_safe (text);
 266         count++;
 267         if (!str_utf8_iscombiningmark (*text))
 268             break;
 269     }
 270
 271     return count;
 272 }
 273
 274 /* --------------------------------------------------------------------------------------------- */
 275
 276 static gboolean
 277 str_utf8_toupper (const char *text, char **out, size_t * remain)
 278 {
 279     gunichar uni;
 280     size_t left;
 281
 282     uni = g_utf8_get_char_validated (text, -1);
 283     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 284         return FALSE;
 285
 286     uni = g_unichar_toupper (uni);
 287     left = g_unichar_to_utf8 (uni, NULL);
 288     if (left >= *remain)
 289         return FALSE;
 290
 291     left = g_unichar_to_utf8 (uni, *out);
 292     (*out) += left;
 293     (*remain) -= left;
 294     return TRUE;
 295 }
 296
 297 /* --------------------------------------------------------------------------------------------- */
 298
 299 static gboolean
 300 str_utf8_tolower (const char *text, char **out, size_t * remain)
 301 {
 302     gunichar uni;
 303     size_t left;
 304
 305     uni = g_utf8_get_char_validated (text, -1);
 306     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 307         return FALSE;
 308
 309     uni = g_unichar_tolower (uni);
 310     left = g_unichar_to_utf8 (uni, NULL);
 311     if (left >= *remain)
 312         return FALSE;
 313
 314     left = g_unichar_to_utf8 (uni, *out);
 315     (*out) += left;
 316     (*remain) -= left;
 317     return TRUE;
 318 }
 319
 320 /* --------------------------------------------------------------------------------------------- */
 321
 322 static int
 323 str_utf8_length (const char *text)
 324 {
 325     int result = 0;
 326     const char *start;
 327     const char *end;
 328
 329     start = text;
 330     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 331     {
 332         if (start != end)
 333             result += g_utf8_strlen (start, end - start);
 334
 335         result++;
 336         start = end + 1;
 337     }
 338
 339     if (start == text)
 340         result = g_utf8_strlen (text, -1);
 341     else if (start[0] != '\0' && start != end)
 342         result += g_utf8_strlen (start, end - start);
 343
 344     return result;
 345 }
 346
 347 /* --------------------------------------------------------------------------------------------- */
 348
 349 static int
 350 str_utf8_length2 (const char *text, int size)
 351 {
 352     int result = 0;
 353     const char *start;
 354     const char *end;
 355
 356     start = text;
 357     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 358     {
 359         if (start != end)
 360         {
 361             result += g_utf8_strlen (start, MIN (end - start, size));
 362             size -= end - start;
 363         }
 364         result += (size > 0);
 365         size--;
 366         start = end + 1;
 367     }
 368
 369     if (start == text)
 370         result = g_utf8_strlen (text, size);
 371     else if (start[0] != '\0' && start != end && size > 0)
 372         result += g_utf8_strlen (start, MIN (end - start, size));
 373
 374     return result;
 375 }
 376
 377 /* --------------------------------------------------------------------------------------------- */
 378
 379 static int
 380 str_utf8_length_noncomb (const char *text)
 381 {
 382     int result = 0;
 383     const char *t = text;
 384
 385     while (t[0] != '\0')
 386     {
 387         str_utf8_cnext_noncomb_char (&t);
 388         result++;
 389     }
 390
 391     return result;
 392 }
 393
 394 /* --------------------------------------------------------------------------------------------- */
 395
 396 #if 0
 397 static void
 398 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
 399 {
 400     char *next;
 401
 402     next = g_utf8_next_char (*string);
 403     (*left) -= next - (*string);
 404     (*string) = next;
 405     g_string_append_c (buffer, '?');
 406 }
 407 #endif
 408
 409 /* --------------------------------------------------------------------------------------------- */
 410
 411 static gchar *
 412 str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
 413 {
 414     if (mcerror != NULL)
 415         return g_strdup (mcerror->message);
 416
 417     return g_strdup (def_msg != NULL ? def_msg : "");
 418 }
 419
 420 /* --------------------------------------------------------------------------------------------- */
 421
 422 static estr_t
 423 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
 424 {
 425     estr_t result = ESTR_SUCCESS;
 426
 427     if (coder == str_cnv_not_convert)
 428         g_string_append_len (buffer, string, size);
 429     else
 430         result = str_nconvert (coder, string, size, buffer);
 431
 432     return result;
 433 }
 434
 435 /* --------------------------------------------------------------------------------------------- */
 436 /* utility function, that makes string valid in utf8 and all characters printable
 437  * return width of string too */
 438
 439 static const struct term_form *
 440 str_utf8_make_make_term_form (const char *text, size_t length)
 441 {
 442     static struct term_form result;
 443     gunichar uni;
 444     size_t left;
 445     char *actual;
 446
 447     result.text[0] = '\0';
 448     result.width = 0;
 449     result.compose = FALSE;
 450     actual = result.text;
 451
 452     /* check if text start with combining character,
 453      * add space at begin in this case */
 454     if (length != 0 && text[0] != '\0')
 455     {
 456         uni = g_utf8_get_char_validated (text, -1);
 457         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
 458             && str_unichar_iscombiningmark (uni))
 459         {
 460             actual[0] = ' ';
 461             actual++;
 462             result.width++;
 463             result.compose = TRUE;
 464         }
 465     }
 466
 467     while (length != 0 && text[0] != '\0')
 468     {
 469         uni = g_utf8_get_char_validated (text, -1);
 470         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 471         {
 472             if (g_unichar_isprint (uni))
 473             {
 474                 left = g_unichar_to_utf8 (uni, actual);
 475                 actual += left;
 476                 if (str_unichar_iscombiningmark (uni))
 477                     result.compose = TRUE;
 478                 else
 479                 {
 480                     result.width++;
 481                     if (g_unichar_iswide (uni))
 482                         result.width++;
 483                 }
 484             }
 485             else
 486             {
 487                 actual[0] = '.';
 488                 actual++;
 489                 result.width++;
 490             }
 491             text = g_utf8_next_char (text);
 492         }
 493         else
 494         {
 495             text++;
 496             /*actual[0] = '?'; */
 497             memcpy (actual, replch, strlen (replch));
 498             actual += strlen (replch);
 499             result.width++;
 500         }
 501
 502         if (length != (size_t) (-1))
 503             length--;
 504     }
 505     actual[0] = '\0';
 506
 507     return &result;
 508 }
 509
 510 /* --------------------------------------------------------------------------------------------- */
 511
 512 static const char *
 513 str_utf8_term_form (const char *text)
 514 {
 515     static char result[BUF_MEDIUM * 6];
 516     const struct term_form *pre_form;
 517
 518     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 519     if (pre_form->compose)
 520     {
 521         char *composed;
 522
 523         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 524         g_strlcpy (result, composed, sizeof (result));
 525         g_free (composed);
 526     }
 527     else
 528         g_strlcpy (result, pre_form->text, sizeof (result));
 529
 530     return result;
 531 }
 532
 533 /* --------------------------------------------------------------------------------------------- */
 534 /* utility function, that copies all characters from checked to actual */
 535
 536 static gboolean
 537 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 538 {
 539     tool->compose = FALSE;
 540
 541     while (tool->checked[0] != '\0')
 542     {
 543         gunichar uni;
 544         size_t left;
 545
 546         uni = g_utf8_get_char (tool->checked);
 547         tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
 548         left = g_unichar_to_utf8 (uni, NULL);
 549         if (tool->remain <= left)
 550             return FALSE;
 551         left = g_unichar_to_utf8 (uni, tool->actual);
 552         tool->actual += left;
 553         tool->remain -= left;
 554         tool->checked = g_utf8_next_char (tool->checked);
 555     }
 556
 557     return TRUE;
 558 }
 559
 560 /* --------------------------------------------------------------------------------------------- */
 561 /* utility function, that copies characters from checked to actual until ident is
 562  * smaller than to_ident */
 563
 564 static gboolean
 565 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 566 {
 567     tool->compose = FALSE;
 568
 569     while (tool->checked[0] != '\0')
 570     {
 571         gunichar uni;
 572         size_t left;
 573         int w = 0;
 574
 575         uni = g_utf8_get_char (tool->checked);
 576         if (str_unichar_iscombiningmark (uni))
 577             tool->compose = TRUE;
 578         else
 579         {
 580             w = 1;
 581             if (g_unichar_iswide (uni))
 582                 w++;
 583             if (tool->ident + w > to_ident)
 584                 return TRUE;
 585         }
 586
 587         left = g_unichar_to_utf8 (uni, NULL);
 588         if (tool->remain <= left)
 589             return FALSE;
 590         left = g_unichar_to_utf8 (uni, tool->actual);
 591         tool->actual += left;
 592         tool->remain -= left;
 593         tool->checked = g_utf8_next_char (tool->checked);
 594         tool->ident += w;
 595     }
 596
 597     return TRUE;
 598 }
 599
 600 /* --------------------------------------------------------------------------------------------- */
 601 /* utility function, adds count spaces to actual */
 602
 603 static int
 604 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 605 {
 606     if (count <= 0)
 607         return 1;
 608     if (tool->remain <= (gsize) count)
 609         return 0;
 610
 611     memset (tool->actual, ' ', count);
 612     tool->actual += count;
 613     tool->remain -= count;
 614     return 1;
 615 }
 616
 617 /* --------------------------------------------------------------------------------------------- */
 618 /* utility function, adds one characters to actual */
 619
 620 static int
 621 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 622 {
 623     if (tool->remain <= 1)
 624         return 0;
 625
 626     tool->actual[0] = ch;
 627     tool->actual++;
 628     tool->remain--;
 629     return 1;
 630 }
 631
 632 /* --------------------------------------------------------------------------------------------- */
 633 /* utility function, thah skips characters from checked until ident is greater or
 634  * equal to to_ident */
 635
 636 static gboolean
 637 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 638 {
 639     gunichar uni;
 640
 641     while (to_ident > tool->ident && tool->checked[0] != '\0')
 642     {
 643         uni = g_utf8_get_char (tool->checked);
 644         if (!str_unichar_iscombiningmark (uni))
 645         {
 646             tool->ident++;
 647             if (g_unichar_iswide (uni))
 648                 tool->ident++;
 649         }
 650         tool->checked = g_utf8_next_char (tool->checked);
 651     }
 652
 653     uni = g_utf8_get_char (tool->checked);
 654     while (str_unichar_iscombiningmark (uni))
 655     {
 656         tool->checked = g_utf8_next_char (tool->checked);
 657         uni = g_utf8_get_char (tool->checked);
 658     }
 659
 660     return TRUE;
 661 }
 662
 663 /* --------------------------------------------------------------------------------------------- */
 664
 665 static void
 666 utf8_tool_compose (char *buffer, size_t size)
 667 {
 668     char *composed;
 669
 670     composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 671     g_strlcpy (buffer, composed, size);
 672     g_free (composed);
 673 }
 674
 675 /* --------------------------------------------------------------------------------------------- */
 676
 677 static const char *
 678 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 679 {
 680     static char result[BUF_MEDIUM * 6];
 681     const struct term_form *pre_form;
 682     struct utf8_tool tool;
 683
 684     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 685     tool.checked = pre_form->text;
 686     tool.actual = result;
 687     tool.remain = sizeof (result);
 688     tool.compose = FALSE;
 689
 690     if (pre_form->width <= (gsize) width)
 691     {
 692         switch (HIDE_FIT (just_mode))
 693         {
 694         case J_CENTER_LEFT:
 695         case J_CENTER:
 696             tool.ident = (width - pre_form->width) / 2;
 697             break;
 698         case J_RIGHT:
 699             tool.ident = width - pre_form->width;
 700             break;
 701         default:
 702             tool.ident = 0;
 703             break;
 704         }
 705
 706         utf8_tool_insert_space (&tool, tool.ident);
 707         utf8_tool_copy_chars_to_end (&tool);
 708         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 709     }
 710     else if (IS_FIT (just_mode))
 711     {
 712         tool.ident = 0;
 713         utf8_tool_copy_chars_to (&tool, width / 2);
 714         utf8_tool_insert_char (&tool, '~');
 715
 716         tool.ident = 0;
 717         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 718         utf8_tool_copy_chars_to_end (&tool);
 719         utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 720     }
 721     else
 722     {
 723         switch (HIDE_FIT (just_mode))
 724         {
 725         case J_CENTER:
 726             tool.ident = (width - pre_form->width) / 2;
 727             break;
 728         case J_RIGHT:
 729             tool.ident = width - pre_form->width;
 730             break;
 731         default:
 732             tool.ident = 0;
 733             break;
 734         }
 735
 736         utf8_tool_skip_chars_to (&tool, 0);
 737         utf8_tool_insert_space (&tool, tool.ident);
 738         utf8_tool_copy_chars_to (&tool, width);
 739         utf8_tool_insert_space (&tool, width - tool.ident);
 740     }
 741
 742     tool.actual[0] = '\0';
 743     if (tool.compose)
 744         utf8_tool_compose (result, sizeof (result));
 745     return result;
 746 }
 747
 748 /* --------------------------------------------------------------------------------------------- */
 749
 750 static const char *
 751 str_utf8_term_trim (const char *text, int width)
 752 {
 753     static char result[BUF_MEDIUM * 6];
 754     const struct term_form *pre_form;
 755     struct utf8_tool tool;
 756
 757     if (width < 1)
 758     {
 759         result[0] = '\0';
 760         return result;
 761     }
 762
 763     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 764
 765     tool.checked = pre_form->text;
 766     tool.actual = result;
 767     tool.remain = sizeof (result);
 768     tool.compose = FALSE;
 769
 770     if ((gsize) width >= pre_form->width)
 771         utf8_tool_copy_chars_to_end (&tool);
 772     else if (width <= 3)
 773     {
 774         memset (tool.actual, '.', width);
 775         tool.actual += width;
 776         tool.remain -= width;
 777     }
 778     else
 779     {
 780         memset (tool.actual, '.', 3);
 781         tool.actual += 3;
 782         tool.remain -= 3;
 783
 784         tool.ident = 0;
 785         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 786         utf8_tool_copy_chars_to_end (&tool);
 787     }
 788
 789     tool.actual[0] = '\0';
 790     if (tool.compose)
 791         utf8_tool_compose (result, sizeof (result));
 792     return result;
 793 }
 794
 795 /* --------------------------------------------------------------------------------------------- */
 796
 797 static int
 798 str_utf8_term_width2 (const char *text, size_t length)
 799 {
 800     const struct term_form *result;
 801
 802     result = str_utf8_make_make_term_form (text, length);
 803     return result->width;
 804 }
 805
 806 /* --------------------------------------------------------------------------------------------- */
 807
 808 static int
 809 str_utf8_term_width1 (const char *text)
 810 {
 811     return str_utf8_term_width2 (text, (size_t) (-1));
 812 }
 813
 814 /* --------------------------------------------------------------------------------------------- */
 815
 816 static int
 817 str_utf8_term_char_width (const char *text)
 818 {
 819     gunichar uni;
 820
 821     uni = g_utf8_get_char_validated (text, -1);
 822     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 823 }
 824
 825 /* --------------------------------------------------------------------------------------------- */
 826
 827 static const char *
 828 str_utf8_term_substring (const char *text, int start, int width)
 829 {
 830     static char result[BUF_MEDIUM * 6];
 831     const struct term_form *pre_form;
 832     struct utf8_tool tool;
 833
 834     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 835
 836     tool.checked = pre_form->text;
 837     tool.actual = result;
 838     tool.remain = sizeof (result);
 839     tool.compose = FALSE;
 840
 841     tool.ident = -start;
 842     utf8_tool_skip_chars_to (&tool, 0);
 843     if (tool.ident < 0)
 844         tool.ident = 0;
 845     utf8_tool_insert_space (&tool, tool.ident);
 846
 847     utf8_tool_copy_chars_to (&tool, width);
 848     utf8_tool_insert_space (&tool, width - tool.ident);
 849
 850     tool.actual[0] = '\0';
 851     if (tool.compose)
 852         utf8_tool_compose (result, sizeof (result));
 853     return result;
 854 }
 855
 856 /* --------------------------------------------------------------------------------------------- */
 857
 858 static const char *
 859 str_utf8_trunc (const char *text, int width)
 860 {
 861     static char result[MC_MAXPATHLEN * 6 * 2];
 862     const struct term_form *pre_form;
 863     struct utf8_tool tool;
 864
 865     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 866
 867     tool.checked = pre_form->text;
 868     tool.actual = result;
 869     tool.remain = sizeof (result);
 870     tool.compose = FALSE;
 871
 872     if (pre_form->width <= (gsize) width)
 873         utf8_tool_copy_chars_to_end (&tool);
 874     else
 875     {
 876         tool.ident = 0;
 877         utf8_tool_copy_chars_to (&tool, width / 2);
 878         utf8_tool_insert_char (&tool, '~');
 879
 880         tool.ident = 0;
 881         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 882         utf8_tool_copy_chars_to_end (&tool);
 883     }
 884
 885     tool.actual[0] = '\0';
 886     if (tool.compose)
 887         utf8_tool_compose (result, sizeof (result));
 888     return result;
 889 }
 890
 891 /* --------------------------------------------------------------------------------------------- */
 892
 893 static int
 894 str_utf8_offset_to_pos (const char *text, size_t length)
 895 {
 896     if (str_utf8_is_valid_string (text))
 897         return g_utf8_offset_to_pointer (text, length) - text;
 898     else
 899     {
 900         int result;
 901         GString *buffer;
 902
 903         buffer = g_string_new (text);
 904         str_utf8_fix_string (buffer->str);
 905         result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
 906         g_string_free (buffer, TRUE);
 907         return result;
 908     }
 909 }
 910
 911 /* --------------------------------------------------------------------------------------------- */
 912
 913 static int
 914 str_utf8_column_to_pos (const char *text, size_t pos)
 915 {
 916     int result = 0;
 917     int width = 0;
 918
 919     while (text[0] != '\0')
 920     {
 921         gunichar uni;
 922
 923         uni = g_utf8_get_char_validated (text, 6);
 924         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 925         {
 926             if (g_unichar_isprint (uni))
 927             {
 928                 if (!str_unichar_iscombiningmark (uni))
 929                 {
 930                     width++;
 931                     if (g_unichar_iswide (uni))
 932                         width++;
 933                 }
 934             }
 935             else
 936             {
 937                 width++;
 938             }
 939             text = g_utf8_next_char (text);
 940         }
 941         else
 942         {
 943             text++;
 944             width++;
 945         }
 946
 947         if ((gsize) width > pos)
 948             return result;
 949
 950         result++;
 951     }
 952
 953     return result;
 954 }
 955
 956 /* --------------------------------------------------------------------------------------------- */
 957
 958 static char *
 959 str_utf8_create_search_needle (const char *needle, gboolean case_sen)
 960 {
 961     char *fold, *result;
 962
 963     if (needle == NULL)
 964         return NULL;
 965
 966     if (case_sen)
 967         return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 968
 969     fold = g_utf8_casefold (needle, -1);
 970     result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 971     g_free (fold);
 972     return result;
 973 }
 974
 975 /* --------------------------------------------------------------------------------------------- */
 976
 977 static void
 978 str_utf8_release_search_needle (char *needle, gboolean case_sen)
 979 {
 980     (void) case_sen;
 981     g_free (needle);
 982 }
 983
 984 /* --------------------------------------------------------------------------------------------- */
 985
 986 static const char *
 987 str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
 988 {
 989     char *fold_text;
 990     char *deco_text;
 991     const char *match;
 992     const char *result = NULL;
 993     const char *m;
 994
 995     fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
 996     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 997
 998     match = deco_text;
 999     do
1000     {
1001         match = g_strstr_len (match, -1, search);
1002         if (match != NULL)
1003         {
1004             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1005                 !str_utf8_iscombiningmark (match + strlen (search)))
1006             {
1007                 result = text;
1008                 m = deco_text;
1009                 while (m < match)
1010                 {
1011                     str_utf8_cnext_noncomb_char (&m);
1012                     str_utf8_cnext_noncomb_char (&result);
1013                 }
1014             }
1015             else
1016                 str_utf8_cnext_char (&match);
1017         }
1018     }
1019     while (match != NULL && result == NULL);
1020
1021     g_free (deco_text);
1022     if (!case_sen)
1023         g_free (fold_text);
1024
1025     return result;
1026 }
1027
1028 /* --------------------------------------------------------------------------------------------- */
1029
1030 static const char *
1031 str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
1032 {
1033     char *fold_text;
1034     char *deco_text;
1035     char *match;
1036     const char *result = NULL;
1037     const char *m;
1038
1039     fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
1040     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1041
1042     do
1043     {
1044         match = g_strrstr_len (deco_text, -1, search);
1045         if (match != NULL)
1046         {
1047             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1048                 !str_utf8_iscombiningmark (match + strlen (search)))
1049             {
1050                 result = text;
1051                 m = deco_text;
1052                 while (m < match)
1053                 {
1054                     str_utf8_cnext_noncomb_char (&m);
1055                     str_utf8_cnext_noncomb_char (&result);
1056                 }
1057             }
1058             else
1059                 match[0] = '\0';
1060         }
1061     }
1062     while (match != NULL && result == NULL);
1063
1064     g_free (deco_text);
1065     if (!case_sen)
1066         g_free (fold_text);
1067
1068     return result;
1069 }
1070
1071 /* --------------------------------------------------------------------------------------------- */
1072
1073 static char *
1074 str_utf8_normalize (const char *text)
1075 {
1076     GString *fixed;
1077     char *tmp;
1078     char *result;
1079     const char *start;
1080     const char *end;
1081
1082     /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
1083      * does the normalization and then converts UCS-4 back into UTF-8.
1084      * Since file names are composed of ASCII characters in most cases, we can speed up
1085      * utf8 normalization by checking if the heavyweight Unicode normalization is actually
1086      * needed. Normalization of ASCII string is no-op.
1087      */
1088
1089     /* find out whether text is ASCII only */
1090     for (end = text; *end != '\0'; end++)
1091         if ((*end & 0x80) != 0)
1092         {
1093             /* found 2nd byte of utf8-encoded symbol */
1094             break;
1095         }
1096
1097     /* if text is ASCII-only, return copy, normalize otherwise */
1098     if (*end == '\0')
1099         return g_strndup (text, end - text);
1100
1101     fixed = g_string_sized_new (4);
1102
1103     start = text;
1104     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1105     {
1106         if (start != end)
1107         {
1108             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1109             g_string_append (fixed, tmp);
1110             g_free (tmp);
1111         }
1112         g_string_append_c (fixed, end[0]);
1113         start = end + 1;
1114     }
1115
1116     if (start == text)
1117     {
1118         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1119         g_string_free (fixed, TRUE);
1120     }
1121     else
1122     {
1123         if (start[0] != '\0' && start != end)
1124         {
1125             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1126             g_string_append (fixed, tmp);
1127             g_free (tmp);
1128         }
1129         result = g_string_free (fixed, FALSE);
1130     }
1131
1132     return result;
1133 }
1134
1135 /* --------------------------------------------------------------------------------------------- */
1136
1137 static char *
1138 str_utf8_casefold_normalize (const char *text)
1139 {
1140     GString *fixed;
1141     char *tmp, *fold;
1142     char *result;
1143     const char *start;
1144     const char *end;
1145
1146     fixed = g_string_sized_new (4);
1147
1148     start = text;
1149     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1150     {
1151         if (start != end)
1152         {
1153             fold = g_utf8_casefold (start, end - start);
1154             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1155             g_string_append (fixed, tmp);
1156             g_free (tmp);
1157             g_free (fold);
1158         }
1159         g_string_append_c (fixed, end[0]);
1160         start = end + 1;
1161     }
1162
1163     if (start == text)
1164     {
1165         fold = g_utf8_casefold (text, -1);
1166         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1167         g_free (fold);
1168         g_string_free (fixed, TRUE);
1169     }
1170     else
1171     {
1172         if (start[0] != '\0' && start != end)
1173         {
1174             fold = g_utf8_casefold (start, end - start);
1175             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1176             g_string_append (fixed, tmp);
1177             g_free (tmp);
1178             g_free (fold);
1179         }
1180         result = g_string_free (fixed, FALSE);
1181     }
1182
1183     return result;
1184 }
1185
1186 /* --------------------------------------------------------------------------------------------- */
1187
1188 static int
1189 str_utf8_compare (const char *t1, const char *t2)
1190 {
1191     char *n1, *n2;
1192     int result;
1193
1194     n1 = str_utf8_normalize (t1);
1195     n2 = str_utf8_normalize (t2);
1196
1197     result = strcmp (n1, n2);
1198
1199     g_free (n1);
1200     g_free (n2);
1201
1202     return result;
1203 }
1204
1205 /* --------------------------------------------------------------------------------------------- */
1206
1207 static int
1208 str_utf8_ncompare (const char *t1, const char *t2)
1209 {
1210     char *n1, *n2;
1211     size_t l1, l2;
1212     int result;
1213
1214     n1 = str_utf8_normalize (t1);
1215     n2 = str_utf8_normalize (t2);
1216
1217     l1 = strlen (n1);
1218     l2 = strlen (n2);
1219     result = strncmp (n1, n2, MIN (l1, l2));
1220
1221     g_free (n1);
1222     g_free (n2);
1223
1224     return result;
1225 }
1226
1227 /* --------------------------------------------------------------------------------------------- */
1228
1229 static int
1230 str_utf8_casecmp (const char *t1, const char *t2)
1231 {
1232     char *n1, *n2;
1233     int result;
1234
1235     n1 = str_utf8_casefold_normalize (t1);
1236     n2 = str_utf8_casefold_normalize (t2);
1237
1238     result = strcmp (n1, n2);
1239
1240     g_free (n1);
1241     g_free (n2);
1242
1243     return result;
1244 }
1245
1246 /* --------------------------------------------------------------------------------------------- */
1247
1248 static int
1249 str_utf8_ncasecmp (const char *t1, const char *t2)
1250 {
1251     char *n1, *n2;
1252     size_t l1, l2;
1253     int result;
1254
1255     n1 = str_utf8_casefold_normalize (t1);
1256     n2 = str_utf8_casefold_normalize (t2);
1257
1258     l1 = strlen (n1);
1259     l2 = strlen (n2);
1260     result = strncmp (n1, n2, MIN (l1, l2));
1261
1262     g_free (n1);
1263     g_free (n2);
1264
1265     return result;
1266 }
1267
1268 /* --------------------------------------------------------------------------------------------- */
1269
1270 static int
1271 str_utf8_prefix (const char *text, const char *prefix)
1272 {
1273     char *t, *p;
1274     const char *nt, *np;
1275     const char *nnt, *nnp;
1276     int result;
1277
1278     t = str_utf8_normalize (text);
1279     p = str_utf8_normalize (prefix);
1280     nt = t;
1281     np = p;
1282     nnt = t;
1283     nnp = p;
1284
1285     while (nt[0] != '\0' && np[0] != '\0')
1286     {
1287         str_utf8_cnext_char_safe (&nnt);
1288         str_utf8_cnext_char_safe (&nnp);
1289         if (nnt - nt != nnp - np)
1290             break;
1291         if (strncmp (nt, np, nnt - nt) != 0)
1292             break;
1293         nt = nnt;
1294         np = nnp;
1295     }
1296
1297     result = np - p;
1298
1299     g_free (t);
1300     g_free (p);
1301
1302     return result;
1303 }
1304
1305 /* --------------------------------------------------------------------------------------------- */
1306
1307 static int
1308 str_utf8_caseprefix (const char *text, const char *prefix)
1309 {
1310     char *t, *p;
1311     const char *nt, *np;
1312     const char *nnt, *nnp;
1313     int result;
1314
1315     t = str_utf8_casefold_normalize (text);
1316     p = str_utf8_casefold_normalize (prefix);
1317     nt = t;
1318     np = p;
1319     nnt = t;
1320     nnp = p;
1321
1322     while (nt[0] != '\0' && np[0] != '\0')
1323     {
1324         str_utf8_cnext_char_safe (&nnt);
1325         str_utf8_cnext_char_safe (&nnp);
1326         if (nnt - nt != nnp - np)
1327             break;
1328         if (strncmp (nt, np, nnt - nt) != 0)
1329             break;
1330         nt = nnt;
1331         np = nnp;
1332     }
1333
1334     result = np - p;
1335
1336     g_free (t);
1337     g_free (p);
1338
1339     return result;
1340 }
1341
1342 /* --------------------------------------------------------------------------------------------- */
1343
1344 static char *
1345 str_utf8_create_key_gen (const char *text, gboolean case_sen,
1346                          gchar * (*keygen) (const gchar * text, gssize size))
1347 {
1348     char *result;
1349
1350     if (case_sen)
1351         result = str_utf8_normalize (text);
1352     else
1353     {
1354         gboolean dot;
1355         GString *fixed;
1356         const char *start, *end;
1357         char *fold, *key;
1358
1359         dot = text[0] == '.';
1360         fixed = g_string_sized_new (16);
1361
1362         if (!dot)
1363             start = text;
1364         else
1365         {
1366             start = text + 1;
1367             g_string_append_c (fixed, '.');
1368         }
1369
1370         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1371         {
1372             if (start != end)
1373             {
1374                 fold = g_utf8_casefold (start, end - start);
1375                 key = keygen (fold, -1);
1376                 g_string_append (fixed, key);
1377                 g_free (key);
1378                 g_free (fold);
1379             }
1380             g_string_append_c (fixed, end[0]);
1381             start = end + 1;
1382         }
1383
1384         if (start == text)
1385         {
1386             fold = g_utf8_casefold (start, -1);
1387             result = keygen (fold, -1);
1388             g_free (fold);
1389             g_string_free (fixed, TRUE);
1390         }
1391         else if (dot && (start == text + 1))
1392         {
1393             fold = g_utf8_casefold (start, -1);
1394             key = keygen (fold, -1);
1395             g_string_append (fixed, key);
1396             g_free (key);
1397             g_free (fold);
1398             result = g_string_free (fixed, FALSE);
1399         }
1400         else
1401         {
1402             if (start[0] != '\0' && start != end)
1403             {
1404                 fold = g_utf8_casefold (start, end - start);
1405                 key = keygen (fold, -1);
1406                 g_string_append (fixed, key);
1407                 g_free (key);
1408                 g_free (fold);
1409             }
1410             result = g_string_free (fixed, FALSE);
1411         }
1412     }
1413     return result;
1414 }
1415
1416 /* --------------------------------------------------------------------------------------------- */
1417
1418 static char *
1419 str_utf8_create_key (const char *text, gboolean case_sen)
1420 {
1421     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1422 }
1423
1424 /* --------------------------------------------------------------------------------------------- */
1425
1426 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1427 static char *
1428 str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
1429 {
1430     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1431 }
1432 #endif
1433
1434 /* --------------------------------------------------------------------------------------------- */
1435
1436 static int
1437 str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
1438 {
1439     (void) case_sen;
1440     return strcmp (t1, t2);
1441 }
1442
1443 /* --------------------------------------------------------------------------------------------- */
1444
1445 static void
1446 str_utf8_release_key (char *key, gboolean case_sen)
1447 {
1448     (void) case_sen;
1449     g_free (key);
1450 }
1451
1452 /* --------------------------------------------------------------------------------------------- */
1453 /*** public functions ****************************************************************************/
1454 /* --------------------------------------------------------------------------------------------- */
1455
1456 struct str_class
1457 str_utf8_init (void)
1458 {
1459     struct str_class result;
1460
1461     result.conv_gerror_message = str_utf8_conv_gerror_message;
1462     result.vfs_convert_to = str_utf8_vfs_convert_to;
1463     result.insert_replace_char = str_utf8_insert_replace_char;
1464     result.is_valid_string = str_utf8_is_valid_string;
1465     result.is_valid_char = str_utf8_is_valid_char;
1466     result.cnext_char = str_utf8_cnext_char;
1467     result.cprev_char = str_utf8_cprev_char;
1468     result.cnext_char_safe = str_utf8_cnext_char_safe;
1469     result.cprev_char_safe = str_utf8_cprev_char_safe;
1470     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1471     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1472     result.char_isspace = str_utf8_isspace;
1473     result.char_ispunct = str_utf8_ispunct;
1474     result.char_isalnum = str_utf8_isalnum;
1475     result.char_isdigit = str_utf8_isdigit;
1476     result.char_isprint = str_utf8_isprint;
1477     result.char_iscombiningmark = str_utf8_iscombiningmark;
1478     result.char_toupper = str_utf8_toupper;
1479     result.char_tolower = str_utf8_tolower;
1480     result.length = str_utf8_length;
1481     result.length2 = str_utf8_length2;
1482     result.length_noncomb = str_utf8_length_noncomb;
1483     result.fix_string = str_utf8_fix_string;
1484     result.term_form = str_utf8_term_form;
1485     result.fit_to_term = str_utf8_fit_to_term;
1486     result.term_trim = str_utf8_term_trim;
1487     result.term_width2 = str_utf8_term_width2;
1488     result.term_width1 = str_utf8_term_width1;
1489     result.term_char_width = str_utf8_term_char_width;
1490     result.term_substring = str_utf8_term_substring;
1491     result.trunc = str_utf8_trunc;
1492     result.offset_to_pos = str_utf8_offset_to_pos;
1493     result.column_to_pos = str_utf8_column_to_pos;
1494     result.create_search_needle = str_utf8_create_search_needle;
1495     result.release_search_needle = str_utf8_release_search_needle;
1496     result.search_first = str_utf8_search_first;
1497     result.search_last = str_utf8_search_last;
1498     result.compare = str_utf8_compare;
1499     result.ncompare = str_utf8_ncompare;
1500     result.casecmp = str_utf8_casecmp;
1501     result.ncasecmp = str_utf8_ncasecmp;
1502     result.prefix = str_utf8_prefix;
1503     result.caseprefix = str_utf8_caseprefix;
1504     result.create_key = str_utf8_create_key;
1505 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1506     /* case insensitive sort files in "a1 a2 a10" order */
1507     result.create_key_for_filename = str_utf8_create_key_for_filename;
1508 #else
1509     /* case insensitive sort files in "a1 a10 a2" order */
1510     result.create_key_for_filename = str_utf8_create_key;
1511 #endif
1512     result.key_collate = str_utf8_key_collate;
1513     result.release_key = str_utf8_release_key;
1514
1515     return result;
1516 }
1517
1518 /* --------------------------------------------------------------------------------------------- */