lib/strutil/strutilutf8.c

   1 /*
   2    UTF-8 strings utilities
   3
   4    Copyright (C) 2007-2017
   5    Free Software Foundation, Inc.
   6
   7    Written by:
   8    Rostislav Benes, 2007
   9
  10    This file is part of the Midnight Commander.
  11
  12    The Midnight Commander is free software: you can redistribute it
  13    and/or modify it under the terms of the GNU General Public License as
  14    published by the Free Software Foundation, either version 3 of the License,
  15    or (at your option) any later version.
  16
  17    The Midnight Commander is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20    GNU General Public License for more details.
  21
  22    You should have received a copy of the GNU General Public License
  23    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  24  */
  25
  26 #include <config.h>
  27
  28 #include <stdlib.h>
  29 #include <langinfo.h>
  30 #include <string.h>
  31
  32 #include "lib/global.h"
  33 #include "lib/strutil.h"
  34
  35 /* using function for utf-8 from glib */
  36
  37 /*** global variables ****************************************************************************/
  38
  39 /*** file scope macro definitions ****************************************************************/
  40
  41 /*** file scope type declarations ****************************************************************/
  42
  43 struct utf8_tool
  44 {
  45     char *actual;
  46     size_t remain;
  47     const char *checked;
  48     int ident;
  49     gboolean compose;
  50 };
  51
  52 struct term_form
  53 {
  54     char text[BUF_MEDIUM * 6];
  55     size_t width;
  56     gboolean compose;
  57 };
  58
  59 /*** file scope variables ************************************************************************/
  60
  61 static const char replch[] = "\xEF\xBF\xBD";
  62
  63 /* --------------------------------------------------------------------------------------------- */
  64 /*** file scope functions ************************************************************************/
  65 /* --------------------------------------------------------------------------------------------- */
  66
  67 static gboolean
  68 str_unichar_iscombiningmark (gunichar uni)
  69 {
  70     GUnicodeType type;
  71
  72     type = g_unichar_type (uni);
  73     return (type == G_UNICODE_COMBINING_MARK)
  74         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  75 }
  76
  77 /* --------------------------------------------------------------------------------------------- */
  78
  79 static void
  80 str_utf8_insert_replace_char (GString * buffer)
  81 {
  82     g_string_append (buffer, replch);
  83 }
  84
  85 /* --------------------------------------------------------------------------------------------- */
  86
  87 static int
  88 str_utf8_is_valid_string (const char *text)
  89 {
  90     return g_utf8_validate (text, -1, NULL);
  91 }
  92
  93 /* --------------------------------------------------------------------------------------------- */
  94
  95 static int
  96 str_utf8_is_valid_char (const char *ch, size_t size)
  97 {
  98     switch (g_utf8_get_char_validated (ch, size))
  99     {
 100     case (gunichar) (-2):
 101         return (-2);
 102     case (gunichar) (-1):
 103         return (-1);
 104     default:
 105         return 1;
 106     }
 107 }
 108
 109 /* --------------------------------------------------------------------------------------------- */
 110
 111 static void
 112 str_utf8_cnext_char (const char **text)
 113 {
 114     (*text) = g_utf8_next_char (*text);
 115 }
 116
 117 /* --------------------------------------------------------------------------------------------- */
 118
 119 static void
 120 str_utf8_cprev_char (const char **text)
 121 {
 122     (*text) = g_utf8_prev_char (*text);
 123 }
 124
 125 /* --------------------------------------------------------------------------------------------- */
 126
 127 static void
 128 str_utf8_cnext_char_safe (const char **text)
 129 {
 130     if (str_utf8_is_valid_char (*text, -1) == 1)
 131         (*text) = g_utf8_next_char (*text);
 132     else
 133         (*text)++;
 134 }
 135
 136 /* --------------------------------------------------------------------------------------------- */
 137
 138 static void
 139 str_utf8_cprev_char_safe (const char **text)
 140 {
 141     const char *result, *t;
 142
 143     result = g_utf8_prev_char (*text);
 144     t = result;
 145     str_utf8_cnext_char_safe (&t);
 146     if (t == *text)
 147         (*text) = result;
 148     else
 149         (*text)--;
 150 }
 151
 152 /* --------------------------------------------------------------------------------------------- */
 153
 154 static void
 155 str_utf8_fix_string (char *text)
 156 {
 157     while (text[0] != '\0')
 158     {
 159         gunichar uni;
 160
 161         uni = g_utf8_get_char_validated (text, -1);
 162         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 163             text = g_utf8_next_char (text);
 164         else
 165         {
 166             text[0] = '?';
 167             text++;
 168         }
 169     }
 170 }
 171
 172 /* --------------------------------------------------------------------------------------------- */
 173
 174 static int
 175 str_utf8_isspace (const char *text)
 176 {
 177     gunichar uni;
 178
 179     uni = g_utf8_get_char_validated (text, -1);
 180     return g_unichar_isspace (uni);
 181 }
 182
 183 /* --------------------------------------------------------------------------------------------- */
 184
 185 static int
 186 str_utf8_ispunct (const char *text)
 187 {
 188     gunichar uni;
 189
 190     uni = g_utf8_get_char_validated (text, -1);
 191     return g_unichar_ispunct (uni);
 192 }
 193
 194 /* --------------------------------------------------------------------------------------------- */
 195
 196 static int
 197 str_utf8_isalnum (const char *text)
 198 {
 199     gunichar uni;
 200
 201     uni = g_utf8_get_char_validated (text, -1);
 202     return g_unichar_isalnum (uni);
 203 }
 204
 205 /* --------------------------------------------------------------------------------------------- */
 206
 207 static int
 208 str_utf8_isdigit (const char *text)
 209 {
 210     gunichar uni;
 211
 212     uni = g_utf8_get_char_validated (text, -1);
 213     return g_unichar_isdigit (uni);
 214 }
 215
 216 /* --------------------------------------------------------------------------------------------- */
 217
 218 static int
 219 str_utf8_isprint (const char *ch)
 220 {
 221     gunichar uni;
 222
 223     uni = g_utf8_get_char_validated (ch, -1);
 224     return g_unichar_isprint (uni);
 225 }
 226
 227 /* --------------------------------------------------------------------------------------------- */
 228
 229 static gboolean
 230 str_utf8_iscombiningmark (const char *ch)
 231 {
 232     gunichar uni;
 233
 234     uni = g_utf8_get_char_validated (ch, -1);
 235     return str_unichar_iscombiningmark (uni);
 236 }
 237
 238 /* --------------------------------------------------------------------------------------------- */
 239
 240 static int
 241 str_utf8_cnext_noncomb_char (const char **text)
 242 {
 243     int count = 0;
 244
 245     while ((*text)[0] != '\0')
 246     {
 247         str_utf8_cnext_char_safe (text);
 248         count++;
 249         if (!str_utf8_iscombiningmark (*text))
 250             break;
 251     }
 252
 253     return count;
 254 }
 255
 256 /* --------------------------------------------------------------------------------------------- */
 257
 258 static int
 259 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 260 {
 261     int count = 0;
 262
 263     while ((*text) != begin)
 264     {
 265         str_utf8_cprev_char_safe (text);
 266         count++;
 267         if (!str_utf8_iscombiningmark (*text))
 268             break;
 269     }
 270
 271     return count;
 272 }
 273
 274 /* --------------------------------------------------------------------------------------------- */
 275
 276 static int
 277 str_utf8_toupper (const char *text, char **out, size_t * remain)
 278 {
 279     gunichar uni;
 280     size_t left;
 281
 282     uni = g_utf8_get_char_validated (text, -1);
 283     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 284         return 0;
 285
 286     uni = g_unichar_toupper (uni);
 287     left = g_unichar_to_utf8 (uni, NULL);
 288     if (left >= *remain)
 289         return 0;
 290
 291     left = g_unichar_to_utf8 (uni, *out);
 292     (*out) += left;
 293     (*remain) -= left;
 294     return 1;
 295 }
 296
 297 /* --------------------------------------------------------------------------------------------- */
 298
 299 static int
 300 str_utf8_tolower (const char *text, char **out, size_t * remain)
 301 {
 302     gunichar uni;
 303     size_t left;
 304
 305     uni = g_utf8_get_char_validated (text, -1);
 306     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 307         return 0;
 308
 309     uni = g_unichar_tolower (uni);
 310     left = g_unichar_to_utf8 (uni, NULL);
 311     if (left >= *remain)
 312         return 0;
 313
 314     left = g_unichar_to_utf8 (uni, *out);
 315     (*out) += left;
 316     (*remain) -= left;
 317     return 1;
 318 }
 319
 320 /* --------------------------------------------------------------------------------------------- */
 321
 322 static int
 323 str_utf8_length (const char *text)
 324 {
 325     int result = 0;
 326     const char *start;
 327     const char *end;
 328
 329     start = text;
 330     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 331     {
 332         if (start != end)
 333             result += g_utf8_strlen (start, end - start);
 334
 335         result++;
 336         start = end + 1;
 337     }
 338
 339     if (start == text)
 340         result = g_utf8_strlen (text, -1);
 341     else if (start[0] != '\0' && start != end)
 342         result += g_utf8_strlen (start, end - start);
 343
 344     return result;
 345 }
 346
 347 /* --------------------------------------------------------------------------------------------- */
 348
 349 static int
 350 str_utf8_length2 (const char *text, int size)
 351 {
 352     int result = 0;
 353     const char *start;
 354     const char *end;
 355
 356     start = text;
 357     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 358     {
 359         if (start != end)
 360         {
 361             result += g_utf8_strlen (start, MIN (end - start, size));
 362             size -= end - start;
 363         }
 364         result += (size > 0);
 365         size--;
 366         start = end + 1;
 367     }
 368
 369     if (start == text)
 370         result = g_utf8_strlen (text, size);
 371     else if (start[0] != '\0' && start != end && size > 0)
 372         result += g_utf8_strlen (start, MIN (end - start, size));
 373
 374     return result;
 375 }
 376
 377 /* --------------------------------------------------------------------------------------------- */
 378
 379 static int
 380 str_utf8_length_noncomb (const char *text)
 381 {
 382     int result = 0;
 383     const char *t = text;
 384
 385     while (t[0] != '\0')
 386     {
 387         str_utf8_cnext_noncomb_char (&t);
 388         result++;
 389     }
 390
 391     return result;
 392 }
 393
 394 /* --------------------------------------------------------------------------------------------- */
 395
 396 #if 0
 397 static void
 398 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
 399 {
 400     char *next;
 401
 402     next = g_utf8_next_char (*string);
 403     (*left) -= next - (*string);
 404     (*string) = next;
 405     g_string_append_c (buffer, '?');
 406 }
 407 #endif
 408
 409 /* --------------------------------------------------------------------------------------------- */
 410
 411 static gchar *
 412 str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
 413 {
 414     if (mcerror != NULL)
 415         return g_strdup (mcerror->message);
 416
 417     return g_strdup (def_msg != NULL ? def_msg : "");
 418 }
 419
 420 /* --------------------------------------------------------------------------------------------- */
 421
 422 static estr_t
 423 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
 424 {
 425     estr_t result = ESTR_SUCCESS;
 426
 427     if (coder == str_cnv_not_convert)
 428         g_string_append_len (buffer, string, size);
 429     else
 430         result = str_nconvert (coder, string, size, buffer);
 431
 432     return result;
 433 }
 434
 435 /* --------------------------------------------------------------------------------------------- */
 436 /* utility function, that makes string valid in utf8 and all characters printable
 437  * return width of string too */
 438
 439 static const struct term_form *
 440 str_utf8_make_make_term_form (const char *text, size_t length)
 441 {
 442     static struct term_form result;
 443     gunichar uni;
 444     size_t left;
 445     char *actual;
 446
 447     result.text[0] = '\0';
 448     result.width = 0;
 449     result.compose = FALSE;
 450     actual = result.text;
 451
 452     /* check if text start with combining character,
 453      * add space at begin in this case */
 454     if (length != 0 && text[0] != '\0')
 455     {
 456         uni = g_utf8_get_char_validated (text, -1);
 457         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
 458             && str_unichar_iscombiningmark (uni))
 459         {
 460             actual[0] = ' ';
 461             actual++;
 462             result.width++;
 463             result.compose = TRUE;
 464         }
 465     }
 466
 467     while (length != 0 && text[0] != '\0')
 468     {
 469         uni = g_utf8_get_char_validated (text, -1);
 470         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 471         {
 472             if (g_unichar_isprint (uni))
 473             {
 474                 left = g_unichar_to_utf8 (uni, actual);
 475                 actual += left;
 476                 if (str_unichar_iscombiningmark (uni))
 477                     result.compose = TRUE;
 478                 else
 479                 {
 480                     result.width++;
 481                     if (g_unichar_iswide (uni))
 482                         result.width++;
 483                 }
 484             }
 485             else
 486             {
 487                 actual[0] = '.';
 488                 actual++;
 489                 result.width++;
 490             }
 491             text = g_utf8_next_char (text);
 492         }
 493         else
 494         {
 495             text++;
 496             /*actual[0] = '?'; */
 497             memcpy (actual, replch, strlen (replch));
 498             actual += strlen (replch);
 499             result.width++;
 500         }
 501
 502         if (length != (size_t) (-1))
 503             length--;
 504     }
 505     actual[0] = '\0';
 506
 507     return &result;
 508 }
 509
 510 /* --------------------------------------------------------------------------------------------- */
 511
 512 static const char *
 513 str_utf8_term_form (const char *text)
 514 {
 515     static char result[BUF_MEDIUM * 6];
 516     const struct term_form *pre_form;
 517
 518     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 519     if (pre_form->compose)
 520     {
 521         char *composed;
 522
 523         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 524         g_strlcpy (result, composed, sizeof (result));
 525         g_free (composed);
 526     }
 527     else
 528         g_strlcpy (result, pre_form->text, sizeof (result));
 529
 530     return result;
 531 }
 532
 533 /* --------------------------------------------------------------------------------------------- */
 534 /* utility function, that copies all characters from checked to actual */
 535
 536 static gboolean
 537 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 538 {
 539     tool->compose = FALSE;
 540
 541     while (tool->checked[0] != '\0')
 542     {
 543         gunichar uni;
 544         size_t left;
 545
 546         uni = g_utf8_get_char (tool->checked);
 547         tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
 548         left = g_unichar_to_utf8 (uni, NULL);
 549         if (tool->remain <= left)
 550             return FALSE;
 551         left = g_unichar_to_utf8 (uni, tool->actual);
 552         tool->actual += left;
 553         tool->remain -= left;
 554         tool->checked = g_utf8_next_char (tool->checked);
 555     }
 556
 557     return TRUE;
 558 }
 559
 560 /* --------------------------------------------------------------------------------------------- */
 561 /* utility function, that copies characters from checked to actual until ident is
 562  * smaller than to_ident */
 563
 564 static gboolean
 565 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 566 {
 567     tool->compose = FALSE;
 568
 569     while (tool->checked[0] != '\0')
 570     {
 571         gunichar uni;
 572         size_t left;
 573         int w = 0;
 574
 575         uni = g_utf8_get_char (tool->checked);
 576         if (str_unichar_iscombiningmark (uni))
 577             tool->compose = TRUE;
 578         else
 579         {
 580             w = 1;
 581             if (g_unichar_iswide (uni))
 582                 w++;
 583             if (tool->ident + w > to_ident)
 584                 return TRUE;
 585         }
 586
 587         left = g_unichar_to_utf8 (uni, NULL);
 588         if (tool->remain <= left)
 589             return FALSE;
 590         left = g_unichar_to_utf8 (uni, tool->actual);
 591         tool->actual += left;
 592         tool->remain -= left;
 593         tool->checked = g_utf8_next_char (tool->checked);
 594         tool->ident += w;
 595     }
 596
 597     return TRUE;
 598 }
 599
 600 /* --------------------------------------------------------------------------------------------- */
 601 /* utility function, adds count spaces to actual */
 602
 603 static int
 604 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 605 {
 606     if (count <= 0)
 607         return 1;
 608     if (tool->remain <= (gsize) count)
 609         return 0;
 610
 611     memset (tool->actual, ' ', count);
 612     tool->actual += count;
 613     tool->remain -= count;
 614     return 1;
 615 }
 616
 617 /* --------------------------------------------------------------------------------------------- */
 618 /* utility function, adds one characters to actual */
 619
 620 static int
 621 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 622 {
 623     if (tool->remain <= 1)
 624         return 0;
 625
 626     tool->actual[0] = ch;
 627     tool->actual++;
 628     tool->remain--;
 629     return 1;
 630 }
 631
 632 /* --------------------------------------------------------------------------------------------- */
 633 /* utility function, thah skips characters from checked until ident is greater or
 634  * equal to to_ident */
 635
 636 static gboolean
 637 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 638 {
 639     gunichar uni;
 640
 641     while (to_ident > tool->ident && tool->checked[0] != '\0')
 642     {
 643         uni = g_utf8_get_char (tool->checked);
 644         if (!str_unichar_iscombiningmark (uni))
 645         {
 646             tool->ident++;
 647             if (g_unichar_iswide (uni))
 648                 tool->ident++;
 649         }
 650         tool->checked = g_utf8_next_char (tool->checked);
 651     }
 652
 653     uni = g_utf8_get_char (tool->checked);
 654     while (str_unichar_iscombiningmark (uni))
 655     {
 656         tool->checked = g_utf8_next_char (tool->checked);
 657         uni = g_utf8_get_char (tool->checked);
 658     }
 659
 660     return TRUE;
 661 }
 662
 663 /* --------------------------------------------------------------------------------------------- */
 664
 665 static void
 666 utf8_tool_compose (char *buffer, size_t size)
 667 {
 668     char *composed;
 669
 670     composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 671     g_strlcpy (buffer, composed, size);
 672     g_free (composed);
 673 }
 674
 675 /* --------------------------------------------------------------------------------------------- */
 676
 677 static const char *
 678 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 679 {
 680     static char result[BUF_MEDIUM * 6];
 681     const struct term_form *pre_form;
 682     struct utf8_tool tool;
 683
 684     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 685     tool.checked = pre_form->text;
 686     tool.actual = result;
 687     tool.remain = sizeof (result);
 688     tool.compose = FALSE;
 689
 690     if (pre_form->width <= (gsize) width)
 691     {
 692         switch (HIDE_FIT (just_mode))
 693         {
 694         case J_CENTER_LEFT:
 695         case J_CENTER:
 696             tool.ident = (width - pre_form->width) / 2;
 697             break;
 698         case J_RIGHT:
 699             tool.ident = width - pre_form->width;
 700             break;
 701         default:
 702             tool.ident = 0;
 703             break;
 704         }
 705
 706         utf8_tool_insert_space (&tool, tool.ident);
 707         utf8_tool_copy_chars_to_end (&tool);
 708         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 709     }
 710     else if (IS_FIT (just_mode))
 711     {
 712         tool.ident = 0;
 713         utf8_tool_copy_chars_to (&tool, width / 2);
 714         utf8_tool_insert_char (&tool, '~');
 715
 716         tool.ident = 0;
 717         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 718         utf8_tool_copy_chars_to_end (&tool);
 719         utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 720     }
 721     else
 722     {
 723         switch (HIDE_FIT (just_mode))
 724         {
 725         case J_CENTER:
 726             tool.ident = (width - pre_form->width) / 2;
 727             break;
 728         case J_RIGHT:
 729             tool.ident = width - pre_form->width;
 730             break;
 731         default:
 732             tool.ident = 0;
 733             break;
 734         }
 735
 736         utf8_tool_skip_chars_to (&tool, 0);
 737         utf8_tool_insert_space (&tool, tool.ident);
 738         utf8_tool_copy_chars_to (&tool, width);
 739         utf8_tool_insert_space (&tool, width - tool.ident);
 740     }
 741
 742     tool.actual[0] = '\0';
 743     if (tool.compose)
 744         utf8_tool_compose (result, sizeof (result));
 745     return result;
 746 }
 747
 748 /* --------------------------------------------------------------------------------------------- */
 749
 750 static const char *
 751 str_utf8_term_trim (const char *text, int width)
 752 {
 753     static char result[BUF_MEDIUM * 6];
 754     const struct term_form *pre_form;
 755     struct utf8_tool tool;
 756
 757     if (width < 1)
 758     {
 759         result[0] = '\0';
 760         return result;
 761     }
 762
 763     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 764
 765     tool.checked = pre_form->text;
 766     tool.actual = result;
 767     tool.remain = sizeof (result);
 768     tool.compose = FALSE;
 769
 770     if ((gsize) width >= pre_form->width)
 771         utf8_tool_copy_chars_to_end (&tool);
 772     else if (width <= 3)
 773     {
 774         memset (tool.actual, '.', width);
 775         tool.actual += width;
 776         tool.remain -= width;
 777     }
 778     else
 779     {
 780         memset (tool.actual, '.', 3);
 781         tool.actual += 3;
 782         tool.remain -= 3;
 783
 784         tool.ident = 0;
 785         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 786         utf8_tool_copy_chars_to_end (&tool);
 787     }
 788
 789     tool.actual[0] = '\0';
 790     if (tool.compose)
 791         utf8_tool_compose (result, sizeof (result));
 792     return result;
 793 }
 794
 795 /* --------------------------------------------------------------------------------------------- */
 796
 797 static int
 798 str_utf8_term_width2 (const char *text, size_t length)
 799 {
 800     const struct term_form *result;
 801
 802     result = str_utf8_make_make_term_form (text, length);
 803     return result->width;
 804 }
 805
 806 /* --------------------------------------------------------------------------------------------- */
 807
 808 static int
 809 str_utf8_term_width1 (const char *text)
 810 {
 811     return str_utf8_term_width2 (text, (size_t) (-1));
 812 }
 813
 814 /* --------------------------------------------------------------------------------------------- */
 815
 816 static int
 817 str_utf8_term_char_width (const char *text)
 818 {
 819     gunichar uni;
 820
 821     uni = g_utf8_get_char_validated (text, -1);
 822     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 823 }
 824
 825 /* --------------------------------------------------------------------------------------------- */
 826
 827 static const char *
 828 str_utf8_term_substring (const char *text, int start, int width)
 829 {
 830     static char result[BUF_MEDIUM * 6];
 831     const struct term_form *pre_form;
 832     struct utf8_tool tool;
 833
 834     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 835
 836     tool.checked = pre_form->text;
 837     tool.actual = result;
 838     tool.remain = sizeof (result);
 839     tool.compose = FALSE;
 840
 841     tool.ident = -start;
 842     utf8_tool_skip_chars_to (&tool, 0);
 843     if (tool.ident < 0)
 844         tool.ident = 0;
 845     utf8_tool_insert_space (&tool, tool.ident);
 846
 847     utf8_tool_copy_chars_to (&tool, width);
 848     utf8_tool_insert_space (&tool, width - tool.ident);
 849
 850     tool.actual[0] = '\0';
 851     if (tool.compose)
 852         utf8_tool_compose (result, sizeof (result));
 853     return result;
 854 }
 855
 856 /* --------------------------------------------------------------------------------------------- */
 857
 858 static const char *
 859 str_utf8_trunc (const char *text, int width)
 860 {
 861     static char result[MC_MAXPATHLEN * 6 * 2];
 862     const struct term_form *pre_form;
 863     struct utf8_tool tool;
 864
 865     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 866
 867     tool.checked = pre_form->text;
 868     tool.actual = result;
 869     tool.remain = sizeof (result);
 870     tool.compose = FALSE;
 871
 872     if (pre_form->width <= (gsize) width)
 873         utf8_tool_copy_chars_to_end (&tool);
 874     else
 875     {
 876         tool.ident = 0;
 877         utf8_tool_copy_chars_to (&tool, width / 2);
 878         utf8_tool_insert_char (&tool, '~');
 879
 880         tool.ident = 0;
 881         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 882         utf8_tool_copy_chars_to_end (&tool);
 883     }
 884
 885     tool.actual[0] = '\0';
 886     if (tool.compose)
 887         utf8_tool_compose (result, sizeof (result));
 888     return result;
 889 }
 890
 891 /* --------------------------------------------------------------------------------------------- */
 892
 893 static int
 894 str_utf8_offset_to_pos (const char *text, size_t length)
 895 {
 896     if (str_utf8_is_valid_string (text))
 897         return g_utf8_offset_to_pointer (text, length) - text;
 898     else
 899     {
 900         int result;
 901         GString *buffer;
 902
 903         buffer = g_string_new (text);
 904         str_utf8_fix_string (buffer->str);
 905         result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
 906         g_string_free (buffer, TRUE);
 907         return result;
 908     }
 909 }
 910
 911 /* --------------------------------------------------------------------------------------------- */
 912
 913 static int
 914 str_utf8_column_to_pos (const char *text, size_t pos)
 915 {
 916     int result = 0;
 917     int width = 0;
 918
 919     while (text[0] != '\0')
 920     {
 921         gunichar uni;
 922
 923         uni = g_utf8_get_char_validated (text, 6);
 924         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 925         {
 926             if (g_unichar_isprint (uni))
 927             {
 928                 if (!str_unichar_iscombiningmark (uni))
 929                 {
 930                     width++;
 931                     if (g_unichar_iswide (uni))
 932                         width++;
 933                 }
 934             }
 935             else
 936             {
 937                 width++;
 938             }
 939             text = g_utf8_next_char (text);
 940         }
 941         else
 942         {
 943             text++;
 944             width++;
 945         }
 946
 947         if ((gsize) width > pos)
 948             return result;
 949
 950         result++;
 951     }
 952
 953     return result;
 954 }
 955
 956 /* --------------------------------------------------------------------------------------------- */
 957
 958 static char *
 959 str_utf8_create_search_needle (const char *needle, int case_sen)
 960 {
 961     char *fold, *result;
 962
 963     if (needle == NULL)
 964         return NULL;
 965
 966     if (case_sen)
 967         return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 968
 969
 970     fold = g_utf8_casefold (needle, -1);
 971     result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 972     g_free (fold);
 973     return result;
 974 }
 975
 976 /* --------------------------------------------------------------------------------------------- */
 977
 978 static void
 979 str_utf8_release_search_needle (char *needle, int case_sen)
 980 {
 981     (void) case_sen;
 982     g_free (needle);
 983 }
 984
 985 /* --------------------------------------------------------------------------------------------- */
 986
 987 static const char *
 988 str_utf8_search_first (const char *text, const char *search, int case_sen)
 989 {
 990     char *fold_text;
 991     char *deco_text;
 992     const char *match;
 993     const char *result = NULL;
 994     const char *m;
 995
 996     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 997     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 998
 999     match = deco_text;
1000     do
1001     {
1002         match = g_strstr_len (match, -1, search);
1003         if (match != NULL)
1004         {
1005             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1006                 !str_utf8_iscombiningmark (match + strlen (search)))
1007             {
1008                 result = text;
1009                 m = deco_text;
1010                 while (m < match)
1011                 {
1012                     str_utf8_cnext_noncomb_char (&m);
1013                     str_utf8_cnext_noncomb_char (&result);
1014                 }
1015             }
1016             else
1017                 str_utf8_cnext_char (&match);
1018         }
1019     }
1020     while (match != NULL && result == NULL);
1021
1022     g_free (deco_text);
1023     if (!case_sen)
1024         g_free (fold_text);
1025
1026     return result;
1027 }
1028
1029 /* --------------------------------------------------------------------------------------------- */
1030
1031 static const char *
1032 str_utf8_search_last (const char *text, const char *search, int case_sen)
1033 {
1034     char *fold_text;
1035     char *deco_text;
1036     char *match;
1037     const char *result = NULL;
1038     const char *m;
1039
1040     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
1041     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1042
1043     do
1044     {
1045         match = g_strrstr_len (deco_text, -1, search);
1046         if (match != NULL)
1047         {
1048             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1049                 !str_utf8_iscombiningmark (match + strlen (search)))
1050             {
1051                 result = text;
1052                 m = deco_text;
1053                 while (m < match)
1054                 {
1055                     str_utf8_cnext_noncomb_char (&m);
1056                     str_utf8_cnext_noncomb_char (&result);
1057                 }
1058             }
1059             else
1060                 match[0] = '\0';
1061         }
1062     }
1063     while (match != NULL && result == NULL);
1064
1065     g_free (deco_text);
1066     if (!case_sen)
1067         g_free (fold_text);
1068
1069     return result;
1070 }
1071
1072 /* --------------------------------------------------------------------------------------------- */
1073
1074 static char *
1075 str_utf8_normalize (const char *text)
1076 {
1077     GString *fixed;
1078     char *tmp;
1079     char *result;
1080     const char *start;
1081     const char *end;
1082
1083     /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
1084      * does the normalization and then converts UCS-4 back into UTF-8.
1085      * Since file names are composed of ASCII characters in most cases, we can speed up
1086      * utf8 normalization by checking if the heavyweight Unicode normalization is actually
1087      * needed. Normalization of ASCII string is no-op.
1088      */
1089
1090     /* find out whether text is ASCII only */
1091     for (end = text; *end != '\0'; end++)
1092         if ((*end & 0x80) != 0)
1093         {
1094             /* found 2nd byte of utf8-encoded symbol */
1095             break;
1096         }
1097
1098     /* if text is ASCII-only, return copy, normalize otherwise */
1099     if (*end == '\0')
1100         return g_strndup (text, end - text);
1101
1102     fixed = g_string_sized_new (4);
1103
1104     start = text;
1105     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1106     {
1107         if (start != end)
1108         {
1109             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1110             g_string_append (fixed, tmp);
1111             g_free (tmp);
1112         }
1113         g_string_append_c (fixed, end[0]);
1114         start = end + 1;
1115     }
1116
1117     if (start == text)
1118     {
1119         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1120         g_string_free (fixed, TRUE);
1121     }
1122     else
1123     {
1124         if (start[0] != '\0' && start != end)
1125         {
1126             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1127             g_string_append (fixed, tmp);
1128             g_free (tmp);
1129         }
1130         result = g_string_free (fixed, FALSE);
1131     }
1132
1133     return result;
1134 }
1135
1136 /* --------------------------------------------------------------------------------------------- */
1137
1138 static char *
1139 str_utf8_casefold_normalize (const char *text)
1140 {
1141     GString *fixed;
1142     char *tmp, *fold;
1143     char *result;
1144     const char *start;
1145     const char *end;
1146
1147     fixed = g_string_sized_new (4);
1148
1149     start = text;
1150     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1151     {
1152         if (start != end)
1153         {
1154             fold = g_utf8_casefold (start, end - start);
1155             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1156             g_string_append (fixed, tmp);
1157             g_free (tmp);
1158             g_free (fold);
1159         }
1160         g_string_append_c (fixed, end[0]);
1161         start = end + 1;
1162     }
1163
1164     if (start == text)
1165     {
1166         fold = g_utf8_casefold (text, -1);
1167         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1168         g_free (fold);
1169         g_string_free (fixed, TRUE);
1170     }
1171     else
1172     {
1173         if (start[0] != '\0' && start != end)
1174         {
1175             fold = g_utf8_casefold (start, end - start);
1176             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1177             g_string_append (fixed, tmp);
1178             g_free (tmp);
1179             g_free (fold);
1180         }
1181         result = g_string_free (fixed, FALSE);
1182     }
1183
1184     return result;
1185 }
1186
1187 /* --------------------------------------------------------------------------------------------- */
1188
1189 static int
1190 str_utf8_compare (const char *t1, const char *t2)
1191 {
1192     char *n1, *n2;
1193     int result;
1194
1195     n1 = str_utf8_normalize (t1);
1196     n2 = str_utf8_normalize (t2);
1197
1198     result = strcmp (n1, n2);
1199
1200     g_free (n1);
1201     g_free (n2);
1202
1203     return result;
1204 }
1205
1206 /* --------------------------------------------------------------------------------------------- */
1207
1208 static int
1209 str_utf8_ncompare (const char *t1, const char *t2)
1210 {
1211     char *n1, *n2;
1212     size_t l1, l2;
1213     int result;
1214
1215     n1 = str_utf8_normalize (t1);
1216     n2 = str_utf8_normalize (t2);
1217
1218     l1 = strlen (n1);
1219     l2 = strlen (n2);
1220     result = strncmp (n1, n2, MIN (l1, l2));
1221
1222     g_free (n1);
1223     g_free (n2);
1224
1225     return result;
1226 }
1227
1228 /* --------------------------------------------------------------------------------------------- */
1229
1230 static int
1231 str_utf8_casecmp (const char *t1, const char *t2)
1232 {
1233     char *n1, *n2;
1234     int result;
1235
1236     n1 = str_utf8_casefold_normalize (t1);
1237     n2 = str_utf8_casefold_normalize (t2);
1238
1239     result = strcmp (n1, n2);
1240
1241     g_free (n1);
1242     g_free (n2);
1243
1244     return result;
1245 }
1246
1247 /* --------------------------------------------------------------------------------------------- */
1248
1249 static int
1250 str_utf8_ncasecmp (const char *t1, const char *t2)
1251 {
1252     char *n1, *n2;
1253     size_t l1, l2;
1254     int result;
1255
1256     n1 = str_utf8_casefold_normalize (t1);
1257     n2 = str_utf8_casefold_normalize (t2);
1258
1259     l1 = strlen (n1);
1260     l2 = strlen (n2);
1261     result = strncmp (n1, n2, MIN (l1, l2));
1262
1263     g_free (n1);
1264     g_free (n2);
1265
1266     return result;
1267 }
1268
1269 /* --------------------------------------------------------------------------------------------- */
1270
1271 static int
1272 str_utf8_prefix (const char *text, const char *prefix)
1273 {
1274     char *t, *p;
1275     const char *nt, *np;
1276     const char *nnt, *nnp;
1277     int result;
1278
1279     t = str_utf8_normalize (text);
1280     p = str_utf8_normalize (prefix);
1281     nt = t;
1282     np = p;
1283     nnt = t;
1284     nnp = p;
1285
1286     while (nt[0] != '\0' && np[0] != '\0')
1287     {
1288         str_utf8_cnext_char_safe (&nnt);
1289         str_utf8_cnext_char_safe (&nnp);
1290         if (nnt - nt != nnp - np)
1291             break;
1292         if (strncmp (nt, np, nnt - nt) != 0)
1293             break;
1294         nt = nnt;
1295         np = nnp;
1296     }
1297
1298     result = np - p;
1299
1300     g_free (t);
1301     g_free (p);
1302
1303     return result;
1304 }
1305
1306 /* --------------------------------------------------------------------------------------------- */
1307
1308 static int
1309 str_utf8_caseprefix (const char *text, const char *prefix)
1310 {
1311     char *t, *p;
1312     const char *nt, *np;
1313     const char *nnt, *nnp;
1314     int result;
1315
1316     t = str_utf8_casefold_normalize (text);
1317     p = str_utf8_casefold_normalize (prefix);
1318     nt = t;
1319     np = p;
1320     nnt = t;
1321     nnp = p;
1322
1323     while (nt[0] != '\0' && np[0] != '\0')
1324     {
1325         str_utf8_cnext_char_safe (&nnt);
1326         str_utf8_cnext_char_safe (&nnp);
1327         if (nnt - nt != nnp - np)
1328             break;
1329         if (strncmp (nt, np, nnt - nt) != 0)
1330             break;
1331         nt = nnt;
1332         np = nnp;
1333     }
1334
1335     result = np - p;
1336
1337     g_free (t);
1338     g_free (p);
1339
1340     return result;
1341 }
1342
1343 /* --------------------------------------------------------------------------------------------- */
1344
1345 static char *
1346 str_utf8_create_key_gen (const char *text, int case_sen,
1347                          gchar * (*keygen) (const gchar * text, gssize size))
1348 {
1349     char *result;
1350
1351     if (case_sen)
1352         result = str_utf8_normalize (text);
1353     else
1354     {
1355         gboolean dot;
1356         GString *fixed;
1357         const char *start, *end;
1358         char *fold, *key;
1359
1360         dot = text[0] == '.';
1361         fixed = g_string_sized_new (16);
1362
1363         if (!dot)
1364             start = text;
1365         else
1366         {
1367             start = text + 1;
1368             g_string_append_c (fixed, '.');
1369         }
1370
1371         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1372         {
1373             if (start != end)
1374             {
1375                 fold = g_utf8_casefold (start, end - start);
1376                 key = keygen (fold, -1);
1377                 g_string_append (fixed, key);
1378                 g_free (key);
1379                 g_free (fold);
1380             }
1381             g_string_append_c (fixed, end[0]);
1382             start = end + 1;
1383         }
1384
1385         if (start == text)
1386         {
1387             fold = g_utf8_casefold (start, -1);
1388             result = keygen (fold, -1);
1389             g_free (fold);
1390             g_string_free (fixed, TRUE);
1391         }
1392         else if (dot && (start == text + 1))
1393         {
1394             fold = g_utf8_casefold (start, -1);
1395             key = keygen (fold, -1);
1396             g_string_append (fixed, key);
1397             g_free (key);
1398             g_free (fold);
1399             result = g_string_free (fixed, FALSE);
1400         }
1401         else
1402         {
1403             if (start[0] != '\0' && start != end)
1404             {
1405                 fold = g_utf8_casefold (start, end - start);
1406                 key = keygen (fold, -1);
1407                 g_string_append (fixed, key);
1408                 g_free (key);
1409                 g_free (fold);
1410             }
1411             result = g_string_free (fixed, FALSE);
1412         }
1413     }
1414     return result;
1415 }
1416
1417 /* --------------------------------------------------------------------------------------------- */
1418
1419 static char *
1420 str_utf8_create_key (const char *text, int case_sen)
1421 {
1422     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1423 }
1424
1425 /* --------------------------------------------------------------------------------------------- */
1426
1427 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1428 static char *
1429 str_utf8_create_key_for_filename (const char *text, int case_sen)
1430 {
1431     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1432 }
1433 #endif
1434
1435 /* --------------------------------------------------------------------------------------------- */
1436
1437 static int
1438 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1439 {
1440     (void) case_sen;
1441     return strcmp (t1, t2);
1442 }
1443
1444 /* --------------------------------------------------------------------------------------------- */
1445
1446 static void
1447 str_utf8_release_key (char *key, int case_sen)
1448 {
1449     (void) case_sen;
1450     g_free (key);
1451 }
1452
1453 /* --------------------------------------------------------------------------------------------- */
1454 /*** public functions ****************************************************************************/
1455 /* --------------------------------------------------------------------------------------------- */
1456
1457 struct str_class
1458 str_utf8_init (void)
1459 {
1460     struct str_class result;
1461
1462     result.conv_gerror_message = str_utf8_conv_gerror_message;
1463     result.vfs_convert_to = str_utf8_vfs_convert_to;
1464     result.insert_replace_char = str_utf8_insert_replace_char;
1465     result.is_valid_string = str_utf8_is_valid_string;
1466     result.is_valid_char = str_utf8_is_valid_char;
1467     result.cnext_char = str_utf8_cnext_char;
1468     result.cprev_char = str_utf8_cprev_char;
1469     result.cnext_char_safe = str_utf8_cnext_char_safe;
1470     result.cprev_char_safe = str_utf8_cprev_char_safe;
1471     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1472     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1473     result.char_isspace = str_utf8_isspace;
1474     result.char_ispunct = str_utf8_ispunct;
1475     result.char_isalnum = str_utf8_isalnum;
1476     result.char_isdigit = str_utf8_isdigit;
1477     result.char_isprint = str_utf8_isprint;
1478     result.char_iscombiningmark = str_utf8_iscombiningmark;
1479     result.char_toupper = str_utf8_toupper;
1480     result.char_tolower = str_utf8_tolower;
1481     result.length = str_utf8_length;
1482     result.length2 = str_utf8_length2;
1483     result.length_noncomb = str_utf8_length_noncomb;
1484     result.fix_string = str_utf8_fix_string;
1485     result.term_form = str_utf8_term_form;
1486     result.fit_to_term = str_utf8_fit_to_term;
1487     result.term_trim = str_utf8_term_trim;
1488     result.term_width2 = str_utf8_term_width2;
1489     result.term_width1 = str_utf8_term_width1;
1490     result.term_char_width = str_utf8_term_char_width;
1491     result.term_substring = str_utf8_term_substring;
1492     result.trunc = str_utf8_trunc;
1493     result.offset_to_pos = str_utf8_offset_to_pos;
1494     result.column_to_pos = str_utf8_column_to_pos;
1495     result.create_search_needle = str_utf8_create_search_needle;
1496     result.release_search_needle = str_utf8_release_search_needle;
1497     result.search_first = str_utf8_search_first;
1498     result.search_last = str_utf8_search_last;
1499     result.compare = str_utf8_compare;
1500     result.ncompare = str_utf8_ncompare;
1501     result.casecmp = str_utf8_casecmp;
1502     result.ncasecmp = str_utf8_ncasecmp;
1503     result.prefix = str_utf8_prefix;
1504     result.caseprefix = str_utf8_caseprefix;
1505     result.create_key = str_utf8_create_key;
1506 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1507     /* case insensitive sort files in "a1 a2 a10" order */
1508     result.create_key_for_filename = str_utf8_create_key_for_filename;
1509 #else
1510     /* case insensitive sort files in "a1 a10 a2" order */
1511     result.create_key_for_filename = str_utf8_create_key;
1512 #endif
1513     result.key_collate = str_utf8_key_collate;
1514     result.release_key = str_utf8_release_key;
1515
1516     return result;
1517 }
1518
1519 /* --------------------------------------------------------------------------------------------- */