lib/strutil/strutilutf8.c

   1 /*
   2    UTF-8 strings utilities
   3
   4    Copyright (C) 2007-2016
   5    Free Software Foundation, Inc.
   6
   7    Written by:
   8    Rostislav Benes, 2007
   9
  10    This file is part of the Midnight Commander.
  11
  12    The Midnight Commander is free software: you can redistribute it
  13    and/or modify it under the terms of the GNU General Public License as
  14    published by the Free Software Foundation, either version 3 of the License,
  15    or (at your option) any later version.
  16
  17    The Midnight Commander is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20    GNU General Public License for more details.
  21
  22    You should have received a copy of the GNU General Public License
  23    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  24  */
  25
  26 #include <config.h>
  27
  28 #include <stdlib.h>
  29 #include <langinfo.h>
  30 #include <string.h>
  31
  32 #include "lib/global.h"
  33 #include "lib/strutil.h"
  34
  35 /* using function for utf-8 from glib */
  36
  37 /*** global variables ****************************************************************************/
  38
  39 /*** file scope macro definitions ****************************************************************/
  40
  41 /*** file scope type declarations ****************************************************************/
  42
  43 struct utf8_tool
  44 {
  45     char *actual;
  46     size_t remain;
  47     const char *checked;
  48     int ident;
  49     gboolean compose;
  50 };
  51
  52 struct term_form
  53 {
  54     char text[BUF_MEDIUM * 6];
  55     size_t width;
  56     gboolean compose;
  57 };
  58
  59 /*** file scope variables ************************************************************************/
  60
  61 static const char replch[] = "\xEF\xBF\xBD";
  62
  63 /* --------------------------------------------------------------------------------------------- */
  64 /*** file scope functions ************************************************************************/
  65 /* --------------------------------------------------------------------------------------------- */
  66
  67 static gboolean
  68 str_unichar_iscombiningmark (gunichar uni)
  69 {
  70     GUnicodeType type;
  71
  72     type = g_unichar_type (uni);
  73     return (type == G_UNICODE_COMBINING_MARK)
  74         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  75 }
  76
  77 /* --------------------------------------------------------------------------------------------- */
  78
  79 static void
  80 str_utf8_insert_replace_char (GString * buffer)
  81 {
  82     g_string_append (buffer, replch);
  83 }
  84
  85 /* --------------------------------------------------------------------------------------------- */
  86
  87 static int
  88 str_utf8_is_valid_string (const char *text)
  89 {
  90     return g_utf8_validate (text, -1, NULL);
  91 }
  92
  93 /* --------------------------------------------------------------------------------------------- */
  94
  95 static int
  96 str_utf8_is_valid_char (const char *ch, size_t size)
  97 {
  98     switch (g_utf8_get_char_validated (ch, size))
  99     {
 100     case (gunichar) (-2):
 101         return (-2);
 102     case (gunichar) (-1):
 103         return (-1);
 104     default:
 105         return 1;
 106     }
 107 }
 108
 109 /* --------------------------------------------------------------------------------------------- */
 110
 111 static void
 112 str_utf8_cnext_char (const char **text)
 113 {
 114     (*text) = g_utf8_next_char (*text);
 115 }
 116
 117 /* --------------------------------------------------------------------------------------------- */
 118
 119 static void
 120 str_utf8_cprev_char (const char **text)
 121 {
 122     (*text) = g_utf8_prev_char (*text);
 123 }
 124
 125 /* --------------------------------------------------------------------------------------------- */
 126
 127 static void
 128 str_utf8_cnext_char_safe (const char **text)
 129 {
 130     if (str_utf8_is_valid_char (*text, -1) == 1)
 131         (*text) = g_utf8_next_char (*text);
 132     else
 133         (*text)++;
 134 }
 135
 136 /* --------------------------------------------------------------------------------------------- */
 137
 138 static void
 139 str_utf8_cprev_char_safe (const char **text)
 140 {
 141     const char *result, *t;
 142
 143     result = g_utf8_prev_char (*text);
 144     t = result;
 145     str_utf8_cnext_char_safe (&t);
 146     if (t == *text)
 147         (*text) = result;
 148     else
 149         (*text)--;
 150 }
 151
 152 /* --------------------------------------------------------------------------------------------- */
 153
 154 static void
 155 str_utf8_fix_string (char *text)
 156 {
 157     while (text[0] != '\0')
 158     {
 159         gunichar uni;
 160
 161         uni = g_utf8_get_char_validated (text, -1);
 162         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 163             text = g_utf8_next_char (text);
 164         else
 165         {
 166             text[0] = '?';
 167             text++;
 168         }
 169     }
 170 }
 171
 172 /* --------------------------------------------------------------------------------------------- */
 173
 174 static int
 175 str_utf8_isspace (const char *text)
 176 {
 177     gunichar uni;
 178
 179     uni = g_utf8_get_char_validated (text, -1);
 180     return g_unichar_isspace (uni);
 181 }
 182
 183 /* --------------------------------------------------------------------------------------------- */
 184
 185 static int
 186 str_utf8_ispunct (const char *text)
 187 {
 188     gunichar uni;
 189
 190     uni = g_utf8_get_char_validated (text, -1);
 191     return g_unichar_ispunct (uni);
 192 }
 193
 194 /* --------------------------------------------------------------------------------------------- */
 195
 196 static int
 197 str_utf8_isalnum (const char *text)
 198 {
 199     gunichar uni;
 200
 201     uni = g_utf8_get_char_validated (text, -1);
 202     return g_unichar_isalnum (uni);
 203 }
 204
 205 /* --------------------------------------------------------------------------------------------- */
 206
 207 static int
 208 str_utf8_isdigit (const char *text)
 209 {
 210     gunichar uni;
 211
 212     uni = g_utf8_get_char_validated (text, -1);
 213     return g_unichar_isdigit (uni);
 214 }
 215
 216 /* --------------------------------------------------------------------------------------------- */
 217
 218 static int
 219 str_utf8_isprint (const char *ch)
 220 {
 221     gunichar uni;
 222
 223     uni = g_utf8_get_char_validated (ch, -1);
 224     return g_unichar_isprint (uni);
 225 }
 226
 227 /* --------------------------------------------------------------------------------------------- */
 228
 229 static gboolean
 230 str_utf8_iscombiningmark (const char *ch)
 231 {
 232     gunichar uni;
 233
 234     uni = g_utf8_get_char_validated (ch, -1);
 235     return str_unichar_iscombiningmark (uni);
 236 }
 237
 238 /* --------------------------------------------------------------------------------------------- */
 239
 240 static int
 241 str_utf8_cnext_noncomb_char (const char **text)
 242 {
 243     int count = 0;
 244
 245     while ((*text)[0] != '\0')
 246     {
 247         str_utf8_cnext_char_safe (text);
 248         count++;
 249         if (!str_utf8_iscombiningmark (*text))
 250             break;
 251     }
 252
 253     return count;
 254 }
 255
 256 /* --------------------------------------------------------------------------------------------- */
 257
 258 static int
 259 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 260 {
 261     int count = 0;
 262
 263     while ((*text) != begin)
 264     {
 265         str_utf8_cprev_char_safe (text);
 266         count++;
 267         if (!str_utf8_iscombiningmark (*text))
 268             break;
 269     }
 270
 271     return count;
 272 }
 273
 274 /* --------------------------------------------------------------------------------------------- */
 275
 276 static int
 277 str_utf8_toupper (const char *text, char **out, size_t * remain)
 278 {
 279     gunichar uni;
 280     size_t left;
 281
 282     uni = g_utf8_get_char_validated (text, -1);
 283     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 284         return 0;
 285
 286     uni = g_unichar_toupper (uni);
 287     left = g_unichar_to_utf8 (uni, NULL);
 288     if (left >= *remain)
 289         return 0;
 290
 291     left = g_unichar_to_utf8 (uni, *out);
 292     (*out) += left;
 293     (*remain) -= left;
 294     return 1;
 295 }
 296
 297 /* --------------------------------------------------------------------------------------------- */
 298
 299 static int
 300 str_utf8_tolower (const char *text, char **out, size_t * remain)
 301 {
 302     gunichar uni;
 303     size_t left;
 304
 305     uni = g_utf8_get_char_validated (text, -1);
 306     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 307         return 0;
 308
 309     uni = g_unichar_tolower (uni);
 310     left = g_unichar_to_utf8 (uni, NULL);
 311     if (left >= *remain)
 312         return 0;
 313
 314     left = g_unichar_to_utf8 (uni, *out);
 315     (*out) += left;
 316     (*remain) -= left;
 317     return 1;
 318 }
 319
 320 /* --------------------------------------------------------------------------------------------- */
 321
 322 static int
 323 str_utf8_length (const char *text)
 324 {
 325     int result = 0;
 326     const char *start;
 327     const char *end;
 328
 329     start = text;
 330     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 331     {
 332         if (start != end)
 333             result += g_utf8_strlen (start, end - start);
 334
 335         result++;
 336         start = end + 1;
 337     }
 338
 339     if (start == text)
 340         result = g_utf8_strlen (text, -1);
 341     else if (start[0] != '\0' && start != end)
 342         result += g_utf8_strlen (start, end - start);
 343
 344     return result;
 345 }
 346
 347 /* --------------------------------------------------------------------------------------------- */
 348
 349 static int
 350 str_utf8_length2 (const char *text, int size)
 351 {
 352     int result = 0;
 353     const char *start;
 354     const char *end;
 355
 356     start = text;
 357     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 358     {
 359         if (start != end)
 360         {
 361             result += g_utf8_strlen (start, MIN (end - start, size));
 362             size -= end - start;
 363         }
 364         result += (size > 0);
 365         size--;
 366         start = end + 1;
 367     }
 368
 369     if (start == text)
 370         result = g_utf8_strlen (text, size);
 371     else if (start[0] != '\0' && start != end && size > 0)
 372         result += g_utf8_strlen (start, MIN (end - start, size));
 373
 374     return result;
 375 }
 376
 377 /* --------------------------------------------------------------------------------------------- */
 378
 379 static int
 380 str_utf8_length_noncomb (const char *text)
 381 {
 382     int result = 0;
 383     const char *t = text;
 384
 385     while (t[0] != '\0')
 386     {
 387         str_utf8_cnext_noncomb_char (&t);
 388         result++;
 389     }
 390
 391     return result;
 392 }
 393
 394 /* --------------------------------------------------------------------------------------------- */
 395
 396 #if 0
 397 static void
 398 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
 399 {
 400     char *next;
 401
 402     next = g_utf8_next_char (*string);
 403     (*left) -= next - (*string);
 404     (*string) = next;
 405     g_string_append_c (buffer, '?');
 406 }
 407 #endif
 408
 409 /* --------------------------------------------------------------------------------------------- */
 410
 411 static gchar *
 412 str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
 413 {
 414     if (mcerror != NULL)
 415         return g_strdup (mcerror->message);
 416
 417     return g_strdup (def_msg != NULL ? def_msg : "");
 418 }
 419
 420 /* --------------------------------------------------------------------------------------------- */
 421
 422 static estr_t
 423 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
 424 {
 425     estr_t result = ESTR_SUCCESS;
 426
 427     if (coder == str_cnv_not_convert)
 428         g_string_append_len (buffer, string, size);
 429     else
 430         result = str_nconvert (coder, string, size, buffer);
 431
 432     return result;
 433 }
 434
 435 /* --------------------------------------------------------------------------------------------- */
 436 /* utility function, that makes string valid in utf8 and all characters printable
 437  * return width of string too */
 438
 439 static const struct term_form *
 440 str_utf8_make_make_term_form (const char *text, size_t length)
 441 {
 442     static struct term_form result;
 443     gunichar uni;
 444     size_t left;
 445     char *actual;
 446
 447     result.text[0] = '\0';
 448     result.width = 0;
 449     result.compose = FALSE;
 450     actual = result.text;
 451
 452     /* check if text start with combining character,
 453      * add space at begin in this case */
 454     if (length != 0 && text[0] != '\0')
 455     {
 456         uni = g_utf8_get_char_validated (text, -1);
 457         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
 458             && str_unichar_iscombiningmark (uni))
 459         {
 460             actual[0] = ' ';
 461             actual++;
 462             result.width++;
 463             result.compose = TRUE;
 464         }
 465     }
 466
 467     while (length != 0 && text[0] != '\0')
 468     {
 469         uni = g_utf8_get_char_validated (text, -1);
 470         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 471         {
 472             if (g_unichar_isprint (uni))
 473             {
 474                 left = g_unichar_to_utf8 (uni, actual);
 475                 actual += left;
 476                 if (str_unichar_iscombiningmark (uni))
 477                     result.compose = TRUE;
 478                 else
 479                 {
 480                     result.width++;
 481                     if (g_unichar_iswide (uni))
 482                         result.width++;
 483                 }
 484             }
 485             else
 486             {
 487                 actual[0] = '.';
 488                 actual++;
 489                 result.width++;
 490             }
 491             text = g_utf8_next_char (text);
 492         }
 493         else
 494         {
 495             text++;
 496             /*actual[0] = '?'; */
 497             memcpy (actual, replch, strlen (replch));
 498             actual += strlen (replch);
 499             result.width++;
 500         }
 501
 502         if (length != (size_t) (-1))
 503             length--;
 504     }
 505     actual[0] = '\0';
 506
 507     return &result;
 508 }
 509
 510 /* --------------------------------------------------------------------------------------------- */
 511
 512 static const char *
 513 str_utf8_term_form (const char *text)
 514 {
 515     static char result[BUF_MEDIUM * 6];
 516     const struct term_form *pre_form;
 517
 518     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 519     if (pre_form->compose)
 520     {
 521         char *composed;
 522
 523         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 524         g_strlcpy (result, composed, sizeof (result));
 525         g_free (composed);
 526     }
 527     else
 528         g_strlcpy (result, pre_form->text, sizeof (result));
 529
 530     return result;
 531 }
 532
 533 /* --------------------------------------------------------------------------------------------- */
 534 /* utility function, that copies all characters from checked to actual */
 535
 536 static gboolean
 537 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 538 {
 539     tool->compose = FALSE;
 540
 541     while (tool->checked[0] != '\0')
 542     {
 543         gunichar uni;
 544         size_t left;
 545
 546         uni = g_utf8_get_char (tool->checked);
 547         tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
 548         left = g_unichar_to_utf8 (uni, NULL);
 549         if (tool->remain <= left)
 550             return FALSE;
 551         left = g_unichar_to_utf8 (uni, tool->actual);
 552         tool->actual += left;
 553         tool->remain -= left;
 554         tool->checked = g_utf8_next_char (tool->checked);
 555     }
 556
 557     return TRUE;
 558 }
 559
 560 /* --------------------------------------------------------------------------------------------- */
 561 /* utility function, that copies characters from checked to actual until ident is
 562  * smaller than to_ident */
 563
 564 static gboolean
 565 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 566 {
 567     tool->compose = FALSE;
 568
 569     while (tool->checked[0] != '\0')
 570     {
 571         gunichar uni;
 572         size_t left;
 573         int w = 0;
 574
 575         uni = g_utf8_get_char (tool->checked);
 576         if (str_unichar_iscombiningmark (uni))
 577             tool->compose = TRUE;
 578         else
 579         {
 580             w = 1;
 581             if (g_unichar_iswide (uni))
 582                 w++;
 583             if (tool->ident + w > to_ident)
 584                 return TRUE;
 585         }
 586
 587         left = g_unichar_to_utf8 (uni, NULL);
 588         if (tool->remain <= left)
 589             return FALSE;
 590         left = g_unichar_to_utf8 (uni, tool->actual);
 591         tool->actual += left;
 592         tool->remain -= left;
 593         tool->checked = g_utf8_next_char (tool->checked);
 594         tool->ident += w;
 595     }
 596
 597     return TRUE;
 598 }
 599
 600 /* --------------------------------------------------------------------------------------------- */
 601 /* utility function, adds count spaces to actual */
 602
 603 static int
 604 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 605 {
 606     if (count <= 0)
 607         return 1;
 608     if (tool->remain <= (gsize) count)
 609         return 0;
 610
 611     memset (tool->actual, ' ', count);
 612     tool->actual += count;
 613     tool->remain -= count;
 614     return 1;
 615 }
 616
 617 /* --------------------------------------------------------------------------------------------- */
 618 /* utility function, adds one characters to actual */
 619
 620 static int
 621 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 622 {
 623     if (tool->remain <= 1)
 624         return 0;
 625
 626     tool->actual[0] = ch;
 627     tool->actual++;
 628     tool->remain--;
 629     return 1;
 630 }
 631
 632 /* --------------------------------------------------------------------------------------------- */
 633 /* utility function, thah skips characters from checked until ident is greater or
 634  * equal to to_ident */
 635
 636 static gboolean
 637 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 638 {
 639     gunichar uni;
 640
 641     while (to_ident > tool->ident && tool->checked[0] != '\0')
 642     {
 643         uni = g_utf8_get_char (tool->checked);
 644         if (!str_unichar_iscombiningmark (uni))
 645         {
 646             tool->ident++;
 647             if (g_unichar_iswide (uni))
 648                 tool->ident++;
 649         }
 650         tool->checked = g_utf8_next_char (tool->checked);
 651     }
 652
 653     uni = g_utf8_get_char (tool->checked);
 654     while (str_unichar_iscombiningmark (uni))
 655     {
 656         tool->checked = g_utf8_next_char (tool->checked);
 657         uni = g_utf8_get_char (tool->checked);
 658     }
 659
 660     return TRUE;
 661 }
 662
 663 /* --------------------------------------------------------------------------------------------- */
 664
 665 static void
 666 utf8_tool_compose (char *buffer, size_t size)
 667 {
 668     char *composed;
 669
 670     composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 671     g_strlcpy (buffer, composed, size);
 672     g_free (composed);
 673 }
 674
 675 /* --------------------------------------------------------------------------------------------- */
 676
 677 static const char *
 678 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 679 {
 680     static char result[BUF_MEDIUM * 6];
 681     const struct term_form *pre_form;
 682     struct utf8_tool tool;
 683
 684     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 685     tool.checked = pre_form->text;
 686     tool.actual = result;
 687     tool.remain = sizeof (result);
 688     tool.compose = FALSE;
 689
 690     if (pre_form->width <= (gsize) width)
 691     {
 692         switch (HIDE_FIT (just_mode))
 693         {
 694         case J_CENTER_LEFT:
 695         case J_CENTER:
 696             tool.ident = (width - pre_form->width) / 2;
 697             break;
 698         case J_RIGHT:
 699             tool.ident = width - pre_form->width;
 700             break;
 701         default:
 702             tool.ident = 0;
 703             break;
 704         }
 705
 706         utf8_tool_insert_space (&tool, tool.ident);
 707         utf8_tool_copy_chars_to_end (&tool);
 708         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 709     }
 710     else if (IS_FIT (just_mode))
 711     {
 712         tool.ident = 0;
 713         utf8_tool_copy_chars_to (&tool, width / 2);
 714         utf8_tool_insert_char (&tool, '~');
 715
 716         tool.ident = 0;
 717         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 718         utf8_tool_copy_chars_to_end (&tool);
 719         utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 720     }
 721     else
 722     {
 723         switch (HIDE_FIT (just_mode))
 724         {
 725         case J_CENTER:
 726             tool.ident = (width - pre_form->width) / 2;
 727             break;
 728         case J_RIGHT:
 729             tool.ident = width - pre_form->width;
 730             break;
 731         default:
 732             tool.ident = 0;
 733             break;
 734         }
 735
 736         utf8_tool_skip_chars_to (&tool, 0);
 737         utf8_tool_insert_space (&tool, tool.ident);
 738         utf8_tool_copy_chars_to (&tool, width);
 739         utf8_tool_insert_space (&tool, width - tool.ident);
 740     }
 741
 742     tool.actual[0] = '\0';
 743     if (tool.compose)
 744         utf8_tool_compose (result, sizeof (result));
 745     return result;
 746 }
 747
 748 /* --------------------------------------------------------------------------------------------- */
 749
 750 static const char *
 751 str_utf8_term_trim (const char *text, int width)
 752 {
 753     static char result[BUF_MEDIUM * 6];
 754     const struct term_form *pre_form;
 755     struct utf8_tool tool;
 756
 757     if (width < 1)
 758     {
 759         result[0] = '\0';
 760         return result;
 761     }
 762
 763     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 764
 765     tool.checked = pre_form->text;
 766     tool.actual = result;
 767     tool.remain = sizeof (result);
 768     tool.compose = FALSE;
 769
 770     if ((gsize) width >= pre_form->width)
 771         utf8_tool_copy_chars_to_end (&tool);
 772     else if (width <= 3)
 773     {
 774         memset (tool.actual, '.', width);
 775         tool.actual += width;
 776         tool.remain -= width;
 777     }
 778     else
 779     {
 780         memset (tool.actual, '.', 3);
 781         tool.actual += 3;
 782         tool.remain -= 3;
 783
 784         tool.ident = 0;
 785         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 786         utf8_tool_copy_chars_to_end (&tool);
 787     }
 788
 789     tool.actual[0] = '\0';
 790     if (tool.compose)
 791         utf8_tool_compose (result, sizeof (result));
 792     return result;
 793 }
 794
 795 /* --------------------------------------------------------------------------------------------- */
 796
 797 static int
 798 str_utf8_term_width2 (const char *text, size_t length)
 799 {
 800     const struct term_form *result;
 801
 802     result = str_utf8_make_make_term_form (text, length);
 803     return result->width;
 804 }
 805
 806 /* --------------------------------------------------------------------------------------------- */
 807
 808 static int
 809 str_utf8_term_width1 (const char *text)
 810 {
 811     return str_utf8_term_width2 (text, (size_t) (-1));
 812 }
 813
 814 /* --------------------------------------------------------------------------------------------- */
 815
 816 static int
 817 str_utf8_term_char_width (const char *text)
 818 {
 819     gunichar uni;
 820
 821     uni = g_utf8_get_char_validated (text, -1);
 822     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 823 }
 824
 825 /* --------------------------------------------------------------------------------------------- */
 826
 827 static const char *
 828 str_utf8_term_substring (const char *text, int start, int width)
 829 {
 830     static char result[BUF_MEDIUM * 6];
 831     const struct term_form *pre_form;
 832     struct utf8_tool tool;
 833
 834     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 835
 836     tool.checked = pre_form->text;
 837     tool.actual = result;
 838     tool.remain = sizeof (result);
 839     tool.compose = FALSE;
 840
 841     tool.ident = -start;
 842     utf8_tool_skip_chars_to (&tool, 0);
 843     if (tool.ident < 0)
 844         tool.ident = 0;
 845     utf8_tool_insert_space (&tool, tool.ident);
 846
 847     utf8_tool_copy_chars_to (&tool, width);
 848     utf8_tool_insert_space (&tool, width - tool.ident);
 849
 850     tool.actual[0] = '\0';
 851     if (tool.compose)
 852         utf8_tool_compose (result, sizeof (result));
 853     return result;
 854 }
 855
 856 /* --------------------------------------------------------------------------------------------- */
 857
 858 static const char *
 859 str_utf8_trunc (const char *text, int width)
 860 {
 861     static char result[MC_MAXPATHLEN * 6 * 2];
 862     const struct term_form *pre_form;
 863     struct utf8_tool tool;
 864
 865     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 866
 867     tool.checked = pre_form->text;
 868     tool.actual = result;
 869     tool.remain = sizeof (result);
 870     tool.compose = FALSE;
 871
 872     if (pre_form->width <= (gsize) width)
 873         utf8_tool_copy_chars_to_end (&tool);
 874     else
 875     {
 876         tool.ident = 0;
 877         utf8_tool_copy_chars_to (&tool, width / 2);
 878         utf8_tool_insert_char (&tool, '~');
 879
 880         tool.ident = 0;
 881         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 882         utf8_tool_copy_chars_to_end (&tool);
 883     }
 884
 885     tool.actual[0] = '\0';
 886     if (tool.compose)
 887         utf8_tool_compose (result, sizeof (result));
 888     return result;
 889 }
 890
 891 /* --------------------------------------------------------------------------------------------- */
 892
 893 static int
 894 str_utf8_offset_to_pos (const char *text, size_t length)
 895 {
 896     if (str_utf8_is_valid_string (text))
 897         return g_utf8_offset_to_pointer (text, length) - text;
 898     else
 899     {
 900         int result;
 901         GString *buffer;
 902
 903         buffer = g_string_new (text);
 904         str_utf8_fix_string (buffer->str);
 905         result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
 906         g_string_free (buffer, TRUE);
 907         return result;
 908     }
 909 }
 910
 911 /* --------------------------------------------------------------------------------------------- */
 912
 913 static int
 914 str_utf8_column_to_pos (const char *text, size_t pos)
 915 {
 916     int result = 0;
 917     int width = 0;
 918
 919     while (text[0] != '\0')
 920     {
 921         gunichar uni;
 922
 923         uni = g_utf8_get_char_validated (text, 6);
 924         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 925         {
 926             if (g_unichar_isprint (uni))
 927             {
 928                 if (!str_unichar_iscombiningmark (uni))
 929                 {
 930                     width++;
 931                     if (g_unichar_iswide (uni))
 932                         width++;
 933                 }
 934             }
 935             else
 936             {
 937                 width++;
 938             }
 939             text = g_utf8_next_char (text);
 940         }
 941         else
 942         {
 943             text++;
 944             width++;
 945         }
 946
 947         if ((gsize) width > pos)
 948             return result;
 949
 950         result++;
 951     }
 952
 953     return result;
 954 }
 955
 956 /* --------------------------------------------------------------------------------------------- */
 957
 958 static char *
 959 str_utf8_create_search_needle (const char *needle, int case_sen)
 960 {
 961     char *fold, *result;
 962
 963     if (needle == NULL)
 964         return NULL;
 965
 966     if (case_sen)
 967         return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 968
 969
 970     fold = g_utf8_casefold (needle, -1);
 971     result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 972     g_free (fold);
 973     return result;
 974 }
 975
 976 /* --------------------------------------------------------------------------------------------- */
 977
 978 static void
 979 str_utf8_release_search_needle (char *needle, int case_sen)
 980 {
 981     (void) case_sen;
 982     g_free (needle);
 983 }
 984
 985 /* --------------------------------------------------------------------------------------------- */
 986
 987 static const char *
 988 str_utf8_search_first (const char *text, const char *search, int case_sen)
 989 {
 990     char *fold_text;
 991     char *deco_text;
 992     const char *match;
 993     const char *result = NULL;
 994     const char *m;
 995
 996     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 997     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 998
 999     match = deco_text;
1000     do
1001     {
1002         match = g_strstr_len (match, -1, search);
1003         if (match != NULL)
1004         {
1005             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1006                 !str_utf8_iscombiningmark (match + strlen (search)))
1007             {
1008                 result = text;
1009                 m = deco_text;
1010                 while (m < match)
1011                 {
1012                     str_utf8_cnext_noncomb_char (&m);
1013                     str_utf8_cnext_noncomb_char (&result);
1014                 }
1015             }
1016             else
1017                 str_utf8_cnext_char (&match);
1018         }
1019     }
1020     while (match != NULL && result == NULL);
1021
1022     g_free (deco_text);
1023     if (!case_sen)
1024         g_free (fold_text);
1025
1026     return result;
1027 }
1028
1029 /* --------------------------------------------------------------------------------------------- */
1030
1031 static const char *
1032 str_utf8_search_last (const char *text, const char *search, int case_sen)
1033 {
1034     char *fold_text;
1035     char *deco_text;
1036     char *match;
1037     const char *result = NULL;
1038     const char *m;
1039
1040     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
1041     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1042
1043     do
1044     {
1045         match = g_strrstr_len (deco_text, -1, search);
1046         if (match != NULL)
1047         {
1048             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1049                 !str_utf8_iscombiningmark (match + strlen (search)))
1050             {
1051                 result = text;
1052                 m = deco_text;
1053                 while (m < match)
1054                 {
1055                     str_utf8_cnext_noncomb_char (&m);
1056                     str_utf8_cnext_noncomb_char (&result);
1057                 }
1058             }
1059             else
1060                 match[0] = '\0';
1061         }
1062     }
1063     while (match != NULL && result == NULL);
1064
1065     g_free (deco_text);
1066     if (!case_sen)
1067         g_free (fold_text);
1068
1069     return result;
1070 }
1071
1072 /* --------------------------------------------------------------------------------------------- */
1073
1074 static char *
1075 str_utf8_normalize (const char *text)
1076 {
1077     GString *fixed;
1078     char *tmp;
1079     char *result;
1080     const char *start;
1081     const char *end;
1082
1083     fixed = g_string_sized_new (4);
1084
1085     start = text;
1086     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1087     {
1088         if (start != end)
1089         {
1090             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1091             g_string_append (fixed, tmp);
1092             g_free (tmp);
1093         }
1094         g_string_append_c (fixed, end[0]);
1095         start = end + 1;
1096     }
1097
1098     if (start == text)
1099     {
1100         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1101         g_string_free (fixed, TRUE);
1102     }
1103     else
1104     {
1105         if (start[0] != '\0' && start != end)
1106         {
1107             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1108             g_string_append (fixed, tmp);
1109             g_free (tmp);
1110         }
1111         result = g_string_free (fixed, FALSE);
1112     }
1113
1114     return result;
1115 }
1116
1117 /* --------------------------------------------------------------------------------------------- */
1118
1119 static char *
1120 str_utf8_casefold_normalize (const char *text)
1121 {
1122     GString *fixed;
1123     char *tmp, *fold;
1124     char *result;
1125     const char *start;
1126     const char *end;
1127
1128     fixed = g_string_sized_new (4);
1129
1130     start = text;
1131     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1132     {
1133         if (start != end)
1134         {
1135             fold = g_utf8_casefold (start, end - start);
1136             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1137             g_string_append (fixed, tmp);
1138             g_free (tmp);
1139             g_free (fold);
1140         }
1141         g_string_append_c (fixed, end[0]);
1142         start = end + 1;
1143     }
1144
1145     if (start == text)
1146     {
1147         fold = g_utf8_casefold (text, -1);
1148         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1149         g_free (fold);
1150         g_string_free (fixed, TRUE);
1151     }
1152     else
1153     {
1154         if (start[0] != '\0' && start != end)
1155         {
1156             fold = g_utf8_casefold (start, end - start);
1157             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1158             g_string_append (fixed, tmp);
1159             g_free (tmp);
1160             g_free (fold);
1161         }
1162         result = g_string_free (fixed, FALSE);
1163     }
1164
1165     return result;
1166 }
1167
1168 /* --------------------------------------------------------------------------------------------- */
1169
1170 static int
1171 str_utf8_compare (const char *t1, const char *t2)
1172 {
1173     char *n1, *n2;
1174     int result;
1175
1176     n1 = str_utf8_normalize (t1);
1177     n2 = str_utf8_normalize (t2);
1178
1179     result = strcmp (n1, n2);
1180
1181     g_free (n1);
1182     g_free (n2);
1183
1184     return result;
1185 }
1186
1187 /* --------------------------------------------------------------------------------------------- */
1188
1189 static int
1190 str_utf8_ncompare (const char *t1, const char *t2)
1191 {
1192     char *n1, *n2;
1193     size_t l1, l2;
1194     int result;
1195
1196     n1 = str_utf8_normalize (t1);
1197     n2 = str_utf8_normalize (t2);
1198
1199     l1 = strlen (n1);
1200     l2 = strlen (n2);
1201     result = strncmp (n1, n2, MIN (l1, l2));
1202
1203     g_free (n1);
1204     g_free (n2);
1205
1206     return result;
1207 }
1208
1209 /* --------------------------------------------------------------------------------------------- */
1210
1211 static int
1212 str_utf8_casecmp (const char *t1, const char *t2)
1213 {
1214     char *n1, *n2;
1215     int result;
1216
1217     n1 = str_utf8_casefold_normalize (t1);
1218     n2 = str_utf8_casefold_normalize (t2);
1219
1220     result = strcmp (n1, n2);
1221
1222     g_free (n1);
1223     g_free (n2);
1224
1225     return result;
1226 }
1227
1228 /* --------------------------------------------------------------------------------------------- */
1229
1230 static int
1231 str_utf8_ncasecmp (const char *t1, const char *t2)
1232 {
1233     char *n1, *n2;
1234     size_t l1, l2;
1235     int result;
1236
1237     n1 = str_utf8_casefold_normalize (t1);
1238     n2 = str_utf8_casefold_normalize (t2);
1239
1240     l1 = strlen (n1);
1241     l2 = strlen (n2);
1242     result = strncmp (n1, n2, MIN (l1, l2));
1243
1244     g_free (n1);
1245     g_free (n2);
1246
1247     return result;
1248 }
1249
1250 /* --------------------------------------------------------------------------------------------- */
1251
1252 static int
1253 str_utf8_prefix (const char *text, const char *prefix)
1254 {
1255     char *t, *p;
1256     const char *nt, *np;
1257     const char *nnt, *nnp;
1258     int result;
1259
1260     t = str_utf8_normalize (text);
1261     p = str_utf8_normalize (prefix);
1262     nt = t;
1263     np = p;
1264     nnt = t;
1265     nnp = p;
1266
1267     while (nt[0] != '\0' && np[0] != '\0')
1268     {
1269         str_utf8_cnext_char_safe (&nnt);
1270         str_utf8_cnext_char_safe (&nnp);
1271         if (nnt - nt != nnp - np)
1272             break;
1273         if (strncmp (nt, np, nnt - nt) != 0)
1274             break;
1275         nt = nnt;
1276         np = nnp;
1277     }
1278
1279     result = np - p;
1280
1281     g_free (t);
1282     g_free (p);
1283
1284     return result;
1285 }
1286
1287 /* --------------------------------------------------------------------------------------------- */
1288
1289 static int
1290 str_utf8_caseprefix (const char *text, const char *prefix)
1291 {
1292     char *t, *p;
1293     const char *nt, *np;
1294     const char *nnt, *nnp;
1295     int result;
1296
1297     t = str_utf8_casefold_normalize (text);
1298     p = str_utf8_casefold_normalize (prefix);
1299     nt = t;
1300     np = p;
1301     nnt = t;
1302     nnp = p;
1303
1304     while (nt[0] != '\0' && np[0] != '\0')
1305     {
1306         str_utf8_cnext_char_safe (&nnt);
1307         str_utf8_cnext_char_safe (&nnp);
1308         if (nnt - nt != nnp - np)
1309             break;
1310         if (strncmp (nt, np, nnt - nt) != 0)
1311             break;
1312         nt = nnt;
1313         np = nnp;
1314     }
1315
1316     result = np - p;
1317
1318     g_free (t);
1319     g_free (p);
1320
1321     return result;
1322 }
1323
1324 /* --------------------------------------------------------------------------------------------- */
1325
1326 static char *
1327 str_utf8_create_key_gen (const char *text, int case_sen,
1328                          gchar * (*keygen) (const gchar * text, gssize size))
1329 {
1330     char *result;
1331
1332     if (case_sen)
1333         result = str_utf8_normalize (text);
1334     else
1335     {
1336         gboolean dot;
1337         GString *fixed;
1338         const char *start, *end;
1339         char *fold, *key;
1340
1341         dot = text[0] == '.';
1342         fixed = g_string_sized_new (16);
1343
1344         if (!dot)
1345             start = text;
1346         else
1347         {
1348             start = text + 1;
1349             g_string_append_c (fixed, '.');
1350         }
1351
1352         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1353         {
1354             if (start != end)
1355             {
1356                 fold = g_utf8_casefold (start, end - start);
1357                 key = keygen (fold, -1);
1358                 g_string_append (fixed, key);
1359                 g_free (key);
1360                 g_free (fold);
1361             }
1362             g_string_append_c (fixed, end[0]);
1363             start = end + 1;
1364         }
1365
1366         if (start == text)
1367         {
1368             fold = g_utf8_casefold (start, -1);
1369             result = keygen (fold, -1);
1370             g_free (fold);
1371             g_string_free (fixed, TRUE);
1372         }
1373         else if (dot && (start == text + 1))
1374         {
1375             fold = g_utf8_casefold (start, -1);
1376             key = keygen (fold, -1);
1377             g_string_append (fixed, key);
1378             g_free (key);
1379             g_free (fold);
1380             result = g_string_free (fixed, FALSE);
1381         }
1382         else
1383         {
1384             if (start[0] != '\0' && start != end)
1385             {
1386                 fold = g_utf8_casefold (start, end - start);
1387                 key = keygen (fold, -1);
1388                 g_string_append (fixed, key);
1389                 g_free (key);
1390                 g_free (fold);
1391             }
1392             result = g_string_free (fixed, FALSE);
1393         }
1394     }
1395     return result;
1396 }
1397
1398 /* --------------------------------------------------------------------------------------------- */
1399
1400 static char *
1401 str_utf8_create_key (const char *text, int case_sen)
1402 {
1403     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1404 }
1405
1406 /* --------------------------------------------------------------------------------------------- */
1407
1408 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1409 static char *
1410 str_utf8_create_key_for_filename (const char *text, int case_sen)
1411 {
1412     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1413 }
1414 #endif
1415
1416 /* --------------------------------------------------------------------------------------------- */
1417
1418 static int
1419 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1420 {
1421     (void) case_sen;
1422     return strcmp (t1, t2);
1423 }
1424
1425 /* --------------------------------------------------------------------------------------------- */
1426
1427 static void
1428 str_utf8_release_key (char *key, int case_sen)
1429 {
1430     (void) case_sen;
1431     g_free (key);
1432 }
1433
1434 /* --------------------------------------------------------------------------------------------- */
1435 /*** public functions ****************************************************************************/
1436 /* --------------------------------------------------------------------------------------------- */
1437
1438 struct str_class
1439 str_utf8_init (void)
1440 {
1441     struct str_class result;
1442
1443     result.conv_gerror_message = str_utf8_conv_gerror_message;
1444     result.vfs_convert_to = str_utf8_vfs_convert_to;
1445     result.insert_replace_char = str_utf8_insert_replace_char;
1446     result.is_valid_string = str_utf8_is_valid_string;
1447     result.is_valid_char = str_utf8_is_valid_char;
1448     result.cnext_char = str_utf8_cnext_char;
1449     result.cprev_char = str_utf8_cprev_char;
1450     result.cnext_char_safe = str_utf8_cnext_char_safe;
1451     result.cprev_char_safe = str_utf8_cprev_char_safe;
1452     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1453     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1454     result.char_isspace = str_utf8_isspace;
1455     result.char_ispunct = str_utf8_ispunct;
1456     result.char_isalnum = str_utf8_isalnum;
1457     result.char_isdigit = str_utf8_isdigit;
1458     result.char_isprint = str_utf8_isprint;
1459     result.char_iscombiningmark = str_utf8_iscombiningmark;
1460     result.char_toupper = str_utf8_toupper;
1461     result.char_tolower = str_utf8_tolower;
1462     result.length = str_utf8_length;
1463     result.length2 = str_utf8_length2;
1464     result.length_noncomb = str_utf8_length_noncomb;
1465     result.fix_string = str_utf8_fix_string;
1466     result.term_form = str_utf8_term_form;
1467     result.fit_to_term = str_utf8_fit_to_term;
1468     result.term_trim = str_utf8_term_trim;
1469     result.term_width2 = str_utf8_term_width2;
1470     result.term_width1 = str_utf8_term_width1;
1471     result.term_char_width = str_utf8_term_char_width;
1472     result.term_substring = str_utf8_term_substring;
1473     result.trunc = str_utf8_trunc;
1474     result.offset_to_pos = str_utf8_offset_to_pos;
1475     result.column_to_pos = str_utf8_column_to_pos;
1476     result.create_search_needle = str_utf8_create_search_needle;
1477     result.release_search_needle = str_utf8_release_search_needle;
1478     result.search_first = str_utf8_search_first;
1479     result.search_last = str_utf8_search_last;
1480     result.compare = str_utf8_compare;
1481     result.ncompare = str_utf8_ncompare;
1482     result.casecmp = str_utf8_casecmp;
1483     result.ncasecmp = str_utf8_ncasecmp;
1484     result.prefix = str_utf8_prefix;
1485     result.caseprefix = str_utf8_caseprefix;
1486     result.create_key = str_utf8_create_key;
1487 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1488     /* case insensitive sort files in "a1 a2 a10" order */
1489     result.create_key_for_filename = str_utf8_create_key_for_filename;
1490 #else
1491     /* case insensitive sort files in "a1 a10 a2" order */
1492     result.create_key_for_filename = str_utf8_create_key;
1493 #endif
1494     result.key_collate = str_utf8_key_collate;
1495     result.release_key = str_utf8_release_key;
1496
1497     return result;
1498 }
1499
1500 /* --------------------------------------------------------------------------------------------- */