lib/strutil/strutilutf8.c

   1 /*
   2    UTF-8 strings utilities
   3
   4    Copyright (C) 2007-2024
   5    Free Software Foundation, Inc.
   6
   7    Written by:
   8    Rostislav Benes, 2007
   9
  10    This file is part of the Midnight Commander.
  11
  12    The Midnight Commander is free software: you can redistribute it
  13    and/or modify it under the terms of the GNU General Public License as
  14    published by the Free Software Foundation, either version 3 of the License,
  15    or (at your option) any later version.
  16
  17    The Midnight Commander is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20    GNU General Public License for more details.
  21
  22    You should have received a copy of the GNU General Public License
  23    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  24  */
  25
  26 #include <config.h>
  27
  28 #include <stdlib.h>
  29 #include <langinfo.h>
  30 #include <limits.h>             /* MB_LEN_MAX */
  31 #include <string.h>
  32
  33 #include "lib/global.h"
  34 #include "lib/strutil.h"
  35
  36 /* using function for utf-8 from glib */
  37
  38 /*** global variables ****************************************************************************/
  39
  40 /*** file scope macro definitions ****************************************************************/
  41
  42 /*** file scope type declarations ****************************************************************/
  43
  44 struct utf8_tool
  45 {
  46     char *actual;
  47     size_t remain;
  48     const char *checked;
  49     int ident;
  50     gboolean compose;
  51 };
  52
  53 struct term_form
  54 {
  55     char text[BUF_MEDIUM * MB_LEN_MAX];
  56     size_t width;
  57     gboolean compose;
  58 };
  59
  60 /*** forward declarations (file scope functions) *************************************************/
  61
  62 /*** file scope variables ************************************************************************/
  63
  64 static const char replch[] = "\xEF\xBF\xBD";
  65
  66 /* --------------------------------------------------------------------------------------------- */
  67 /*** file scope functions ************************************************************************/
  68 /* --------------------------------------------------------------------------------------------- */
  69
  70 static gboolean
  71 str_unichar_iscombiningmark (gunichar uni)
  72 {
  73     GUnicodeType type;
  74
  75     type = g_unichar_type (uni);
  76     return (type == G_UNICODE_SPACING_MARK)
  77         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  78 }
  79
  80 /* --------------------------------------------------------------------------------------------- */
  81
  82 static void
  83 str_utf8_insert_replace_char (GString *buffer)
  84 {
  85     g_string_append (buffer, replch);
  86 }
  87
  88 /* --------------------------------------------------------------------------------------------- */
  89
  90 static gboolean
  91 str_utf8_is_valid_string (const char *text)
  92 {
  93     return g_utf8_validate (text, -1, NULL);
  94 }
  95
  96 /* --------------------------------------------------------------------------------------------- */
  97
  98 static int
  99 str_utf8_is_valid_char (const char *ch, size_t size)
 100 {
 101     switch (g_utf8_get_char_validated (ch, size))
 102     {
 103     case (gunichar) (-2):
 104         return (-2);
 105     case (gunichar) (-1):
 106         return (-1);
 107     default:
 108         return 1;
 109     }
 110 }
 111
 112 /* --------------------------------------------------------------------------------------------- */
 113
 114 static void
 115 str_utf8_cnext_char (const char **text)
 116 {
 117     (*text) = g_utf8_next_char (*text);
 118 }
 119
 120 /* --------------------------------------------------------------------------------------------- */
 121
 122 static void
 123 str_utf8_cprev_char (const char **text)
 124 {
 125     (*text) = g_utf8_prev_char (*text);
 126 }
 127
 128 /* --------------------------------------------------------------------------------------------- */
 129
 130 static void
 131 str_utf8_cnext_char_safe (const char **text)
 132 {
 133     if (str_utf8_is_valid_char (*text, -1) == 1)
 134         (*text) = g_utf8_next_char (*text);
 135     else
 136         (*text)++;
 137 }
 138
 139 /* --------------------------------------------------------------------------------------------- */
 140
 141 static void
 142 str_utf8_cprev_char_safe (const char **text)
 143 {
 144     const char *result, *t;
 145
 146     result = g_utf8_prev_char (*text);
 147     t = result;
 148     str_utf8_cnext_char_safe (&t);
 149     if (t == *text)
 150         (*text) = result;
 151     else
 152         (*text)--;
 153 }
 154
 155 /* --------------------------------------------------------------------------------------------- */
 156
 157 static void
 158 str_utf8_fix_string (char *text)
 159 {
 160     while (text[0] != '\0')
 161     {
 162         gunichar uni;
 163
 164         uni = g_utf8_get_char_validated (text, -1);
 165         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 166             text = g_utf8_next_char (text);
 167         else
 168         {
 169             text[0] = '?';
 170             text++;
 171         }
 172     }
 173 }
 174
 175 /* --------------------------------------------------------------------------------------------- */
 176
 177 static gboolean
 178 str_utf8_isspace (const char *text)
 179 {
 180     gunichar uni;
 181
 182     uni = g_utf8_get_char_validated (text, -1);
 183     return g_unichar_isspace (uni);
 184 }
 185
 186 /* --------------------------------------------------------------------------------------------- */
 187
 188 static gboolean
 189 str_utf8_ispunct (const char *text)
 190 {
 191     gunichar uni;
 192
 193     uni = g_utf8_get_char_validated (text, -1);
 194     return g_unichar_ispunct (uni);
 195 }
 196
 197 /* --------------------------------------------------------------------------------------------- */
 198
 199 static gboolean
 200 str_utf8_isalnum (const char *text)
 201 {
 202     gunichar uni;
 203
 204     uni = g_utf8_get_char_validated (text, -1);
 205     return g_unichar_isalnum (uni);
 206 }
 207
 208 /* --------------------------------------------------------------------------------------------- */
 209
 210 static gboolean
 211 str_utf8_isdigit (const char *text)
 212 {
 213     gunichar uni;
 214
 215     uni = g_utf8_get_char_validated (text, -1);
 216     return g_unichar_isdigit (uni);
 217 }
 218
 219 /* --------------------------------------------------------------------------------------------- */
 220
 221 static gboolean
 222 str_utf8_isprint (const char *ch)
 223 {
 224     gunichar uni;
 225
 226     uni = g_utf8_get_char_validated (ch, -1);
 227     return g_unichar_isprint (uni);
 228 }
 229
 230 /* --------------------------------------------------------------------------------------------- */
 231
 232 static gboolean
 233 str_utf8_iscombiningmark (const char *ch)
 234 {
 235     gunichar uni;
 236
 237     uni = g_utf8_get_char_validated (ch, -1);
 238     return str_unichar_iscombiningmark (uni);
 239 }
 240
 241 /* --------------------------------------------------------------------------------------------- */
 242
 243 static int
 244 str_utf8_cnext_noncomb_char (const char **text)
 245 {
 246     int count = 0;
 247
 248     while ((*text)[0] != '\0')
 249     {
 250         str_utf8_cnext_char_safe (text);
 251         count++;
 252         if (!str_utf8_iscombiningmark (*text))
 253             break;
 254     }
 255
 256     return count;
 257 }
 258
 259 /* --------------------------------------------------------------------------------------------- */
 260
 261 static int
 262 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 263 {
 264     int count = 0;
 265
 266     while ((*text) != begin)
 267     {
 268         str_utf8_cprev_char_safe (text);
 269         count++;
 270         if (!str_utf8_iscombiningmark (*text))
 271             break;
 272     }
 273
 274     return count;
 275 }
 276
 277 /* --------------------------------------------------------------------------------------------- */
 278
 279 static gboolean
 280 str_utf8_toupper (const char *text, char **out, size_t *remain)
 281 {
 282     gunichar uni;
 283     size_t left;
 284
 285     uni = g_utf8_get_char_validated (text, -1);
 286     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 287         return FALSE;
 288
 289     uni = g_unichar_toupper (uni);
 290     left = g_unichar_to_utf8 (uni, NULL);
 291     if (left >= *remain)
 292         return FALSE;
 293
 294     left = g_unichar_to_utf8 (uni, *out);
 295     (*out) += left;
 296     (*remain) -= left;
 297     return TRUE;
 298 }
 299
 300 /* --------------------------------------------------------------------------------------------- */
 301
 302 static gboolean
 303 str_utf8_tolower (const char *text, char **out, size_t *remain)
 304 {
 305     gunichar uni;
 306     size_t left;
 307
 308     uni = g_utf8_get_char_validated (text, -1);
 309     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 310         return FALSE;
 311
 312     uni = g_unichar_tolower (uni);
 313     left = g_unichar_to_utf8 (uni, NULL);
 314     if (left >= *remain)
 315         return FALSE;
 316
 317     left = g_unichar_to_utf8 (uni, *out);
 318     (*out) += left;
 319     (*remain) -= left;
 320     return TRUE;
 321 }
 322
 323 /* --------------------------------------------------------------------------------------------- */
 324
 325 static int
 326 str_utf8_length (const char *text)
 327 {
 328     int result = 0;
 329     const char *start;
 330     const char *end;
 331
 332     start = text;
 333     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 334     {
 335         if (start != end)
 336             result += g_utf8_strlen (start, end - start);
 337
 338         result++;
 339         start = end + 1;
 340     }
 341
 342     if (start == text)
 343         result = g_utf8_strlen (text, -1);
 344     else if (start[0] != '\0' && start != end)
 345         result += g_utf8_strlen (start, end - start);
 346
 347     return result;
 348 }
 349
 350 /* --------------------------------------------------------------------------------------------- */
 351
 352 static int
 353 str_utf8_length2 (const char *text, int size)
 354 {
 355     int result = 0;
 356     const char *start;
 357     const char *end;
 358
 359     start = text;
 360     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 361     {
 362         if (start != end)
 363         {
 364             result += g_utf8_strlen (start, MIN (end - start, size));
 365             size -= end - start;
 366         }
 367         result += (size > 0);
 368         size--;
 369         start = end + 1;
 370     }
 371
 372     if (start == text)
 373         result = g_utf8_strlen (text, size);
 374     else if (start[0] != '\0' && start != end && size > 0)
 375         result += g_utf8_strlen (start, MIN (end - start, size));
 376
 377     return result;
 378 }
 379
 380 /* --------------------------------------------------------------------------------------------- */
 381
 382 static int
 383 str_utf8_length_noncomb (const char *text)
 384 {
 385     int result = 0;
 386     const char *t = text;
 387
 388     while (t[0] != '\0')
 389     {
 390         str_utf8_cnext_noncomb_char (&t);
 391         result++;
 392     }
 393
 394     return result;
 395 }
 396
 397 /* --------------------------------------------------------------------------------------------- */
 398
 399 #if 0
 400 static void
 401 str_utf8_questmark_sustb (char **string, size_t *left, GString *buffer)
 402 {
 403     char *next;
 404
 405     next = g_utf8_next_char (*string);
 406     (*left) -= next - (*string);
 407     (*string) = next;
 408     g_string_append_c (buffer, '?');
 409 }
 410 #endif
 411
 412 /* --------------------------------------------------------------------------------------------- */
 413
 414 static gchar *
 415 str_utf8_conv_gerror_message (GError *mcerror, const char *def_msg)
 416 {
 417     if (mcerror != NULL)
 418         return g_strdup (mcerror->message);
 419
 420     return g_strdup (def_msg != NULL ? def_msg : "");
 421 }
 422
 423 /* --------------------------------------------------------------------------------------------- */
 424
 425 static estr_t
 426 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString *buffer)
 427 {
 428     estr_t result = ESTR_SUCCESS;
 429
 430     if (coder == str_cnv_not_convert)
 431         g_string_append_len (buffer, string, size);
 432     else
 433         result = str_nconvert (coder, string, size, buffer);
 434
 435     return result;
 436 }
 437
 438 /* --------------------------------------------------------------------------------------------- */
 439 /* utility function, that makes string valid in utf8 and all characters printable
 440  * return width of string too */
 441
 442 static const struct term_form *
 443 str_utf8_make_make_term_form (const char *text, size_t length)
 444 {
 445     static struct term_form result;
 446     gunichar uni;
 447     size_t left;
 448     char *actual;
 449
 450     result.text[0] = '\0';
 451     result.width = 0;
 452     result.compose = FALSE;
 453     actual = result.text;
 454
 455     /* check if text start with combining character,
 456      * add space at begin in this case */
 457     if (length != 0 && text[0] != '\0')
 458     {
 459         uni = g_utf8_get_char_validated (text, -1);
 460         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
 461             && str_unichar_iscombiningmark (uni))
 462         {
 463             actual[0] = ' ';
 464             actual++;
 465             result.width++;
 466             result.compose = TRUE;
 467         }
 468     }
 469
 470     while (length != 0 && text[0] != '\0')
 471     {
 472         uni = g_utf8_get_char_validated (text, -1);
 473         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 474         {
 475             if (g_unichar_isprint (uni))
 476             {
 477                 left = g_unichar_to_utf8 (uni, actual);
 478                 actual += left;
 479                 if (str_unichar_iscombiningmark (uni))
 480                     result.compose = TRUE;
 481                 else
 482                 {
 483                     result.width++;
 484                     if (g_unichar_iswide (uni))
 485                         result.width++;
 486                 }
 487             }
 488             else
 489             {
 490                 actual[0] = '.';
 491                 actual++;
 492                 result.width++;
 493             }
 494             text = g_utf8_next_char (text);
 495         }
 496         else
 497         {
 498             text++;
 499             /*actual[0] = '?'; */
 500             memcpy (actual, replch, strlen (replch));
 501             actual += strlen (replch);
 502             result.width++;
 503         }
 504
 505         if (length != (size_t) (-1))
 506             length--;
 507     }
 508     actual[0] = '\0';
 509
 510     return &result;
 511 }
 512
 513 /* --------------------------------------------------------------------------------------------- */
 514
 515 static const char *
 516 str_utf8_term_form (const char *text)
 517 {
 518     static char result[BUF_MEDIUM * MB_LEN_MAX];
 519     const struct term_form *pre_form;
 520
 521     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 522     if (pre_form->compose)
 523     {
 524         char *composed;
 525
 526         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 527         g_strlcpy (result, composed, sizeof (result));
 528         g_free (composed);
 529     }
 530     else
 531         g_strlcpy (result, pre_form->text, sizeof (result));
 532
 533     return result;
 534 }
 535
 536 /* --------------------------------------------------------------------------------------------- */
 537 /* utility function, that copies all characters from checked to actual */
 538
 539 static gboolean
 540 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 541 {
 542     tool->compose = FALSE;
 543
 544     while (tool->checked[0] != '\0')
 545     {
 546         gunichar uni;
 547         size_t left;
 548
 549         uni = g_utf8_get_char (tool->checked);
 550         tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
 551         left = g_unichar_to_utf8 (uni, NULL);
 552         if (tool->remain <= left)
 553             return FALSE;
 554         left = g_unichar_to_utf8 (uni, tool->actual);
 555         tool->actual += left;
 556         tool->remain -= left;
 557         tool->checked = g_utf8_next_char (tool->checked);
 558     }
 559
 560     return TRUE;
 561 }
 562
 563 /* --------------------------------------------------------------------------------------------- */
 564 /* utility function, that copies characters from checked to actual until ident is
 565  * smaller than to_ident */
 566
 567 static gboolean
 568 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 569 {
 570     tool->compose = FALSE;
 571
 572     while (tool->checked[0] != '\0')
 573     {
 574         gunichar uni;
 575         size_t left;
 576         int w = 0;
 577
 578         uni = g_utf8_get_char (tool->checked);
 579         if (str_unichar_iscombiningmark (uni))
 580             tool->compose = TRUE;
 581         else
 582         {
 583             w = 1;
 584             if (g_unichar_iswide (uni))
 585                 w++;
 586             if (tool->ident + w > to_ident)
 587                 return TRUE;
 588         }
 589
 590         left = g_unichar_to_utf8 (uni, NULL);
 591         if (tool->remain <= left)
 592             return FALSE;
 593         left = g_unichar_to_utf8 (uni, tool->actual);
 594         tool->actual += left;
 595         tool->remain -= left;
 596         tool->checked = g_utf8_next_char (tool->checked);
 597         tool->ident += w;
 598     }
 599
 600     return TRUE;
 601 }
 602
 603 /* --------------------------------------------------------------------------------------------- */
 604 /* utility function, adds count spaces to actual */
 605
 606 static int
 607 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 608 {
 609     if (count <= 0)
 610         return 1;
 611     if (tool->remain <= (gsize) count)
 612         return 0;
 613
 614     memset (tool->actual, ' ', count);
 615     tool->actual += count;
 616     tool->remain -= count;
 617     return 1;
 618 }
 619
 620 /* --------------------------------------------------------------------------------------------- */
 621 /* utility function, adds one characters to actual */
 622
 623 static int
 624 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 625 {
 626     if (tool->remain <= 1)
 627         return 0;
 628
 629     tool->actual[0] = ch;
 630     tool->actual++;
 631     tool->remain--;
 632     return 1;
 633 }
 634
 635 /* --------------------------------------------------------------------------------------------- */
 636 /* utility function, thah skips characters from checked until ident is greater or
 637  * equal to to_ident */
 638
 639 static gboolean
 640 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 641 {
 642     gunichar uni;
 643
 644     while (to_ident > tool->ident && tool->checked[0] != '\0')
 645     {
 646         uni = g_utf8_get_char (tool->checked);
 647         if (!str_unichar_iscombiningmark (uni))
 648         {
 649             tool->ident++;
 650             if (g_unichar_iswide (uni))
 651                 tool->ident++;
 652         }
 653         tool->checked = g_utf8_next_char (tool->checked);
 654     }
 655
 656     uni = g_utf8_get_char (tool->checked);
 657     while (str_unichar_iscombiningmark (uni))
 658     {
 659         tool->checked = g_utf8_next_char (tool->checked);
 660         uni = g_utf8_get_char (tool->checked);
 661     }
 662
 663     return TRUE;
 664 }
 665
 666 /* --------------------------------------------------------------------------------------------- */
 667
 668 static void
 669 utf8_tool_compose (char *buffer, size_t size)
 670 {
 671     char *composed;
 672
 673     composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 674     g_strlcpy (buffer, composed, size);
 675     g_free (composed);
 676 }
 677
 678 /* --------------------------------------------------------------------------------------------- */
 679
 680 static const char *
 681 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 682 {
 683     static char result[BUF_MEDIUM * MB_LEN_MAX];
 684     const struct term_form *pre_form;
 685     struct utf8_tool tool;
 686
 687     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 688     tool.checked = pre_form->text;
 689     tool.actual = result;
 690     tool.remain = sizeof (result);
 691     tool.compose = FALSE;
 692
 693     if (pre_form->width <= (gsize) width)
 694     {
 695         switch (HIDE_FIT (just_mode))
 696         {
 697         case J_CENTER_LEFT:
 698         case J_CENTER:
 699             tool.ident = (width - pre_form->width) / 2;
 700             break;
 701         case J_RIGHT:
 702             tool.ident = width - pre_form->width;
 703             break;
 704         default:
 705             tool.ident = 0;
 706             break;
 707         }
 708
 709         utf8_tool_insert_space (&tool, tool.ident);
 710         utf8_tool_copy_chars_to_end (&tool);
 711         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 712     }
 713     else if (IS_FIT (just_mode))
 714     {
 715         tool.ident = 0;
 716         utf8_tool_copy_chars_to (&tool, width / 2);
 717         utf8_tool_insert_char (&tool, '~');
 718
 719         tool.ident = 0;
 720         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 721         utf8_tool_copy_chars_to_end (&tool);
 722         utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 723     }
 724     else
 725     {
 726         switch (HIDE_FIT (just_mode))
 727         {
 728         case J_CENTER:
 729             tool.ident = (width - pre_form->width) / 2;
 730             break;
 731         case J_RIGHT:
 732             tool.ident = width - pre_form->width;
 733             break;
 734         default:
 735             tool.ident = 0;
 736             break;
 737         }
 738
 739         utf8_tool_skip_chars_to (&tool, 0);
 740         utf8_tool_insert_space (&tool, tool.ident);
 741         utf8_tool_copy_chars_to (&tool, width);
 742         utf8_tool_insert_space (&tool, width - tool.ident);
 743     }
 744
 745     tool.actual[0] = '\0';
 746     if (tool.compose)
 747         utf8_tool_compose (result, sizeof (result));
 748     return result;
 749 }
 750
 751 /* --------------------------------------------------------------------------------------------- */
 752
 753 static const char *
 754 str_utf8_term_trim (const char *text, int width)
 755 {
 756     static char result[BUF_MEDIUM * MB_LEN_MAX];
 757     const struct term_form *pre_form;
 758     struct utf8_tool tool;
 759
 760     if (width < 1)
 761     {
 762         result[0] = '\0';
 763         return result;
 764     }
 765
 766     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 767
 768     tool.checked = pre_form->text;
 769     tool.actual = result;
 770     tool.remain = sizeof (result);
 771     tool.compose = FALSE;
 772
 773     if ((gsize) width >= pre_form->width)
 774         utf8_tool_copy_chars_to_end (&tool);
 775     else if (width <= 3)
 776     {
 777         memset (tool.actual, '.', width);
 778         tool.actual += width;
 779         tool.remain -= width;
 780     }
 781     else
 782     {
 783         memset (tool.actual, '.', 3);
 784         tool.actual += 3;
 785         tool.remain -= 3;
 786
 787         tool.ident = 0;
 788         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 789         utf8_tool_copy_chars_to_end (&tool);
 790     }
 791
 792     tool.actual[0] = '\0';
 793     if (tool.compose)
 794         utf8_tool_compose (result, sizeof (result));
 795     return result;
 796 }
 797
 798 /* --------------------------------------------------------------------------------------------- */
 799
 800 static int
 801 str_utf8_term_width2 (const char *text, size_t length)
 802 {
 803     const struct term_form *result;
 804
 805     result = str_utf8_make_make_term_form (text, length);
 806     return result->width;
 807 }
 808
 809 /* --------------------------------------------------------------------------------------------- */
 810
 811 static int
 812 str_utf8_term_width1 (const char *text)
 813 {
 814     return str_utf8_term_width2 (text, (size_t) (-1));
 815 }
 816
 817 /* --------------------------------------------------------------------------------------------- */
 818
 819 static int
 820 str_utf8_term_char_width (const char *text)
 821 {
 822     gunichar uni;
 823
 824     uni = g_utf8_get_char_validated (text, -1);
 825     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 826 }
 827
 828 /* --------------------------------------------------------------------------------------------- */
 829
 830 static const char *
 831 str_utf8_term_substring (const char *text, int start, int width)
 832 {
 833     static char result[BUF_MEDIUM * MB_LEN_MAX];
 834     const struct term_form *pre_form;
 835     struct utf8_tool tool;
 836
 837     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 838
 839     tool.checked = pre_form->text;
 840     tool.actual = result;
 841     tool.remain = sizeof (result);
 842     tool.compose = FALSE;
 843
 844     tool.ident = -start;
 845     utf8_tool_skip_chars_to (&tool, 0);
 846     if (tool.ident < 0)
 847         tool.ident = 0;
 848     utf8_tool_insert_space (&tool, tool.ident);
 849
 850     utf8_tool_copy_chars_to (&tool, width);
 851     utf8_tool_insert_space (&tool, width - tool.ident);
 852
 853     tool.actual[0] = '\0';
 854     if (tool.compose)
 855         utf8_tool_compose (result, sizeof (result));
 856     return result;
 857 }
 858
 859 /* --------------------------------------------------------------------------------------------- */
 860
 861 static const char *
 862 str_utf8_trunc (const char *text, int width)
 863 {
 864     static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
 865     const struct term_form *pre_form;
 866     struct utf8_tool tool;
 867
 868     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 869
 870     tool.checked = pre_form->text;
 871     tool.actual = result;
 872     tool.remain = sizeof (result);
 873     tool.compose = FALSE;
 874
 875     if (pre_form->width <= (gsize) width)
 876         utf8_tool_copy_chars_to_end (&tool);
 877     else
 878     {
 879         tool.ident = 0;
 880         utf8_tool_copy_chars_to (&tool, width / 2);
 881         utf8_tool_insert_char (&tool, '~');
 882
 883         tool.ident = 0;
 884         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 885         utf8_tool_copy_chars_to_end (&tool);
 886     }
 887
 888     tool.actual[0] = '\0';
 889     if (tool.compose)
 890         utf8_tool_compose (result, sizeof (result));
 891     return result;
 892 }
 893
 894 /* --------------------------------------------------------------------------------------------- */
 895
 896 static int
 897 str_utf8_offset_to_pos (const char *text, size_t length)
 898 {
 899     if (str_utf8_is_valid_string (text))
 900         return g_utf8_offset_to_pointer (text, length) - text;
 901     else
 902     {
 903         int result;
 904         char *buffer;
 905
 906         buffer = g_strdup (text);
 907         str_utf8_fix_string (buffer);
 908         result = g_utf8_offset_to_pointer (buffer, length) - buffer;
 909         g_free (buffer);
 910         return result;
 911     }
 912 }
 913
 914 /* --------------------------------------------------------------------------------------------- */
 915
 916 static int
 917 str_utf8_column_to_pos (const char *text, size_t pos)
 918 {
 919     int result = 0;
 920     int width = 0;
 921
 922     while (text[0] != '\0')
 923     {
 924         gunichar uni;
 925
 926         uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
 927         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 928         {
 929             if (g_unichar_isprint (uni))
 930             {
 931                 if (!str_unichar_iscombiningmark (uni))
 932                 {
 933                     width++;
 934                     if (g_unichar_iswide (uni))
 935                         width++;
 936                 }
 937             }
 938             else
 939             {
 940                 width++;
 941             }
 942             text = g_utf8_next_char (text);
 943         }
 944         else
 945         {
 946             text++;
 947             width++;
 948         }
 949
 950         if ((gsize) width > pos)
 951             return result;
 952
 953         result++;
 954     }
 955
 956     return result;
 957 }
 958
 959 /* --------------------------------------------------------------------------------------------- */
 960
 961 static char *
 962 str_utf8_create_search_needle (const char *needle, gboolean case_sen)
 963 {
 964     char *fold, *result;
 965
 966     if (needle == NULL)
 967         return NULL;
 968
 969     if (case_sen)
 970         return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 971
 972     fold = g_utf8_casefold (needle, -1);
 973     result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 974     g_free (fold);
 975     return result;
 976 }
 977
 978 /* --------------------------------------------------------------------------------------------- */
 979
 980 static void
 981 str_utf8_release_search_needle (char *needle, gboolean case_sen)
 982 {
 983     (void) case_sen;
 984     g_free (needle);
 985 }
 986
 987 /* --------------------------------------------------------------------------------------------- */
 988
 989 static const char *
 990 str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
 991 {
 992     char *fold_text;
 993     char *deco_text;
 994     const char *match;
 995     const char *result = NULL;
 996     const char *m;
 997
 998     fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
 999     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1000
1001     match = deco_text;
1002     do
1003     {
1004         match = g_strstr_len (match, -1, search);
1005         if (match != NULL)
1006         {
1007             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1008                 !str_utf8_iscombiningmark (match + strlen (search)))
1009             {
1010                 result = text;
1011                 m = deco_text;
1012                 while (m < match)
1013                 {
1014                     str_utf8_cnext_noncomb_char (&m);
1015                     str_utf8_cnext_noncomb_char (&result);
1016                 }
1017             }
1018             else
1019                 str_utf8_cnext_char (&match);
1020         }
1021     }
1022     while (match != NULL && result == NULL);
1023
1024     g_free (deco_text);
1025     if (!case_sen)
1026         g_free (fold_text);
1027
1028     return result;
1029 }
1030
1031 /* --------------------------------------------------------------------------------------------- */
1032
1033 static const char *
1034 str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
1035 {
1036     char *fold_text;
1037     char *deco_text;
1038     char *match;
1039     const char *result = NULL;
1040     const char *m;
1041
1042     fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
1043     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1044
1045     do
1046     {
1047         match = g_strrstr_len (deco_text, -1, search);
1048         if (match != NULL)
1049         {
1050             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1051                 !str_utf8_iscombiningmark (match + strlen (search)))
1052             {
1053                 result = text;
1054                 m = deco_text;
1055                 while (m < match)
1056                 {
1057                     str_utf8_cnext_noncomb_char (&m);
1058                     str_utf8_cnext_noncomb_char (&result);
1059                 }
1060             }
1061             else
1062                 match[0] = '\0';
1063         }
1064     }
1065     while (match != NULL && result == NULL);
1066
1067     g_free (deco_text);
1068     if (!case_sen)
1069         g_free (fold_text);
1070
1071     return result;
1072 }
1073
1074 /* --------------------------------------------------------------------------------------------- */
1075
1076 static char *
1077 str_utf8_normalize (const char *text)
1078 {
1079     GString *fixed;
1080     char *tmp;
1081     char *result;
1082     const char *start;
1083     const char *end;
1084
1085     /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
1086      * does the normalization and then converts UCS-4 back into UTF-8.
1087      * Since file names are composed of ASCII characters in most cases, we can speed up
1088      * utf8 normalization by checking if the heavyweight Unicode normalization is actually
1089      * needed. Normalization of ASCII string is no-op.
1090      */
1091
1092     /* find out whether text is ASCII only */
1093     for (end = text; *end != '\0'; end++)
1094         if ((*end & 0x80) != 0)
1095         {
1096             /* found 2nd byte of utf8-encoded symbol */
1097             break;
1098         }
1099
1100     /* if text is ASCII-only, return copy, normalize otherwise */
1101     if (*end == '\0')
1102         return g_strndup (text, end - text);
1103
1104     fixed = g_string_sized_new (4);
1105
1106     start = text;
1107     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1108     {
1109         if (start != end)
1110         {
1111             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1112             g_string_append (fixed, tmp);
1113             g_free (tmp);
1114         }
1115         g_string_append_c (fixed, end[0]);
1116         start = end + 1;
1117     }
1118
1119     if (start == text)
1120     {
1121         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1122         g_string_free (fixed, TRUE);
1123     }
1124     else
1125     {
1126         if (start[0] != '\0' && start != end)
1127         {
1128             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1129             g_string_append (fixed, tmp);
1130             g_free (tmp);
1131         }
1132         result = g_string_free (fixed, FALSE);
1133     }
1134
1135     return result;
1136 }
1137
1138 /* --------------------------------------------------------------------------------------------- */
1139
1140 static char *
1141 str_utf8_casefold_normalize (const char *text)
1142 {
1143     GString *fixed;
1144     char *tmp, *fold;
1145     char *result;
1146     const char *start;
1147     const char *end;
1148
1149     fixed = g_string_sized_new (4);
1150
1151     start = text;
1152     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1153     {
1154         if (start != end)
1155         {
1156             fold = g_utf8_casefold (start, end - start);
1157             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1158             g_string_append (fixed, tmp);
1159             g_free (tmp);
1160             g_free (fold);
1161         }
1162         g_string_append_c (fixed, end[0]);
1163         start = end + 1;
1164     }
1165
1166     if (start == text)
1167     {
1168         fold = g_utf8_casefold (text, -1);
1169         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1170         g_free (fold);
1171         g_string_free (fixed, TRUE);
1172     }
1173     else
1174     {
1175         if (start[0] != '\0' && start != end)
1176         {
1177             fold = g_utf8_casefold (start, end - start);
1178             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1179             g_string_append (fixed, tmp);
1180             g_free (tmp);
1181             g_free (fold);
1182         }
1183         result = g_string_free (fixed, FALSE);
1184     }
1185
1186     return result;
1187 }
1188
1189 /* --------------------------------------------------------------------------------------------- */
1190
1191 static int
1192 str_utf8_compare (const char *t1, const char *t2)
1193 {
1194     char *n1, *n2;
1195     int result;
1196
1197     n1 = str_utf8_normalize (t1);
1198     n2 = str_utf8_normalize (t2);
1199
1200     result = strcmp (n1, n2);
1201
1202     g_free (n1);
1203     g_free (n2);
1204
1205     return result;
1206 }
1207
1208 /* --------------------------------------------------------------------------------------------- */
1209
1210 static int
1211 str_utf8_ncompare (const char *t1, const char *t2)
1212 {
1213     char *n1, *n2;
1214     size_t l1, l2;
1215     int result;
1216
1217     n1 = str_utf8_normalize (t1);
1218     n2 = str_utf8_normalize (t2);
1219
1220     l1 = strlen (n1);
1221     l2 = strlen (n2);
1222     result = strncmp (n1, n2, MIN (l1, l2));
1223
1224     g_free (n1);
1225     g_free (n2);
1226
1227     return result;
1228 }
1229
1230 /* --------------------------------------------------------------------------------------------- */
1231
1232 static int
1233 str_utf8_casecmp (const char *t1, const char *t2)
1234 {
1235     char *n1, *n2;
1236     int result;
1237
1238     n1 = str_utf8_casefold_normalize (t1);
1239     n2 = str_utf8_casefold_normalize (t2);
1240
1241     result = strcmp (n1, n2);
1242
1243     g_free (n1);
1244     g_free (n2);
1245
1246     return result;
1247 }
1248
1249 /* --------------------------------------------------------------------------------------------- */
1250
1251 static int
1252 str_utf8_ncasecmp (const char *t1, const char *t2)
1253 {
1254     char *n1, *n2;
1255     size_t l1, l2;
1256     int result;
1257
1258     n1 = str_utf8_casefold_normalize (t1);
1259     n2 = str_utf8_casefold_normalize (t2);
1260
1261     l1 = strlen (n1);
1262     l2 = strlen (n2);
1263     result = strncmp (n1, n2, MIN (l1, l2));
1264
1265     g_free (n1);
1266     g_free (n2);
1267
1268     return result;
1269 }
1270
1271 /* --------------------------------------------------------------------------------------------- */
1272
1273 static int
1274 str_utf8_prefix (const char *text, const char *prefix)
1275 {
1276     char *t, *p;
1277     const char *nt, *np;
1278     const char *nnt, *nnp;
1279     int result;
1280
1281     t = str_utf8_normalize (text);
1282     p = str_utf8_normalize (prefix);
1283     nt = t;
1284     np = p;
1285     nnt = t;
1286     nnp = p;
1287
1288     while (nt[0] != '\0' && np[0] != '\0')
1289     {
1290         str_utf8_cnext_char_safe (&nnt);
1291         str_utf8_cnext_char_safe (&nnp);
1292         if (nnt - nt != nnp - np)
1293             break;
1294         if (strncmp (nt, np, nnt - nt) != 0)
1295             break;
1296         nt = nnt;
1297         np = nnp;
1298     }
1299
1300     result = np - p;
1301
1302     g_free (t);
1303     g_free (p);
1304
1305     return result;
1306 }
1307
1308 /* --------------------------------------------------------------------------------------------- */
1309
1310 static int
1311 str_utf8_caseprefix (const char *text, const char *prefix)
1312 {
1313     char *t, *p;
1314     const char *nt, *np;
1315     const char *nnt, *nnp;
1316     int result;
1317
1318     t = str_utf8_casefold_normalize (text);
1319     p = str_utf8_casefold_normalize (prefix);
1320     nt = t;
1321     np = p;
1322     nnt = t;
1323     nnp = p;
1324
1325     while (nt[0] != '\0' && np[0] != '\0')
1326     {
1327         str_utf8_cnext_char_safe (&nnt);
1328         str_utf8_cnext_char_safe (&nnp);
1329         if (nnt - nt != nnp - np)
1330             break;
1331         if (strncmp (nt, np, nnt - nt) != 0)
1332             break;
1333         nt = nnt;
1334         np = nnp;
1335     }
1336
1337     result = np - p;
1338
1339     g_free (t);
1340     g_free (p);
1341
1342     return result;
1343 }
1344
1345 /* --------------------------------------------------------------------------------------------- */
1346
1347 static char *
1348 str_utf8_create_key_gen (const char *text, gboolean case_sen,
1349                          gchar *(*keygen) (const gchar *text, gssize size))
1350 {
1351     char *result;
1352
1353     if (case_sen)
1354         result = str_utf8_normalize (text);
1355     else
1356     {
1357         gboolean dot;
1358         GString *fixed;
1359         const char *start, *end;
1360         char *fold, *key;
1361
1362         dot = text[0] == '.';
1363         fixed = g_string_sized_new (16);
1364
1365         if (!dot)
1366             start = text;
1367         else
1368         {
1369             start = text + 1;
1370             g_string_append_c (fixed, '.');
1371         }
1372
1373         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1374         {
1375             if (start != end)
1376             {
1377                 fold = g_utf8_casefold (start, end - start);
1378                 key = keygen (fold, -1);
1379                 g_string_append (fixed, key);
1380                 g_free (key);
1381                 g_free (fold);
1382             }
1383             g_string_append_c (fixed, end[0]);
1384             start = end + 1;
1385         }
1386
1387         if (start == text)
1388         {
1389             fold = g_utf8_casefold (start, -1);
1390             result = keygen (fold, -1);
1391             g_free (fold);
1392             g_string_free (fixed, TRUE);
1393         }
1394         else if (dot && (start == text + 1))
1395         {
1396             fold = g_utf8_casefold (start, -1);
1397             key = keygen (fold, -1);
1398             g_string_append (fixed, key);
1399             g_free (key);
1400             g_free (fold);
1401             result = g_string_free (fixed, FALSE);
1402         }
1403         else
1404         {
1405             if (start[0] != '\0' && start != end)
1406             {
1407                 fold = g_utf8_casefold (start, end - start);
1408                 key = keygen (fold, -1);
1409                 g_string_append (fixed, key);
1410                 g_free (key);
1411                 g_free (fold);
1412             }
1413             result = g_string_free (fixed, FALSE);
1414         }
1415     }
1416     return result;
1417 }
1418
1419 /* --------------------------------------------------------------------------------------------- */
1420
1421 static char *
1422 str_utf8_create_key (const char *text, gboolean case_sen)
1423 {
1424     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1425 }
1426
1427 /* --------------------------------------------------------------------------------------------- */
1428
1429 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1430 static char *
1431 str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
1432 {
1433     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1434 }
1435 #endif
1436
1437 /* --------------------------------------------------------------------------------------------- */
1438
1439 static int
1440 str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
1441 {
1442     (void) case_sen;
1443     return strcmp (t1, t2);
1444 }
1445
1446 /* --------------------------------------------------------------------------------------------- */
1447
1448 static void
1449 str_utf8_release_key (char *key, gboolean case_sen)
1450 {
1451     (void) case_sen;
1452     g_free (key);
1453 }
1454
1455 /* --------------------------------------------------------------------------------------------- */
1456 /*** public functions ****************************************************************************/
1457 /* --------------------------------------------------------------------------------------------- */
1458
1459 struct str_class
1460 str_utf8_init (void)
1461 {
1462     struct str_class result;
1463
1464     result.conv_gerror_message = str_utf8_conv_gerror_message;
1465     result.vfs_convert_to = str_utf8_vfs_convert_to;
1466     result.insert_replace_char = str_utf8_insert_replace_char;
1467     result.is_valid_string = str_utf8_is_valid_string;
1468     result.is_valid_char = str_utf8_is_valid_char;
1469     result.cnext_char = str_utf8_cnext_char;
1470     result.cprev_char = str_utf8_cprev_char;
1471     result.cnext_char_safe = str_utf8_cnext_char_safe;
1472     result.cprev_char_safe = str_utf8_cprev_char_safe;
1473     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1474     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1475     result.char_isspace = str_utf8_isspace;
1476     result.char_ispunct = str_utf8_ispunct;
1477     result.char_isalnum = str_utf8_isalnum;
1478     result.char_isdigit = str_utf8_isdigit;
1479     result.char_isprint = str_utf8_isprint;
1480     result.char_iscombiningmark = str_utf8_iscombiningmark;
1481     result.char_toupper = str_utf8_toupper;
1482     result.char_tolower = str_utf8_tolower;
1483     result.length = str_utf8_length;
1484     result.length2 = str_utf8_length2;
1485     result.length_noncomb = str_utf8_length_noncomb;
1486     result.fix_string = str_utf8_fix_string;
1487     result.term_form = str_utf8_term_form;
1488     result.fit_to_term = str_utf8_fit_to_term;
1489     result.term_trim = str_utf8_term_trim;
1490     result.term_width2 = str_utf8_term_width2;
1491     result.term_width1 = str_utf8_term_width1;
1492     result.term_char_width = str_utf8_term_char_width;
1493     result.term_substring = str_utf8_term_substring;
1494     result.trunc = str_utf8_trunc;
1495     result.offset_to_pos = str_utf8_offset_to_pos;
1496     result.column_to_pos = str_utf8_column_to_pos;
1497     result.create_search_needle = str_utf8_create_search_needle;
1498     result.release_search_needle = str_utf8_release_search_needle;
1499     result.search_first = str_utf8_search_first;
1500     result.search_last = str_utf8_search_last;
1501     result.compare = str_utf8_compare;
1502     result.ncompare = str_utf8_ncompare;
1503     result.casecmp = str_utf8_casecmp;
1504     result.ncasecmp = str_utf8_ncasecmp;
1505     result.prefix = str_utf8_prefix;
1506     result.caseprefix = str_utf8_caseprefix;
1507     result.create_key = str_utf8_create_key;
1508 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1509     /* case insensitive sort files in "a1 a2 a10" order */
1510     result.create_key_for_filename = str_utf8_create_key_for_filename;
1511 #else
1512     /* case insensitive sort files in "a1 a10 a2" order */
1513     result.create_key_for_filename = str_utf8_create_key;
1514 #endif
1515     result.key_collate = str_utf8_key_collate;
1516     result.release_key = str_utf8_release_key;
1517
1518     return result;
1519 }
1520
1521 /* --------------------------------------------------------------------------------------------- */