glib/gunicollate.c

   1 /* gunicollate.c - Collation
   2  *
   3  *  Copyright 2001,2005 Red Hat, Inc.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2.1 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public License
  16  * along with this library; if not, see <http://www.gnu.org/licenses/>.
  17  */
  18
  19 #include "config.h"
  20
  21 #include <locale.h>
  22 #include <string.h>
  23 #ifdef __STDC_ISO_10646__
  24 #include <wchar.h>
  25 #endif
  26
  27 #ifdef HAVE_CARBON
  28 #include <CoreServices/CoreServices.h>
  29 #endif
  30
  31 #include "gmem.h"
  32 #include "gunicode.h"
  33 #include "gunicodeprivate.h"
  34 #include "gstring.h"
  35 #include "gstrfuncs.h"
  36 #include "gtestutils.h"
  37 #include "gcharset.h"
  38 #ifndef __STDC_ISO_10646__
  39 #include "gconvert.h"
  40 #endif
  41
  42
  43 #ifdef _MSC_VER
  44 /* Workaround for bug in MSVCR80.DLL */
  45 static gsize
  46 msc_strxfrm_wrapper (char       *string1,
  47                      const char *string2,
  48                      gsize       count)
  49 {
  50   if (!string1 || count <= 0)
  51     {
  52       char tmp;
  53
  54       return strxfrm (&tmp, string2, 1);
  55     }
  56   return strxfrm (string1, string2, count);
  57 }
  58 #define strxfrm msc_strxfrm_wrapper
  59 #endif
  60
  61 /**
  62  * g_utf8_collate:
  63  * @str1: a UTF-8 encoded string
  64  * @str2: a UTF-8 encoded string
  65  *
  66  * Compares two strings for ordering using the linguistically
  67  * correct rules for the [current locale][setlocale].
  68  * When sorting a large number of strings, it will be significantly
  69  * faster to obtain collation keys with g_utf8_collate_key() and
  70  * compare the keys with strcmp() when sorting instead of sorting
  71  * the original strings.
  72  *
  73  * Returns: < 0 if @str1 compares before @str2,
  74  *   0 if they compare equal, > 0 if @str1 compares after @str2.
  75  **/
  76 gint
  77 g_utf8_collate (const gchar *str1,
  78                 const gchar *str2)
  79 {
  80   gint result;
  81
  82 #ifdef HAVE_CARBON
  83
  84   UniChar *str1_utf16;
  85   UniChar *str2_utf16;
  86   glong len1;
  87   glong len2;
  88   SInt32 retval = 0;
  89
  90   g_return_val_if_fail (str1 != NULL, 0);
  91   g_return_val_if_fail (str2 != NULL, 0);
  92
  93   str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
  94   str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
  95
  96   UCCompareTextDefault (kUCCollateStandardOptions,
  97                         str1_utf16, len1, str2_utf16, len2,
  98                         NULL, &retval);
  99   result = retval;
 100
 101   g_free (str2_utf16);
 102   g_free (str1_utf16);
 103
 104 #elif defined(__STDC_ISO_10646__)
 105
 106   gunichar *str1_norm;
 107   gunichar *str2_norm;
 108
 109   g_return_val_if_fail (str1 != NULL, 0);
 110   g_return_val_if_fail (str2 != NULL, 0);
 111
 112   str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
 113   str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
 114
 115   result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
 116
 117   g_free (str1_norm);
 118   g_free (str2_norm);
 119
 120 #else /* !__STDC_ISO_10646__ */
 121
 122   const gchar *charset;
 123   gchar *str1_norm;
 124   gchar *str2_norm;
 125
 126   g_return_val_if_fail (str1 != NULL, 0);
 127   g_return_val_if_fail (str2 != NULL, 0);
 128
 129   str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
 130   str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
 131
 132   if (g_get_charset (&charset))
 133     {
 134       result = strcoll (str1_norm, str2_norm);
 135     }
 136   else
 137     {
 138       gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 139       gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 140
 141       if (str1_locale && str2_locale)
 142         result =  strcoll (str1_locale, str2_locale);
 143       else if (str1_locale)
 144         result = -1;
 145       else if (str2_locale)
 146         result = 1;
 147       else
 148         result = strcmp (str1_norm, str2_norm);
 149
 150       g_free (str1_locale);
 151       g_free (str2_locale);
 152     }
 153
 154   g_free (str1_norm);
 155   g_free (str2_norm);
 156
 157 #endif /* __STDC_ISO_10646__ */
 158
 159   return result;
 160 }
 161
 162 #if defined(__STDC_ISO_10646__)
 163 /* We need UTF-8 encoding of numbers to encode the weights if
 164  * we are using wcsxfrm. However, we aren't encoding Unicode
 165  * characters, so we can't simply use g_unichar_to_utf8.
 166  *
 167  * The following routine is taken (with modification) from GNU
 168  * libc's strxfrm routine:
 169  *
 170  * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
 171  * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
 172  */
 173 static inline int
 174 utf8_encode (char *buf, wchar_t val)
 175 {
 176   int retval;
 177
 178   if (val < 0x80)
 179     {
 180       if (buf)
 181         *buf++ = (char) val;
 182       retval = 1;
 183     }
 184   else
 185     {
 186       int step;
 187
 188       for (step = 2; step < 6; ++step)
 189         if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
 190           break;
 191       retval = step;
 192
 193       if (buf)
 194         {
 195           *buf = (unsigned char) (~0xff >> step);
 196           --step;
 197           do
 198             {
 199               buf[step] = 0x80 | (val & 0x3f);
 200               val >>= 6;
 201             }
 202           while (--step > 0);
 203           *buf |= val;
 204         }
 205     }
 206
 207   return retval;
 208 }
 209 #endif /* __STDC_ISO_10646__ */
 210
 211 #ifdef HAVE_CARBON
 212
 213 static gchar *
 214 collate_key_to_string (UCCollationValue *key,
 215                        gsize             key_len)
 216 {
 217   gchar *result;
 218   gsize result_len;
 219   long *lkey = (long *) key;
 220
 221   /* UCCollationValue format:
 222    *
 223    * UCCollateOptions (32/64 bits)
 224    * SizeInBytes      (32/64 bits)
 225    * Value            (8 bits arrey)
 226    *
 227    * UCCollateOptions: ordering option mask of the collator
 228    * used to create the key. Size changes on 32-bit / 64-bit
 229    * hosts. On 64-bits also the extra half-word seems to have
 230    * some extra (unknown) meaning.
 231    * SizeInBytes: size of the whole structure, in bytes
 232    * (including UCCollateOptions and SizeInBytes fields). Size
 233    * changes on 32-bit & 64-bit hosts.
 234    * Value: array of bytes containing the comparison weights.
 235    * Seems to have several sub-strings separated by \001 and \002
 236    * chars. Also, experience shows this is directly strcmp-able.
 237    */
 238
 239   result_len = lkey[1];
 240   result = g_malloc (result_len + 1);
 241   memcpy (result, &lkey[2], result_len);
 242   result[result_len] = '\0';
 243
 244   return result;
 245 }
 246
 247 static gchar *
 248 carbon_collate_key_with_collator (const gchar *str,
 249                                   gssize       len,
 250                                   CollatorRef  collator)
 251 {
 252   UniChar *str_utf16 = NULL;
 253   glong len_utf16;
 254   OSStatus ret;
 255   UCCollationValue staticbuf[512];
 256   UCCollationValue *freeme = NULL;
 257   UCCollationValue *buf;
 258   ItemCount buf_len;
 259   ItemCount key_len;
 260   ItemCount try_len;
 261   gchar *result = NULL;
 262
 263   str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
 264   try_len = len_utf16 * 5 + 2;
 265
 266   if (try_len <= sizeof staticbuf)
 267     {
 268       buf = staticbuf;
 269       buf_len = sizeof staticbuf;
 270     }
 271   else
 272     {
 273       freeme = g_new (UCCollationValue, try_len);
 274       buf = freeme;
 275       buf_len = try_len;
 276     }
 277
 278   ret = UCGetCollationKey (collator, str_utf16, len_utf16,
 279                            buf_len, &key_len, buf);
 280
 281   if (ret == kCollateBufferTooSmall)
 282     {
 283       freeme = g_renew (UCCollationValue, freeme, try_len * 2);
 284       buf = freeme;
 285       buf_len = try_len * 2;
 286       ret = UCGetCollationKey (collator, str_utf16, len_utf16,
 287                                buf_len, &key_len, buf);
 288     }
 289
 290   if (ret == 0)
 291     result = collate_key_to_string (buf, key_len);
 292   else
 293     result = g_strdup ("");
 294
 295   g_free (freeme);
 296   g_free (str_utf16);
 297   return result;
 298 }
 299
 300 static gchar *
 301 carbon_collate_key (const gchar *str,
 302                     gssize       len)
 303 {
 304   static CollatorRef collator;
 305
 306   if (G_UNLIKELY (!collator))
 307     {
 308       UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
 309
 310       if (!collator)
 311         {
 312           static gboolean been_here;
 313           if (!been_here)
 314             g_warning ("%s: UCCreateCollator failed", G_STRLOC);
 315           been_here = TRUE;
 316           return g_strdup ("");
 317         }
 318     }
 319
 320   return carbon_collate_key_with_collator (str, len, collator);
 321 }
 322
 323 static gchar *
 324 carbon_collate_key_for_filename (const gchar *str,
 325                                  gssize       len)
 326 {
 327   static CollatorRef collator;
 328
 329   if (G_UNLIKELY (!collator))
 330     {
 331       /* http://developer.apple.com/qa/qa2004/qa1159.html */
 332       UCCreateCollator (NULL, 0,
 333                         kUCCollateComposeInsensitiveMask
 334                          | kUCCollateWidthInsensitiveMask
 335                          | kUCCollateCaseInsensitiveMask
 336                          | kUCCollateDigitsOverrideMask
 337                          | kUCCollateDigitsAsNumberMask
 338                          | kUCCollatePunctuationSignificantMask,
 339                         &collator);
 340
 341       if (!collator)
 342         {
 343           static gboolean been_here;
 344           if (!been_here)
 345             g_warning ("%s: UCCreateCollator failed", G_STRLOC);
 346           been_here = TRUE;
 347           return g_strdup ("");
 348         }
 349     }
 350
 351   return carbon_collate_key_with_collator (str, len, collator);
 352 }
 353
 354 #endif /* HAVE_CARBON */
 355
 356 /**
 357  * g_utf8_collate_key:
 358  * @str: a UTF-8 encoded string.
 359  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 360  *
 361  * Converts a string into a collation key that can be compared
 362  * with other collation keys produced by the same function using
 363  * strcmp().
 364  *
 365  * The results of comparing the collation keys of two strings
 366  * with strcmp() will always be the same as comparing the two
 367  * original keys with g_utf8_collate().
 368  *
 369  * Note that this function depends on the [current locale][setlocale].
 370  *
 371  * Returns: a newly allocated string. This string should
 372  *   be freed with g_free() when you are done with it.
 373  **/
 374 gchar *
 375 g_utf8_collate_key (const gchar *str,
 376                     gssize       len)
 377 {
 378   gchar *result;
 379
 380 #ifdef HAVE_CARBON
 381
 382   g_return_val_if_fail (str != NULL, NULL);
 383   result = carbon_collate_key (str, len);
 384
 385 #elif defined(__STDC_ISO_10646__)
 386
 387   gsize xfrm_len;
 388   gunichar *str_norm;
 389   wchar_t *result_wc;
 390   gsize i;
 391   gsize result_len = 0;
 392
 393   g_return_val_if_fail (str != NULL, NULL);
 394
 395   str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
 396
 397   xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
 398   result_wc = g_new (wchar_t, xfrm_len + 1);
 399   wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
 400
 401   for (i = 0; i < xfrm_len; i++)
 402     result_len += utf8_encode (NULL, result_wc[i]);
 403
 404   result = g_malloc (result_len + 1);
 405   result_len = 0;
 406   for (i = 0; i < xfrm_len; i++)
 407     result_len += utf8_encode (result + result_len, result_wc[i]);
 408
 409   result[result_len] = '\0';
 410
 411   g_free (result_wc);
 412   g_free (str_norm);
 413
 414   return result;
 415 #else /* !__STDC_ISO_10646__ */
 416
 417   gsize xfrm_len;
 418   const gchar *charset;
 419   gchar *str_norm;
 420
 421   g_return_val_if_fail (str != NULL, NULL);
 422
 423   str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
 424
 425   result = NULL;
 426
 427   if (g_get_charset (&charset))
 428     {
 429       xfrm_len = strxfrm (NULL, str_norm, 0);
 430       if (xfrm_len >= 0 && xfrm_len < G_MAXINT - 2)
 431         {
 432           result = g_malloc (xfrm_len + 1);
 433           strxfrm (result, str_norm, xfrm_len + 1);
 434         }
 435     }
 436   else
 437     {
 438       gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 439
 440       if (str_locale)
 441         {
 442           xfrm_len = strxfrm (NULL, str_locale, 0);
 443           if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
 444             {
 445               g_free (str_locale);
 446               str_locale = NULL;
 447             }
 448         }
 449       if (str_locale)
 450         {
 451           result = g_malloc (xfrm_len + 2);
 452           result[0] = 'A';
 453           strxfrm (result + 1, str_locale, xfrm_len + 1);
 454
 455           g_free (str_locale);
 456         }
 457     }
 458
 459   if (!result)
 460     {
 461       xfrm_len = strlen (str_norm);
 462       result = g_malloc (xfrm_len + 2);
 463       result[0] = 'B';
 464       memcpy (result + 1, str_norm, xfrm_len);
 465       result[xfrm_len+1] = '\0';
 466     }
 467
 468   g_free (str_norm);
 469 #endif /* __STDC_ISO_10646__ */
 470
 471   return result;
 472 }
 473
 474 /* This is a collation key that is very very likely to sort before any
 475  * collation key that libc strxfrm generates. We use this before any
 476  * special case (dot or number) to make sure that its sorted before
 477  * anything else.
 478  */
 479 #define COLLATION_SENTINEL "\1\1\1"
 480
 481 /**
 482  * g_utf8_collate_key_for_filename:
 483  * @str: a UTF-8 encoded string.
 484  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 485  *
 486  * Converts a string into a collation key that can be compared
 487  * with other collation keys produced by the same function using strcmp().
 488  *
 489  * In order to sort filenames correctly, this function treats the dot '.'
 490  * as a special case. Most dictionary orderings seem to consider it
 491  * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
 492  * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
 493  * would like to treat numbers intelligently so that "file1" "file10" "file5"
 494  * is sorted as "file1" "file5" "file10".
 495  *
 496  * Note that this function depends on the [current locale][setlocale].
 497  *
 498  * Returns: a newly allocated string. This string should
 499  *   be freed with g_free() when you are done with it.
 500  *
 501  * Since: 2.8
 502  */
 503 gchar *
 504 g_utf8_collate_key_for_filename (const gchar *str,
 505                                  gssize       len)
 506 {
 507 #ifndef HAVE_CARBON
 508   GString *result;
 509   GString *append;
 510   const gchar *p;
 511   const gchar *prev;
 512   const gchar *end;
 513   gchar *collate_key;
 514   gint digits;
 515   gint leading_zeros;
 516
 517   /*
 518    * How it works:
 519    *
 520    * Split the filename into collatable substrings which do
 521    * not contain [.0-9] and special-cased substrings. The collatable
 522    * substrings are run through the normal g_utf8_collate_key() and the
 523    * resulting keys are concatenated with keys generated from the
 524    * special-cased substrings.
 525    *
 526    * Special cases: Dots are handled by replacing them with '\1' which
 527    * implies that short dot-delimited substrings are before long ones,
 528    * e.g.
 529    *
 530    *   a\1a   (a.a)
 531    *   a-\1a  (a-.a)
 532    *   aa\1a  (aa.a)
 533    *
 534    * Numbers are handled by prepending to each number d-1 superdigits
 535    * where d = number of digits in the number and SUPERDIGIT is a
 536    * character with an integer value higher than any digit (for instance
 537    * ':'). This ensures that single-digit numbers are sorted before
 538    * double-digit numbers which in turn are sorted separately from
 539    * triple-digit numbers, etc. To avoid strange side-effects when
 540    * sorting strings that already contain SUPERDIGITs, a '\2'
 541    * is also prepended, like this
 542    *
 543    *   file\21      (file1)
 544    *   file\25      (file5)
 545    *   file\2:10    (file10)
 546    *   file\2:26    (file26)
 547    *   file\2::100  (file100)
 548    *   file:foo     (file:foo)
 549    *
 550    * This has the side-effect of sorting numbers before everything else (except
 551    * dots), but this is probably OK.
 552    *
 553    * Leading digits are ignored when doing the above. To discriminate
 554    * numbers which differ only in the number of leading digits, we append
 555    * the number of leading digits as a byte at the very end of the collation
 556    * key.
 557    *
 558    * To try avoid conflict with any collation key sequence generated by libc we
 559    * start each switch to a special cased part with a sentinel that hopefully
 560    * will sort before anything libc will generate.
 561    */
 562
 563   if (len < 0)
 564     len = strlen (str);
 565
 566   result = g_string_sized_new (len * 2);
 567   append = g_string_sized_new (0);
 568
 569   end = str + len;
 570
 571   /* No need to use utf8 functions, since we're only looking for ascii chars */
 572   for (prev = p = str; p < end; p++)
 573     {
 574       switch (*p)
 575         {
 576         case '.':
 577           if (prev != p)
 578             {
 579               collate_key = g_utf8_collate_key (prev, p - prev);
 580               g_string_append (result, collate_key);
 581               g_free (collate_key);
 582             }
 583
 584           g_string_append (result, COLLATION_SENTINEL "\1");
 585
 586           /* skip the dot */
 587           prev = p + 1;
 588           break;
 589
 590         case '0':
 591         case '1':
 592         case '2':
 593         case '3':
 594         case '4':
 595         case '5':
 596         case '6':
 597         case '7':
 598         case '8':
 599         case '9':
 600           if (prev != p)
 601             {
 602               collate_key = g_utf8_collate_key (prev, p - prev);
 603               g_string_append (result, collate_key);
 604               g_free (collate_key);
 605             }
 606
 607           g_string_append (result, COLLATION_SENTINEL "\2");
 608
 609           prev = p;
 610
 611           /* write d-1 colons */
 612           if (*p == '0')
 613             {
 614               leading_zeros = 1;
 615               digits = 0;
 616             }
 617           else
 618             {
 619               leading_zeros = 0;
 620               digits = 1;
 621             }
 622
 623           while (++p < end)
 624             {
 625               if (*p == '0' && !digits)
 626                 ++leading_zeros;
 627               else if (g_ascii_isdigit(*p))
 628                 ++digits;
 629               else
 630                 {
 631                   /* count an all-zero sequence as
 632                    * one digit plus leading zeros
 633                    */
 634                   if (!digits)
 635                     {
 636                       ++digits;
 637                       --leading_zeros;
 638                     }
 639                   break;
 640                 }
 641             }
 642
 643           while (digits > 1)
 644             {
 645               g_string_append_c (result, ':');
 646               --digits;
 647             }
 648
 649           if (leading_zeros > 0)
 650             {
 651               g_string_append_c (append, (char)leading_zeros);
 652               prev += leading_zeros;
 653             }
 654
 655           /* write the number itself */
 656           g_string_append_len (result, prev, p - prev);
 657
 658           prev = p;
 659           --p;    /* go one step back to avoid disturbing outer loop */
 660           break;
 661
 662         default:
 663           /* other characters just accumulate */
 664           break;
 665         }
 666     }
 667
 668   if (prev != p)
 669     {
 670       collate_key = g_utf8_collate_key (prev, p - prev);
 671       g_string_append (result, collate_key);
 672       g_free (collate_key);
 673     }
 674
 675   g_string_append (result, append->str);
 676   g_string_free (append, TRUE);
 677
 678   return g_string_free (result, FALSE);
 679 #else /* HAVE_CARBON */
 680   return carbon_collate_key_for_filename (str, len);
 681 #endif
 682 }