string/strcoll_l.c

   1 /* Copyright (C) 1995,96,97,2002, 2004 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20
  21 #include <assert.h>
  22 #include <langinfo.h>
  23 #include <locale.h>
  24 #include <stddef.h>
  25 #include <stdint.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28
  29 #ifndef STRING_TYPE
  30 # define STRING_TYPE char
  31 # define USTRING_TYPE unsigned char
  32 # define STRCOLL __strcoll_l
  33 # define STRCMP strcmp
  34 # define STRLEN strlen
  35 # define WEIGHT_H "../locale/weight.h"
  36 # define SUFFIX MB
  37 # define L(arg) arg
  38 #endif
  39
  40 #define CONCAT(a,b) CONCAT1(a,b)
  41 #define CONCAT1(a,b) a##b
  42
  43 #include "../locale/localeinfo.h"
  44
  45 int
  46 STRCOLL (s1, s2, l)
  47      const STRING_TYPE *s1;
  48      const STRING_TYPE *s2;
  49      __locale_t l;
  50 {
  51   struct locale_data *current = l->__locales[LC_COLLATE];
  52   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
  53   /* We don't assign the following values right away since it might be
  54      unnecessary in case there are no rules.  */
  55   const unsigned char *rulesets;
  56   const int32_t *table;
  57   const USTRING_TYPE *weights;
  58   const USTRING_TYPE *extra;
  59   const int32_t *indirect;
  60   uint_fast32_t pass;
  61   int result = 0;
  62   const USTRING_TYPE *us1;
  63   const USTRING_TYPE *us2;
  64   size_t s1len;
  65   size_t s2len;
  66   int32_t *idx1arr;
  67   int32_t *idx2arr;
  68   unsigned char *rule1arr;
  69   unsigned char *rule2arr;
  70   size_t idx1max;
  71   size_t idx2max;
  72   size_t idx1cnt;
  73   size_t idx2cnt;
  74   size_t idx1now;
  75   size_t idx2now;
  76   size_t backw1_stop;
  77   size_t backw2_stop;
  78   size_t backw1;
  79   size_t backw2;
  80   int val1;
  81   int val2;
  82   int position;
  83   int seq1len;
  84   int seq2len;
  85   int use_malloc;
  86
  87 #include WEIGHT_H
  88
  89   if (nrules == 0)
  90     return STRCMP (s1, s2);
  91
  92   rulesets = (const unsigned char *)
  93     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
  94   table = (const int32_t *)
  95     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
  96   weights = (const USTRING_TYPE *)
  97     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
  98   extra = (const USTRING_TYPE *)
  99     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 100   indirect = (const int32_t *)
 101     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 102   use_malloc = 0;
 103
 104   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 105   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 106   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 107   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 108
 109   /* We need this a few times.  */
 110   s1len = STRLEN (s1);
 111   s2len = STRLEN (s2);
 112
 113   /* Catch empty strings.  */
 114   if (__builtin_expect (s1len == 0, 0) || __builtin_expect (s2len == 0, 0))
 115     return (s1len != 0) - (s2len != 0);
 116
 117   /* We need the elements of the strings as unsigned values since they
 118      are used as indeces.  */
 119   us1 = (const USTRING_TYPE *) s1;
 120   us2 = (const USTRING_TYPE *) s2;
 121
 122   /* Perform the first pass over the string and while doing this find
 123      and store the weights for each character.  Since we want this to
 124      be as fast as possible we are using `alloca' to store the temporary
 125      values.  But since there is no limit on the length of the string
 126      we have to use `malloc' if the string is too long.  We should be
 127      very conservative here.
 128
 129      Please note that the localedef programs makes sure that `position'
 130      is not used at the first level.  */
 131   if (! __libc_use_alloca (s1len + s2len))
 132     {
 133       idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
 134       idx2arr = &idx1arr[s1len];
 135       rule1arr = (unsigned char *) &idx2arr[s2len];
 136       rule2arr = &rule1arr[s1len];
 137
 138       if (idx1arr == NULL)
 139         /* No memory.  Well, go with the stack then.
 140
 141            XXX Once this implementation is stable we will handle this
 142            differently.  Instead of precomputing the indeces we will
 143            do this in time.  This means, though, that this happens for
 144            every pass again.  */
 145         goto try_stack;
 146       use_malloc = 1;
 147     }
 148   else
 149     {
 150     try_stack:
 151       idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
 152       idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
 153       rule1arr = (unsigned char *) alloca (s1len);
 154       rule2arr = (unsigned char *) alloca (s2len);
 155     }
 156
 157   idx1cnt = 0;
 158   idx2cnt = 0;
 159   idx1max = 0;
 160   idx2max = 0;
 161   idx1now = 0;
 162   idx2now = 0;
 163   backw1_stop = ~0ul;
 164   backw2_stop = ~0ul;
 165   backw1 = ~0ul;
 166   backw2 = ~0ul;
 167   seq1len = 0;
 168   seq2len = 0;
 169   position = rulesets[0] & sort_position;
 170   while (1)
 171     {
 172       val1 = 0;
 173       val2 = 0;
 174
 175       /* Get the next non-IGNOREd element for string `s1'.  */
 176       if (seq1len == 0)
 177         do
 178           {
 179             ++val1;
 180
 181             if (backw1_stop != ~0ul)
 182               {
 183                 /* The is something pushed.  */
 184                 if (backw1 == backw1_stop)
 185                   {
 186                     /* The last pushed character was handled.  Continue
 187                        with forward characters.  */
 188                     if (idx1cnt < idx1max)
 189                       idx1now = idx1cnt;
 190                     else
 191                       /* Nothing anymore.  The backward sequence ended with
 192                          the last sequence in the string.  Note that seq1len
 193                          is still zero.  */
 194                       break;
 195                   }
 196                 else
 197                   idx1now = --backw1;
 198               }
 199             else
 200               {
 201                 backw1_stop = idx1max;
 202
 203                 while (*us1 != L('\0'))
 204                   {
 205                     int32_t tmp = findidx (&us1);
 206                     rule1arr[idx1max] = tmp >> 24;
 207                     idx1arr[idx1max] = tmp & 0xffffff;
 208                     idx1cnt = idx1max++;
 209
 210                     if ((rulesets[rule1arr[idx1cnt] * nrules]
 211                          & sort_backward) == 0)
 212                       /* No more backward characters to push.  */
 213                       break;
 214                     ++idx1cnt;
 215                   }
 216
 217                 if (backw1_stop >= idx1cnt)
 218                   {
 219                     /* No sequence at all or just one.  */
 220                     if (idx1cnt == idx1max || backw1_stop > idx1cnt)
 221                       /* Note that seq1len is still zero.  */
 222                       break;
 223
 224                     backw1_stop = ~0ul;
 225                     idx1now = idx1cnt;
 226                   }
 227                 else
 228                   /* We pushed backward sequences.  */
 229                   idx1now = backw1 = idx1cnt - 1;
 230               }
 231           }
 232         while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
 233
 234       /* And the same for string `s2'.  */
 235       if (seq2len == 0)
 236         do
 237           {
 238             ++val2;
 239
 240             if (backw2_stop != ~0ul)
 241               {
 242                 /* The is something pushed.  */
 243                 if (backw2 == backw2_stop)
 244                   {
 245                     /* The last pushed character was handled.  Continue
 246                        with forward characters.  */
 247                     if (idx2cnt < idx2max)
 248                       idx2now = idx2cnt;
 249                     else
 250                       /* Nothing anymore.  The backward sequence ended with
 251                          the last sequence in the string.  Note that seq2len
 252                          is still zero.  */
 253                       break;
 254                   }
 255                 else
 256                   idx2now = --backw2;
 257               }
 258             else
 259               {
 260                 backw2_stop = idx2max;
 261
 262                 while (*us2 != L('\0'))
 263                   {
 264                     int32_t tmp = findidx (&us2);
 265                     rule2arr[idx2max] = tmp >> 24;
 266                     idx2arr[idx2max] = tmp & 0xffffff;
 267                     idx2cnt = idx2max++;
 268
 269                     if ((rulesets[rule2arr[idx2cnt] * nrules]
 270                          & sort_backward) == 0)
 271                       /* No more backward characters to push.  */
 272                       break;
 273                     ++idx2cnt;
 274                   }
 275
 276                 if (backw2_stop >= idx2cnt)
 277                   {
 278                     /* No sequence at all or just one.  */
 279                     if (idx2cnt == idx2max || backw2_stop > idx2cnt)
 280                       /* Note that seq1len is still zero.  */
 281                       break;
 282
 283                     backw2_stop = ~0ul;
 284                     idx2now = idx2cnt;
 285                   }
 286                 else
 287                   /* We pushed backward sequences.  */
 288                   idx2now = backw2 = idx2cnt - 1;
 289               }
 290           }
 291         while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
 292
 293       /* See whether any or both strings are empty.  */
 294       if (seq1len == 0 || seq2len == 0)
 295         {
 296           if (seq1len == seq2len)
 297             /* Both ended.  So far so good, both strings are equal at the
 298                first level.  */
 299             break;
 300
 301           /* This means one string is shorter than the other.  Find out
 302              which one and return an appropriate value.  */
 303           result = seq1len == 0 ? -1 : 1;
 304           goto free_and_return;
 305         }
 306
 307       /* Test for position if necessary.  */
 308       if (position && val1 != val2)
 309         {
 310           result = val1 - val2;
 311           goto free_and_return;
 312         }
 313
 314       /* Compare the two sequences.  */
 315       do
 316         {
 317           if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
 318             {
 319               /* The sequences differ.  */
 320               result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
 321               goto free_and_return;
 322             }
 323
 324           /* Increment the offsets.  */
 325           ++idx1arr[idx1now];
 326           ++idx2arr[idx2now];
 327
 328           --seq1len;
 329           --seq2len;
 330         }
 331       while (seq1len > 0 && seq2len > 0);
 332
 333       if (position && seq1len != seq2len)
 334         {
 335           result = seq1len - seq2len;
 336           goto free_and_return;
 337         }
 338     }
 339
 340   /* Now the remaining passes over the weights.  We now use the
 341      indeces we found before.  */
 342   for (pass = 1; pass < nrules; ++pass)
 343     {
 344       /* We assume that if a rule has defined `position' in one section
 345          this is true for all of them.  */
 346       idx1cnt = 0;
 347       idx2cnt = 0;
 348       backw1_stop = ~0ul;
 349       backw2_stop = ~0ul;
 350       backw1 = ~0ul;
 351       backw2 = ~0ul;
 352       position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
 353
 354       while (1)
 355         {
 356           val1 = 0;
 357           val2 = 0;
 358
 359           /* Get the next non-IGNOREd element for string `s1'.  */
 360           if (seq1len == 0)
 361             do
 362               {
 363                 ++val1;
 364
 365                 if (backw1_stop != ~0ul)
 366                   {
 367                     /* The is something pushed.  */
 368                     if (backw1 == backw1_stop)
 369                       {
 370                         /* The last pushed character was handled.  Continue
 371                            with forward characters.  */
 372                         if (idx1cnt < idx1max)
 373                           idx1now = idx1cnt;
 374                         else
 375                           {
 376                             /* Nothing anymore.  The backward sequence
 377                                ended with the last sequence in the string.  */
 378                             idx1now = ~0ul;
 379                             break;
 380                           }
 381                       }
 382                     else
 383                       idx1now = --backw1;
 384                   }
 385                 else
 386                   {
 387                     backw1_stop = idx1cnt;
 388
 389                     while (idx1cnt < idx1max)
 390                       {
 391                         if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
 392                              & sort_backward) == 0)
 393                           /* No more backward characters to push.  */
 394                           break;
 395                         ++idx1cnt;
 396                       }
 397
 398                     if (backw1_stop == idx1cnt)
 399                       {
 400                         /* No sequence at all or just one.  */
 401                         if (idx1cnt == idx1max)
 402                           /* Note that seq1len is still zero.  */
 403                           break;
 404
 405                         backw1_stop = ~0ul;
 406                         idx1now = idx1cnt++;
 407                       }
 408                     else
 409                       /* We pushed backward sequences.  */
 410                       idx1now = backw1 = idx1cnt - 1;
 411                   }
 412               }
 413             while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
 414
 415           /* And the same for string `s2'.  */
 416           if (seq2len == 0)
 417             do
 418               {
 419                 ++val2;
 420
 421                 if (backw2_stop != ~0ul)
 422                   {
 423                     /* The is something pushed.  */
 424                     if (backw2 == backw2_stop)
 425                       {
 426                         /* The last pushed character was handled.  Continue
 427                            with forward characters.  */
 428                         if (idx2cnt < idx2max)
 429                           idx2now = idx2cnt;
 430                         else
 431                           {
 432                             /* Nothing anymore.  The backward sequence
 433                                ended with the last sequence in the string.  */
 434                             idx2now = ~0ul;
 435                             break;
 436                           }
 437                       }
 438                     else
 439                       idx2now = --backw2;
 440                   }
 441                 else
 442                   {
 443                     backw2_stop = idx2cnt;
 444
 445                     while (idx2cnt < idx2max)
 446                       {
 447                         if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
 448                              & sort_backward) == 0)
 449                           /* No more backward characters to push.  */
 450                           break;
 451                         ++idx2cnt;
 452                       }
 453
 454                     if (backw2_stop == idx2cnt)
 455                       {
 456                         /* No sequence at all or just one.  */
 457                         if (idx2cnt == idx2max)
 458                           /* Note that seq2len is still zero.  */
 459                           break;
 460
 461                         backw2_stop = ~0ul;
 462                         idx2now = idx2cnt++;
 463                       }
 464                     else
 465                       /* We pushed backward sequences.  */
 466                       idx2now = backw2 = idx2cnt - 1;
 467                   }
 468               }
 469             while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
 470
 471           /* See whether any or both strings are empty.  */
 472           if (seq1len == 0 || seq2len == 0)
 473             {
 474               if (seq1len == seq2len)
 475                 /* Both ended.  So far so good, both strings are equal
 476                    at this level.  */
 477                 break;
 478
 479               /* This means one string is shorter than the other.  Find out
 480                  which one and return an appropriate value.  */
 481               result = seq1len == 0 ? -1 : 1;
 482               goto free_and_return;
 483             }
 484
 485           /* Test for position if necessary.  */
 486           if (position && val1 != val2)
 487             {
 488               result = val1 - val2;
 489               goto free_and_return;
 490             }
 491
 492           /* Compare the two sequences.  */
 493           do
 494             {
 495               if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
 496                 {
 497                   /* The sequences differ.  */
 498                   result = (weights[idx1arr[idx1now]]
 499                             - weights[idx2arr[idx2now]]);
 500                   goto free_and_return;
 501                 }
 502
 503               /* Increment the offsets.  */
 504               ++idx1arr[idx1now];
 505               ++idx2arr[idx2now];
 506
 507               --seq1len;
 508               --seq2len;
 509             }
 510           while (seq1len > 0 && seq2len > 0);
 511
 512           if (position && seq1len != seq2len)
 513             {
 514               result = seq1len - seq2len;
 515               goto free_and_return;
 516             }
 517         }
 518     }
 519
 520   /* Free the memory if needed.  */
 521  free_and_return:
 522   if (use_malloc)
 523     free (idx1arr);
 524
 525   return result;
 526 }
 527 libc_hidden_def (STRCOLL)
 528
 529 #ifndef WIDE_CHAR_VERSION
 530 weak_alias (__strcoll_l, strcoll_l)
 531 #endif