string/strcoll_l.c

   1 /* Copyright (C) 1995-2014 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19
  20 #include <assert.h>
  21 #include <langinfo.h>
  22 #include <locale.h>
  23 #include <stddef.h>
  24 #include <stdint.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27 #include <sys/param.h>
  28
  29 #ifndef STRING_TYPE
  30 # define STRING_TYPE char
  31 # define USTRING_TYPE unsigned char
  32 # define STRCOLL __strcoll_l
  33 # define STRCMP strcmp
  34 # define STRLEN strlen
  35 # define WEIGHT_H "../locale/weight.h"
  36 # define SUFFIX MB
  37 # define L(arg) arg
  38 #endif
  39
  40 #define CONCAT(a,b) CONCAT1(a,b)
  41 #define CONCAT1(a,b) a##b
  42
  43 #include "../locale/localeinfo.h"
  44
  45 /* Track status while looking for sequences in a string.  */
  46 typedef struct
  47 {
  48   int len;                      /* Length of the current sequence.  */
  49   size_t val;                   /* Position of the sequence relative to the
  50                                    previous non-ignored sequence.  */
  51   size_t idxnow;                /* Current index in sequences.  */
  52   size_t idxmax;                /* Maximum index in sequences.  */
  53   size_t idxcnt;                /* Current count of indices.  */
  54   size_t backw;                 /* Current Backward sequence index.  */
  55   size_t backw_stop;            /* Index where the backward sequences stop.  */
  56   const USTRING_TYPE *us;       /* The string.  */
  57   int32_t *idxarr;              /* Array to cache weight indices.  */
  58   unsigned char *rulearr;       /* Array to cache rules.  */
  59   unsigned char rule;           /* Saved rule for the first sequence.  */
  60   int32_t idx;                  /* Index to weight of the current sequence.  */
  61   int32_t save_idx;             /* Save looked up index of a forward
  62                                    sequence after the last backward
  63                                    sequence.  */
  64   const USTRING_TYPE *back_us;  /* Beginning of the backward sequence.  */
  65 } coll_seq;
  66
  67 /* Get next sequence.  The weight indices are cached, so we don't need to
  68    traverse the string.  */
  69 static void
  70 get_next_seq_cached (coll_seq *seq, int nrules, int pass,
  71                      const unsigned char *rulesets,
  72                      const USTRING_TYPE *weights)
  73 {
  74   size_t val = seq->val = 0;
  75   int len = seq->len;
  76   size_t backw_stop = seq->backw_stop;
  77   size_t backw = seq->backw;
  78   size_t idxcnt = seq->idxcnt;
  79   size_t idxmax = seq->idxmax;
  80   size_t idxnow = seq->idxnow;
  81   unsigned char *rulearr = seq->rulearr;
  82   int32_t *idxarr = seq->idxarr;
  83
  84   while (len == 0)
  85     {
  86       ++val;
  87       if (backw_stop != ~0ul)
  88         {
  89           /* There is something pushed.  */
  90           if (backw == backw_stop)
  91             {
  92               /* The last pushed character was handled.  Continue
  93                  with forward characters.  */
  94               if (idxcnt < idxmax)
  95                 {
  96                   idxnow = idxcnt;
  97                   backw_stop = ~0ul;
  98                 }
  99               else
 100                 {
 101                   /* Nothing any more.  The backward sequence
 102                      ended with the last sequence in the string.  */
 103                   idxnow = ~0ul;
 104                   break;
 105                 }
 106             }
 107           else
 108             idxnow = --backw;
 109         }
 110       else
 111         {
 112           backw_stop = idxcnt;
 113
 114           while (idxcnt < idxmax)
 115             {
 116               if ((rulesets[rulearr[idxcnt] * nrules + pass]
 117                    & sort_backward) == 0)
 118                 /* No more backward characters to push.  */
 119                 break;
 120               ++idxcnt;
 121             }
 122
 123           if (backw_stop == idxcnt)
 124             {
 125               /* No sequence at all or just one.  */
 126               if (idxcnt == idxmax)
 127                 /* Note that LEN is still zero.  */
 128                 break;
 129
 130               backw_stop = ~0ul;
 131               idxnow = idxcnt++;
 132             }
 133           else
 134             /* We pushed backward sequences.  */
 135             idxnow = backw = idxcnt - 1;
 136         }
 137       len = weights[idxarr[idxnow]++];
 138     }
 139
 140   /* Update the structure.  */
 141   seq->val = val;
 142   seq->len = len;
 143   seq->backw_stop = backw_stop;
 144   seq->backw = backw;
 145   seq->idxcnt = idxcnt;
 146   seq->idxnow = idxnow;
 147 }
 148
 149 /* Get next sequence.  Traverse the string as required.  */
 150 static void
 151 get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
 152               const USTRING_TYPE *weights, const int32_t *table,
 153               const USTRING_TYPE *extra, const int32_t *indirect)
 154 {
 155 #include WEIGHT_H
 156   size_t val = seq->val = 0;
 157   int len = seq->len;
 158   size_t backw_stop = seq->backw_stop;
 159   size_t backw = seq->backw;
 160   size_t idxcnt = seq->idxcnt;
 161   size_t idxmax = seq->idxmax;
 162   size_t idxnow = seq->idxnow;
 163   unsigned char *rulearr = seq->rulearr;
 164   int32_t *idxarr = seq->idxarr;
 165   const USTRING_TYPE *us = seq->us;
 166
 167   while (len == 0)
 168     {
 169       ++val;
 170       if (backw_stop != ~0ul)
 171         {
 172           /* There is something pushed.  */
 173           if (backw == backw_stop)
 174             {
 175               /* The last pushed character was handled.  Continue
 176                  with forward characters.  */
 177               if (idxcnt < idxmax)
 178                 {
 179                   idxnow = idxcnt;
 180                   backw_stop = ~0ul;
 181                 }
 182               else
 183                 /* Nothing any more.  The backward sequence ended with
 184                    the last sequence in the string.  Note that LEN
 185                    is still zero.  */
 186                 break;
 187             }
 188           else
 189             idxnow = --backw;
 190         }
 191       else
 192         {
 193           backw_stop = idxmax;
 194
 195           while (*us != L('\0'))
 196             {
 197               int32_t tmp = findidx (&us, -1);
 198               rulearr[idxmax] = tmp >> 24;
 199               idxarr[idxmax] = tmp & 0xffffff;
 200               idxcnt = idxmax++;
 201
 202               if ((rulesets[rulearr[idxcnt] * nrules]
 203                    & sort_backward) == 0)
 204                 /* No more backward characters to push.  */
 205                 break;
 206               ++idxcnt;
 207             }
 208
 209           if (backw_stop >= idxcnt)
 210             {
 211               /* No sequence at all or just one.  */
 212               if (idxcnt == idxmax || backw_stop > idxcnt)
 213                 /* Note that LEN is still zero.  */
 214                 break;
 215
 216               backw_stop = ~0ul;
 217               idxnow = idxcnt;
 218             }
 219           else
 220             /* We pushed backward sequences.  */
 221             idxnow = backw = idxcnt - 1;
 222         }
 223       len = weights[idxarr[idxnow]++];
 224     }
 225
 226   /* Update the structure.  */
 227   seq->val = val;
 228   seq->len = len;
 229   seq->backw_stop = backw_stop;
 230   seq->backw = backw;
 231   seq->idxcnt = idxcnt;
 232   seq->idxmax = idxmax;
 233   seq->idxnow = idxnow;
 234   seq->us = us;
 235 }
 236
 237 /* Get next sequence.  Traverse the string as required.  This function does not
 238    set or use any index or rule cache.  */
 239 static void
 240 get_next_seq_nocache (coll_seq *seq, int nrules, const unsigned char *rulesets,
 241                       const USTRING_TYPE *weights, const int32_t *table,
 242                       const USTRING_TYPE *extra, const int32_t *indirect,
 243                       int pass)
 244 {
 245 #include WEIGHT_H
 246   size_t val = seq->val = 0;
 247   int len = seq->len;
 248   size_t backw_stop = seq->backw_stop;
 249   size_t backw = seq->backw;
 250   size_t idxcnt = seq->idxcnt;
 251   size_t idxmax = seq->idxmax;
 252   int32_t idx = seq->idx;
 253   const USTRING_TYPE *us = seq->us;
 254
 255   while (len == 0)
 256     {
 257       ++val;
 258       if (backw_stop != ~0ul)
 259         {
 260           /* There is something pushed.  */
 261           if (backw == backw_stop)
 262             {
 263               /* The last pushed character was handled.  Continue
 264                  with forward characters.  */
 265               if (idxcnt < idxmax)
 266                 {
 267                   idx = seq->save_idx;
 268                   backw_stop = ~0ul;
 269                 }
 270               else
 271                 {
 272                   /* Nothing anymore.  The backward sequence ended with
 273                      the last sequence in the string.  Note that len is
 274                      still zero.  */
 275                   idx = 0;
 276                   break;
 277                 }
 278             }
 279           else
 280             {
 281               /* XXX Traverse BACKW sequences from the beginning of
 282                  BACKW_STOP to get the next sequence.  Is ther a quicker way
 283                  to do this?  */
 284               size_t i = backw_stop;
 285               us = seq->back_us;
 286               while (i < backw)
 287                 {
 288                   int32_t tmp = findidx (&us, -1);
 289                   idx = tmp & 0xffffff;
 290                   i++;
 291                 }
 292               --backw;
 293               us = seq->us;
 294             }
 295         }
 296       else
 297         {
 298           backw_stop = idxmax;
 299           int32_t prev_idx = idx;
 300
 301           while (*us != L('\0'))
 302             {
 303               int32_t tmp = findidx (&us, -1);
 304               unsigned char rule = tmp >> 24;
 305               prev_idx = idx;
 306               idx = tmp & 0xffffff;
 307               idxcnt = idxmax++;
 308
 309               /* Save the rule for the first sequence.  */
 310               if (__glibc_unlikely (idxcnt == 0))
 311                 seq->rule = rule;
 312
 313               if ((rulesets[rule * nrules + pass]
 314                    & sort_backward) == 0)
 315                 /* No more backward characters to push.  */
 316                 break;
 317               ++idxcnt;
 318             }
 319
 320           if (backw_stop >= idxcnt)
 321             {
 322               /* No sequence at all or just one.  */
 323               if (idxcnt == idxmax || backw_stop > idxcnt)
 324                 /* Note that len is still zero.  */
 325                 break;
 326
 327               backw_stop = ~0ul;
 328             }
 329           else
 330             {
 331               /* We pushed backward sequences.  If the stream ended with the
 332                  backward sequence, then we process the last sequence we
 333                  found.  Otherwise we process the sequence before the last
 334                  one since the last one was a forward sequence.  */
 335               seq->back_us = seq->us;
 336               seq->us = us;
 337               backw = idxcnt;
 338               if (idxmax > idxcnt)
 339                 {
 340                   backw--;
 341                   seq->save_idx = idx;
 342                   idx = prev_idx;
 343                 }
 344               if (backw > backw_stop)
 345                 backw--;
 346             }
 347         }
 348
 349       len = weights[idx++];
 350       /* Skip over indices of previous levels.  */
 351       for (int i = 0; i < pass; i++)
 352         {
 353           idx += len;
 354           len = weights[idx];
 355           idx++;
 356         }
 357     }
 358
 359   /* Update the structure.  */
 360   seq->val = val;
 361   seq->len = len;
 362   seq->backw_stop = backw_stop;
 363   seq->backw = backw;
 364   seq->idxcnt = idxcnt;
 365   seq->idxmax = idxmax;
 366   seq->us = us;
 367   seq->idx = idx;
 368 }
 369
 370 /* Compare two sequences.  This version does not use the index and rules
 371    cache.  */
 372 static int
 373 do_compare_nocache (coll_seq *seq1, coll_seq *seq2, int position,
 374                     const USTRING_TYPE *weights)
 375 {
 376   int seq1len = seq1->len;
 377   int seq2len = seq2->len;
 378   size_t val1 = seq1->val;
 379   size_t val2 = seq2->val;
 380   int idx1 = seq1->idx;
 381   int idx2 = seq2->idx;
 382   int result = 0;
 383
 384   /* Test for position if necessary.  */
 385   if (position && val1 != val2)
 386     {
 387       result = val1 > val2 ? 1 : -1;
 388       goto out;
 389     }
 390
 391   /* Compare the two sequences.  */
 392   do
 393     {
 394       if (weights[idx1] != weights[idx2])
 395         {
 396           /* The sequences differ.  */
 397           result = weights[idx1] - weights[idx2];
 398           goto out;
 399         }
 400
 401       /* Increment the offsets.  */
 402       ++idx1;
 403       ++idx2;
 404
 405       --seq1len;
 406       --seq2len;
 407     }
 408   while (seq1len > 0 && seq2len > 0);
 409
 410   if (position && seq1len != seq2len)
 411     result = seq1len - seq2len;
 412
 413 out:
 414   seq1->len = seq1len;
 415   seq2->len = seq2len;
 416   seq1->idx = idx1;
 417   seq2->idx = idx2;
 418   return result;
 419 }
 420
 421 /* Compare two sequences using the index cache.  */
 422 static int
 423 do_compare (coll_seq *seq1, coll_seq *seq2, int position,
 424             const USTRING_TYPE *weights)
 425 {
 426   int seq1len = seq1->len;
 427   int seq2len = seq2->len;
 428   size_t val1 = seq1->val;
 429   size_t val2 = seq2->val;
 430   int32_t *idx1arr = seq1->idxarr;
 431   int32_t *idx2arr = seq2->idxarr;
 432   int idx1now = seq1->idxnow;
 433   int idx2now = seq2->idxnow;
 434   int result = 0;
 435
 436   /* Test for position if necessary.  */
 437   if (position && val1 != val2)
 438     {
 439       result = val1 > val2 ? 1 : -1;
 440       goto out;
 441     }
 442
 443   /* Compare the two sequences.  */
 444   do
 445     {
 446       if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
 447         {
 448           /* The sequences differ.  */
 449           result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
 450           goto out;
 451         }
 452
 453       /* Increment the offsets.  */
 454       ++idx1arr[idx1now];
 455       ++idx2arr[idx2now];
 456
 457       --seq1len;
 458       --seq2len;
 459     }
 460   while (seq1len > 0 && seq2len > 0);
 461
 462   if (position && seq1len != seq2len)
 463     result = seq1len - seq2len;
 464
 465 out:
 466   seq1->len = seq1len;
 467   seq2->len = seq2len;
 468   return result;
 469 }
 470
 471 int
 472 STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 473 {
 474   struct __locale_data *current = l->__locales[LC_COLLATE];
 475   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
 476   /* We don't assign the following values right away since it might be
 477      unnecessary in case there are no rules.  */
 478   const unsigned char *rulesets;
 479   const int32_t *table;
 480   const USTRING_TYPE *weights;
 481   const USTRING_TYPE *extra;
 482   const int32_t *indirect;
 483
 484   if (nrules == 0)
 485     return STRCMP (s1, s2);
 486
 487   rulesets = (const unsigned char *)
 488     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 489   table = (const int32_t *)
 490     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 491   weights = (const USTRING_TYPE *)
 492     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 493   extra = (const USTRING_TYPE *)
 494     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 495   indirect = (const int32_t *)
 496     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 497
 498   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 499   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 500   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 501   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 502
 503   /* We need this a few times.  */
 504   size_t s1len = STRLEN (s1);
 505   size_t s2len = STRLEN (s2);
 506
 507   /* Catch empty strings.  */
 508   if (__glibc_unlikely (s1len == 0) || __glibc_unlikely (s2len == 0))
 509     return (s1len != 0) - (s2len != 0);
 510
 511   /* Perform the first pass over the string and while doing this find
 512      and store the weights for each character.  Since we want this to
 513      be as fast as possible we are using `alloca' to store the temporary
 514      values.  But since there is no limit on the length of the string
 515      we have to use `malloc' if the string is too long.  We should be
 516      very conservative here.
 517
 518      Please note that the localedef programs makes sure that `position'
 519      is not used at the first level.  */
 520
 521   coll_seq seq1, seq2;
 522   bool use_malloc = false;
 523   int result = 0;
 524
 525   memset (&seq1, 0, sizeof (seq1));
 526   seq2 = seq1;
 527
 528   size_t size_max = SIZE_MAX / (sizeof (int32_t) + 1);
 529
 530   if (MIN (s1len, s2len) > size_max
 531       || MAX (s1len, s2len) > size_max - MIN (s1len, s2len))
 532     {
 533       /* If the strings are long enough to cause overflow in the size request,
 534          then skip the allocation and proceed with the non-cached routines.  */
 535     }
 536   else if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1)))
 537     {
 538       seq1.idxarr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
 539
 540       /* If we failed to allocate memory, we leave everything as NULL so that
 541          we use the nocache version of traversal and comparison functions.  */
 542       if (seq1.idxarr != NULL)
 543         {
 544           seq2.idxarr = &seq1.idxarr[s1len];
 545           seq1.rulearr = (unsigned char *) &seq2.idxarr[s2len];
 546           seq2.rulearr = &seq1.rulearr[s1len];
 547           use_malloc = true;
 548         }
 549     }
 550   else
 551     {
 552       seq1.idxarr = (int32_t *) alloca (s1len * sizeof (int32_t));
 553       seq2.idxarr = (int32_t *) alloca (s2len * sizeof (int32_t));
 554       seq1.rulearr = (unsigned char *) alloca (s1len);
 555       seq2.rulearr = (unsigned char *) alloca (s2len);
 556     }
 557
 558   int rule = 0;
 559
 560   /* Cache values in the first pass and if needed, use them in subsequent
 561      passes.  */
 562   for (int pass = 0; pass < nrules; ++pass)
 563     {
 564       seq1.idxcnt = 0;
 565       seq1.idx = 0;
 566       seq2.idx = 0;
 567       seq1.backw_stop = ~0ul;
 568       seq1.backw = ~0ul;
 569       seq2.idxcnt = 0;
 570       seq2.backw_stop = ~0ul;
 571       seq2.backw = ~0ul;
 572
 573       /* We need the elements of the strings as unsigned values since they
 574          are used as indices.  */
 575       seq1.us = (const USTRING_TYPE *) s1;
 576       seq2.us = (const USTRING_TYPE *) s2;
 577
 578       /* We assume that if a rule has defined `position' in one section
 579          this is true for all of them.  */
 580       int position = rulesets[rule * nrules + pass] & sort_position;
 581
 582       while (1)
 583         {
 584           if (__glibc_unlikely (seq1.idxarr == NULL))
 585             {
 586               get_next_seq_nocache (&seq1, nrules, rulesets, weights, table,
 587                                     extra, indirect, pass);
 588               get_next_seq_nocache (&seq2, nrules, rulesets, weights, table,
 589                                     extra, indirect, pass);
 590             }
 591           else if (pass == 0)
 592             {
 593               get_next_seq (&seq1, nrules, rulesets, weights, table, extra,
 594                             indirect);
 595               get_next_seq (&seq2, nrules, rulesets, weights, table, extra,
 596                             indirect);
 597             }
 598           else
 599             {
 600               get_next_seq_cached (&seq1, nrules, pass, rulesets, weights);
 601               get_next_seq_cached (&seq2, nrules, pass, rulesets, weights);
 602             }
 603
 604           /* See whether any or both strings are empty.  */
 605           if (seq1.len == 0 || seq2.len == 0)
 606             {
 607               if (seq1.len == seq2.len)
 608                 /* Both ended.  So far so good, both strings are equal
 609                    at this level.  */
 610                 break;
 611
 612               /* This means one string is shorter than the other.  Find out
 613                  which one and return an appropriate value.  */
 614               result = seq1.len == 0 ? -1 : 1;
 615               goto free_and_return;
 616             }
 617
 618           if (__glibc_unlikely (seq1.idxarr == NULL))
 619             result = do_compare_nocache (&seq1, &seq2, position, weights);
 620           else
 621             result = do_compare (&seq1, &seq2, position, weights);
 622           if (result != 0)
 623             goto free_and_return;
 624         }
 625
 626       if (__glibc_likely (seq1.rulearr != NULL))
 627         rule = seq1.rulearr[0];
 628       else
 629         rule = seq1.rule;
 630     }
 631
 632   /* Free the memory if needed.  */
 633  free_and_return:
 634   if (use_malloc)
 635     free (seq1.idxarr);
 636
 637   return result;
 638 }
 639 libc_hidden_def (STRCOLL)
 640
 641 #ifndef WIDE_CHAR_VERSION
 642 weak_alias (__strcoll_l, strcoll_l)
 643 #endif