string/strcoll_l.c

   1 /* Copyright (C) 1995-2015 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19
  20 #include <assert.h>
  21 #include <langinfo.h>
  22 #include <locale.h>
  23 #include <stddef.h>
  24 #include <stdint.h>
  25 #include <string.h>
  26 #include <sys/param.h>
  27
  28 #ifndef STRING_TYPE
  29 # define STRING_TYPE char
  30 # define USTRING_TYPE unsigned char
  31 # define STRCOLL __strcoll_l
  32 # define STRDIFF __strdiff
  33 # define STRCMP strcmp
  34 # define WEIGHT_H "../locale/weight.h"
  35 # define SUFFIX MB
  36 # define L(arg) arg
  37 #endif
  38
  39 #define CONCAT(a,b) CONCAT1(a,b)
  40 #define CONCAT1(a,b) a##b
  41
  42 #include "../locale/localeinfo.h"
  43 #include WEIGHT_H
  44
  45 #define MASK_UTF8_7BIT  (1 << 7)
  46 #define MASK_UTF8_START (3 << 6)
  47
  48 size_t
  49 STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
  50 {
  51   size_t n;
  52
  53   for (n = 0; *s != '\0' && *s++ == *t++; ++n)
  54     continue;
  55
  56   return n;
  57 }
  58
  59 /* Track status while looking for sequences in a string.  */
  60 typedef struct
  61 {
  62   int len;                      /* Length of the current sequence.  */
  63   size_t val;                   /* Position of the sequence relative to the
  64                                    previous non-ignored sequence.  */
  65   size_t idxmax;                /* Maximum index in sequences.  */
  66   size_t idxcnt;                /* Current count of indices.  */
  67   size_t backw;                 /* Current Backward sequence index.  */
  68   size_t backw_stop;            /* Index where the backward sequences stop.  */
  69   const USTRING_TYPE *us;       /* The string.  */
  70   unsigned char rule;           /* Saved rule for the first sequence.  */
  71   int32_t idx;                  /* Index to weight of the current sequence.  */
  72   int32_t save_idx;             /* Save looked up index of a forward
  73                                    sequence after the last backward
  74                                    sequence.  */
  75   const USTRING_TYPE *back_us;  /* Beginning of the backward sequence.  */
  76 } coll_seq;
  77
  78 /* Get next sequence.  Traverse the string as required.  */
  79 static __always_inline void
  80 get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
  81               const USTRING_TYPE *weights, const int32_t *table,
  82               const USTRING_TYPE *extra, const int32_t *indirect,
  83               int pass)
  84 {
  85   size_t val = seq->val = 0;
  86   int len = seq->len;
  87   size_t backw_stop = seq->backw_stop;
  88   size_t backw = seq->backw;
  89   size_t idxcnt = seq->idxcnt;
  90   size_t idxmax = seq->idxmax;
  91   int32_t idx = seq->idx;
  92   const USTRING_TYPE *us = seq->us;
  93
  94   while (len == 0)
  95     {
  96       ++val;
  97       if (backw_stop != ~0ul)
  98         {
  99           /* There is something pushed.  */
 100           if (backw == backw_stop)
 101             {
 102               /* The last pushed character was handled.  Continue
 103                  with forward characters.  */
 104               if (idxcnt < idxmax)
 105                 {
 106                   idx = seq->save_idx;
 107                   backw_stop = ~0ul;
 108                 }
 109               else
 110                 {
 111                   /* Nothing anymore.  The backward sequence ended with
 112                      the last sequence in the string.  Note that len is
 113                      still zero.  */
 114                   idx = 0;
 115                   break;
 116                 }
 117             }
 118           else
 119             {
 120               /* XXX Traverse BACKW sequences from the beginning of
 121                  BACKW_STOP to get the next sequence.  Is ther a quicker way
 122                  to do this?  */
 123               size_t i = backw_stop;
 124               us = seq->back_us;
 125               while (i < backw)
 126                 {
 127                   int32_t tmp = findidx (table, indirect, extra, &us, -1);
 128                   idx = tmp & 0xffffff;
 129                   i++;
 130                 }
 131               --backw;
 132               us = seq->us;
 133             }
 134         }
 135       else
 136         {
 137           backw_stop = idxmax;
 138           int32_t prev_idx = idx;
 139
 140           while (*us != L('\0'))
 141             {
 142               int32_t tmp = findidx (table, indirect, extra, &us, -1);
 143               unsigned char rule = tmp >> 24;
 144               prev_idx = idx;
 145               idx = tmp & 0xffffff;
 146               idxcnt = idxmax++;
 147
 148               /* Save the rule for the first sequence.  */
 149               if (__glibc_unlikely (idxcnt == 0))
 150                 seq->rule = rule;
 151
 152               if ((rulesets[rule * nrules + pass]
 153                    & sort_backward) == 0)
 154                 /* No more backward characters to push.  */
 155                 break;
 156               ++idxcnt;
 157             }
 158
 159           if (backw_stop >= idxcnt)
 160             {
 161               /* No sequence at all or just one.  */
 162               if (idxcnt == idxmax || backw_stop > idxcnt)
 163                 /* Note that len is still zero.  */
 164                 break;
 165
 166               backw_stop = ~0ul;
 167             }
 168           else
 169             {
 170               /* We pushed backward sequences.  If the stream ended with the
 171                  backward sequence, then we process the last sequence we
 172                  found.  Otherwise we process the sequence before the last
 173                  one since the last one was a forward sequence.  */
 174               seq->back_us = seq->us;
 175               seq->us = us;
 176               backw = idxcnt;
 177               if (idxmax > idxcnt)
 178                 {
 179                   backw--;
 180                   seq->save_idx = idx;
 181                   idx = prev_idx;
 182                 }
 183               if (backw > backw_stop)
 184                 backw--;
 185             }
 186         }
 187
 188       len = weights[idx++];
 189       /* Skip over indices of previous levels.  */
 190       for (int i = 0; i < pass; i++)
 191         {
 192           idx += len;
 193           len = weights[idx];
 194           idx++;
 195         }
 196     }
 197
 198   /* Update the structure.  */
 199   seq->val = val;
 200   seq->len = len;
 201   seq->backw_stop = backw_stop;
 202   seq->backw = backw;
 203   seq->idxcnt = idxcnt;
 204   seq->idxmax = idxmax;
 205   seq->us = us;
 206   seq->idx = idx;
 207 }
 208
 209 /* Compare two sequences.  */
 210 static __always_inline int
 211 do_compare (coll_seq *seq1, coll_seq *seq2, int position,
 212             const USTRING_TYPE *weights)
 213 {
 214   int seq1len = seq1->len;
 215   int seq2len = seq2->len;
 216   size_t val1 = seq1->val;
 217   size_t val2 = seq2->val;
 218   int idx1 = seq1->idx;
 219   int idx2 = seq2->idx;
 220   int result = 0;
 221
 222   /* Test for position if necessary.  */
 223   if (position && val1 != val2)
 224     {
 225       result = val1 > val2 ? 1 : -1;
 226       goto out;
 227     }
 228
 229   /* Compare the two sequences.  */
 230   do
 231     {
 232       if (weights[idx1] != weights[idx2])
 233         {
 234           /* The sequences differ.  */
 235           result = weights[idx1] - weights[idx2];
 236           goto out;
 237         }
 238
 239       /* Increment the offsets.  */
 240       ++idx1;
 241       ++idx2;
 242
 243       --seq1len;
 244       --seq2len;
 245     }
 246   while (seq1len > 0 && seq2len > 0);
 247
 248   if (position && seq1len != seq2len)
 249     result = seq1len - seq2len;
 250
 251 out:
 252   seq1->len = seq1len;
 253   seq2->len = seq2len;
 254   seq1->idx = idx1;
 255   seq2->idx = idx2;
 256   return result;
 257 }
 258
 259 int
 260 STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 261 {
 262   struct __locale_data *current = l->__locales[LC_COLLATE];
 263   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
 264   /* We don't assign the following values right away since it might be
 265      unnecessary in case there are no rules.  */
 266   const unsigned char *rulesets;
 267   const int32_t *table;
 268   const USTRING_TYPE *weights;
 269   const USTRING_TYPE *extra;
 270   const int32_t *indirect;
 271
 272   /* In case there is no locale specific sort order (C / POSIX).  */
 273   if (nrules == 0)
 274     return STRCMP (s1, s2);
 275
 276   /* Fast forward to the position of the first difference.  Needs to be
 277      encoding aware as the byte-by-byte comparison can stop in the middle
 278      of a char sequence for multibyte encodings like UTF-8.  */
 279   uint_fast32_t encoding =
 280     current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
 281   if (encoding != __cet_other)
 282     {
 283       size_t diff = STRDIFF (s1, s2);
 284       if (diff > 0)
 285         {
 286           if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
 287             do
 288               diff--;
 289             while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
 290           s1 += diff;
 291           s2 += diff;
 292         }
 293     }
 294
 295   /* Catch empty strings.  */
 296   if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
 297     return (*s1 != '\0') - (*s2 != '\0');
 298
 299   rulesets = (const unsigned char *)
 300     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 301   table = (const int32_t *)
 302     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 303   weights = (const USTRING_TYPE *)
 304     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 305   extra = (const USTRING_TYPE *)
 306     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 307   indirect = (const int32_t *)
 308     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 309
 310   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 311   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 312   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 313   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 314
 315   int result = 0, rule = 0;
 316
 317   coll_seq seq1, seq2;
 318   seq1.len = 0;
 319   seq1.idxmax = 0;
 320   seq1.rule = 0;
 321   seq2.len = 0;
 322   seq2.idxmax = 0;
 323
 324   for (int pass = 0; pass < nrules; ++pass)
 325     {
 326       seq1.idxcnt = 0;
 327       seq1.idx = 0;
 328       seq2.idx = 0;
 329       seq1.backw_stop = ~0ul;
 330       seq1.backw = ~0ul;
 331       seq2.idxcnt = 0;
 332       seq2.backw_stop = ~0ul;
 333       seq2.backw = ~0ul;
 334
 335       /* We need the elements of the strings as unsigned values since they
 336          are used as indices.  */
 337       seq1.us = (const USTRING_TYPE *) s1;
 338       seq2.us = (const USTRING_TYPE *) s2;
 339
 340       /* We assume that if a rule has defined `position' in one section
 341          this is true for all of them.  Please note that the localedef programs
 342          makes sure that `position' is not used at the first level.  */
 343
 344       int position = rulesets[rule * nrules + pass] & sort_position;
 345
 346       while (1)
 347         {
 348           get_next_seq (&seq1, nrules, rulesets, weights, table,
 349                                     extra, indirect, pass);
 350           get_next_seq (&seq2, nrules, rulesets, weights, table,
 351                                     extra, indirect, pass);
 352           /* See whether any or both strings are empty.  */
 353           if (seq1.len == 0 || seq2.len == 0)
 354             {
 355               if (seq1.len == seq2.len)
 356                 {
 357                   /* Both strings ended and are equal at this level.  Do a
 358                      byte-level comparison to ensure that we don't waste time
 359                      going through multiple passes for totally equal strings
 360                      before proceeding to subsequent passes.  */
 361                   if (pass == 0 && encoding == __cet_other &&
 362                       STRCMP (s1, s2) == 0)
 363                     return result;
 364                   else
 365                     break;
 366                 }
 367
 368               /* This means one string is shorter than the other.  Find out
 369                  which one and return an appropriate value.  */
 370               return seq1.len == 0 ? -1 : 1;
 371             }
 372
 373           result = do_compare (&seq1, &seq2, position, weights);
 374           if (result != 0)
 375             return result;
 376         }
 377
 378       rule = seq1.rule;
 379     }
 380
 381   return result;
 382 }
 383 libc_hidden_def (STRCOLL)
 384
 385 #ifndef WIDE_CHAR_VERSION
 386 weak_alias (__strcoll_l, strcoll_l)
 387 #endif