string/strcoll_l.c

   1 /* Copyright (C) 1995-2015 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19
  20 #include <assert.h>
  21 #include <langinfo.h>
  22 #include <locale.h>
  23 #include <stddef.h>
  24 #include <stdint.h>
  25 #include <string.h>
  26 #include <sys/param.h>
  27
  28 #ifndef STRING_TYPE
  29 # define STRING_TYPE char
  30 # define USTRING_TYPE unsigned char
  31 # define STRCOLL __strcoll_l
  32 # define STRDIFF __strdiff
  33 # define STRCMP strcmp
  34 # define WEIGHT_H "../locale/weight.h"
  35 # define SUFFIX MB
  36 # define L(arg) arg
  37 #endif
  38
  39 #define CONCAT(a,b) CONCAT1(a,b)
  40 #define CONCAT1(a,b) a##b
  41
  42 #include "../locale/localeinfo.h"
  43 #include WEIGHT_H
  44
  45 #define MASK_UTF8_7BIT  (1 << 7)
  46 #define MASK_UTF8_START (3 << 6)
  47
  48 size_t
  49 STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
  50 {
  51   size_t n;
  52
  53   for (n = 0; *s != '\0' && *s++ == *t++; ++n)
  54     continue;
  55
  56   return n;
  57 }
  58
  59 /* Track status while looking for sequences in a string.  */
  60 typedef struct
  61 {
  62   int len;                      /* Length of the current sequence.  */
  63   size_t val;                   /* Position of the sequence relative to the
  64                                    previous non-ignored sequence.  */
  65   size_t idxnow;                /* Current index in sequences.  */
  66   size_t idxmax;                /* Maximum index in sequences.  */
  67   size_t idxcnt;                /* Current count of indices.  */
  68   size_t backw;                 /* Current Backward sequence index.  */
  69   size_t backw_stop;            /* Index where the backward sequences stop.  */
  70   const USTRING_TYPE *us;       /* The string.  */
  71   unsigned char rule;           /* Saved rule for the first sequence.  */
  72   int32_t idx;                  /* Index to weight of the current sequence.  */
  73   int32_t save_idx;             /* Save looked up index of a forward
  74                                    sequence after the last backward
  75                                    sequence.  */
  76   const USTRING_TYPE *back_us;  /* Beginning of the backward sequence.  */
  77 } coll_seq;
  78
  79 /* Get next sequence.  Traverse the string as required.  */
  80 static __always_inline void
  81 get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
  82               const USTRING_TYPE *weights, const int32_t *table,
  83               const USTRING_TYPE *extra, const int32_t *indirect,
  84               int pass)
  85 {
  86   size_t val = seq->val = 0;
  87   int len = seq->len;
  88   size_t backw_stop = seq->backw_stop;
  89   size_t backw = seq->backw;
  90   size_t idxcnt = seq->idxcnt;
  91   size_t idxmax = seq->idxmax;
  92   int32_t idx = seq->idx;
  93   const USTRING_TYPE *us = seq->us;
  94
  95   while (len == 0)
  96     {
  97       ++val;
  98       if (backw_stop != ~0ul)
  99         {
 100           /* There is something pushed.  */
 101           if (backw == backw_stop)
 102             {
 103               /* The last pushed character was handled.  Continue
 104                  with forward characters.  */
 105               if (idxcnt < idxmax)
 106                 {
 107                   idx = seq->save_idx;
 108                   backw_stop = ~0ul;
 109                 }
 110               else
 111                 {
 112                   /* Nothing anymore.  The backward sequence ended with
 113                      the last sequence in the string.  Note that len is
 114                      still zero.  */
 115                   idx = 0;
 116                   break;
 117                 }
 118             }
 119           else
 120             {
 121               /* XXX Traverse BACKW sequences from the beginning of
 122                  BACKW_STOP to get the next sequence.  Is ther a quicker way
 123                  to do this?  */
 124               size_t i = backw_stop;
 125               us = seq->back_us;
 126               while (i < backw)
 127                 {
 128                   int32_t tmp = findidx (table, indirect, extra, &us, -1);
 129                   idx = tmp & 0xffffff;
 130                   i++;
 131                 }
 132               --backw;
 133               us = seq->us;
 134             }
 135         }
 136       else
 137         {
 138           backw_stop = idxmax;
 139           int32_t prev_idx = idx;
 140
 141           while (*us != L('\0'))
 142             {
 143               int32_t tmp = findidx (table, indirect, extra, &us, -1);
 144               unsigned char rule = tmp >> 24;
 145               prev_idx = idx;
 146               idx = tmp & 0xffffff;
 147               idxcnt = idxmax++;
 148
 149               /* Save the rule for the first sequence.  */
 150               if (__glibc_unlikely (idxcnt == 0))
 151                 seq->rule = rule;
 152
 153               if ((rulesets[rule * nrules + pass]
 154                    & sort_backward) == 0)
 155                 /* No more backward characters to push.  */
 156                 break;
 157               ++idxcnt;
 158             }
 159
 160           if (backw_stop >= idxcnt)
 161             {
 162               /* No sequence at all or just one.  */
 163               if (idxcnt == idxmax || backw_stop > idxcnt)
 164                 /* Note that len is still zero.  */
 165                 break;
 166
 167               backw_stop = ~0ul;
 168             }
 169           else
 170             {
 171               /* We pushed backward sequences.  If the stream ended with the
 172                  backward sequence, then we process the last sequence we
 173                  found.  Otherwise we process the sequence before the last
 174                  one since the last one was a forward sequence.  */
 175               seq->back_us = seq->us;
 176               seq->us = us;
 177               backw = idxcnt;
 178               if (idxmax > idxcnt)
 179                 {
 180                   backw--;
 181                   seq->save_idx = idx;
 182                   idx = prev_idx;
 183                 }
 184               if (backw > backw_stop)
 185                 backw--;
 186             }
 187         }
 188
 189       len = weights[idx++];
 190       /* Skip over indices of previous levels.  */
 191       for (int i = 0; i < pass; i++)
 192         {
 193           idx += len;
 194           len = weights[idx];
 195           idx++;
 196         }
 197     }
 198
 199   /* Update the structure.  */
 200   seq->val = val;
 201   seq->len = len;
 202   seq->backw_stop = backw_stop;
 203   seq->backw = backw;
 204   seq->idxcnt = idxcnt;
 205   seq->idxmax = idxmax;
 206   seq->us = us;
 207   seq->idx = idx;
 208 }
 209
 210 /* Compare two sequences.  */
 211 static __always_inline int
 212 do_compare (coll_seq *seq1, coll_seq *seq2, int position,
 213             const USTRING_TYPE *weights)
 214 {
 215   int seq1len = seq1->len;
 216   int seq2len = seq2->len;
 217   size_t val1 = seq1->val;
 218   size_t val2 = seq2->val;
 219   int idx1 = seq1->idx;
 220   int idx2 = seq2->idx;
 221   int result = 0;
 222
 223   /* Test for position if necessary.  */
 224   if (position && val1 != val2)
 225     {
 226       result = val1 > val2 ? 1 : -1;
 227       goto out;
 228     }
 229
 230   /* Compare the two sequences.  */
 231   do
 232     {
 233       if (weights[idx1] != weights[idx2])
 234         {
 235           /* The sequences differ.  */
 236           result = weights[idx1] - weights[idx2];
 237           goto out;
 238         }
 239
 240       /* Increment the offsets.  */
 241       ++idx1;
 242       ++idx2;
 243
 244       --seq1len;
 245       --seq2len;
 246     }
 247   while (seq1len > 0 && seq2len > 0);
 248
 249   if (position && seq1len != seq2len)
 250     result = seq1len - seq2len;
 251
 252 out:
 253   seq1->len = seq1len;
 254   seq2->len = seq2len;
 255   seq1->idx = idx1;
 256   seq2->idx = idx2;
 257   return result;
 258 }
 259
 260 int
 261 STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 262 {
 263   struct __locale_data *current = l->__locales[LC_COLLATE];
 264   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
 265   /* We don't assign the following values right away since it might be
 266      unnecessary in case there are no rules.  */
 267   const unsigned char *rulesets;
 268   const int32_t *table;
 269   const USTRING_TYPE *weights;
 270   const USTRING_TYPE *extra;
 271   const int32_t *indirect;
 272
 273   /* In case there is no locale specific sort order (C / POSIX).  */
 274   if (nrules == 0)
 275     return STRCMP (s1, s2);
 276
 277   /* Fast forward to the position of the first difference.  Needs to be
 278      encoding aware as the byte-by-byte comparison can stop in the middle
 279      of a char sequence for multibyte encodings like UTF-8.  */
 280   uint_fast32_t encoding =
 281     current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
 282   if (encoding != __cet_other)
 283     {
 284       size_t diff = STRDIFF (s1, s2);
 285       if (diff > 0)
 286         {
 287           if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
 288             do
 289               diff--;
 290             while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
 291           s1 += diff;
 292           s2 += diff;
 293         }
 294     }
 295
 296   /* Catch empty strings.  */
 297   if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
 298     return (*s1 != '\0') - (*s2 != '\0');
 299
 300   rulesets = (const unsigned char *)
 301     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 302   table = (const int32_t *)
 303     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 304   weights = (const USTRING_TYPE *)
 305     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 306   extra = (const USTRING_TYPE *)
 307     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 308   indirect = (const int32_t *)
 309     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 310
 311   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 312   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 313   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 314   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 315
 316   int result = 0, rule = 0;
 317
 318   coll_seq seq1, seq2;
 319   memset (&seq1, 0, sizeof (seq1));
 320   seq2 = seq1;
 321
 322   for (int pass = 0; pass < nrules; ++pass)
 323     {
 324       seq1.idxcnt = 0;
 325       seq1.idx = 0;
 326       seq2.idx = 0;
 327       seq1.backw_stop = ~0ul;
 328       seq1.backw = ~0ul;
 329       seq2.idxcnt = 0;
 330       seq2.backw_stop = ~0ul;
 331       seq2.backw = ~0ul;
 332
 333       /* We need the elements of the strings as unsigned values since they
 334          are used as indices.  */
 335       seq1.us = (const USTRING_TYPE *) s1;
 336       seq2.us = (const USTRING_TYPE *) s2;
 337
 338       /* We assume that if a rule has defined `position' in one section
 339          this is true for all of them.  Please note that the localedef programs
 340          makes sure that `position' is not used at the first level.  */
 341
 342       int position = rulesets[rule * nrules + pass] & sort_position;
 343
 344       while (1)
 345         {
 346           get_next_seq (&seq1, nrules, rulesets, weights, table,
 347                                     extra, indirect, pass);
 348           get_next_seq (&seq2, nrules, rulesets, weights, table,
 349                                     extra, indirect, pass);
 350           /* See whether any or both strings are empty.  */
 351           if (seq1.len == 0 || seq2.len == 0)
 352             {
 353               if (seq1.len == seq2.len)
 354                 {
 355                   /* Both strings ended and are equal at this level.  Do a
 356                      byte-level comparison to ensure that we don't waste time
 357                      going through multiple passes for totally equal strings
 358                      before proceeding to subsequent passes.  */
 359                   if (pass == 0 && encoding == __cet_other &&
 360                       STRCMP (s1, s2) == 0)
 361                     return result;
 362                   else
 363                     break;
 364                 }
 365
 366               /* This means one string is shorter than the other.  Find out
 367                  which one and return an appropriate value.  */
 368               return seq1.len == 0 ? -1 : 1;
 369             }
 370
 371           result = do_compare (&seq1, &seq2, position, weights);
 372           if (result != 0)
 373             return result;
 374         }
 375
 376       rule = seq1.rule;
 377     }
 378
 379   return result;
 380 }
 381 libc_hidden_def (STRCOLL)
 382
 383 #ifndef WIDE_CHAR_VERSION
 384 weak_alias (__strcoll_l, strcoll_l)
 385 #endif