string/strxfrm_l.c

   1 /* Copyright (C) 1995-2014 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <langinfo.h>
  21 #include <locale.h>
  22 #include <stddef.h>
  23 #include <stdint.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <sys/param.h>
  27
  28 #ifndef STRING_TYPE
  29 # define STRING_TYPE char
  30 # define USTRING_TYPE unsigned char
  31 # define STRXFRM __strxfrm_l
  32 # define STRCMP strcmp
  33 # define STRLEN strlen
  34 # define STPNCPY __stpncpy
  35 # define WEIGHT_H "../locale/weight.h"
  36 # define SUFFIX MB
  37 # define L(arg) arg
  38 #endif
  39
  40 #define CONCAT(a,b) CONCAT1(a,b)
  41 #define CONCAT1(a,b) a##b
  42
  43 #include "../locale/localeinfo.h"
  44
  45
  46 #ifndef WIDE_CHAR_VERSION
  47
  48 /* We need UTF-8 encoding of numbers.  */
  49 static int
  50 utf8_encode (char *buf, int val)
  51 {
  52   int retval;
  53
  54   if (val < 0x80)
  55     {
  56       *buf++ = (char) val;
  57       retval = 1;
  58     }
  59   else
  60     {
  61       int step;
  62
  63       for (step = 2; step < 6; ++step)
  64         if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
  65           break;
  66       retval = step;
  67
  68       *buf = (unsigned char) (~0xff >> step);
  69       --step;
  70       do
  71         {
  72           buf[step] = 0x80 | (val & 0x3f);
  73           val >>= 6;
  74         }
  75       while (--step > 0);
  76       *buf |= val;
  77     }
  78
  79   return retval;
  80 }
  81 #endif
  82
  83
  84 size_t
  85 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
  86 {
  87   struct __locale_data *current = l->__locales[LC_COLLATE];
  88   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
  89   /* We don't assign the following values right away since it might be
  90      unnecessary in case there are no rules.  */
  91   const unsigned char *rulesets;
  92   const int32_t *table;
  93   const USTRING_TYPE *weights;
  94   const USTRING_TYPE *extra;
  95   const int32_t *indirect;
  96   uint_fast32_t pass;
  97   size_t needed;
  98   size_t last_needed;
  99   const USTRING_TYPE *usrc;
 100   size_t srclen = STRLEN (src);
 101   int32_t *idxarr;
 102   unsigned char *rulearr;
 103   size_t idxmax;
 104   size_t idxcnt;
 105   int use_malloc;
 106
 107 #include WEIGHT_H
 108
 109   if (nrules == 0)
 110     {
 111       if (n != 0)
 112         STPNCPY (dest, src, MIN (srclen + 1, n));
 113
 114       return srclen;
 115     }
 116
 117   rulesets = (const unsigned char *)
 118     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 119   table = (const int32_t *)
 120     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 121   weights = (const USTRING_TYPE *)
 122     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 123   extra = (const USTRING_TYPE *)
 124     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 125   indirect = (const int32_t *)
 126     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 127   use_malloc = 0;
 128
 129   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 130   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 131   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 132   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 133
 134   /* Handle an empty string as a special case.  */
 135   if (srclen == 0)
 136     {
 137       if (n != 0)
 138         *dest = L('\0');
 139       return 0;
 140     }
 141
 142   /* We need the elements of the string as unsigned values since they
 143      are used as indeces.  */
 144   usrc = (const USTRING_TYPE *) src;
 145
 146   /* Perform the first pass over the string and while doing this find
 147      and store the weights for each character.  Since we want this to
 148      be as fast as possible we are using `alloca' to store the temporary
 149      values.  But since there is no limit on the length of the string
 150      we have to use `malloc' if the string is too long.  We should be
 151      very conservative here.  */
 152   if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1)))
 153     {
 154       idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
 155       rulearr = (unsigned char *) &idxarr[srclen];
 156
 157       if (idxarr == NULL)
 158         /* No memory.  Well, go with the stack then.
 159
 160            XXX Once this implementation is stable we will handle this
 161            differently.  Instead of precomputing the indeces we will
 162            do this in time.  This means, though, that this happens for
 163            every pass again.  */
 164         goto try_stack;
 165       use_malloc = 1;
 166     }
 167   else
 168     {
 169     try_stack:
 170       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
 171       rulearr = (unsigned char *) alloca (srclen + 1);
 172     }
 173
 174   idxmax = 0;
 175   do
 176     {
 177       int32_t tmp = findidx (&usrc, -1);
 178       rulearr[idxmax] = tmp >> 24;
 179       idxarr[idxmax] = tmp & 0xffffff;
 180
 181       ++idxmax;
 182     }
 183   while (*usrc != L('\0'));
 184
 185   /* This element is only read, the value never used but to determine
 186      another value which then is ignored.  */
 187   rulearr[idxmax] = '\0';
 188
 189   /* Now the passes over the weights.  We now use the indeces we found
 190      before.  */
 191   needed = 0;
 192   for (pass = 0; pass < nrules; ++pass)
 193     {
 194       size_t backw_stop = ~0ul;
 195       int rule = rulesets[rulearr[0] * nrules + pass];
 196       /* We assume that if a rule has defined `position' in one section
 197          this is true for all of them.  */
 198       int position = rule & sort_position;
 199
 200       last_needed = needed;
 201       if (position == 0)
 202         {
 203           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 204             {
 205               if ((rule & sort_forward) != 0)
 206                 {
 207                   size_t len;
 208
 209                   if (backw_stop != ~0ul)
 210                     {
 211                       /* Handle the pushed elements now.  */
 212                       size_t backw;
 213
 214                       for (backw = idxcnt; backw > backw_stop; )
 215                         {
 216                           --backw;
 217                           len = weights[idxarr[backw]++];
 218
 219                           if (needed + len < n)
 220                             while (len-- > 0)
 221                               dest[needed++] = weights[idxarr[backw]++];
 222                           else
 223                             {
 224                                 /* No more characters fit into the buffer.  */
 225                               needed += len;
 226                               idxarr[backw] += len;
 227                             }
 228                         }
 229
 230                       backw_stop = ~0ul;
 231                     }
 232
 233                   /* Now handle the forward element.  */
 234                   len = weights[idxarr[idxcnt]++];
 235                   if (needed + len < n)
 236                     while (len-- > 0)
 237                       dest[needed++] = weights[idxarr[idxcnt]++];
 238                   else
 239                     {
 240                       /* No more characters fit into the buffer.  */
 241                       needed += len;
 242                       idxarr[idxcnt] += len;
 243                     }
 244                 }
 245               else
 246                 {
 247                   /* Remember where the backwards series started.  */
 248                   if (backw_stop == ~0ul)
 249                     backw_stop = idxcnt;
 250                 }
 251
 252               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 253             }
 254
 255
 256           if (backw_stop != ~0ul)
 257             {
 258               /* Handle the pushed elements now.  */
 259               size_t backw;
 260
 261               backw = idxcnt;
 262               while (backw > backw_stop)
 263                 {
 264                   size_t len = weights[idxarr[--backw]++];
 265
 266                   if (needed + len < n)
 267                     while (len-- > 0)
 268                       dest[needed++] = weights[idxarr[backw]++];
 269                   else
 270                     {
 271                       /* No more characters fit into the buffer.  */
 272                       needed += len;
 273                       idxarr[backw] += len;
 274                     }
 275                 }
 276             }
 277         }
 278       else
 279         {
 280           int val = 1;
 281 #ifndef WIDE_CHAR_VERSION
 282           char buf[7];
 283           size_t buflen;
 284 #endif
 285           size_t i;
 286
 287           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 288             {
 289               if ((rule & sort_forward) != 0)
 290                 {
 291                   size_t len;
 292
 293                   if (backw_stop != ~0ul)
 294                     {
 295                      /* Handle the pushed elements now.  */
 296                       size_t backw;
 297
 298                       for (backw = idxcnt; backw > backw_stop; )
 299                         {
 300                           --backw;
 301                           len = weights[idxarr[backw]++];
 302                           if (len != 0)
 303                             {
 304 #ifdef WIDE_CHAR_VERSION
 305                               if (needed + 1 + len < n)
 306                                 {
 307                                   dest[needed] = val;
 308                                   for (i = 0; i < len; ++i)
 309                                     dest[needed + 1 + i] =
 310                                       weights[idxarr[backw] + i];
 311                                 }
 312                               needed += 1 + len;
 313 #else
 314                               buflen = utf8_encode (buf, val);
 315                               if (needed + buflen + len < n)
 316                                 {
 317                                   for (i = 0; i < buflen; ++i)
 318                                     dest[needed + i] = buf[i];
 319                                   for (i = 0; i < len; ++i)
 320                                     dest[needed + buflen + i] =
 321                                       weights[idxarr[backw] + i];
 322                                 }
 323                               needed += buflen + len;
 324 #endif
 325                               idxarr[backw] += len;
 326                               val = 1;
 327                             }
 328                           else
 329                             ++val;
 330                         }
 331
 332                       backw_stop = ~0ul;
 333                     }
 334
 335                   /* Now handle the forward element.  */
 336                   len = weights[idxarr[idxcnt]++];
 337                   if (len != 0)
 338                     {
 339 #ifdef WIDE_CHAR_VERSION
 340                       if (needed + 1+ len < n)
 341                         {
 342                           dest[needed] = val;
 343                           for (i = 0; i < len; ++i)
 344                             dest[needed + 1 + i] =
 345                               weights[idxarr[idxcnt] + i];
 346                         }
 347                       needed += 1 + len;
 348 #else
 349                       buflen = utf8_encode (buf, val);
 350                       if (needed + buflen + len < n)
 351                         {
 352                           for (i = 0; i < buflen; ++i)
 353                             dest[needed + i] = buf[i];
 354                           for (i = 0; i < len; ++i)
 355                             dest[needed + buflen + i] =
 356                               weights[idxarr[idxcnt] + i];
 357                         }
 358                       needed += buflen + len;
 359 #endif
 360                       idxarr[idxcnt] += len;
 361                       val = 1;
 362                     }
 363                   else
 364                     /* Note that we don't have to increment `idxarr[idxcnt]'
 365                        since the length is zero.  */
 366                     ++val;
 367                 }
 368               else
 369                 {
 370                   /* Remember where the backwards series started.  */
 371                   if (backw_stop == ~0ul)
 372                     backw_stop = idxcnt;
 373                 }
 374
 375               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 376             }
 377
 378           if (backw_stop != ~0ul)
 379             {
 380               /* Handle the pushed elements now.  */
 381               size_t backw;
 382
 383               backw = idxmax - 1;
 384               while (backw > backw_stop)
 385                 {
 386                   size_t len = weights[idxarr[--backw]++];
 387                   if (len != 0)
 388                     {
 389 #ifdef WIDE_CHAR_VERSION
 390                       if (needed + 1 + len < n)
 391                         {
 392                           dest[needed] = val;
 393                           for (i = 0; i < len; ++i)
 394                             dest[needed + 1 + i] =
 395                               weights[idxarr[backw] + i];
 396                         }
 397                       needed += 1 + len;
 398 #else
 399                       buflen = utf8_encode (buf, val);
 400                       if (needed + buflen + len < n)
 401                         {
 402                           for (i = 0; i < buflen; ++i)
 403                             dest[needed + i] = buf[i];
 404                           for (i = 0; i < len; ++i)
 405                             dest[needed + buflen + i] =
 406                               weights[idxarr[backw] + i];
 407                         }
 408                       needed += buflen + len;
 409 #endif
 410                       idxarr[backw] += len;
 411                       val = 1;
 412                     }
 413                   else
 414                     ++val;
 415                 }
 416             }
 417         }
 418
 419       /* Finally store the byte to separate the passes or terminate
 420          the string.  */
 421       if (needed < n)
 422         dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
 423       ++needed;
 424     }
 425
 426   /* This is a little optimization: many collation specifications have
 427      a `position' rule at the end and if no non-ignored character
 428      is found the last \1 byte is immediately followed by a \0 byte
 429      signalling this.  We can avoid the \1 byte(s).  */
 430   if (needed > 2 && needed == last_needed + 1)
 431     {
 432       /* Remove the \1 byte.  */
 433       if (--needed <= n)
 434         dest[needed - 1] = L('\0');
 435     }
 436
 437   /* Free the memory if needed.  */
 438   if (use_malloc)
 439     free (idxarr);
 440
 441   /* Return the number of bytes/words we need, but don't count the NUL
 442      byte/word at the end.  */
 443   return needed - 1;
 444 }
 445 libc_hidden_def (STRXFRM)
 446
 447 #ifndef WIDE_CHAR_VERSION
 448 weak_alias (__strxfrm_l, strxfrm_l)
 449 #endif