string/strxfrm_l.c

   1 /* Copyright (C) 1995-1997,2002,2004-2006,2010,2011
   2    Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <assert.h>
  21 #include <langinfo.h>
  22 #include <locale.h>
  23 #include <stddef.h>
  24 #include <stdint.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27 #include <sys/param.h>
  28
  29 #ifndef STRING_TYPE
  30 # define STRING_TYPE char
  31 # define USTRING_TYPE unsigned char
  32 # define STRXFRM __strxfrm_l
  33 # define STRCMP strcmp
  34 # define STRLEN strlen
  35 # define STPNCPY __stpncpy
  36 # define WEIGHT_H "../locale/weight.h"
  37 # define SUFFIX MB
  38 # define L(arg) arg
  39 #endif
  40
  41 #define CONCAT(a,b) CONCAT1(a,b)
  42 #define CONCAT1(a,b) a##b
  43
  44 #include "../locale/localeinfo.h"
  45
  46
  47 #ifndef WIDE_CHAR_VERSION
  48
  49 /* We need UTF-8 encoding of numbers.  */
  50 static int
  51 utf8_encode (char *buf, int val)
  52 {
  53   int retval;
  54
  55   if (val < 0x80)
  56     {
  57       *buf++ = (char) val;
  58       retval = 1;
  59     }
  60   else
  61     {
  62       int step;
  63
  64       for (step = 2; step < 6; ++step)
  65         if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
  66           break;
  67       retval = step;
  68
  69       *buf = (unsigned char) (~0xff >> step);
  70       --step;
  71       do
  72         {
  73           buf[step] = 0x80 | (val & 0x3f);
  74           val >>= 6;
  75         }
  76       while (--step > 0);
  77       *buf |= val;
  78     }
  79
  80   return retval;
  81 }
  82 #endif
  83
  84
  85 size_t
  86 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
  87 {
  88   struct __locale_data *current = l->__locales[LC_COLLATE];
  89   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
  90   /* We don't assign the following values right away since it might be
  91      unnecessary in case there are no rules.  */
  92   const unsigned char *rulesets;
  93   const int32_t *table;
  94   const USTRING_TYPE *weights;
  95   const USTRING_TYPE *extra;
  96   const int32_t *indirect;
  97   uint_fast32_t pass;
  98   size_t needed;
  99   size_t last_needed;
 100   const USTRING_TYPE *usrc;
 101   size_t srclen = STRLEN (src);
 102   int32_t *idxarr;
 103   unsigned char *rulearr;
 104   size_t idxmax;
 105   size_t idxcnt;
 106   int use_malloc;
 107
 108 #include WEIGHT_H
 109
 110   if (nrules == 0)
 111     {
 112       if (n != 0)
 113         STPNCPY (dest, src, MIN (srclen + 1, n));
 114
 115       return srclen;
 116     }
 117
 118   rulesets = (const unsigned char *)
 119     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 120   table = (const int32_t *)
 121     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 122   weights = (const USTRING_TYPE *)
 123     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 124   extra = (const USTRING_TYPE *)
 125     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 126   indirect = (const int32_t *)
 127     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 128   use_malloc = 0;
 129
 130   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 131   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 132   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 133   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 134
 135   /* Handle an empty string as a special case.  */
 136   if (srclen == 0)
 137     {
 138       if (n != 0)
 139         *dest = L('\0');
 140       return 0;
 141     }
 142
 143   /* We need the elements of the string as unsigned values since they
 144      are used as indeces.  */
 145   usrc = (const USTRING_TYPE *) src;
 146
 147   /* Perform the first pass over the string and while doing this find
 148      and store the weights for each character.  Since we want this to
 149      be as fast as possible we are using `alloca' to store the temporary
 150      values.  But since there is no limit on the length of the string
 151      we have to use `malloc' if the string is too long.  We should be
 152      very conservative here.  */
 153   if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1)))
 154     {
 155       idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
 156       rulearr = (unsigned char *) &idxarr[srclen];
 157
 158       if (idxarr == NULL)
 159         /* No memory.  Well, go with the stack then.
 160
 161            XXX Once this implementation is stable we will handle this
 162            differently.  Instead of precomputing the indeces we will
 163            do this in time.  This means, though, that this happens for
 164            every pass again.  */
 165         goto try_stack;
 166       use_malloc = 1;
 167     }
 168   else
 169     {
 170     try_stack:
 171       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
 172       rulearr = (unsigned char *) alloca (srclen + 1);
 173     }
 174
 175   idxmax = 0;
 176   do
 177     {
 178       int32_t tmp = findidx (&usrc, -1);
 179       rulearr[idxmax] = tmp >> 24;
 180       idxarr[idxmax] = tmp & 0xffffff;
 181
 182       ++idxmax;
 183     }
 184   while (*usrc != L('\0'));
 185
 186   /* This element is only read, the value never used but to determine
 187      another value which then is ignored.  */
 188   rulearr[idxmax] = '\0';
 189
 190   /* Now the passes over the weights.  We now use the indeces we found
 191      before.  */
 192   needed = 0;
 193   for (pass = 0; pass < nrules; ++pass)
 194     {
 195       size_t backw_stop = ~0ul;
 196       int rule = rulesets[rulearr[0] * nrules + pass];
 197       /* We assume that if a rule has defined `position' in one section
 198          this is true for all of them.  */
 199       int position = rule & sort_position;
 200
 201       last_needed = needed;
 202       if (position == 0)
 203         {
 204           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 205             {
 206               if ((rule & sort_forward) != 0)
 207                 {
 208                   size_t len;
 209
 210                   if (backw_stop != ~0ul)
 211                     {
 212                       /* Handle the pushed elements now.  */
 213                       size_t backw;
 214
 215                       for (backw = idxcnt; backw > backw_stop; )
 216                         {
 217                           --backw;
 218                           len = weights[idxarr[backw]++];
 219
 220                           if (needed + len < n)
 221                             while (len-- > 0)
 222                               dest[needed++] = weights[idxarr[backw]++];
 223                           else
 224                             {
 225                                 /* No more characters fit into the buffer.  */
 226                               needed += len;
 227                               idxarr[backw] += len;
 228                             }
 229                         }
 230
 231                       backw_stop = ~0ul;
 232                     }
 233
 234                   /* Now handle the forward element.  */
 235                   len = weights[idxarr[idxcnt]++];
 236                   if (needed + len < n)
 237                     while (len-- > 0)
 238                       dest[needed++] = weights[idxarr[idxcnt]++];
 239                   else
 240                     {
 241                       /* No more characters fit into the buffer.  */
 242                       needed += len;
 243                       idxarr[idxcnt] += len;
 244                     }
 245                 }
 246               else
 247                 {
 248                   /* Remember where the backwards series started.  */
 249                   if (backw_stop == ~0ul)
 250                     backw_stop = idxcnt;
 251                 }
 252
 253               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 254             }
 255
 256
 257           if (backw_stop != ~0ul)
 258             {
 259               /* Handle the pushed elements now.  */
 260               size_t backw;
 261
 262               backw = idxcnt;
 263               while (backw > backw_stop)
 264                 {
 265                   size_t len = weights[idxarr[--backw]++];
 266
 267                   if (needed + len < n)
 268                     while (len-- > 0)
 269                       dest[needed++] = weights[idxarr[backw]++];
 270                   else
 271                     {
 272                       /* No more characters fit into the buffer.  */
 273                       needed += len;
 274                       idxarr[backw] += len;
 275                     }
 276                 }
 277             }
 278         }
 279       else
 280         {
 281           int val = 1;
 282 #ifndef WIDE_CHAR_VERSION
 283           char buf[7];
 284           size_t buflen;
 285 #endif
 286           size_t i;
 287
 288           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 289             {
 290               if ((rule & sort_forward) != 0)
 291                 {
 292                   size_t len;
 293
 294                   if (backw_stop != ~0ul)
 295                     {
 296                      /* Handle the pushed elements now.  */
 297                       size_t backw;
 298
 299                       for (backw = idxcnt; backw > backw_stop; )
 300                         {
 301                           --backw;
 302                           len = weights[idxarr[backw]++];
 303                           if (len != 0)
 304                             {
 305 #ifdef WIDE_CHAR_VERSION
 306                               if (needed + 1 + len < n)
 307                                 {
 308                                   dest[needed] = val;
 309                                   for (i = 0; i < len; ++i)
 310                                     dest[needed + 1 + i] =
 311                                       weights[idxarr[backw] + i];
 312                                 }
 313                               needed += 1 + len;
 314 #else
 315                               buflen = utf8_encode (buf, val);
 316                               if (needed + buflen + len < n)
 317                                 {
 318                                   for (i = 0; i < buflen; ++i)
 319                                     dest[needed + i] = buf[i];
 320                                   for (i = 0; i < len; ++i)
 321                                     dest[needed + buflen + i] =
 322                                       weights[idxarr[backw] + i];
 323                                 }
 324                               needed += buflen + len;
 325 #endif
 326                               idxarr[backw] += len;
 327                               val = 1;
 328                             }
 329                           else
 330                             ++val;
 331                         }
 332
 333                       backw_stop = ~0ul;
 334                     }
 335
 336                   /* Now handle the forward element.  */
 337                   len = weights[idxarr[idxcnt]++];
 338                   if (len != 0)
 339                     {
 340 #ifdef WIDE_CHAR_VERSION
 341                       if (needed + 1+ len < n)
 342                         {
 343                           dest[needed] = val;
 344                           for (i = 0; i < len; ++i)
 345                             dest[needed + 1 + i] =
 346                               weights[idxarr[idxcnt] + i];
 347                         }
 348                       needed += 1 + len;
 349 #else
 350                       buflen = utf8_encode (buf, val);
 351                       if (needed + buflen + len < n)
 352                         {
 353                           for (i = 0; i < buflen; ++i)
 354                             dest[needed + i] = buf[i];
 355                           for (i = 0; i < len; ++i)
 356                             dest[needed + buflen + i] =
 357                               weights[idxarr[idxcnt] + i];
 358                         }
 359                       needed += buflen + len;
 360 #endif
 361                       idxarr[idxcnt] += len;
 362                       val = 1;
 363                     }
 364                   else
 365                     /* Note that we don't have to increment `idxarr[idxcnt]'
 366                        since the length is zero.  */
 367                     ++val;
 368                 }
 369               else
 370                 {
 371                   /* Remember where the backwards series started.  */
 372                   if (backw_stop == ~0ul)
 373                     backw_stop = idxcnt;
 374                 }
 375
 376               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 377             }
 378
 379           if (backw_stop != ~0ul)
 380             {
 381               /* Handle the pushed elements now.  */
 382               size_t backw;
 383
 384               backw = idxmax - 1;
 385               while (backw > backw_stop)
 386                 {
 387                   size_t len = weights[idxarr[--backw]++];
 388                   if (len != 0)
 389                     {
 390 #ifdef WIDE_CHAR_VERSION
 391                       if (needed + 1 + len < n)
 392                         {
 393                           dest[needed] = val;
 394                           for (i = 0; i < len; ++i)
 395                             dest[needed + 1 + i] =
 396                               weights[idxarr[backw] + i];
 397                         }
 398                       needed += 1 + len;
 399 #else
 400                       buflen = utf8_encode (buf, val);
 401                       if (needed + buflen + len < n)
 402                         {
 403                           for (i = 0; i < buflen; ++i)
 404                             dest[needed + i] = buf[i];
 405                           for (i = 0; i < len; ++i)
 406                             dest[needed + buflen + i] =
 407                               weights[idxarr[backw] + i];
 408                         }
 409                       needed += buflen + len;
 410 #endif
 411                       idxarr[backw] += len;
 412                       val = 1;
 413                     }
 414                   else
 415                     ++val;
 416                 }
 417             }
 418         }
 419
 420       /* Finally store the byte to separate the passes or terminate
 421          the string.  */
 422       if (needed < n)
 423         dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
 424       ++needed;
 425     }
 426
 427   /* This is a little optimization: many collation specifications have
 428      a `position' rule at the end and if no non-ignored character
 429      is found the last \1 byte is immediately followed by a \0 byte
 430      signalling this.  We can avoid the \1 byte(s).  */
 431   if (needed > 2 && needed == last_needed + 1)
 432     {
 433       /* Remove the \1 byte.  */
 434       if (--needed <= n)
 435         dest[needed - 1] = L('\0');
 436     }
 437
 438   /* Free the memory if needed.  */
 439   if (use_malloc)
 440     free (idxarr);
 441
 442   /* Return the number of bytes/words we need, but don't count the NUL
 443      byte/word at the end.  */
 444   return needed - 1;
 445 }
 446 libc_hidden_def (STRXFRM)
 447
 448 #ifndef WIDE_CHAR_VERSION
 449 weak_alias (__strxfrm_l, strxfrm_l)
 450 #endif