string/strxfrm_l.c

   1 /* Copyright (C) 1995,96,97,2002, 2004 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <assert.h>
  21 #include <langinfo.h>
  22 #include <locale.h>
  23 #include <stddef.h>
  24 #include <stdint.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27 #include <sys/param.h>
  28
  29 #ifndef STRING_TYPE
  30 # define STRING_TYPE char
  31 # define USTRING_TYPE unsigned char
  32 # define STRXFRM __strxfrm_l
  33 # define STRCMP strcmp
  34 # define STRLEN strlen
  35 # define STPNCPY __stpncpy
  36 # define WEIGHT_H "../locale/weight.h"
  37 # define SUFFIX MB
  38 # define L(arg) arg
  39 #endif
  40
  41 #define CONCAT(a,b) CONCAT1(a,b)
  42 #define CONCAT1(a,b) a##b
  43
  44 #include "../locale/localeinfo.h"
  45
  46
  47 #ifndef WIDE_CHAR_VERSION
  48
  49 /* We need UTF-8 encoding of numbers.  */
  50 static int
  51 utf8_encode (char *buf, int val)
  52 {
  53   int retval;
  54
  55   if (val < 0x80)
  56     {
  57       *buf++ = (char) val;
  58       retval = 1;
  59     }
  60   else
  61     {
  62       int step;
  63
  64       for (step = 2; step < 6; ++step)
  65         if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
  66           break;
  67       retval = step;
  68
  69       *buf = (unsigned char) (~0xff >> step);
  70       --step;
  71       do
  72         {
  73           buf[step] = 0x80 | (val & 0x3f);
  74           val >>= 6;
  75         }
  76       while (--step > 0);
  77       *buf |= val;
  78     }
  79
  80   return retval;
  81 }
  82 #endif
  83
  84
  85 size_t
  86 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
  87 {
  88   struct locale_data *current = l->__locales[LC_COLLATE];
  89   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
  90   /* We don't assign the following values right away since it might be
  91      unnecessary in case there are no rules.  */
  92   const unsigned char *rulesets;
  93   const int32_t *table;
  94   const USTRING_TYPE *weights;
  95   const USTRING_TYPE *extra;
  96   const int32_t *indirect;
  97   uint_fast32_t pass;
  98   size_t needed;
  99   const USTRING_TYPE *usrc;
 100   size_t srclen = STRLEN (src);
 101   int32_t *idxarr;
 102   unsigned char *rulearr;
 103   size_t idxmax;
 104   size_t idxcnt;
 105   int use_malloc;
 106
 107 #include WEIGHT_H
 108
 109   if (nrules == 0)
 110     {
 111       if (n != 0)
 112         STPNCPY (dest, src, MIN (srclen + 1, n));
 113
 114       return srclen;
 115     }
 116
 117   rulesets = (const unsigned char *)
 118     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 119   table = (const int32_t *)
 120     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 121   weights = (const USTRING_TYPE *)
 122     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 123   extra = (const USTRING_TYPE *)
 124     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 125   indirect = (const int32_t *)
 126     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 127   use_malloc = 0;
 128
 129   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 130   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 131   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 132   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 133
 134   /* Handle an empty string as a special case.  */
 135   if (srclen == 0)
 136     {
 137       if (n != 0)
 138         *dest = L('\0');
 139       return 0;
 140     }
 141
 142   /* We need the elements of the string as unsigned values since they
 143      are used as indeces.  */
 144   usrc = (const USTRING_TYPE *) src;
 145
 146   /* Perform the first pass over the string and while doing this find
 147      and store the weights for each character.  Since we want this to
 148      be as fast as possible we are using `alloca' to store the temporary
 149      values.  But since there is no limit on the length of the string
 150      we have to use `malloc' if the string is too long.  We should be
 151      very conservative here.  */
 152   if (! __libc_use_alloca (srclen))
 153     {
 154       idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
 155       rulearr = (unsigned char *) &idxarr[srclen];
 156
 157       if (idxarr == NULL)
 158         /* No memory.  Well, go with the stack then.
 159
 160            XXX Once this implementation is stable we will handle this
 161            differently.  Instead of precomputing the indeces we will
 162            do this in time.  This means, though, that this happens for
 163            every pass again.  */
 164         goto try_stack;
 165       use_malloc = 1;
 166     }
 167   else
 168     {
 169     try_stack:
 170       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
 171       rulearr = (unsigned char *) alloca (srclen + 1);
 172     }
 173
 174   idxmax = 0;
 175   do
 176     {
 177       int32_t tmp = findidx (&usrc);
 178       rulearr[idxmax] = tmp >> 24;
 179       idxarr[idxmax] = tmp & 0xffffff;
 180
 181       ++idxmax;
 182     }
 183   while (*usrc != L('\0'));
 184
 185   /* This element is only read, the value never used but to determine
 186      another value which then is ignored.  */
 187   rulearr[idxmax] = '\0';
 188
 189   /* Now the passes over the weights.  We now use the indeces we found
 190      before.  */
 191   needed = 0;
 192   for (pass = 0; pass < nrules; ++pass)
 193     {
 194       size_t backw_stop = ~0ul;
 195       int rule = rulesets[rulearr[0] * nrules + pass];
 196       /* We assume that if a rule has defined `position' in one section
 197          this is true for all of them.  */
 198       int position = rule & sort_position;
 199
 200       if (position == 0)
 201         {
 202           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 203             {
 204               if ((rule & sort_forward) != 0)
 205                 {
 206                   size_t len;
 207
 208                   if (backw_stop != ~0ul)
 209                     {
 210                       /* Handle the pushed elements now.  */
 211                       size_t backw;
 212
 213                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 214                         {
 215                           len = weights[idxarr[backw]++];
 216
 217                           if (needed + len < n)
 218                             while (len-- > 0)
 219                               dest[needed++] = weights[idxarr[backw]++];
 220                           else
 221                             {
 222                                 /* No more characters fit into the buffer.  */
 223                               needed += len;
 224                               idxarr[backw] += len;
 225                             }
 226                         }
 227
 228                       backw_stop = ~0ul;
 229                     }
 230
 231                   /* Now handle the forward element.  */
 232                   len = weights[idxarr[idxcnt]++];
 233                   if (needed + len < n)
 234                     while (len-- > 0)
 235                       dest[needed++] = weights[idxarr[idxcnt]++];
 236                   else
 237                     {
 238                       /* No more characters fit into the buffer.  */
 239                       needed += len;
 240                       idxarr[idxcnt] += len;
 241                     }
 242                 }
 243               else
 244                 {
 245                   /* Remember where the backwards series started.  */
 246                   if (backw_stop == ~0ul)
 247                     backw_stop = idxcnt;
 248                 }
 249
 250               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 251             }
 252
 253
 254           if (backw_stop != ~0ul)
 255             {
 256               /* Handle the pushed elements now.  */
 257               size_t backw;
 258
 259               backw = idxcnt;
 260               while (backw > backw_stop)
 261                 {
 262                   size_t len = weights[idxarr[--backw]++];
 263
 264                   if (needed + len < n)
 265                     while (len-- > 0)
 266                       dest[needed++] = weights[idxarr[backw]++];
 267                   else
 268                     {
 269                       /* No more characters fit into the buffer.  */
 270                       needed += len;
 271                       idxarr[backw] += len;
 272                     }
 273                 }
 274             }
 275         }
 276       else
 277         {
 278           int val = 1;
 279 #ifndef WIDE_CHAR_VERSION
 280           char buf[7];
 281           size_t buflen;
 282 #endif
 283           size_t i;
 284
 285           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 286             {
 287               if ((rule & sort_forward) != 0)
 288                 {
 289                   size_t len;
 290
 291                   if (backw_stop != ~0ul)
 292                     {
 293                      /* Handle the pushed elements now.  */
 294                       size_t backw;
 295
 296                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 297                         {
 298                           len = weights[idxarr[backw]++];
 299                           if (len != 0)
 300                             {
 301 #ifdef WIDE_CHAR_VERSION
 302                               if (needed + 1 + len < n)
 303                                 {
 304                                   dest[needed] = val;
 305                                   for (i = 0; i < len; ++i)
 306                                     dest[needed + 1 + i] =
 307                                       weights[idxarr[backw] + i];
 308                                 }
 309                               needed += 1 + len;
 310 #else
 311                               buflen = utf8_encode (buf, val);
 312                               if (needed + buflen + len < n)
 313                                 {
 314                                   for (i = 0; i < buflen; ++i)
 315                                     dest[needed + i] = buf[i];
 316                                   for (i = 0; i < len; ++i)
 317                                     dest[needed + buflen + i] =
 318                                       weights[idxarr[backw] + i];
 319                                 }
 320                               needed += buflen + len;
 321 #endif
 322                               idxarr[backw] += len;
 323                               val = 1;
 324                             }
 325                           else
 326                             ++val;
 327                         }
 328
 329                       backw_stop = ~0ul;
 330                     }
 331
 332                   /* Now handle the forward element.  */
 333                   len = weights[idxarr[idxcnt]++];
 334                   if (len != 0)
 335                     {
 336 #ifdef WIDE_CHAR_VERSION
 337                       if (needed + 1+ len < n)
 338                         {
 339                           dest[needed] = val;
 340                           for (i = 0; i < len; ++i)
 341                             dest[needed + 1 + i] =
 342                               weights[idxarr[idxcnt] + i];
 343                         }
 344                       needed += 1 + len;
 345 #else
 346                       buflen = utf8_encode (buf, val);
 347                       if (needed + buflen + len < n)
 348                         {
 349                           for (i = 0; i < buflen; ++i)
 350                             dest[needed + i] = buf[i];
 351                           for (i = 0; i < len; ++i)
 352                             dest[needed + buflen + i] =
 353                               weights[idxarr[idxcnt] + i];
 354                         }
 355                       needed += buflen + len;
 356 #endif
 357                       idxarr[idxcnt] += len;
 358                       val = 1;
 359                     }
 360                   else
 361                     /* Note that we don't have to increment `idxarr[idxcnt]'
 362                        since the length is zero.  */
 363                     ++val;
 364                 }
 365               else
 366                 {
 367                   /* Remember where the backwards series started.  */
 368                   if (backw_stop == ~0ul)
 369                     backw_stop = idxcnt;
 370                 }
 371
 372               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 373             }
 374
 375           if (backw_stop != ~0ul)
 376             {
 377               /* Handle the pushed elements now.  */
 378               size_t backw;
 379
 380               backw = idxmax - 1;
 381               while (backw > backw_stop)
 382                 {
 383                   size_t len = weights[idxarr[--backw]++];
 384                   if (len != 0)
 385                     {
 386 #ifdef WIDE_CHAR_VERSION
 387                       if (needed + 1 + len < n)
 388                         {
 389                           dest[needed] = val;
 390                           for (i = 0; i < len; ++i)
 391                             dest[needed + 1 + i] =
 392                               weights[idxarr[backw] + i];
 393                         }
 394                       needed += 1 + len;
 395 #else
 396                       buflen = utf8_encode (buf, val);
 397                       if (needed + buflen + len < n)
 398                         {
 399                           for (i = 0; i < buflen; ++i)
 400                             dest[needed + i] = buf[i];
 401                           for (i = 0; i < len; ++i)
 402                             dest[needed + buflen + i] =
 403                               weights[idxarr[backw] + i];
 404                         }
 405                       needed += buflen + len;
 406 #endif
 407                       idxarr[backw] += len;
 408                       val = 1;
 409                     }
 410                   else
 411                     ++val;
 412                 }
 413             }
 414         }
 415
 416       /* Finally store the byte to separate the passes or terminate
 417          the string.  */
 418       if (needed < n)
 419         dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
 420       ++needed;
 421     }
 422
 423   /* This is a little optimization: many collation specifications have
 424      a `position' rule at the end and if no non-ignored character
 425      is found the last \1 byte is immediately followed by a \0 byte
 426      signalling this.  We can avoid the \1 byte(s).  */
 427   if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
 428     {
 429       /* Remove the \1 byte.  */
 430       --needed;
 431       dest[needed - 1] = L('\0');
 432     }
 433
 434   /* Free the memory if needed.  */
 435   if (use_malloc)
 436     free (idxarr);
 437
 438   /* Return the number of bytes/words we need, but don't count the NUL
 439      byte/word at the end.  */
 440   return needed - 1;
 441 }
 442 libc_hidden_def (STRXFRM)
 443
 444 #ifndef WIDE_CHAR_VERSION
 445 weak_alias (__strxfrm_l, strxfrm_l)
 446 #endif