string/strxfrm_l.c

   1 /* Copyright (C) 1995,96,97,2002, 2004, 2005 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <assert.h>
  21 #include <langinfo.h>
  22 #include <locale.h>
  23 #include <stddef.h>
  24 #include <stdint.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27 #include <sys/param.h>
  28
  29 #ifndef STRING_TYPE
  30 # define STRING_TYPE char
  31 # define USTRING_TYPE unsigned char
  32 # define STRXFRM __strxfrm_l
  33 # define STRCMP strcmp
  34 # define STRLEN strlen
  35 # define STPNCPY __stpncpy
  36 # define WEIGHT_H "../locale/weight.h"
  37 # define SUFFIX MB
  38 # define L(arg) arg
  39 #endif
  40
  41 #define CONCAT(a,b) CONCAT1(a,b)
  42 #define CONCAT1(a,b) a##b
  43
  44 #include "../locale/localeinfo.h"
  45
  46
  47 #ifndef WIDE_CHAR_VERSION
  48
  49 /* We need UTF-8 encoding of numbers.  */
  50 static int
  51 utf8_encode (char *buf, int val)
  52 {
  53   int retval;
  54
  55   if (val < 0x80)
  56     {
  57       *buf++ = (char) val;
  58       retval = 1;
  59     }
  60   else
  61     {
  62       int step;
  63
  64       for (step = 2; step < 6; ++step)
  65         if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
  66           break;
  67       retval = step;
  68
  69       *buf = (unsigned char) (~0xff >> step);
  70       --step;
  71       do
  72         {
  73           buf[step] = 0x80 | (val & 0x3f);
  74           val >>= 6;
  75         }
  76       while (--step > 0);
  77       *buf |= val;
  78     }
  79
  80   return retval;
  81 }
  82 #endif
  83
  84
  85 size_t
  86 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
  87 {
  88   struct locale_data *current = l->__locales[LC_COLLATE];
  89   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
  90   /* We don't assign the following values right away since it might be
  91      unnecessary in case there are no rules.  */
  92   const unsigned char *rulesets;
  93   const int32_t *table;
  94   const USTRING_TYPE *weights;
  95   const USTRING_TYPE *extra;
  96   const int32_t *indirect;
  97   uint_fast32_t pass;
  98   size_t needed;
  99   const USTRING_TYPE *usrc;
 100   size_t srclen = STRLEN (src);
 101   int32_t *idxarr;
 102   unsigned char *rulearr;
 103   size_t idxmax;
 104   size_t idxcnt;
 105   int use_malloc;
 106
 107 #include WEIGHT_H
 108
 109   if (nrules == 0)
 110     {
 111       if (n != 0)
 112         STPNCPY (dest, src, MIN (srclen + 1, n));
 113
 114       return srclen;
 115     }
 116
 117   rulesets = (const unsigned char *)
 118     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 119   table = (const int32_t *)
 120     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 121   weights = (const USTRING_TYPE *)
 122     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 123   extra = (const USTRING_TYPE *)
 124     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 125   indirect = (const int32_t *)
 126     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 127   use_malloc = 0;
 128
 129   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 130   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 131   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 132   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 133
 134   /* Handle an empty string as a special case.  */
 135   if (srclen == 0)
 136     {
 137       if (n != 0)
 138         *dest = L('\0');
 139       return 0;
 140     }
 141
 142   /* We need the elements of the string as unsigned values since they
 143      are used as indeces.  */
 144   usrc = (const USTRING_TYPE *) src;
 145
 146   /* Perform the first pass over the string and while doing this find
 147      and store the weights for each character.  Since we want this to
 148      be as fast as possible we are using `alloca' to store the temporary
 149      values.  But since there is no limit on the length of the string
 150      we have to use `malloc' if the string is too long.  We should be
 151      very conservative here.  */
 152   if (! __libc_use_alloca (srclen))
 153     {
 154       idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
 155       rulearr = (unsigned char *) &idxarr[srclen];
 156
 157       if (idxarr == NULL)
 158         /* No memory.  Well, go with the stack then.
 159
 160            XXX Once this implementation is stable we will handle this
 161            differently.  Instead of precomputing the indeces we will
 162            do this in time.  This means, though, that this happens for
 163            every pass again.  */
 164         goto try_stack;
 165       use_malloc = 1;
 166     }
 167   else
 168     {
 169     try_stack:
 170       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
 171       rulearr = (unsigned char *) alloca (srclen + 1);
 172     }
 173
 174   idxmax = 0;
 175   do
 176     {
 177       int32_t tmp = findidx (&usrc);
 178       rulearr[idxmax] = tmp >> 24;
 179       idxarr[idxmax] = tmp & 0xffffff;
 180
 181       ++idxmax;
 182     }
 183   while (*usrc != L('\0'));
 184
 185   /* This element is only read, the value never used but to determine
 186      another value which then is ignored.  */
 187   rulearr[idxmax] = '\0';
 188
 189   /* Now the passes over the weights.  We now use the indeces we found
 190      before.  */
 191   needed = 0;
 192   for (pass = 0; pass < nrules; ++pass)
 193     {
 194       size_t backw_stop = ~0ul;
 195       int rule = rulesets[rulearr[0] * nrules + pass];
 196       /* We assume that if a rule has defined `position' in one section
 197          this is true for all of them.  */
 198       int position = rule & sort_position;
 199
 200       if (position == 0)
 201         {
 202           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 203             {
 204               if ((rule & sort_forward) != 0)
 205                 {
 206                   size_t len;
 207
 208                   if (backw_stop != ~0ul)
 209                     {
 210                       /* Handle the pushed elements now.  */
 211                       size_t backw;
 212
 213                       for (backw = idxcnt; backw > backw_stop; )
 214                         {
 215                           --backw;
 216                           len = weights[idxarr[backw]++];
 217
 218                           if (needed + len < n)
 219                             while (len-- > 0)
 220                               dest[needed++] = weights[idxarr[backw]++];
 221                           else
 222                             {
 223                                 /* No more characters fit into the buffer.  */
 224                               needed += len;
 225                               idxarr[backw] += len;
 226                             }
 227                         }
 228
 229                       backw_stop = ~0ul;
 230                     }
 231
 232                   /* Now handle the forward element.  */
 233                   len = weights[idxarr[idxcnt]++];
 234                   if (needed + len < n)
 235                     while (len-- > 0)
 236                       dest[needed++] = weights[idxarr[idxcnt]++];
 237                   else
 238                     {
 239                       /* No more characters fit into the buffer.  */
 240                       needed += len;
 241                       idxarr[idxcnt] += len;
 242                     }
 243                 }
 244               else
 245                 {
 246                   /* Remember where the backwards series started.  */
 247                   if (backw_stop == ~0ul)
 248                     backw_stop = idxcnt;
 249                 }
 250
 251               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 252             }
 253
 254
 255           if (backw_stop != ~0ul)
 256             {
 257               /* Handle the pushed elements now.  */
 258               size_t backw;
 259
 260               backw = idxcnt;
 261               while (backw > backw_stop)
 262                 {
 263                   size_t len = weights[idxarr[--backw]++];
 264
 265                   if (needed + len < n)
 266                     while (len-- > 0)
 267                       dest[needed++] = weights[idxarr[backw]++];
 268                   else
 269                     {
 270                       /* No more characters fit into the buffer.  */
 271                       needed += len;
 272                       idxarr[backw] += len;
 273                     }
 274                 }
 275             }
 276         }
 277       else
 278         {
 279           int val = 1;
 280 #ifndef WIDE_CHAR_VERSION
 281           char buf[7];
 282           size_t buflen;
 283 #endif
 284           size_t i;
 285
 286           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 287             {
 288               if ((rule & sort_forward) != 0)
 289                 {
 290                   size_t len;
 291
 292                   if (backw_stop != ~0ul)
 293                     {
 294                      /* Handle the pushed elements now.  */
 295                       size_t backw;
 296
 297                       for (backw = idxcnt; backw > backw_stop; )
 298                         {
 299                           --backw;
 300                           len = weights[idxarr[backw]++];
 301                           if (len != 0)
 302                             {
 303 #ifdef WIDE_CHAR_VERSION
 304                               if (needed + 1 + len < n)
 305                                 {
 306                                   dest[needed] = val;
 307                                   for (i = 0; i < len; ++i)
 308                                     dest[needed + 1 + i] =
 309                                       weights[idxarr[backw] + i];
 310                                 }
 311                               needed += 1 + len;
 312 #else
 313                               buflen = utf8_encode (buf, val);
 314                               if (needed + buflen + len < n)
 315                                 {
 316                                   for (i = 0; i < buflen; ++i)
 317                                     dest[needed + i] = buf[i];
 318                                   for (i = 0; i < len; ++i)
 319                                     dest[needed + buflen + i] =
 320                                       weights[idxarr[backw] + i];
 321                                 }
 322                               needed += buflen + len;
 323 #endif
 324                               idxarr[backw] += len;
 325                               val = 1;
 326                             }
 327                           else
 328                             ++val;
 329                         }
 330
 331                       backw_stop = ~0ul;
 332                     }
 333
 334                   /* Now handle the forward element.  */
 335                   len = weights[idxarr[idxcnt]++];
 336                   if (len != 0)
 337                     {
 338 #ifdef WIDE_CHAR_VERSION
 339                       if (needed + 1+ len < n)
 340                         {
 341                           dest[needed] = val;
 342                           for (i = 0; i < len; ++i)
 343                             dest[needed + 1 + i] =
 344                               weights[idxarr[idxcnt] + i];
 345                         }
 346                       needed += 1 + len;
 347 #else
 348                       buflen = utf8_encode (buf, val);
 349                       if (needed + buflen + len < n)
 350                         {
 351                           for (i = 0; i < buflen; ++i)
 352                             dest[needed + i] = buf[i];
 353                           for (i = 0; i < len; ++i)
 354                             dest[needed + buflen + i] =
 355                               weights[idxarr[idxcnt] + i];
 356                         }
 357                       needed += buflen + len;
 358 #endif
 359                       idxarr[idxcnt] += len;
 360                       val = 1;
 361                     }
 362                   else
 363                     /* Note that we don't have to increment `idxarr[idxcnt]'
 364                        since the length is zero.  */
 365                     ++val;
 366                 }
 367               else
 368                 {
 369                   /* Remember where the backwards series started.  */
 370                   if (backw_stop == ~0ul)
 371                     backw_stop = idxcnt;
 372                 }
 373
 374               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 375             }
 376
 377           if (backw_stop != ~0ul)
 378             {
 379               /* Handle the pushed elements now.  */
 380               size_t backw;
 381
 382               backw = idxmax - 1;
 383               while (backw > backw_stop)
 384                 {
 385                   size_t len = weights[idxarr[--backw]++];
 386                   if (len != 0)
 387                     {
 388 #ifdef WIDE_CHAR_VERSION
 389                       if (needed + 1 + len < n)
 390                         {
 391                           dest[needed] = val;
 392                           for (i = 0; i < len; ++i)
 393                             dest[needed + 1 + i] =
 394                               weights[idxarr[backw] + i];
 395                         }
 396                       needed += 1 + len;
 397 #else
 398                       buflen = utf8_encode (buf, val);
 399                       if (needed + buflen + len < n)
 400                         {
 401                           for (i = 0; i < buflen; ++i)
 402                             dest[needed + i] = buf[i];
 403                           for (i = 0; i < len; ++i)
 404                             dest[needed + buflen + i] =
 405                               weights[idxarr[backw] + i];
 406                         }
 407                       needed += buflen + len;
 408 #endif
 409                       idxarr[backw] += len;
 410                       val = 1;
 411                     }
 412                   else
 413                     ++val;
 414                 }
 415             }
 416         }
 417
 418       /* Finally store the byte to separate the passes or terminate
 419          the string.  */
 420       if (needed < n)
 421         dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
 422       ++needed;
 423     }
 424
 425   /* This is a little optimization: many collation specifications have
 426      a `position' rule at the end and if no non-ignored character
 427      is found the last \1 byte is immediately followed by a \0 byte
 428      signalling this.  We can avoid the \1 byte(s).  */
 429   if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
 430     {
 431       /* Remove the \1 byte.  */
 432       --needed;
 433       dest[needed - 1] = L('\0');
 434     }
 435
 436   /* Free the memory if needed.  */
 437   if (use_malloc)
 438     free (idxarr);
 439
 440   /* Return the number of bytes/words we need, but don't count the NUL
 441      byte/word at the end.  */
 442   return needed - 1;
 443 }
 444 libc_hidden_def (STRXFRM)
 445
 446 #ifndef WIDE_CHAR_VERSION
 447 weak_alias (__strxfrm_l, strxfrm_l)
 448 #endif