string/strxfrm.c

   1 /* Copyright (C) 1995-1999,2000,2001,2002 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <assert.h>
  21 #include <langinfo.h>
  22 #include <locale.h>
  23 #include <stddef.h>
  24 #include <stdint.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27 #include <sys/param.h>
  28
  29 #ifndef STRING_TYPE
  30 # define STRING_TYPE char
  31 # define USTRING_TYPE unsigned char
  32 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
  33 #  define STRXFRM __strxfrm_l
  34 # else
  35 #  define STRXFRM strxfrm
  36 # endif
  37 # define STRCMP strcmp
  38 # define STRLEN strlen
  39 # define STPNCPY __stpncpy
  40 # define WEIGHT_H "../locale/weight.h"
  41 # define SUFFIX MB
  42 # define L(arg) arg
  43 #endif
  44
  45 #define CONCAT(a,b) CONCAT1(a,b)
  46 #define CONCAT1(a,b) a##b
  47
  48 #include "../locale/localeinfo.h"
  49
  50
  51 #ifndef WIDE_CHAR_VERSION
  52
  53 /* We need UTF-8 encoding of numbers.  */
  54 static inline int
  55 utf8_encode (char *buf, int val)
  56 {
  57   int retval;
  58
  59   if (val < 0x80)
  60     {
  61       *buf++ = (char) val;
  62       retval = 1;
  63     }
  64   else
  65     {
  66       int step;
  67
  68       for (step = 2; step < 6; ++step)
  69         if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
  70           break;
  71       retval = step;
  72
  73       *buf = (unsigned char) (~0xff >> step);
  74       --step;
  75       do
  76         {
  77           buf[step] = 0x80 | (val & 0x3f);
  78           val >>= 6;
  79         }
  80       while (--step > 0);
  81       *buf |= val;
  82     }
  83
  84   return retval;
  85 }
  86 #endif
  87
  88
  89 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
  90 size_t
  91 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
  92 #else
  93 size_t
  94 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
  95 #endif
  96 {
  97 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
  98   struct locale_data *current = l->__locales[LC_COLLATE];
  99   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
 100 #else
 101   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 102 #endif
 103   /* We don't assign the following values right away since it might be
 104      unnecessary in case there are no rules.  */
 105   const unsigned char *rulesets;
 106   const int32_t *table;
 107   const USTRING_TYPE *weights;
 108   const USTRING_TYPE *extra;
 109   const int32_t *indirect;
 110   uint_fast32_t pass;
 111   size_t needed;
 112   const USTRING_TYPE *usrc;
 113   size_t srclen = STRLEN (src);
 114   int32_t *idxarr;
 115   unsigned char *rulearr;
 116   size_t idxmax;
 117   size_t idxcnt;
 118   int use_malloc;
 119
 120 #include WEIGHT_H
 121
 122   if (nrules == 0)
 123     {
 124       if (n != 0)
 125         STPNCPY (dest, src, MIN (srclen + 1, n));
 126
 127       return srclen;
 128     }
 129
 130 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
 131   rulesets = (const unsigned char *)
 132     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 133   table = (const int32_t *)
 134     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 135   weights = (const USTRING_TYPE *)
 136     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 137   extra = (const USTRING_TYPE *)
 138     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 139   indirect = (const int32_t *)
 140     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 141 #else
 142   rulesets = (const unsigned char *)
 143     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
 144   table = (const int32_t *)
 145     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
 146   weights = (const USTRING_TYPE *)
 147     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
 148   extra = (const USTRING_TYPE *)
 149     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
 150   indirect = (const int32_t *)
 151     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
 152 #endif
 153   use_malloc = 0;
 154
 155   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 156   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 157   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 158   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 159
 160   /* Handle an empty string as a special case.  */
 161   if (srclen == 0)
 162     {
 163       if (n != 0)
 164         *dest = L('\0');
 165       return 0;
 166     }
 167
 168   /* We need the elements of the string as unsigned values since they
 169      are used as indeces.  */
 170   usrc = (const USTRING_TYPE *) src;
 171
 172   /* Perform the first pass over the string and while doing this find
 173      and store the weights for each character.  Since we want this to
 174      be as fast as possible we are using `alloca' to store the temporary
 175      values.  But since there is no limit on the length of the string
 176      we have to use `malloc' if the string is too long.  We should be
 177      very conservative here.  */
 178   if (srclen >= 16384)
 179     {
 180       idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
 181       rulearr = (unsigned char *) &idxarr[srclen];
 182
 183       if (idxarr == NULL)
 184         /* No memory.  Well, go with the stack then.
 185
 186            XXX Once this implementation is stable we will handle this
 187            differently.  Instead of precomputing the indeces we will
 188            do this in time.  This means, though, that this happens for
 189            every pass again.  */
 190         goto try_stack;
 191       use_malloc = 1;
 192     }
 193   else
 194     {
 195     try_stack:
 196       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
 197       rulearr = (unsigned char *) alloca (srclen + 1);
 198     }
 199   /* This element is only read, the value never used but to determine
 200      another value which then is ignored.  */
 201   rulearr[srclen] = '\0';
 202
 203   idxmax = 0;
 204   do
 205     {
 206       int32_t tmp = findidx (&usrc);
 207       rulearr[idxmax] = tmp >> 24;
 208       idxarr[idxmax] = tmp & 0xffffff;
 209
 210       ++idxmax;
 211     }
 212   while (*usrc != L('\0'));
 213
 214   /* Now the passes over the weights.  We now use the indeces we found
 215      before.  */
 216   needed = 0;
 217   for (pass = 0; pass < nrules; ++pass)
 218     {
 219       size_t backw_stop = ~0ul;
 220       int rule = rulesets[rulearr[0] * nrules + pass];
 221       /* We assume that if a rule has defined `position' in one section
 222          this is true for all of them.  */
 223       int position = rule & sort_position;
 224
 225       if (position == 0)
 226         {
 227           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 228             {
 229               if ((rule & sort_forward) != 0)
 230                 {
 231                   size_t len;
 232
 233                   if (backw_stop != ~0ul)
 234                     {
 235                       /* Handle the pushed elements now.  */
 236                       size_t backw;
 237
 238                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 239                         {
 240                           len = weights[idxarr[backw]++];
 241
 242                           if (needed + len < n)
 243                             while (len-- > 0)
 244                               dest[needed++] = weights[idxarr[backw]++];
 245                           else
 246                             {
 247                                 /* No more characters fit into the buffer.  */
 248                               needed += len;
 249                               idxarr[backw] += len;
 250                             }
 251                         }
 252
 253                       backw_stop = ~0ul;
 254                     }
 255
 256                   /* Now handle the forward element.  */
 257                   len = weights[idxarr[idxcnt]++];
 258                   if (needed + len < n)
 259                     while (len-- > 0)
 260                       dest[needed++] = weights[idxarr[idxcnt]++];
 261                   else
 262                     {
 263                       /* No more characters fit into the buffer.  */
 264                       needed += len;
 265                       idxarr[idxcnt] += len;
 266                     }
 267                 }
 268               else
 269                 {
 270                   /* Remember where the backwards series started.  */
 271                   if (backw_stop == ~0ul)
 272                     backw_stop = idxcnt;
 273                 }
 274
 275               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 276             }
 277
 278
 279           if (backw_stop != ~0ul)
 280             {
 281               /* Handle the pushed elements now.  */
 282               size_t backw;
 283
 284               backw = idxcnt;
 285               while (backw > backw_stop)
 286                 {
 287                   size_t len = weights[idxarr[--backw]++];
 288
 289                   if (needed + len < n)
 290                     while (len-- > 0)
 291                       dest[needed++] = weights[idxarr[backw]++];
 292                   else
 293                     {
 294                       /* No more characters fit into the buffer.  */
 295                       needed += len;
 296                       idxarr[backw] += len;
 297                     }
 298                 }
 299             }
 300         }
 301       else
 302         {
 303           int val = 1;
 304 #ifndef WIDE_CHAR_VERSION
 305           char buf[7];
 306           size_t buflen;
 307 #endif
 308           size_t i;
 309
 310           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 311             {
 312               if ((rule & sort_forward) != 0)
 313                 {
 314                   size_t len;
 315
 316                   if (backw_stop != ~0ul)
 317                     {
 318                      /* Handle the pushed elements now.  */
 319                       size_t backw;
 320
 321                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 322                         {
 323                           len = weights[idxarr[backw]++];
 324                           if (len != 0)
 325                             {
 326 #ifdef WIDE_CHAR_VERSION
 327                               if (needed + 1 + len < n)
 328                                 {
 329                                   dest[needed] = val;
 330                                   for (i = 0; i < len; ++i)
 331                                     dest[needed + 1 + i] =
 332                                       weights[idxarr[backw] + i];
 333                                 }
 334                               needed += 1 + len;
 335 #else
 336                               buflen = utf8_encode (buf, val);
 337                               if (needed + buflen + len < n)
 338                                 {
 339                                   for (i = 0; i < buflen; ++i)
 340                                     dest[needed + i] = buf[i];
 341                                   for (i = 0; i < len; ++i)
 342                                     dest[needed + buflen + i] =
 343                                       weights[idxarr[backw] + i];
 344                                 }
 345                               needed += buflen + len;
 346 #endif
 347                               idxarr[backw] += len;
 348                               val = 1;
 349                             }
 350                           else
 351                             ++val;
 352                         }
 353
 354                       backw_stop = ~0ul;
 355                     }
 356
 357                   /* Now handle the forward element.  */
 358                   len = weights[idxarr[idxcnt]++];
 359                   if (len != 0)
 360                     {
 361 #ifdef WIDE_CHAR_VERSION
 362                       if (needed + 1+ len < n)
 363                         {
 364                           dest[needed] = val;
 365                           for (i = 0; i < len; ++i)
 366                             dest[needed + 1 + i] =
 367                               weights[idxarr[idxcnt] + i];
 368                         }
 369                       needed += 1 + len;
 370 #else
 371                       buflen = utf8_encode (buf, val);
 372                       if (needed + buflen + len < n)
 373                         {
 374                           for (i = 0; i < buflen; ++i)
 375                             dest[needed + i] = buf[i];
 376                           for (i = 0; i < len; ++i)
 377                             dest[needed + buflen + i] =
 378                               weights[idxarr[idxcnt] + i];
 379                         }
 380                       needed += buflen + len;
 381 #endif
 382                       idxarr[idxcnt] += len;
 383                       val = 1;
 384                     }
 385                   else
 386                     /* Note that we don't have to increment `idxarr[idxcnt]'
 387                        since the length is zero.  */
 388                     ++val;
 389                 }
 390               else
 391                 {
 392                   /* Remember where the backwards series started.  */
 393                   if (backw_stop == ~0ul)
 394                     backw_stop = idxcnt;
 395                 }
 396
 397               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 398             }
 399
 400           if (backw_stop != ~0ul)
 401             {
 402               /* Handle the pushed elements now.  */
 403               size_t backw;
 404
 405               backw = idxmax - 1;
 406               while (backw > backw_stop)
 407                 {
 408                   size_t len = weights[idxarr[--backw]++];
 409                   if (len != 0)
 410                     {
 411 #ifdef WIDE_CHAR_VERSION
 412                       if (needed + 1 + len < n)
 413                         {
 414                           dest[needed] = val;
 415                           for (i = 0; i < len; ++i)
 416                             dest[needed + 1 + i] =
 417                               weights[idxarr[backw] + i];
 418                         }
 419                       needed += 1 + len;
 420 #else
 421                       buflen = utf8_encode (buf, val);
 422                       if (needed + buflen + len < n)
 423                         {
 424                           for (i = 0; i < buflen; ++i)
 425                             dest[needed + i] = buf[i];
 426                           for (i = 0; i < len; ++i)
 427                             dest[needed + buflen + i] =
 428                               weights[idxarr[backw] + i];
 429                         }
 430                       needed += buflen + len;
 431 #endif
 432                       idxarr[backw] += len;
 433                       val = 1;
 434                     }
 435                   else
 436                     ++val;
 437                 }
 438             }
 439         }
 440
 441       /* Finally store the byte to separate the passes or terminate
 442          the string.  */
 443       if (needed < n)
 444         dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
 445       ++needed;
 446     }
 447
 448   /* This is a little optimization: many collation specifications have
 449      a `position' rule at the end and if no non-ignored character
 450      is found the last \1 byte is immediately followed by a \0 byte
 451      signalling this.  We can avoid the \1 byte(s).  */
 452   if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
 453     {
 454       /* Remove the \1 byte.  */
 455       --needed;
 456       dest[needed - 1] = L('\0');
 457     }
 458
 459   /* Free the memory if needed.  */
 460   if (use_malloc)
 461     free (idxarr);
 462
 463   /* Return the number of bytes/words we need, but don't count the NUL
 464      byte/word at the end.  */
 465   return needed - 1;
 466 }