string/strxfrm.c

   1 /* Copyright (C) 1995-1999,2000,2001,2002,2003 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <assert.h>
  21 #include <langinfo.h>
  22 #include <locale.h>
  23 #include <stddef.h>
  24 #include <stdint.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27 #include <sys/param.h>
  28
  29 #ifndef STRING_TYPE
  30 # define STRING_TYPE char
  31 # define USTRING_TYPE unsigned char
  32 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
  33 #  define STRXFRM __strxfrm_l
  34 # else
  35 #  define STRXFRM strxfrm
  36 # endif
  37 # define STRCMP strcmp
  38 # define STRLEN strlen
  39 # define STPNCPY __stpncpy
  40 # define WEIGHT_H "../locale/weight.h"
  41 # define SUFFIX MB
  42 # define L(arg) arg
  43 #endif
  44
  45 #define CONCAT(a,b) CONCAT1(a,b)
  46 #define CONCAT1(a,b) a##b
  47
  48 #include "../locale/localeinfo.h"
  49
  50
  51 #ifndef WIDE_CHAR_VERSION
  52
  53 /* We need UTF-8 encoding of numbers.  */
  54 static inline int
  55 utf8_encode (char *buf, int val)
  56 {
  57   int retval;
  58
  59   if (val < 0x80)
  60     {
  61       *buf++ = (char) val;
  62       retval = 1;
  63     }
  64   else
  65     {
  66       int step;
  67
  68       for (step = 2; step < 6; ++step)
  69         if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
  70           break;
  71       retval = step;
  72
  73       *buf = (unsigned char) (~0xff >> step);
  74       --step;
  75       do
  76         {
  77           buf[step] = 0x80 | (val & 0x3f);
  78           val >>= 6;
  79         }
  80       while (--step > 0);
  81       *buf |= val;
  82     }
  83
  84   return retval;
  85 }
  86 #endif
  87
  88
  89 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
  90 size_t
  91 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
  92 #else
  93 size_t
  94 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
  95 #endif
  96 {
  97 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
  98   struct locale_data *current = l->__locales[LC_COLLATE];
  99   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
 100 #else
 101   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 102 #endif
 103   /* We don't assign the following values right away since it might be
 104      unnecessary in case there are no rules.  */
 105   const unsigned char *rulesets;
 106   const int32_t *table;
 107   const USTRING_TYPE *weights;
 108   const USTRING_TYPE *extra;
 109   const int32_t *indirect;
 110   uint_fast32_t pass;
 111   size_t needed;
 112   const USTRING_TYPE *usrc;
 113   size_t srclen = STRLEN (src);
 114   int32_t *idxarr;
 115   unsigned char *rulearr;
 116   size_t idxmax;
 117   size_t idxcnt;
 118   int use_malloc;
 119
 120 #include WEIGHT_H
 121
 122   if (nrules == 0)
 123     {
 124       if (n != 0)
 125         STPNCPY (dest, src, MIN (srclen + 1, n));
 126
 127       return srclen;
 128     }
 129
 130 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
 131   rulesets = (const unsigned char *)
 132     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 133   table = (const int32_t *)
 134     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 135   weights = (const USTRING_TYPE *)
 136     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 137   extra = (const USTRING_TYPE *)
 138     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 139   indirect = (const int32_t *)
 140     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 141 #else
 142   rulesets = (const unsigned char *)
 143     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
 144   table = (const int32_t *)
 145     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
 146   weights = (const USTRING_TYPE *)
 147     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
 148   extra = (const USTRING_TYPE *)
 149     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
 150   indirect = (const int32_t *)
 151     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
 152 #endif
 153   use_malloc = 0;
 154
 155   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 156   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 157   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 158   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 159
 160   /* Handle an empty string as a special case.  */
 161   if (srclen == 0)
 162     {
 163       if (n != 0)
 164         *dest = L('\0');
 165       return 0;
 166     }
 167
 168   /* We need the elements of the string as unsigned values since they
 169      are used as indeces.  */
 170   usrc = (const USTRING_TYPE *) src;
 171
 172   /* Perform the first pass over the string and while doing this find
 173      and store the weights for each character.  Since we want this to
 174      be as fast as possible we are using `alloca' to store the temporary
 175      values.  But since there is no limit on the length of the string
 176      we have to use `malloc' if the string is too long.  We should be
 177      very conservative here.  */
 178   if (! __libc_use_alloca (srclen))
 179     {
 180       idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
 181       rulearr = (unsigned char *) &idxarr[srclen];
 182
 183       if (idxarr == NULL)
 184         /* No memory.  Well, go with the stack then.
 185
 186            XXX Once this implementation is stable we will handle this
 187            differently.  Instead of precomputing the indeces we will
 188            do this in time.  This means, though, that this happens for
 189            every pass again.  */
 190         goto try_stack;
 191       use_malloc = 1;
 192     }
 193   else
 194     {
 195     try_stack:
 196       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
 197       rulearr = (unsigned char *) alloca (srclen + 1);
 198     }
 199
 200   idxmax = 0;
 201   do
 202     {
 203       int32_t tmp = findidx (&usrc);
 204       rulearr[idxmax] = tmp >> 24;
 205       idxarr[idxmax] = tmp & 0xffffff;
 206
 207       ++idxmax;
 208     }
 209   while (*usrc != L('\0'));
 210
 211   /* This element is only read, the value never used but to determine
 212      another value which then is ignored.  */
 213   rulearr[idxmax] = '\0';
 214
 215   /* Now the passes over the weights.  We now use the indeces we found
 216      before.  */
 217   needed = 0;
 218   for (pass = 0; pass < nrules; ++pass)
 219     {
 220       size_t backw_stop = ~0ul;
 221       int rule = rulesets[rulearr[0] * nrules + pass];
 222       /* We assume that if a rule has defined `position' in one section
 223          this is true for all of them.  */
 224       int position = rule & sort_position;
 225
 226       if (position == 0)
 227         {
 228           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 229             {
 230               if ((rule & sort_forward) != 0)
 231                 {
 232                   size_t len;
 233
 234                   if (backw_stop != ~0ul)
 235                     {
 236                       /* Handle the pushed elements now.  */
 237                       size_t backw;
 238
 239                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 240                         {
 241                           len = weights[idxarr[backw]++];
 242
 243                           if (needed + len < n)
 244                             while (len-- > 0)
 245                               dest[needed++] = weights[idxarr[backw]++];
 246                           else
 247                             {
 248                                 /* No more characters fit into the buffer.  */
 249                               needed += len;
 250                               idxarr[backw] += len;
 251                             }
 252                         }
 253
 254                       backw_stop = ~0ul;
 255                     }
 256
 257                   /* Now handle the forward element.  */
 258                   len = weights[idxarr[idxcnt]++];
 259                   if (needed + len < n)
 260                     while (len-- > 0)
 261                       dest[needed++] = weights[idxarr[idxcnt]++];
 262                   else
 263                     {
 264                       /* No more characters fit into the buffer.  */
 265                       needed += len;
 266                       idxarr[idxcnt] += len;
 267                     }
 268                 }
 269               else
 270                 {
 271                   /* Remember where the backwards series started.  */
 272                   if (backw_stop == ~0ul)
 273                     backw_stop = idxcnt;
 274                 }
 275
 276               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 277             }
 278
 279
 280           if (backw_stop != ~0ul)
 281             {
 282               /* Handle the pushed elements now.  */
 283               size_t backw;
 284
 285               backw = idxcnt;
 286               while (backw > backw_stop)
 287                 {
 288                   size_t len = weights[idxarr[--backw]++];
 289
 290                   if (needed + len < n)
 291                     while (len-- > 0)
 292                       dest[needed++] = weights[idxarr[backw]++];
 293                   else
 294                     {
 295                       /* No more characters fit into the buffer.  */
 296                       needed += len;
 297                       idxarr[backw] += len;
 298                     }
 299                 }
 300             }
 301         }
 302       else
 303         {
 304           int val = 1;
 305 #ifndef WIDE_CHAR_VERSION
 306           char buf[7];
 307           size_t buflen;
 308 #endif
 309           size_t i;
 310
 311           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 312             {
 313               if ((rule & sort_forward) != 0)
 314                 {
 315                   size_t len;
 316
 317                   if (backw_stop != ~0ul)
 318                     {
 319                      /* Handle the pushed elements now.  */
 320                       size_t backw;
 321
 322                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 323                         {
 324                           len = weights[idxarr[backw]++];
 325                           if (len != 0)
 326                             {
 327 #ifdef WIDE_CHAR_VERSION
 328                               if (needed + 1 + len < n)
 329                                 {
 330                                   dest[needed] = val;
 331                                   for (i = 0; i < len; ++i)
 332                                     dest[needed + 1 + i] =
 333                                       weights[idxarr[backw] + i];
 334                                 }
 335                               needed += 1 + len;
 336 #else
 337                               buflen = utf8_encode (buf, val);
 338                               if (needed + buflen + len < n)
 339                                 {
 340                                   for (i = 0; i < buflen; ++i)
 341                                     dest[needed + i] = buf[i];
 342                                   for (i = 0; i < len; ++i)
 343                                     dest[needed + buflen + i] =
 344                                       weights[idxarr[backw] + i];
 345                                 }
 346                               needed += buflen + len;
 347 #endif
 348                               idxarr[backw] += len;
 349                               val = 1;
 350                             }
 351                           else
 352                             ++val;
 353                         }
 354
 355                       backw_stop = ~0ul;
 356                     }
 357
 358                   /* Now handle the forward element.  */
 359                   len = weights[idxarr[idxcnt]++];
 360                   if (len != 0)
 361                     {
 362 #ifdef WIDE_CHAR_VERSION
 363                       if (needed + 1+ len < n)
 364                         {
 365                           dest[needed] = val;
 366                           for (i = 0; i < len; ++i)
 367                             dest[needed + 1 + i] =
 368                               weights[idxarr[idxcnt] + i];
 369                         }
 370                       needed += 1 + len;
 371 #else
 372                       buflen = utf8_encode (buf, val);
 373                       if (needed + buflen + len < n)
 374                         {
 375                           for (i = 0; i < buflen; ++i)
 376                             dest[needed + i] = buf[i];
 377                           for (i = 0; i < len; ++i)
 378                             dest[needed + buflen + i] =
 379                               weights[idxarr[idxcnt] + i];
 380                         }
 381                       needed += buflen + len;
 382 #endif
 383                       idxarr[idxcnt] += len;
 384                       val = 1;
 385                     }
 386                   else
 387                     /* Note that we don't have to increment `idxarr[idxcnt]'
 388                        since the length is zero.  */
 389                     ++val;
 390                 }
 391               else
 392                 {
 393                   /* Remember where the backwards series started.  */
 394                   if (backw_stop == ~0ul)
 395                     backw_stop = idxcnt;
 396                 }
 397
 398               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 399             }
 400
 401           if (backw_stop != ~0ul)
 402             {
 403               /* Handle the pushed elements now.  */
 404               size_t backw;
 405
 406               backw = idxmax - 1;
 407               while (backw > backw_stop)
 408                 {
 409                   size_t len = weights[idxarr[--backw]++];
 410                   if (len != 0)
 411                     {
 412 #ifdef WIDE_CHAR_VERSION
 413                       if (needed + 1 + len < n)
 414                         {
 415                           dest[needed] = val;
 416                           for (i = 0; i < len; ++i)
 417                             dest[needed + 1 + i] =
 418                               weights[idxarr[backw] + i];
 419                         }
 420                       needed += 1 + len;
 421 #else
 422                       buflen = utf8_encode (buf, val);
 423                       if (needed + buflen + len < n)
 424                         {
 425                           for (i = 0; i < buflen; ++i)
 426                             dest[needed + i] = buf[i];
 427                           for (i = 0; i < len; ++i)
 428                             dest[needed + buflen + i] =
 429                               weights[idxarr[backw] + i];
 430                         }
 431                       needed += buflen + len;
 432 #endif
 433                       idxarr[backw] += len;
 434                       val = 1;
 435                     }
 436                   else
 437                     ++val;
 438                 }
 439             }
 440         }
 441
 442       /* Finally store the byte to separate the passes or terminate
 443          the string.  */
 444       if (needed < n)
 445         dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
 446       ++needed;
 447     }
 448
 449   /* This is a little optimization: many collation specifications have
 450      a `position' rule at the end and if no non-ignored character
 451      is found the last \1 byte is immediately followed by a \0 byte
 452      signalling this.  We can avoid the \1 byte(s).  */
 453   if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
 454     {
 455       /* Remove the \1 byte.  */
 456       --needed;
 457       dest[needed - 1] = L('\0');
 458     }
 459
 460   /* Free the memory if needed.  */
 461   if (use_malloc)
 462     free (idxarr);
 463
 464   /* Return the number of bytes/words we need, but don't count the NUL
 465      byte/word at the end.  */
 466   return needed - 1;
 467 }