string/strxfrm.c

   1 /* Copyright (C) 1995-1999, 2000 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #include <langinfo.h>
  21 #include <stddef.h>
  22 #include <stdint.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25
  26 #ifndef STRING_TYPE
  27 # define STRING_TYPE char
  28 # define USTRING_TYPE unsigned char
  29 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
  30 #  define STRXFRM __strxfrm_l
  31 # else
  32 #  define STRXFRM strxfrm
  33 # endif
  34 # define STRCMP strcmp
  35 # define STRLEN strlen
  36 # define STPNCPY __stpncpy
  37 # define WEIGHT_H "../locale/weight.h"
  38 # define SUFFIX MB
  39 # define L(arg) arg
  40 #endif
  41
  42 #define CONCAT(a,b) CONCAT1(a,b)
  43 #define CONCAT1(a,b) a##b
  44
  45 #include "../locale/localeinfo.h"
  46
  47
  48 #ifndef WIDE_CHAR_VERSION
  49 /* These are definitions used by some of the functions for handling
  50    UTF-8 encoding below.  */
  51 static const uint32_t encoding_mask[] =
  52 {
  53   ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
  54 };
  55
  56 static const unsigned char encoding_byte[] =
  57 {
  58   0xc0, 0xe0, 0xf0, 0xf8, 0xfc
  59 };
  60
  61
  62 /* We need UTF-8 encoding of numbers.  */
  63 static inline int
  64 utf8_encode (char *buf, int val)
  65 {
  66   char *startp = buf;
  67   int retval;
  68
  69   if (val < 0x80)
  70     {
  71       *buf++ = (char) val;
  72       retval = 1;
  73     }
  74   else
  75     {
  76       int step;
  77
  78       for (step = 2; step < 6; ++step)
  79         if ((val & encoding_mask[step - 2]) == 0)
  80           break;
  81       retval = step;
  82
  83       *buf = encoding_byte[step - 2];
  84       --step;
  85       do
  86         {
  87           buf[step] = 0x80 | (val & 0x3f);
  88           val >>= 6;
  89         }
  90       while (--step > 0);
  91       *buf |= val;
  92     }
  93
  94   return buf - startp;
  95 }
  96 #endif
  97
  98
  99 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
 100 size_t
 101 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
 102 #else
 103 size_t
 104 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 105 #endif
 106 {
 107 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
 108   struct locale_data *current = l->__locales[LC_COLLATE];
 109   uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
 110 #else
 111   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 112 #endif
 113   /* We don't assign the following values right away since it might be
 114      unnecessary in case there are no rules.  */
 115   const unsigned char *rulesets;
 116   const int32_t *table;
 117   const USTRING_TYPE *weights;
 118   const USTRING_TYPE *extra;
 119   const int32_t *indirect;
 120   uint_fast32_t pass;
 121   size_t needed;
 122   const USTRING_TYPE *usrc;
 123   size_t srclen = STRLEN (src);
 124   int32_t *idxarr;
 125   unsigned char *rulearr;
 126   size_t idxmax;
 127   size_t idxcnt;
 128   int use_malloc;
 129 #ifdef WIDE_CHAR_VERSION
 130   size_t size;
 131   size_t layers;
 132   const wint_t *names;
 133 #endif
 134
 135 #include WEIGHT_H
 136
 137   if (nrules == 0)
 138     {
 139       if (n != 0)
 140         STPNCPY (dest, src, n);
 141
 142       return srclen;
 143     }
 144
 145 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
 146   rulesets = (const unsigned char *)
 147     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 148   table = (const int32_t *)
 149     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 150   weights = (const USTRING_TYPE *)
 151     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 152   extra = (const USTRING_TYPE *)
 153     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 154   indirect = (const int32_t *)
 155     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 156 # ifdef WIDE_CHAR_VERSION
 157   names = (const wint_t *)
 158     current->values[_NL_ITEM_INDEX (_NL_COLLATE_NAMES)].string;
 159   size = current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].word;
 160   layers = current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].word;
 161 # endif
 162 #else
 163   rulesets = (const unsigned char *)
 164     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
 165   table = (const int32_t *)
 166     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
 167   weights = (const USTRING_TYPE *)
 168     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
 169   extra = (const USTRING_TYPE *)
 170     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
 171   indirect = (const int32_t *)
 172     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
 173 # ifdef WIDE_CHAR_VERSION
 174   names = (const wint_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_NAMES);
 175   size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_SIZE);
 176   layers = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_LAYERS);
 177 # endif
 178 #endif
 179   use_malloc = 0;
 180
 181   /* Handle an empty string as a special case.  */
 182   if (srclen == 0)
 183     {
 184       if (n != 0)
 185         *dest = L('\0');
 186       return 1;
 187     }
 188
 189   /* We need the elements of the string as unsigned values since they
 190      are used as indeces.  */
 191   usrc = (const USTRING_TYPE *) src;
 192
 193   /* Perform the first pass over the string and while doing this find
 194      and store the weights for each character.  Since we want this to
 195      be as fast as possible we are using `alloca' to store the temporary
 196      values.  But since there is no limit on the length of the string
 197      we have to use `malloc' if the string is too long.  We should be
 198      very conservative here.  */
 199   if (srclen >= 16384)
 200     {
 201       idxarr = (int32_t *) malloc (srclen * (sizeof (int32_t) + 1));
 202       rulearr = (unsigned char *) &idxarr[srclen];
 203
 204       if (idxarr == NULL)
 205         /* No memory.  Well, go with the stack then.
 206
 207            XXX Once this implementation is stable we will handle this
 208            differently.  Instead of precomputing the indeces we will
 209            do this in time.  This means, though, that this happens for
 210            every pass again.  */
 211         goto try_stack;
 212       use_malloc = 1;
 213     }
 214   else
 215     {
 216     try_stack:
 217       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
 218       rulearr = (unsigned char *) alloca (srclen);
 219     }
 220
 221   idxmax = 0;
 222   do
 223     {
 224       int32_t tmp = findidx (&usrc);
 225       rulearr[idxmax] = tmp >> 24;
 226       idxarr[idxmax] = tmp & 0xffffff;
 227
 228       ++idxmax;
 229     }
 230   while (*usrc != L('\0'));
 231
 232   /* Now the passes over the weights.  We now use the indeces we found
 233      before.  */
 234   needed = 0;
 235   for (pass = 0; pass < nrules; ++pass)
 236     {
 237       size_t backw_stop = ~0ul;
 238       int rule = rulesets[rulearr[0] * nrules + pass];
 239       /* We assume that if a rule has defined `position' in one section
 240          this is true for all of them.  */
 241       int position = rule & sort_position;
 242
 243       if (position == 0)
 244         {
 245           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 246             {
 247               if ((rule & sort_forward) != 0)
 248                 {
 249                   size_t len;
 250
 251                   if (backw_stop != ~0ul)
 252                     {
 253                       /* Handle the pushed elements now.  */
 254                       size_t backw;
 255
 256                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 257                         {
 258                           len = weights[idxarr[backw]++];
 259
 260                           if (needed + len < n)
 261                             while (len-- > 0)
 262                               dest[needed++] = weights[idxarr[backw]++];
 263                           else
 264                             {
 265                                 /* No more characters fit into the buffer.  */
 266                               needed += len;
 267                               idxarr[backw] += len;
 268                             }
 269                         }
 270
 271                       backw_stop = ~0ul;
 272                     }
 273
 274                   /* Now handle the forward element.  */
 275                   len = weights[idxarr[idxcnt]++];
 276                   if (needed + len < n)
 277                     while (len-- > 0)
 278                       dest[needed++] = weights[idxarr[idxcnt]++];
 279                   else
 280                     {
 281                       /* No more characters fit into the buffer.  */
 282                       needed += len;
 283                       idxarr[idxcnt] += len;
 284                     }
 285                 }
 286               else
 287                 {
 288                   /* Remember where the backwards series started.  */
 289                   if (backw_stop == ~0ul)
 290                     backw_stop = idxcnt;
 291                 }
 292
 293               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 294             }
 295
 296
 297           if (backw_stop != ~0ul)
 298             {
 299               /* Handle the pushed elements now.  */
 300               size_t backw;
 301
 302               backw = idxcnt;
 303               while (backw > backw_stop)
 304                 {
 305                   size_t len = weights[idxarr[--backw]++];
 306
 307                   if (needed + len < n)
 308                     while (len-- > 0)
 309                       dest[needed++] = weights[idxarr[backw]++];
 310                   else
 311                     {
 312                       /* No more characters fit into the buffer.  */
 313                       needed += len;
 314                       idxarr[backw] += len;
 315                     }
 316                 }
 317             }
 318         }
 319       else
 320         {
 321           int val = 1;
 322 #ifndef WIDE_CHAR_VERSION
 323           char buf[7];
 324           size_t buflen;
 325 #endif
 326           size_t i;
 327
 328           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 329             {
 330               if ((rule & sort_forward) != 0)
 331                 {
 332                   size_t len;
 333
 334                   if (backw_stop != ~0ul)
 335                     {
 336                      /* Handle the pushed elements now.  */
 337                       size_t backw;
 338
 339                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 340                         {
 341                           len = weights[idxarr[backw]++];
 342                           if (len != 0)
 343                             {
 344 #ifdef WIDE_CHAR_VERSION
 345                               if (needed + 1 + len < n)
 346                                 {
 347                                   dest[needed] = val;
 348                                   for (i = 0; i < len; ++i)
 349                                     dest[needed + 1 + i] =
 350                                       weights[idxarr[backw] + i];
 351                                 }
 352                               needed += 1 + len;
 353 #else
 354                               buflen = utf8_encode (buf, val);
 355                               if (needed + buflen + len < n)
 356                                 {
 357                                   for (i = 0; i < buflen; ++i)
 358                                     dest[needed + i] = buf[i];
 359                                   for (i = 0; i < len; ++i)
 360                                     dest[needed + buflen + i] =
 361                                       weights[idxarr[backw] + i];
 362                                 }
 363                               needed += buflen + len;
 364 #endif
 365                               idxarr[backw] += len;
 366                               val = 1;
 367                             }
 368                           else
 369                             ++val;
 370                         }
 371
 372                       backw_stop = ~0ul;
 373                     }
 374
 375                   /* Now handle the forward element.  */
 376                   len = weights[idxarr[idxcnt]++];
 377                   if (len != 0)
 378                     {
 379 #ifdef WIDE_CHAR_VERSION
 380                       if (needed + 1+ len < n)
 381                         {
 382                           dest[needed] = val;
 383                           for (i = 0; i < len; ++i)
 384                             dest[needed + 1 + i] =
 385                               weights[idxarr[idxcnt] + i];
 386                         }
 387                       needed += 1 + len;
 388 #else
 389                       buflen = utf8_encode (buf, val);
 390                       if (needed + buflen + len < n)
 391                         {
 392                           for (i = 0; i < buflen; ++i)
 393                             dest[needed + i] = buf[i];
 394                           for (i = 0; i < len; ++i)
 395                             dest[needed + buflen + i] =
 396                               weights[idxarr[idxcnt] + i];
 397                         }
 398                       needed += buflen + len;
 399 #endif
 400                       idxarr[idxcnt] += len;
 401                       val = 1;
 402                     }
 403                   else
 404                     /* Note that we don't have to increment `idxarr[idxcnt]'
 405                        since the length is zero.  */
 406                     ++val;
 407                 }
 408               else
 409                 {
 410                   /* Remember where the backwards series started.  */
 411                   if (backw_stop == ~0ul)
 412                     backw_stop = idxcnt;
 413                 }
 414
 415               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 416             }
 417
 418           if (backw_stop != ~0)
 419             {
 420               /* Handle the pushed elements now.  */
 421               size_t backw;
 422
 423               backw = idxmax - 1;
 424               while (backw > backw_stop)
 425                 {
 426                   size_t len = weights[idxarr[--backw]++];
 427                   if (len != 0)
 428                     {
 429 #ifdef WIDE_CHAR_VERSION
 430                       if (needed + 1 + len < n)
 431                         {
 432                           dest[needed] = val;
 433                           for (i = 0; i < len; ++i)
 434                             dest[needed + 1 + i] =
 435                               weights[idxarr[backw] + i];
 436                         }
 437                       needed += 1 + len;
 438 #else
 439                       buflen = utf8_encode (buf, val);
 440                       if (needed + buflen + len < n)
 441                         {
 442                           for (i = 0; i < buflen; ++i)
 443                             dest[needed + i] = buf[i];
 444                           for (i = 0; i < len; ++i)
 445                             dest[needed + buflen + i] =
 446                               weights[idxarr[backw] + i];
 447                         }
 448                       needed += buflen + len;
 449 #endif
 450                       idxarr[backw] += len;
 451                       val = 1;
 452                     }
 453                   else
 454                     ++val;
 455                 }
 456             }
 457         }
 458
 459       /* Finally store the byte to separate the passes or terminate
 460          the string.  */
 461       if (needed < n)
 462         dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
 463       ++needed;
 464     }
 465
 466   /* This is a little optimization: many collation specifications have
 467      a `position' rule at the end and if no non-ignored character
 468      is found the last \1 byte is immediately followed by a \0 byte
 469      signalling this.  We can avoid the \1 byte(s).  */
 470   if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
 471     {
 472       /* Remove the \1 byte.  */
 473       --needed;
 474       dest[needed - 1] = L('\0');
 475     }
 476
 477   /* Free the memory if needed.  */
 478   if (use_malloc)
 479     free (idxarr);
 480
 481   /* Return the number of bytes/words we need, but don't count the NUL
 482      byte/word at the end.  */
 483   return needed - 1;
 484 }