string/strxfrm.c

   1 /* Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #include <stddef.h>
  21 #include <stdlib.h>
  22 #include <string.h>
  23
  24 #ifndef WIDE_VERSION
  25 # define STRING_TYPE char
  26 # define USTRING_TYPE unsigned char
  27 # define L_(Ch) Ch
  28 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
  29 #  define STRXFRM __strxfrm_l
  30 # else
  31 #  define STRXFRM strxfrm
  32 # endif
  33 # define STRLEN strlen
  34 # define STPNCPY __stpncpy
  35 #endif
  36
  37 /* Include the shared helper functions.  `strxfrm'/`wcsxfrm' also use
  38    these functions.  */
  39 #include "../locale/weight.h"
  40
  41
  42 #ifndef WIDE_VERSION
  43 /* Write 32 bit value UTF-8 encoded but only if enough space is left.  */
  44 static __inline size_t
  45 print_val (u_int32_t value, char *dest, size_t max, size_t act)
  46 {
  47   char tmp[6];
  48   int idx = 0;
  49
  50   if (value < 0x80)
  51     tmp[idx++] = (char) value;
  52   else
  53     {
  54       tmp[idx++] = '\x80' + (char) (value & 0x3f);
  55       value >>= 6;
  56
  57       if (value < 0x20)
  58         tmp[idx++] = '\xc0' + (char) value;
  59       else
  60         {
  61           tmp[idx++] = '\x80' + (char) (value & 0x3f);
  62           value >>= 6;
  63
  64           if (value < 0x10)
  65             tmp[idx++] = '\xe0' + (char) value;
  66           else
  67             {
  68               tmp[idx++] = '\x80' + (char) (value & 0x3f);
  69               value >>= 6;
  70
  71               if (value < 0x08)
  72                 tmp[idx++] = '\xf0' + (char) value;
  73               else
  74                 {
  75                   tmp[idx++] = '\x80' + (char) (value & 0x3f);
  76                   value >>= 6;
  77
  78                   if (value < 0x04)
  79                     tmp[idx++] = '\xf8' + (char) value;
  80                   else
  81                     {
  82                       tmp[idx++] = '\x80' + (char) (value & 0x3f);
  83                       tmp[idx++] = '\xfc' + (char) (value >> 6);
  84                     }
  85                 }
  86             }
  87         }
  88     }
  89
  90   while (idx-- > 0)
  91     {
  92       if (act < max)
  93         dest[act] = tmp[idx];
  94       ++act;
  95     }
  96
  97   return act;
  98 }
  99 #else
 100 static __inline size_t
 101 print_val (u_int32_t value, wchar_t *dest, size_t max, size_t act)
 102 {
 103   /* We cannot really assume wchar_t is 32 bits wide.  But it is for
 104      GCC and so we don't do much optimization for the other case.  */
 105   if (sizeof (wchar_t) == 4)
 106     {
 107       if (act < max)
 108         dest[act] = (wchar_t) value;
 109       ++act;
 110     }
 111   else
 112     {
 113       wchar_t tmp[3];
 114       size_t idx = 0;
 115
 116       if (value < 0x8000)
 117         tmp[idx++] = (wchar_t) act;
 118       else
 119         {
 120           tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
 121           value >>= 14;
 122           if (value < 0x2000)
 123             tmp[idx++] = (wchar_t) (0xc000 + value);
 124           else
 125             {
 126               tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
 127               value >>= 14;
 128               tmp[idx++] = (wchar_t) (0xe000 + value);
 129             }
 130         }
 131       while (idx-- > 0)
 132         {
 133           if (act < max)
 134             dest[act] = tmp[idx];
 135           ++act;
 136         }
 137     }
 138   return act;
 139 }
 140 #endif
 141
 142
 143 /* Transform SRC into a form such that the result of strcmp
 144    on two strings that have been transformed by strxfrm is
 145    the same as the result of strcoll on the two strings before
 146    their transformation.  The transformed string is put in at
 147    most N characters of DEST and its length is returned.  */
 148 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
 149 size_t
 150 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
 151 #else
 152 size_t
 153 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 154 #endif
 155 {
 156 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
 157   struct locale_data *current = l->__locales[LC_COLLATE];
 158 # if BYTE_ORDER == BIG_ENDIAN
 159   const u_int32_t *collate_table = (const u_int32_t *)
 160     current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].string;
 161   const u_int32_t *collate_extra = (const u_int32_t *)
 162     current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].string;
 163 # elif BYTE_ORDER == LITTLE_ENDIAN
 164   const u_int32_t *collate_table = (const u_int32_t *)
 165     current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].string;
 166   const u_int32_t *collate_extra = (const u_int32_t *)
 167     current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].string;
 168 # else
 169 #  error bizarre byte order
 170 # endif
 171 #endif
 172   weight_t *forw = NULL;
 173   weight_t *backw = NULL;
 174   size_t pass;
 175   size_t written;
 176
 177   /* If the current locale does not specify locale data we use normal
 178      8-bit string comparison.  */
 179   if (collate_nrules == 0)
 180     {
 181       if (n != 0)
 182         STPNCPY (dest, src, n);
 183
 184       return STRLEN (src);
 185     }
 186
 187   /* Get full information about the string.  This means we get
 188      information for all passes in a special data structure.  */
 189   get_string (src, forw, backw);
 190
 191   /* Now we have all the information.  In at most the given number of
 192      passes we can finally decide about the order.  */
 193   written = 0;
 194   for (pass = 0; pass < collate_nrules; ++pass)
 195     {
 196       int forward = (collate_rules[pass] & sort_forward) != 0;
 197       const weight_t *run = forward ? forw : backw;
 198       int idx = forward ? 0 : run->data[pass].number - 1;
 199
 200       while (1)
 201         {
 202           int ignore = 0;
 203           u_int32_t w = 0;
 204
 205           /* Here we have to check for IGNORE entries.  If these are
 206              found we count them and go on with he next value.  */
 207           while (run != NULL
 208                  && ((w = run->data[pass].value[idx])
 209                      == (u_int32_t) IGNORE_CHAR))
 210             {
 211               ++ignore;
 212               if ((forward && ++idx >= run->data[pass].number)
 213                   || (!forward && --idx < 0))
 214                 {
 215                   weight_t *nextp = forward ? run->next : run->prev;
 216                   if (nextp == NULL)
 217                     {
 218                       w = 0;
 219                       /* No more non-INGOREd elements means lowest
 220                          possible value.  */
 221                       ignore = -1;
 222                     }
 223                   else
 224                     idx = forward ? 0 : nextp->data[pass].number - 1;
 225                   run = nextp;
 226                 }
 227             }
 228
 229           /* Stop if all characters are processed.  */
 230           if (run == NULL)
 231             break;
 232
 233           /* Now we have information of the number of ignored weights
 234              and the value of the next weight.  We have to add 2
 235              because 0 means EOS and 1 is the intermediate string end.  */
 236           if ((collate_rules[pass] & sort_position) != 0)
 237             written = print_val (ignore + 2, dest, n, written);
 238
 239           if (w != 0)
 240             written = print_val (w, dest, n, written);
 241
 242           /* We have to increment the index counters.  */
 243           if ((forward && ++idx >= run->data[pass].number)
 244               || (!forward && --idx < 0))
 245             if (forward)
 246               {
 247                 run = run->next;
 248                 idx = 0;
 249               }
 250             else
 251               {
 252                 run = run->prev;
 253                 if (run != NULL)
 254                   idx = run->data[pass].number - 1;
 255               }
 256         }
 257
 258       /* Write marker for end of word.  */
 259       if (pass + 1 < collate_nrules)
 260         written = print_val (1, dest, n, written);
 261     }
 262
 263   /* Terminate string.  */
 264   if (written < n)
 265     dest[written] = L_('\0');
 266
 267   /* Return length without counting the terminating '\0'.  */
 268   return written;
 269 }