contrib/grep/src/searchutils.c

   1 /* searchutils.c - helper subroutines for grep's matchers.
   2    Copyright 1992, 1998, 2000, 2007, 2009-2014 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 3, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
  17    02110-1301, USA.  */
  18
  19 #include <config.h>
  20 #include <assert.h>
  21 #include "search.h"
  22
  23 #define NCHAR (UCHAR_MAX + 1)
  24
  25 static size_t mbclen_cache[NCHAR];
  26
  27 void
  28 kwsinit (kwset_t *kwset)
  29 {
  30   static char trans[NCHAR];
  31   int i;
  32
  33   if (match_icase && MB_CUR_MAX == 1)
  34     {
  35       for (i = 0; i < NCHAR; ++i)
  36         trans[i] = toupper (i);
  37
  38       *kwset = kwsalloc (trans);
  39     }
  40   else
  41     *kwset = kwsalloc (NULL);
  42
  43   if (!*kwset)
  44     xalloc_die ();
  45 }
  46
  47 /* Convert BEG, an *N-byte string, to uppercase, and write the
  48    NUL-terminated result into malloc'd storage.  Upon success, set *N
  49    to the length (in bytes) of the resulting string (not including the
  50    trailing NUL byte), and return a pointer to the uppercase string.
  51    Upon memory allocation failure, exit.  *N must be positive.
  52
  53    Although this function returns a pointer to malloc'd storage,
  54    the caller must not free it, since this function retains a pointer
  55    to the buffer and reuses it on any subsequent call.  As a consequence,
  56    this function is not thread-safe.
  57
  58    When each character in the uppercase result string has the same length
  59    as the corresponding character in the input string, set *LEN_MAP_P
  60    to NULL.  Otherwise, set it to a malloc'd buffer (like the returned
  61    buffer, this must not be freed by caller) of the same length as the
  62    result string.  (*LEN_MAP_P)[J] is the change in byte-length of the
  63    character in BEG that formed byte J of the result as it was converted to
  64    uppercase.  It is usually zero.  For lowercase Turkish dotless I it
  65    is -1, since the lowercase input occupies two bytes, while the
  66    uppercase output occupies only one byte.  For lowercase I in the
  67    tr_TR.utf8 locale, it is 1 because the uppercase Turkish dotted I
  68    is one byte longer than the original.  When that happens, we have two
  69    or more slots in *LEN_MAP_P for each such character.  We store the
  70    difference in the first one and 0's in any remaining slots.
  71
  72    This map is used by the caller to convert offset,length pairs that
  73    reference the uppercase result to numbers that refer to the matched
  74    part of the original buffer.  */
  75
  76 char *
  77 mbtoupper (const char *beg, size_t *n, mb_len_map_t **len_map_p)
  78 {
  79   static char *out;
  80   static mb_len_map_t *len_map;
  81   static size_t outalloc;
  82   size_t outlen, mb_cur_max;
  83   mbstate_t is, os;
  84   const char *end;
  85   char *p;
  86   mb_len_map_t *m;
  87   bool lengths_differ = false;
  88
  89   if (*n > outalloc || outalloc == 0)
  90     {
  91       outalloc = MAX (1, *n);
  92       out = xrealloc (out, outalloc);
  93       len_map = xrealloc (len_map, outalloc);
  94     }
  95
  96   /* appease clang-2.6 */
  97   assert (out);
  98   assert (len_map);
  99   if (*n == 0)
 100     return out;
 101
 102   memset (&is, 0, sizeof (is));
 103   memset (&os, 0, sizeof (os));
 104   end = beg + *n;
 105
 106   mb_cur_max = MB_CUR_MAX;
 107   p = out;
 108   m = len_map;
 109   outlen = 0;
 110   while (beg < end)
 111     {
 112       wchar_t wc;
 113       size_t mbclen = mbrtowc (&wc, beg, end - beg, &is);
 114 #ifdef __CYGWIN__
 115       /* Handle a UTF-8 sequence for a character beyond the base plane.
 116          Cygwin's wchar_t is UTF-16, as in the underlying OS.  This
 117          results in surrogate pairs which need some extra attention.  */
 118       wint_t wci = 0;
 119       if (mbclen == 3 && (wc & 0xdc00) == 0xd800)
 120         {
 121           /* We got the start of a 4 byte UTF-8 sequence.  This is returned
 122              as a UTF-16 surrogate pair.  The first call to mbrtowc returned 3
 123              and wc has been set to a high surrogate value, now we're going
 124              to fetch the matching low surrogate.  This second call to mbrtowc
 125              is supposed to return 1 to complete the 4 byte UTF-8 sequence.  */
 126           wchar_t wc_2;
 127           size_t mbclen_2 = mbrtowc (&wc_2, beg + mbclen, end - beg - mbclen,
 128                                      &is);
 129           if (mbclen_2 == 1 && (wc_2 & 0xdc00) == 0xdc00)
 130             {
 131               /* Match.  Convert this to a 4 byte wint_t which constitutes
 132                  a 32-bit UTF-32 value.  */
 133               wci = ( (((wint_t) (wc - 0xd800)) << 10)
 134                      | ((wint_t) (wc_2 - 0xdc00)))
 135                     + 0x10000;
 136               ++mbclen;
 137             }
 138           else
 139             {
 140               /* Invalid UTF-8 sequence.  */
 141               mbclen = (size_t) -1;
 142             }
 143         }
 144 #endif
 145       if (outlen + mb_cur_max >= outalloc)
 146         {
 147           size_t dm = m - len_map;
 148           out = x2nrealloc (out, &outalloc, 1);
 149           len_map = xrealloc (len_map, outalloc);
 150           p = out + outlen;
 151           m = len_map + dm;
 152         }
 153
 154       if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
 155         {
 156           /* An invalid sequence, or a truncated multi-octet character.
 157              We treat it as a single-octet character.  */
 158           *m++ = 0;
 159           *p++ = *beg++;
 160           outlen++;
 161           memset (&is, 0, sizeof (is));
 162           memset (&os, 0, sizeof (os));
 163         }
 164       else
 165         {
 166           size_t ombclen;
 167           beg += mbclen;
 168 #ifdef __CYGWIN__
 169           /* Handle Unicode characters beyond the base plane.  */
 170           if (mbclen == 4)
 171             {
 172               /* towupper, taking wint_t (4 bytes), handles UCS-4 values.  */
 173               wci = towupper (wci);
 174               if (wci >= 0x10000)
 175                 {
 176                   wci -= 0x10000;
 177                   wc = (wci >> 10) | 0xd800;
 178                   /* No need to check the return value.  When reading the
 179                      high surrogate, the return value will be 0 and only the
 180                      mbstate indicates that we're in the middle of reading a
 181                      surrogate pair.  The next wcrtomb call reading the low
 182                      surrogate will then return 4 and reset the mbstate.  */
 183                   wcrtomb (p, wc, &os);
 184                   wc = (wci & 0x3ff) | 0xdc00;
 185                 }
 186               else
 187                 {
 188                   wc = (wchar_t) wci;
 189                 }
 190               ombclen = wcrtomb (p, wc, &os);
 191             }
 192           else
 193 #endif
 194           ombclen = wcrtomb (p, towupper (wc), &os);
 195           *m = mbclen - ombclen;
 196           memset (m + 1, 0, ombclen - 1);
 197           m += ombclen;
 198           p += ombclen;
 199           outlen += ombclen;
 200           lengths_differ |= (mbclen != ombclen);
 201         }
 202     }
 203
 204   *len_map_p = lengths_differ ? len_map : NULL;
 205   *n = p - out;
 206   *p = 0;
 207   return out;
 208 }
 209
 210 /* Initialize a cache of mbrlen values for each of its 1-byte inputs.  */
 211 void
 212 build_mbclen_cache (void)
 213 {
 214   int i;
 215
 216   for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
 217     {
 218       char c = i;
 219       unsigned char uc = i;
 220       mbstate_t mbs = { 0 };
 221       mbclen_cache[uc] = mbrlen (&c, 1, &mbs);
 222     }
 223 }
 224
 225 /* In the buffer *MB_START, return the number of bytes needed to go
 226    back from CUR to the previous boundary, where a "boundary" is the
 227    start of a multibyte character or is an error-encoding byte.  The
 228    buffer ends at END (i.e., one past the address of the buffer's last
 229    byte).  If CUR is already at a boundary, return 0.  If *MB_START is
 230    greater than or equal to CUR, return the negative value CUR - *MB_START.
 231
 232    When returning zero, set *MB_START to CUR.  When returning a
 233    positive value, set *MB_START to the next boundary after CUR, or to
 234    END if there is no such boundary.  When returning a negative value,
 235    leave *MB_START alone.  */
 236 ptrdiff_t
 237 mb_goback (char const **mb_start, char const *cur, char const *end)
 238 {
 239   const char *p = *mb_start;
 240   const char *p0 = p;
 241   mbstate_t cur_state;
 242
 243   memset (&cur_state, 0, sizeof cur_state);
 244
 245   while (p < cur)
 246     {
 247       size_t mbclen = mbclen_cache[to_uchar (*p)];
 248
 249       if (mbclen == (size_t) -2)
 250         mbclen = mbrlen (p, end - p, &cur_state);
 251
 252       if (! (0 < mbclen && mbclen < (size_t) -2))
 253         {
 254           /* An invalid sequence, or a truncated multibyte character, or
 255              a null wide character.  Treat it as a single byte character.  */
 256           mbclen = 1;
 257           memset (&cur_state, 0, sizeof cur_state);
 258         }
 259       p0 = p;
 260       p += mbclen;
 261     }
 262
 263   *mb_start = p;
 264   return p == cur ? 0 : cur - p0;
 265 }
 266
 267 /* In the buffer BUF, return the wide character that is encoded just
 268    before CUR.  The buffer ends at END.  Return WEOF if there is no
 269    wide character just before CUR.  */
 270 wint_t
 271 mb_prev_wc (char const *buf, char const *cur, char const *end)
 272 {
 273   if (cur == buf)
 274     return WEOF;
 275   char const *p = buf;
 276   cur--;
 277   cur -= mb_goback (&p, cur, end);
 278   return mb_next_wc (cur, end);
 279 }
 280
 281 /* Return the wide character that is encoded at CUR.  The buffer ends
 282    at END.  Return WEOF if there is no wide character encoded at CUR.  */
 283 wint_t
 284 mb_next_wc (char const *cur, char const *end)
 285 {
 286   wchar_t wc;
 287   mbstate_t mbs = { 0 };
 288   return mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 ? wc : WEOF;
 289 }