kjs/regexp.cpp

   1 // -*- c-basic-offset: 2 -*-
   2 /*
   3  *  This file is part of the KDE libraries
   4  *  Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
   5  *  Copyright (C) 2003,2004 Apple Computer, Inc.
   6  *  Copyright (C) 2006      Maksim Orlovich (maksim@kde.org)
   7  *
   8  *  This library is free software; you can redistribute it and/or
   9  *  modify it under the terms of the GNU Lesser General Public
  10  *  License as published by the Free Software Foundation; either
  11  *  version 2 of the License, or (at your option) any later version.
  12  *
  13  *  This library is distributed in the hope that it will be useful,
  14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  *  Lesser General Public License for more details.
  17  *
  18  *  You should have received a copy of the GNU Lesser General Public
  19  *  License along with this library; if not, write to the Free Software
  20  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  21  *
  22  */
  23
  24 #include "regexp.h"
  25 #include <config.h>
  26 #include "lexer.h"
  27
  28 #include <assert.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32 #include <wtf/Vector.h>
  33 using WTF::Vector;
  34
  35 // GCC cstring uses these automatically, but not all implementations do.
  36 using std::strlen;
  37 using std::strcpy;
  38 using std::strncpy;
  39 using std::memset;
  40 using std::memcpy;
  41
  42 namespace KJS {
  43
  44 #ifdef PCRE_CONFIG_UTF8
  45 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
  46 #endif
  47
  48 // JS regexps can contain Unicode escape sequences (\uxxxx) which
  49 // are rather uncommon elsewhere. As our regexp libs don't understand
  50 // them we do the unescaping ourselves internally.
  51 // Also make sure to expand out any nulls as pcre_compile
  52 // expects null termination..
  53 static UString sanitizePattern(const UString &p)
  54 {
  55   UString newPattern;
  56
  57   const char* const nil = "\\x00";
  58   if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) {
  59     bool escape = false;
  60     for (int i = 0; i < p.size(); ++i) {
  61       UChar c = p[i];
  62       if (escape) {
  63         escape = false;
  64         // we only care about \u
  65         if (c == 'u') {
  66           // standard unicode escape sequence looks like \uxxxx but
  67           // other browsers also accept less then 4 hex digits
  68           unsigned short u = 0;
  69           int j = 0;
  70           for (j = 0; j < 4; ++j) {
  71             if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
  72               u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
  73               ++i;
  74             } else {
  75               // sequence incomplete. restore index.
  76               // TODO: cleaner way to propagate warning
  77               fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
  78               i -= j;
  79               break;
  80             }
  81           }
  82           if (j < 4) {
  83             // sequence was incomplete. treat \u as u which IE always
  84             // and FF sometimes does.
  85              newPattern.append(UString('u'));
  86           } else {
  87             c = UChar(u);
  88             switch (u) {
  89             case 0:
  90               // Make sure to encode 0, to avoid terminating the string
  91                newPattern += UString(nil);
  92               break;
  93             case '^':
  94             case '$':
  95             case '\\':
  96             case '.':
  97             case '*':
  98             case '+':
  99             case '?':
 100             case '(': case ')':
 101             case '{': case '}':
 102             case '[': case ']':
 103             case '|':
 104               // escape pattern characters have to remain escaped
 105                newPattern.append(UString('\\'));
 106               // intentional fallthrough
 107             default:
 108               newPattern += UString(&c, 1);
 109               break;
 110             }
 111           }
 112           continue;
 113         }
 114         newPattern += UString('\\');
 115         newPattern += UString(&c, 1);
 116       } else {
 117         if (c == '\\')
 118           escape = true;
 119         else if (c == '\0')
 120           newPattern += UString(nil);
 121         else
 122           newPattern += UString(&c, 1);
 123       }
 124     }
 125     return newPattern;
 126   } else {
 127     return p;
 128   }
 129 }
 130
 131 RegExp::RegExp(const UString &p, char flags)
 132   : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0), _buffer(0), _originalPos(0)
 133 {
 134   // Determine whether libpcre has unicode support if need be..
 135 #ifdef PCRE_CONFIG_UTF8
 136   if (utf8Support == Unknown) {
 137     int supported;
 138     pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
 139     utf8Support = supported ? Supported : Unsupported;
 140   }
 141 #endif
 142
 143   UString intern = sanitizePattern(p);
 144
 145 #ifdef HAVE_PCREPOSIX
 146
 147   int options = 0;
 148   // Note: the Global flag is already handled by RegExpProtoFunc::execute.
 149   // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
 150   if (flags & IgnoreCase)
 151     options |= PCRE_CASELESS;
 152   if (flags & Multiline)
 153     options |= PCRE_MULTILINE;
 154
 155   if (utf8Support == Supported)
 156     options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
 157
 158   const char *errorMessage;
 159   int errorOffset;
 160
 161   // Fill our buffer with an encoded version, whether utf-8, or,
 162   // if PCRE is incapable, truncated.
 163   prepareMatch(intern);
 164   _regex = pcre_compile(_buffer, options, &errorMessage, &errorOffset, NULL);
 165   doneMatch(); //Cleanup buffers
 166   if (!_regex) {
 167 #ifndef NDEBUG
 168     fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
 169 #endif
 170     _valid = false;
 171     return;
 172   }
 173
 174 #ifdef PCRE_INFO_CAPTURECOUNT
 175   // Get number of subpatterns that will be returned.
 176   pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
 177 #endif
 178
 179 #else /* HAVE_PCREPOSIX */
 180
 181   int regflags = 0;
 182 #ifdef REG_EXTENDED
 183   regflags |= REG_EXTENDED;
 184 #endif
 185 #ifdef REG_ICASE
 186   if ( flags & IgnoreCase )
 187     regflags |= REG_ICASE;
 188 #endif
 189
 190   //NOTE: Multiline is not feasible with POSIX regex.
 191   //if ( f & Multiline )
 192   //    ;
 193   // Note: the Global flag is already handled by RegExpProtoFunc::execute
 194
 195   int errorCode = regcomp(&_regex, intern.ascii(), regflags);
 196   if (errorCode != 0) {
 197 #ifndef NDEBUG
 198     char errorMessage[80];
 199     regerror(errorCode, &_regex, errorMessage, sizeof errorMessage);
 200     fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
 201 #endif
 202     _valid = false;
 203   }
 204 #endif
 205 }
 206
 207 RegExp::~RegExp()
 208 {
 209   doneMatch(); // Be 100% sure buffers are freed
 210 #ifdef HAVE_PCREPOSIX
 211   pcre_free(_regex);
 212 #else
 213   /* TODO: is this really okay after an error ? */
 214   regfree(&_regex);
 215 #endif
 216 }
 217
 218 void RegExp::prepareUtf8(const UString& s)
 219 {
 220   // Allocate a buffer big enough to hold all the characters plus \0
 221   const int length = s.size();
 222   _buffer = new char[length * 3 + 1];
 223
 224   // Also create buffer for positions. We need one extra character in there,
 225   // even past the \0 since the non-empty handling may jump one past the end
 226   _originalPos = new int[length * 3 + 2];
 227
 228   // Convert to runs of 8-bit characters, and generate indices
 229   // Note that we do NOT combine surrogate pairs here, as
 230   // regexps operate on them as separate characters
 231   char *p      = _buffer;
 232   int  *posOut = _originalPos;
 233   const UChar *d = s.data();
 234   for (int i = 0; i != length; ++i) {
 235     unsigned short c = d[i].unicode();
 236
 237     int sequenceLen;
 238     if (c < 0x80) {
 239       *p++ = (char)c;
 240       sequenceLen = 1;
 241     } else if (c < 0x800) {
 242       *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
 243       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
 244       sequenceLen = 2;
 245     } else {
 246       *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
 247       *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
 248       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
 249       sequenceLen = 3;
 250     }
 251
 252     while (sequenceLen > 0) {
 253       *posOut = i;
 254       ++posOut;
 255       --sequenceLen;
 256     }
 257   }
 258
 259   _bufferSize = p - _buffer;
 260
 261   *p++ = '\0';
 262
 263   // Record positions for \0, and the fictional character after that.
 264   *posOut     = length;
 265   *(posOut+1) = length+1;
 266 }
 267
 268 void RegExp::prepareASCII (const UString& s)
 269 {
 270   _originalPos = 0;
 271
 272   // Best-effort attempt to get something done
 273   // when we don't have utf 8 available -- use
 274   // truncated version, and pray for the best
 275   CString truncated = s.cstring();
 276   _buffer = new char[truncated.size() + 1];
 277   memcpy(_buffer, truncated.c_str(), truncated.size());
 278   _buffer[truncated.size()] = '\0'; // For _compile use
 279   _bufferSize = truncated.size();
 280 }
 281
 282 void RegExp::prepareMatch(const UString &s)
 283 {
 284   delete[] _originalPos; // Just to be sure..
 285   delete[] _buffer;
 286 #ifdef PCRE_CONFIG_UTF8
 287   if (utf8Support == Supported)
 288     prepareUtf8(s);
 289   else
 290 #endif
 291     prepareASCII(s);
 292
 293 #ifndef NDEBUG
 294   _originalS = s;
 295 #endif
 296 }
 297
 298 void RegExp::doneMatch()
 299 {
 300   delete[] _originalPos; _originalPos = 0;
 301   delete[] _buffer;      _buffer      = 0;
 302 }
 303
 304 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
 305 {
 306 #ifndef NDEBUG
 307   assert(s.data() == _originalS.data()); // Make sure prepareMatch got called right..
 308 #endif
 309
 310   if (i < 0)
 311     i = 0;
 312   int dummyPos;
 313   if (!pos)
 314     pos = &dummyPos;
 315   *pos = -1;
 316   if (ovector)
 317     *ovector = 0;
 318
 319   if (i > s.size() || s.isNull())
 320     return UString::null();
 321
 322 #ifdef HAVE_PCREPOSIX
 323
 324   if (!_regex)
 325     return UString::null();
 326
 327   // Set up the offset vector for the result.
 328   // First 2/3 used for result, the last third used by PCRE.
 329   int *offsetVector;
 330   int offsetVectorSize;
 331   int fixedSizeOffsetVector[3];
 332   if (!ovector) {
 333     offsetVectorSize = 3;
 334     offsetVector = fixedSizeOffsetVector;
 335   } else {
 336     offsetVectorSize = (_numSubPatterns + 1) * 3;
 337     offsetVector = new int [offsetVectorSize];
 338   }
 339
 340   int startPos;
 341   if (utf8Support == Supported) {
 342     startPos = i;
 343     while (_originalPos[startPos] < i)
 344       ++startPos;
 345   } else {
 346     startPos = i;
 347   }
 348
 349   int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0;
 350   const int numMatches = pcre_exec(_regex, NULL, _buffer, _bufferSize, startPos, baseFlags, offsetVector, offsetVectorSize);
 351
 352   //Now go through and patch up the offsetVector
 353   if (utf8Support == Supported)
 354     for (int c = 0; c < 2 * numMatches; ++c)
 355       if (offsetVector[c] != -1)
 356         offsetVector[c] = _originalPos[offsetVector[c]];
 357
 358   if (numMatches < 0) {
 359 #ifndef NDEBUG
 360     if (numMatches != PCRE_ERROR_NOMATCH)
 361       fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
 362 #endif
 363     if (offsetVector != fixedSizeOffsetVector)
 364       delete [] offsetVector;
 365     return UString::null();
 366   }
 367
 368   *pos = offsetVector[0];
 369   if (ovector)
 370     *ovector = offsetVector;
 371   return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
 372
 373 #else
 374
 375   if (!_valid)
 376     return UString::null();
 377
 378   const unsigned maxMatch = 10;
 379   regmatch_t rmatch[maxMatch];
 380
 381   char *str = strdup(s.ascii()); // TODO: why ???
 382   if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
 383     free(str);
 384     return UString::null();
 385   }
 386   free(str);
 387
 388   if (!ovector) {
 389     *pos = rmatch[0].rm_so + i;
 390     return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
 391   }
 392
 393   // map rmatch array to ovector used in PCRE case
 394   _numSubPatterns = 0;
 395   for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
 396       _numSubPatterns++;
 397   int ovecsize = (_numSubPatterns+1)*3; // see above
 398   *ovector = new int[ovecsize];
 399   for (unsigned j = 0; j < _numSubPatterns + 1; j++) {
 400     if (j>maxMatch)
 401       break;
 402     (*ovector)[2*j] = rmatch[j].rm_so + i;
 403     (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
 404   }
 405
 406   *pos = (*ovector)[0];
 407   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
 408
 409 #endif
 410 }
 411
 412 } // namespace KJS