kjs/regexp.cpp

   1 // -*- c-basic-offset: 2 -*-
   2 /*
   3  *  This file is part of the KDE libraries
   4  *  Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
   5  *  Copyright (C) 2003,2004 Apple Computer, Inc.
   6  *  Copyright (C) 2006      Maksim Orlovich (maksim@kde.org)
   7  *  Copyright (C) 2007      Sune Vuorela (debian@pusling.com)
   8  *
   9  *  This library is free software; you can redistribute it and/or
  10  *  modify it under the terms of the GNU Lesser General Public
  11  *  License as published by the Free Software Foundation; either
  12  *  version 2 of the License, or (at your option) any later version.
  13  *
  14  *  This library is distributed in the hope that it will be useful,
  15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  *  Lesser General Public License for more details.
  18  *
  19  *  You should have received a copy of the GNU Lesser General Public
  20  *  License along with this library; if not, write to the Free Software
  21  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  22  *
  23  */
  24
  25 #include "regexp.h"
  26 #include <config.h>
  27 #include "lexer.h"
  28
  29 #include <assert.h>
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <string.h>
  33 #include <wtf/Vector.h>
  34 using WTF::Vector;
  35
  36 // GCC cstring uses these automatically, but not all implementations do.
  37 using std::strlen;
  38 using std::strcpy;
  39 using std::strncpy;
  40 using std::memset;
  41 using std::memcpy;
  42
  43 namespace KJS {
  44
  45 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
  46
  47 // JS regexps can contain Unicode escape sequences (\uxxxx) which
  48 // are rather uncommon elsewhere. As our regexp libs don't understand
  49 // them we do the unescaping ourselves internally.
  50 // Also make sure to expand out any nulls as pcre_compile
  51 // expects null termination..
  52 static UString sanitizePattern(const UString &p)
  53 {
  54   UString newPattern;
  55
  56   const char* const nil = "\\x00";
  57   if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) {
  58     bool escape = false;
  59     for (int i = 0; i < p.size(); ++i) {
  60       UChar c = p[i];
  61       if (escape) {
  62         escape = false;
  63         // we only care about \u
  64         if (c == 'u') {
  65           // standard unicode escape sequence looks like \uxxxx but
  66           // other browsers also accept less than 4 hex digits
  67           unsigned short u = 0;
  68           int j = 0;
  69           for (j = 0; j < 4; ++j) {
  70             if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
  71               u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
  72               ++i;
  73             } else {
  74               // sequence incomplete. restore index.
  75               // TODO: cleaner way to propagate warning
  76               fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
  77               i -= j;
  78               break;
  79             }
  80           }
  81           if (j < 4) {
  82             // sequence was incomplete. treat \u as u which IE always
  83             // and FF sometimes does.
  84              newPattern.append(UString('u'));
  85           } else {
  86             c = UChar(u);
  87             switch (u) {
  88             case 0:
  89               // Make sure to encode 0, to avoid terminating the string
  90                newPattern += UString(nil);
  91               break;
  92             case '^':
  93             case '$':
  94             case '\\':
  95             case '.':
  96             case '*':
  97             case '+':
  98             case '?':
  99             case '(': case ')':
 100             case '{': case '}':
 101             case '[': case ']':
 102             case '|':
 103               // escape pattern characters have to remain escaped
 104                newPattern.append(UString('\\'));
 105               // intentional fallthrough
 106             default:
 107               newPattern += UString(&c, 1);
 108               break;
 109             }
 110           }
 111           continue;
 112         }
 113         newPattern += UString('\\');
 114         newPattern += UString(&c, 1);
 115       } else {
 116         if (c == '\\')
 117           escape = true;
 118         else if (c == '\0')
 119           newPattern += UString(nil);
 120         else
 121           newPattern += UString(&c, 1);
 122       }
 123     }
 124     return newPattern;
 125   } else {
 126     return p;
 127   }
 128 }
 129
 130 RegExp::RegExp(const UString &p, char flags)
 131   : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0), _buffer(0), _originalPos(0)
 132 {
 133 #ifdef HAVE_PCREPOSIX
 134   // Determine whether libpcre has unicode support if need be..
 135   if (utf8Support == Unknown) {
 136     int supported;
 137     pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
 138     utf8Support = supported ? Supported : Unsupported;
 139   }
 140 #endif
 141
 142   UString intern = sanitizePattern(p);
 143
 144 #ifdef HAVE_PCREPOSIX
 145   int options = 0;
 146
 147   // we are close but not 100% the same as Perl
 148 #ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7
 149   options |= PCRE_JAVASCRIPT_COMPAT;
 150 #endif
 151
 152   // Note: the Global flag is already handled by RegExpProtoFunc::execute.
 153   // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
 154   if (flags & IgnoreCase)
 155     options |= PCRE_CASELESS;
 156   if (flags & Multiline)
 157     options |= PCRE_MULTILINE;
 158
 159   if (utf8Support == Supported)
 160     options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
 161
 162   const char *errorMessage;
 163   int errorOffset;
 164
 165   // Fill our buffer with an encoded version, whether utf-8, or,
 166   // if PCRE is incapable, truncated.
 167   prepareMatch(intern);
 168   _regex = pcre_compile(_buffer, options, &errorMessage, &errorOffset, NULL);
 169   doneMatch(); //Cleanup buffers
 170   if (!_regex) {
 171 #ifndef NDEBUG
 172     fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
 173 #endif
 174     _valid = false;
 175     return;
 176   }
 177
 178 #ifdef PCRE_INFO_CAPTURECOUNT
 179   // Get number of subpatterns that will be returned.
 180   pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
 181 #endif
 182
 183 #else /* HAVE_PCREPOSIX */
 184
 185   int regflags = 0;
 186 #ifdef REG_EXTENDED
 187   regflags |= REG_EXTENDED;
 188 #endif
 189 #ifdef REG_ICASE
 190   if ( flags & IgnoreCase )
 191     regflags |= REG_ICASE;
 192 #endif
 193
 194   //NOTE: Multiline is not feasible with POSIX regex.
 195   //if ( f & Multiline )
 196   //    ;
 197   // Note: the Global flag is already handled by RegExpProtoFunc::execute
 198
 199   int errorCode = regcomp(&_regex, intern.ascii(), regflags);
 200   if (errorCode != 0) {
 201 #ifndef NDEBUG
 202     char errorMessage[80];
 203     regerror(errorCode, &_regex, errorMessage, sizeof errorMessage);
 204     fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
 205 #endif
 206     _valid = false;
 207   }
 208 #endif
 209 }
 210
 211 RegExp::~RegExp()
 212 {
 213   doneMatch(); // Be 100% sure buffers are freed
 214 #ifdef HAVE_PCREPOSIX
 215   pcre_free(_regex);
 216 #else
 217   /* TODO: is this really okay after an error ? */
 218   regfree(&_regex);
 219 #endif
 220 }
 221
 222 void RegExp::prepareUtf8(const UString& s)
 223 {
 224   // Allocate a buffer big enough to hold all the characters plus \0
 225   const int length = s.size();
 226   _buffer = new char[length * 3 + 1];
 227
 228   // Also create buffer for positions. We need one extra character in there,
 229   // even past the \0 since the non-empty handling may jump one past the end
 230   _originalPos = new int[length * 3 + 2];
 231
 232   // Convert to runs of 8-bit characters, and generate indices
 233   // Note that we do NOT combine surrogate pairs here, as
 234   // regexps operate on them as separate characters
 235   char *p      = _buffer;
 236   int  *posOut = _originalPos;
 237   const UChar *d = s.data();
 238   for (int i = 0; i != length; ++i) {
 239     unsigned short c = d[i].unicode();
 240
 241     int sequenceLen;
 242     if (c < 0x80) {
 243       *p++ = (char)c;
 244       sequenceLen = 1;
 245     } else if (c < 0x800) {
 246       *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
 247       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
 248       sequenceLen = 2;
 249     } else {
 250       *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
 251       *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
 252       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
 253       sequenceLen = 3;
 254     }
 255
 256     while (sequenceLen > 0) {
 257       *posOut = i;
 258       ++posOut;
 259       --sequenceLen;
 260     }
 261   }
 262
 263   _bufferSize = p - _buffer;
 264
 265   *p++ = '\0';
 266
 267   // Record positions for \0, and the fictional character after that.
 268   *posOut     = length;
 269   *(posOut+1) = length+1;
 270 }
 271
 272 void RegExp::prepareASCII (const UString& s)
 273 {
 274   _originalPos = 0;
 275
 276   // Best-effort attempt to get something done
 277   // when we don't have utf 8 available -- use
 278   // truncated version, and pray for the best
 279   CString truncated = s.cstring();
 280   _buffer = new char[truncated.size() + 1];
 281   memcpy(_buffer, truncated.c_str(), truncated.size());
 282   _buffer[truncated.size()] = '\0'; // For _compile use
 283   _bufferSize = truncated.size();
 284 }
 285
 286 void RegExp::prepareMatch(const UString &s)
 287 {
 288   delete[] _originalPos; // Just to be sure..
 289   delete[] _buffer;
 290   if (utf8Support == Supported)
 291     prepareUtf8(s);
 292   else
 293     prepareASCII(s);
 294
 295 #ifndef NDEBUG
 296   _originalS = s;
 297 #endif
 298 }
 299
 300 void RegExp::doneMatch()
 301 {
 302   delete[] _originalPos; _originalPos = 0;
 303   delete[] _buffer;      _buffer      = 0;
 304 }
 305
 306 UString RegExp::match(const UString &s, bool *error, int i, int *pos, int **ovector)
 307 {
 308 #ifndef NDEBUG
 309   assert(s.data() == _originalS.data()); // Make sure prepareMatch got called right..
 310 #endif
 311
 312   if (i < 0)
 313     i = 0;
 314   int dummyPos;
 315   if (!pos)
 316     pos = &dummyPos;
 317   *pos = -1;
 318   if (ovector)
 319     *ovector = 0;
 320
 321   if (i > s.size() || s.isNull())
 322     return UString::null();
 323
 324 #ifdef HAVE_PCREPOSIX
 325
 326   if (!_regex)
 327     return UString::null();
 328
 329   // Set up the offset vector for the result.
 330   // First 2/3 used for result, the last third used by PCRE.
 331   int *offsetVector;
 332   int offsetVectorSize;
 333   int fixedSizeOffsetVector[3];
 334   if (!ovector) {
 335     offsetVectorSize = 3;
 336     offsetVector = fixedSizeOffsetVector;
 337   } else {
 338     offsetVectorSize = (_numSubPatterns + 1) * 3;
 339     offsetVector = new int [offsetVectorSize];
 340   }
 341
 342   int startPos;
 343   if (utf8Support == Supported) {
 344     startPos = i;
 345     while (_originalPos[startPos] < i)
 346       ++startPos;
 347   } else {
 348     startPos = i;
 349   }
 350
 351   int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0;
 352
 353   // See if we have to limit stack space...
 354   *error = false;
 355   int stackGlutton = 0;
 356   pcre_config(PCRE_CONFIG_STACKRECURSE, (void*)&stackGlutton);
 357   pcre_extra limits;
 358   if (stackGlutton) {
 359     limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 360     // libPCRE docs claim that it munches about 500 bytes per recursion.
 361     // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes
 362     // (and I've measured 800 in an another instance)
 363     // So the usual 8MiB rlimit on Linux produces about 6452 frames.
 364     // We go somewhat conservative, and use about 2/3rds of that, for 4300
 365     // especially since we're not exactly light on the stack, either
 366     // ### TODO: get some build system help to use getrlimit.
 367     limits.match_limit_recursion = 4300;
 368   }
 369
 370   const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : 0, _buffer, _bufferSize, startPos, baseFlags, offsetVector, offsetVectorSize);
 371
 372   //Now go through and patch up the offsetVector
 373   if (utf8Support == Supported)
 374     for (int c = 0; c < 2 * numMatches; ++c)
 375       if (offsetVector[c] != -1)
 376         offsetVector[c] = _originalPos[offsetVector[c]];
 377
 378   if (numMatches < 0) {
 379 #ifndef NDEBUG
 380     if (numMatches != PCRE_ERROR_NOMATCH)
 381       fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
 382 #endif
 383     if (offsetVector != fixedSizeOffsetVector)
 384       delete [] offsetVector;
 385     if (numMatches == PCRE_ERROR_MATCHLIMIT || numMatches == PCRE_ERROR_RECURSIONLIMIT)
 386       *error = true;
 387     return UString::null();
 388   }
 389
 390   *pos = offsetVector[0];
 391   if (ovector)
 392     *ovector = offsetVector;
 393   return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
 394
 395 #else
 396
 397   if (!_valid)
 398     return UString::null();
 399
 400   const unsigned maxMatch = 10;
 401   regmatch_t rmatch[maxMatch];
 402
 403   char *str = strdup(s.ascii()); // TODO: why ???
 404   if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
 405     free(str);
 406     return UString::null();
 407   }
 408   free(str);
 409
 410   if (!ovector) {
 411     *pos = rmatch[0].rm_so + i;
 412     return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
 413   }
 414
 415   // map rmatch array to ovector used in PCRE case
 416   _numSubPatterns = 0;
 417   for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
 418       _numSubPatterns++;
 419   int ovecsize = (_numSubPatterns+1)*3; // see above
 420   *ovector = new int[ovecsize];
 421   for (unsigned j = 0; j < _numSubPatterns + 1; j++) {
 422     if (j>maxMatch)
 423       break;
 424     (*ovector)[2*j] = rmatch[j].rm_so + i;
 425     (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
 426   }
 427
 428   *pos = (*ovector)[0];
 429   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
 430
 431 #endif
 432 }
 433
 434 } // namespace KJS