fix tricky regression noticed by Vyacheslav Tokarev on Google Reader.
[kdelibs.git] / kjs / regexp.cpp
bloba59fde05f924101ef6f8f2ce9efd4a64ac305470
1 // -*- c-basic-offset: 2 -*-
2 /*
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
5 * Copyright (C) 2003,2004 Apple Computer, Inc.
6 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
7 * Copyright (C) 2007 Sune Vuorela (debian@pusling.com)
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "regexp.h"
26 #include <config.h>
27 #include "lexer.h"
29 #include <assert.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <wtf/Vector.h>
34 using WTF::Vector;
36 // GCC cstring uses these automatically, but not all implementations do.
37 using std::strlen;
38 using std::strcpy;
39 using std::strncpy;
40 using std::memset;
41 using std::memcpy;
43 namespace KJS {
45 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
47 // JS regexps can contain Unicode escape sequences (\uxxxx) which
48 // are rather uncommon elsewhere. As our regexp libs don't understand
49 // them we do the unescaping ourselves internally.
50 // Also make sure to expand out any nulls as pcre_compile
51 // expects null termination..
52 static UString sanitizePattern(const UString &p)
54 UString newPattern;
56 const char* const nil = "\\x00";
57 if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) {
58 bool escape = false;
59 for (int i = 0; i < p.size(); ++i) {
60 UChar c = p[i];
61 if (escape) {
62 escape = false;
63 // we only care about \u
64 if (c == 'u') {
65 // standard unicode escape sequence looks like \uxxxx but
66 // other browsers also accept less than 4 hex digits
67 unsigned short u = 0;
68 int j = 0;
69 for (j = 0; j < 4; ++j) {
70 if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
71 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
72 ++i;
73 } else {
74 // sequence incomplete. restore index.
75 // TODO: cleaner way to propagate warning
76 fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
77 i -= j;
78 break;
81 if (j < 4) {
82 // sequence was incomplete. treat \u as u which IE always
83 // and FF sometimes does.
84 newPattern.append(UString('u'));
85 } else {
86 c = UChar(u);
87 switch (u) {
88 case 0:
89 // Make sure to encode 0, to avoid terminating the string
90 newPattern += UString(nil);
91 break;
92 case '^':
93 case '$':
94 case '\\':
95 case '.':
96 case '*':
97 case '+':
98 case '?':
99 case '(': case ')':
100 case '{': case '}':
101 case '[': case ']':
102 case '|':
103 // escape pattern characters have to remain escaped
104 newPattern.append(UString('\\'));
105 // intentional fallthrough
106 default:
107 newPattern += UString(&c, 1);
108 break;
111 continue;
113 newPattern += UString('\\');
114 newPattern += UString(&c, 1);
115 } else {
116 if (c == '\\')
117 escape = true;
118 else if (c == '\0')
119 newPattern += UString(nil);
120 else
121 newPattern += UString(&c, 1);
124 return newPattern;
125 } else {
126 return p;
130 RegExp::RegExp(const UString &p, char flags)
131 : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0), _buffer(0), _originalPos(0)
133 #ifdef HAVE_PCREPOSIX
134 // Determine whether libpcre has unicode support if need be..
135 if (utf8Support == Unknown) {
136 int supported;
137 pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
138 utf8Support = supported ? Supported : Unsupported;
140 #endif
142 UString intern = sanitizePattern(p);
144 #ifdef HAVE_PCREPOSIX
145 int options = 0;
147 // we are close but not 100% the same as Perl
148 #ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7
149 options |= PCRE_JAVASCRIPT_COMPAT;
150 #endif
152 // Note: the Global flag is already handled by RegExpProtoFunc::execute.
153 // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
154 if (flags & IgnoreCase)
155 options |= PCRE_CASELESS;
156 if (flags & Multiline)
157 options |= PCRE_MULTILINE;
159 if (utf8Support == Supported)
160 options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
162 const char *errorMessage;
163 int errorOffset;
165 // Fill our buffer with an encoded version, whether utf-8, or,
166 // if PCRE is incapable, truncated.
167 prepareMatch(intern);
168 _regex = pcre_compile(_buffer, options, &errorMessage, &errorOffset, NULL);
169 doneMatch(); //Cleanup buffers
170 if (!_regex) {
171 #ifndef NDEBUG
172 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
173 #endif
174 _valid = false;
175 return;
178 #ifdef PCRE_INFO_CAPTURECOUNT
179 // Get number of subpatterns that will be returned.
180 pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
181 #endif
183 #else /* HAVE_PCREPOSIX */
185 int regflags = 0;
186 #ifdef REG_EXTENDED
187 regflags |= REG_EXTENDED;
188 #endif
189 #ifdef REG_ICASE
190 if ( flags & IgnoreCase )
191 regflags |= REG_ICASE;
192 #endif
194 //NOTE: Multiline is not feasible with POSIX regex.
195 //if ( f & Multiline )
196 // ;
197 // Note: the Global flag is already handled by RegExpProtoFunc::execute
199 int errorCode = regcomp(&_regex, intern.ascii(), regflags);
200 if (errorCode != 0) {
201 #ifndef NDEBUG
202 char errorMessage[80];
203 regerror(errorCode, &_regex, errorMessage, sizeof errorMessage);
204 fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
205 #endif
206 _valid = false;
208 #endif
211 RegExp::~RegExp()
213 doneMatch(); // Be 100% sure buffers are freed
214 #ifdef HAVE_PCREPOSIX
215 pcre_free(_regex);
216 #else
217 /* TODO: is this really okay after an error ? */
218 regfree(&_regex);
219 #endif
222 void RegExp::prepareUtf8(const UString& s)
224 // Allocate a buffer big enough to hold all the characters plus \0
225 const int length = s.size();
226 _buffer = new char[length * 3 + 1];
228 // Also create buffer for positions. We need one extra character in there,
229 // even past the \0 since the non-empty handling may jump one past the end
230 _originalPos = new int[length * 3 + 2];
232 // Convert to runs of 8-bit characters, and generate indices
233 // Note that we do NOT combine surrogate pairs here, as
234 // regexps operate on them as separate characters
235 char *p = _buffer;
236 int *posOut = _originalPos;
237 const UChar *d = s.data();
238 for (int i = 0; i != length; ++i) {
239 unsigned short c = d[i].unicode();
241 int sequenceLen;
242 if (c < 0x80) {
243 *p++ = (char)c;
244 sequenceLen = 1;
245 } else if (c < 0x800) {
246 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
247 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
248 sequenceLen = 2;
249 } else {
250 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
251 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
252 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
253 sequenceLen = 3;
256 while (sequenceLen > 0) {
257 *posOut = i;
258 ++posOut;
259 --sequenceLen;
263 _bufferSize = p - _buffer;
265 *p++ = '\0';
267 // Record positions for \0, and the fictional character after that.
268 *posOut = length;
269 *(posOut+1) = length+1;
272 void RegExp::prepareASCII (const UString& s)
274 _originalPos = 0;
276 // Best-effort attempt to get something done
277 // when we don't have utf 8 available -- use
278 // truncated version, and pray for the best
279 CString truncated = s.cstring();
280 _buffer = new char[truncated.size() + 1];
281 memcpy(_buffer, truncated.c_str(), truncated.size());
282 _buffer[truncated.size()] = '\0'; // For _compile use
283 _bufferSize = truncated.size();
286 void RegExp::prepareMatch(const UString &s)
288 delete[] _originalPos; // Just to be sure..
289 delete[] _buffer;
290 if (utf8Support == Supported)
291 prepareUtf8(s);
292 else
293 prepareASCII(s);
295 #ifndef NDEBUG
296 _originalS = s;
297 #endif
300 void RegExp::doneMatch()
302 delete[] _originalPos; _originalPos = 0;
303 delete[] _buffer; _buffer = 0;
306 UString RegExp::match(const UString &s, bool *error, int i, int *pos, int **ovector)
308 #ifndef NDEBUG
309 assert(s.data() == _originalS.data()); // Make sure prepareMatch got called right..
310 #endif
312 if (i < 0)
313 i = 0;
314 int dummyPos;
315 if (!pos)
316 pos = &dummyPos;
317 *pos = -1;
318 if (ovector)
319 *ovector = 0;
321 if (i > s.size() || s.isNull())
322 return UString::null();
324 #ifdef HAVE_PCREPOSIX
326 if (!_regex)
327 return UString::null();
329 // Set up the offset vector for the result.
330 // First 2/3 used for result, the last third used by PCRE.
331 int *offsetVector;
332 int offsetVectorSize;
333 int fixedSizeOffsetVector[3];
334 if (!ovector) {
335 offsetVectorSize = 3;
336 offsetVector = fixedSizeOffsetVector;
337 } else {
338 offsetVectorSize = (_numSubPatterns + 1) * 3;
339 offsetVector = new int [offsetVectorSize];
342 int startPos;
343 if (utf8Support == Supported) {
344 startPos = i;
345 while (_originalPos[startPos] < i)
346 ++startPos;
347 } else {
348 startPos = i;
351 int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0;
353 // See if we have to limit stack space...
354 *error = false;
355 int stackGlutton = 0;
356 pcre_config(PCRE_CONFIG_STACKRECURSE, (void*)&stackGlutton);
357 pcre_extra limits;
358 if (stackGlutton) {
359 limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION;
360 // libPCRE docs claim that it munches about 500 bytes per recursion.
361 // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes
362 // (and I've measured 800 in an another instance)
363 // So the usual 8MiB rlimit on Linux produces about 6452 frames.
364 // We go somewhat conservative, and use about 2/3rds of that, for 4300
365 // especially since we're not exactly light on the stack, either
366 // ### TODO: get some build system help to use getrlimit.
367 limits.match_limit_recursion = 4300;
370 const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : 0, _buffer, _bufferSize, startPos, baseFlags, offsetVector, offsetVectorSize);
372 //Now go through and patch up the offsetVector
373 if (utf8Support == Supported)
374 for (int c = 0; c < 2 * numMatches; ++c)
375 if (offsetVector[c] != -1)
376 offsetVector[c] = _originalPos[offsetVector[c]];
378 if (numMatches < 0) {
379 #ifndef NDEBUG
380 if (numMatches != PCRE_ERROR_NOMATCH)
381 fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
382 #endif
383 if (offsetVector != fixedSizeOffsetVector)
384 delete [] offsetVector;
385 if (numMatches == PCRE_ERROR_MATCHLIMIT || numMatches == PCRE_ERROR_RECURSIONLIMIT)
386 *error = true;
387 return UString::null();
390 *pos = offsetVector[0];
391 if (ovector)
392 *ovector = offsetVector;
393 return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
395 #else
397 if (!_valid)
398 return UString::null();
400 const unsigned maxMatch = 10;
401 regmatch_t rmatch[maxMatch];
403 char *str = strdup(s.ascii()); // TODO: why ???
404 if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
405 free(str);
406 return UString::null();
408 free(str);
410 if (!ovector) {
411 *pos = rmatch[0].rm_so + i;
412 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
415 // map rmatch array to ovector used in PCRE case
416 _numSubPatterns = 0;
417 for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
418 _numSubPatterns++;
419 int ovecsize = (_numSubPatterns+1)*3; // see above
420 *ovector = new int[ovecsize];
421 for (unsigned j = 0; j < _numSubPatterns + 1; j++) {
422 if (j>maxMatch)
423 break;
424 (*ovector)[2*j] = rmatch[j].rm_so + i;
425 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
428 *pos = (*ovector)[0];
429 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
431 #endif
434 } // namespace KJS