also propagate updateContents() calls to parent frame
[kdelibs.git] / kjs / regexp.cpp
blob6f960e0ee94fe3afbba8057f8e734d5431686256
1 // -*- c-basic-offset: 2 -*-
2 /*
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
5 * Copyright (C) 2003,2004 Apple Computer, Inc.
6 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "regexp.h"
25 #include <config.h>
26 #include "lexer.h"
28 #include <assert.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <wtf/Vector.h>
33 using WTF::Vector;
35 // GCC cstring uses these automatically, but not all implementations do.
36 using std::strlen;
37 using std::strcpy;
38 using std::strncpy;
39 using std::memset;
40 using std::memcpy;
42 namespace KJS {
44 #ifdef PCRE_CONFIG_UTF8
45 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
46 #endif
48 // JS regexps can contain Unicode escape sequences (\uxxxx) which
49 // are rather uncommon elsewhere. As our regexp libs don't understand
50 // them we do the unescaping ourselves internally.
51 // Also make sure to expand out any nulls as pcre_compile
52 // expects null termination..
53 static UString sanitizePattern(const UString &p)
55 UString newPattern;
57 const char* const nil = "\\x00";
58 if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) {
59 bool escape = false;
60 for (int i = 0; i < p.size(); ++i) {
61 UChar c = p[i];
62 if (escape) {
63 escape = false;
64 // we only care about \u
65 if (c == 'u') {
66 // standard unicode escape sequence looks like \uxxxx but
67 // other browsers also accept less then 4 hex digits
68 unsigned short u = 0;
69 int j = 0;
70 for (j = 0; j < 4; ++j) {
71 if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
72 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
73 ++i;
74 } else {
75 // sequence incomplete. restore index.
76 // TODO: cleaner way to propagate warning
77 fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
78 i -= j;
79 break;
82 if (j < 4) {
83 // sequence was incomplete. treat \u as u which IE always
84 // and FF sometimes does.
85 newPattern.append(UString('u'));
86 } else {
87 c = UChar(u);
88 switch (u) {
89 case 0:
90 // Make sure to encode 0, to avoid terminating the string
91 newPattern += UString(nil);
92 break;
93 case '^':
94 case '$':
95 case '\\':
96 case '.':
97 case '*':
98 case '+':
99 case '?':
100 case '(': case ')':
101 case '{': case '}':
102 case '[': case ']':
103 case '|':
104 // escape pattern characters have to remain escaped
105 newPattern.append(UString('\\'));
106 // intentional fallthrough
107 default:
108 newPattern += UString(&c, 1);
109 break;
112 continue;
114 newPattern += UString('\\');
115 newPattern += UString(&c, 1);
116 } else {
117 if (c == '\\')
118 escape = true;
119 else if (c == '\0')
120 newPattern += UString(nil);
121 else
122 newPattern += UString(&c, 1);
125 return newPattern;
126 } else {
127 return p;
131 RegExp::RegExp(const UString &p, char flags)
132 : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0), _buffer(0), _originalPos(0)
134 // Determine whether libpcre has unicode support if need be..
135 #ifdef PCRE_CONFIG_UTF8
136 if (utf8Support == Unknown) {
137 int supported;
138 pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
139 utf8Support = supported ? Supported : Unsupported;
141 #endif
143 UString intern = sanitizePattern(p);
145 #ifdef HAVE_PCREPOSIX
147 int options = 0;
148 // Note: the Global flag is already handled by RegExpProtoFunc::execute.
149 // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
150 if (flags & IgnoreCase)
151 options |= PCRE_CASELESS;
152 if (flags & Multiline)
153 options |= PCRE_MULTILINE;
155 if (utf8Support == Supported)
156 options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
158 const char *errorMessage;
159 int errorOffset;
161 // Fill our buffer with an encoded version, whether utf-8, or,
162 // if PCRE is incapable, truncated.
163 prepareMatch(intern);
164 _regex = pcre_compile(_buffer, options, &errorMessage, &errorOffset, NULL);
165 doneMatch(); //Cleanup buffers
166 if (!_regex) {
167 #ifndef NDEBUG
168 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
169 #endif
170 _valid = false;
171 return;
174 #ifdef PCRE_INFO_CAPTURECOUNT
175 // Get number of subpatterns that will be returned.
176 pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
177 #endif
179 #else /* HAVE_PCREPOSIX */
181 int regflags = 0;
182 #ifdef REG_EXTENDED
183 regflags |= REG_EXTENDED;
184 #endif
185 #ifdef REG_ICASE
186 if ( flags & IgnoreCase )
187 regflags |= REG_ICASE;
188 #endif
190 //NOTE: Multiline is not feasible with POSIX regex.
191 //if ( f & Multiline )
192 // ;
193 // Note: the Global flag is already handled by RegExpProtoFunc::execute
195 int errorCode = regcomp(&_regex, intern.ascii(), regflags);
196 if (errorCode != 0) {
197 #ifndef NDEBUG
198 char errorMessage[80];
199 regerror(errorCode, &_regex, errorMessage, sizeof errorMessage);
200 fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
201 #endif
202 _valid = false;
204 #endif
207 RegExp::~RegExp()
209 doneMatch(); // Be 100% sure buffers are freed
210 #ifdef HAVE_PCREPOSIX
211 pcre_free(_regex);
212 #else
213 /* TODO: is this really okay after an error ? */
214 regfree(&_regex);
215 #endif
218 void RegExp::prepareUtf8(const UString& s)
220 // Allocate a buffer big enough to hold all the characters plus \0
221 const int length = s.size();
222 _buffer = new char[length * 3 + 1];
224 // Also create buffer for positions. We need one extra character in there,
225 // even past the \0 since the non-empty handling may jump one past the end
226 _originalPos = new int[length * 3 + 2];
228 // Convert to runs of 8-bit characters, and generate indices
229 // Note that we do NOT combine surrogate pairs here, as
230 // regexps operate on them as separate characters
231 char *p = _buffer;
232 int *posOut = _originalPos;
233 const UChar *d = s.data();
234 for (int i = 0; i != length; ++i) {
235 unsigned short c = d[i].unicode();
237 int sequenceLen;
238 if (c < 0x80) {
239 *p++ = (char)c;
240 sequenceLen = 1;
241 } else if (c < 0x800) {
242 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
243 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
244 sequenceLen = 2;
245 } else {
246 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
247 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
248 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
249 sequenceLen = 3;
252 while (sequenceLen > 0) {
253 *posOut = i;
254 ++posOut;
255 --sequenceLen;
259 _bufferSize = p - _buffer;
261 *p++ = '\0';
263 // Record positions for \0, and the fictional character after that.
264 *posOut = length;
265 *(posOut+1) = length+1;
268 void RegExp::prepareASCII (const UString& s)
270 _originalPos = 0;
272 // Best-effort attempt to get something done
273 // when we don't have utf 8 available -- use
274 // truncated version, and pray for the best
275 CString truncated = s.cstring();
276 _buffer = new char[truncated.size() + 1];
277 memcpy(_buffer, truncated.c_str(), truncated.size());
278 _buffer[truncated.size()] = '\0'; // For _compile use
279 _bufferSize = truncated.size();
282 void RegExp::prepareMatch(const UString &s)
284 delete[] _originalPos; // Just to be sure..
285 delete[] _buffer;
286 #ifdef PCRE_CONFIG_UTF8
287 if (utf8Support == Supported)
288 prepareUtf8(s);
289 else
290 #endif
291 prepareASCII(s);
293 #ifndef NDEBUG
294 _originalS = s;
295 #endif
298 void RegExp::doneMatch()
300 delete[] _originalPos; _originalPos = 0;
301 delete[] _buffer; _buffer = 0;
304 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
306 #ifndef NDEBUG
307 assert(s.data() == _originalS.data()); // Make sure prepareMatch got called right..
308 #endif
310 if (i < 0)
311 i = 0;
312 int dummyPos;
313 if (!pos)
314 pos = &dummyPos;
315 *pos = -1;
316 if (ovector)
317 *ovector = 0;
319 if (i > s.size() || s.isNull())
320 return UString::null();
322 #ifdef HAVE_PCREPOSIX
324 if (!_regex)
325 return UString::null();
327 // Set up the offset vector for the result.
328 // First 2/3 used for result, the last third used by PCRE.
329 int *offsetVector;
330 int offsetVectorSize;
331 int fixedSizeOffsetVector[3];
332 if (!ovector) {
333 offsetVectorSize = 3;
334 offsetVector = fixedSizeOffsetVector;
335 } else {
336 offsetVectorSize = (_numSubPatterns + 1) * 3;
337 offsetVector = new int [offsetVectorSize];
340 int startPos;
341 if (utf8Support == Supported) {
342 startPos = i;
343 while (_originalPos[startPos] < i)
344 ++startPos;
345 } else {
346 startPos = i;
349 int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0;
350 const int numMatches = pcre_exec(_regex, NULL, _buffer, _bufferSize, startPos, baseFlags, offsetVector, offsetVectorSize);
352 //Now go through and patch up the offsetVector
353 if (utf8Support == Supported)
354 for (int c = 0; c < 2 * numMatches; ++c)
355 if (offsetVector[c] != -1)
356 offsetVector[c] = _originalPos[offsetVector[c]];
358 if (numMatches < 0) {
359 #ifndef NDEBUG
360 if (numMatches != PCRE_ERROR_NOMATCH)
361 fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
362 #endif
363 if (offsetVector != fixedSizeOffsetVector)
364 delete [] offsetVector;
365 return UString::null();
368 *pos = offsetVector[0];
369 if (ovector)
370 *ovector = offsetVector;
371 return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
373 #else
375 if (!_valid)
376 return UString::null();
378 const unsigned maxMatch = 10;
379 regmatch_t rmatch[maxMatch];
381 char *str = strdup(s.ascii()); // TODO: why ???
382 if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
383 free(str);
384 return UString::null();
386 free(str);
388 if (!ovector) {
389 *pos = rmatch[0].rm_so + i;
390 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
393 // map rmatch array to ovector used in PCRE case
394 _numSubPatterns = 0;
395 for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
396 _numSubPatterns++;
397 int ovecsize = (_numSubPatterns+1)*3; // see above
398 *ovector = new int[ovecsize];
399 for (unsigned j = 0; j < _numSubPatterns + 1; j++) {
400 if (j>maxMatch)
401 break;
402 (*ovector)[2*j] = rmatch[j].rm_so + i;
403 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
406 *pos = (*ovector)[0];
407 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
409 #endif
412 } // namespace KJS