1 // -*- c-basic-offset: 2 -*-
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
5 * Copyright (C) 2003,2004 Apple Computer, Inc.
6 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include <wtf/Vector.h>
35 // GCC cstring uses these automatically, but not all implementations do.
44 #ifdef PCRE_CONFIG_UTF8
45 RegExp::UTF8SupportState
RegExp::utf8Support
= RegExp::Unknown
;
48 // JS regexps can contain Unicode escape sequences (\uxxxx) which
49 // are rather uncommon elsewhere. As our regexp libs don't understand
50 // them we do the unescaping ourselves internally.
51 // Also make sure to expand out any nulls as pcre_compile
52 // expects null termination..
53 static UString
sanitizePattern(const UString
&p
)
57 const char* const nil
= "\\x00";
58 if (p
.find("\\u") >= 0 || p
.find(KJS::UChar('\0')) >= 0) {
60 for (int i
= 0; i
< p
.size(); ++i
) {
64 // we only care about \u
66 // standard unicode escape sequence looks like \uxxxx but
67 // other browsers also accept less then 4 hex digits
70 for (j
= 0; j
< 4; ++j
) {
71 if (i
+ 1 < p
.size() && Lexer::isHexDigit(p
[i
+ 1].unicode())) {
72 u
= (u
<< 4) + Lexer::convertHex(p
[i
+ 1].unicode());
75 // sequence incomplete. restore index.
76 // TODO: cleaner way to propagate warning
77 fprintf(stderr
, "KJS: saw %d digit \\u sequence.\n", j
);
83 // sequence was incomplete. treat \u as u which IE always
84 // and FF sometimes does.
85 newPattern
.append(UString('u'));
90 // Make sure to encode 0, to avoid terminating the string
91 newPattern
+= UString(nil
);
104 // escape pattern characters have to remain escaped
105 newPattern
.append(UString('\\'));
106 // intentional fallthrough
108 newPattern
+= UString(&c
, 1);
114 newPattern
+= UString('\\');
115 newPattern
+= UString(&c
, 1);
120 newPattern
+= UString(nil
);
122 newPattern
+= UString(&c
, 1);
131 RegExp::RegExp(const UString
&p
, char flags
)
132 : _pat(p
), _flags(flags
), _valid(true), _numSubPatterns(0), _buffer(0), _originalPos(0)
134 // Determine whether libpcre has unicode support if need be..
135 #ifdef PCRE_CONFIG_UTF8
136 if (utf8Support
== Unknown
) {
138 pcre_config(PCRE_CONFIG_UTF8
, (void*)&supported
);
139 utf8Support
= supported
? Supported
: Unsupported
;
143 UString intern
= sanitizePattern(p
);
145 #ifdef HAVE_PCREPOSIX
148 // Note: the Global flag is already handled by RegExpProtoFunc::execute.
149 // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
150 if (flags
& IgnoreCase
)
151 options
|= PCRE_CASELESS
;
152 if (flags
& Multiline
)
153 options
|= PCRE_MULTILINE
;
155 if (utf8Support
== Supported
)
156 options
|= (PCRE_UTF8
| PCRE_NO_UTF8_CHECK
);
158 const char *errorMessage
;
161 // Fill our buffer with an encoded version, whether utf-8, or,
162 // if PCRE is incapable, truncated.
163 prepareMatch(intern
);
164 _regex
= pcre_compile(_buffer
, options
, &errorMessage
, &errorOffset
, NULL
);
165 doneMatch(); //Cleanup buffers
168 fprintf(stderr
, "KJS: pcre_compile() failed with '%s'\n", errorMessage
);
174 #ifdef PCRE_INFO_CAPTURECOUNT
175 // Get number of subpatterns that will be returned.
176 pcre_fullinfo(_regex
, NULL
, PCRE_INFO_CAPTURECOUNT
, &_numSubPatterns
);
179 #else /* HAVE_PCREPOSIX */
183 regflags
|= REG_EXTENDED
;
186 if ( flags
& IgnoreCase
)
187 regflags
|= REG_ICASE
;
190 //NOTE: Multiline is not feasible with POSIX regex.
191 //if ( f & Multiline )
193 // Note: the Global flag is already handled by RegExpProtoFunc::execute
195 int errorCode
= regcomp(&_regex
, intern
.ascii(), regflags
);
196 if (errorCode
!= 0) {
198 char errorMessage
[80];
199 regerror(errorCode
, &_regex
, errorMessage
, sizeof errorMessage
);
200 fprintf(stderr
, "KJS: regcomp failed with '%s'\n", errorMessage
);
209 doneMatch(); // Be 100% sure buffers are freed
210 #ifdef HAVE_PCREPOSIX
213 /* TODO: is this really okay after an error ? */
218 void RegExp::prepareUtf8(const UString
& s
)
220 // Allocate a buffer big enough to hold all the characters plus \0
221 const int length
= s
.size();
222 _buffer
= new char[length
* 3 + 1];
224 // Also create buffer for positions. We need one extra character in there,
225 // even past the \0 since the non-empty handling may jump one past the end
226 _originalPos
= new int[length
* 3 + 2];
228 // Convert to runs of 8-bit characters, and generate indices
229 // Note that we do NOT combine surrogate pairs here, as
230 // regexps operate on them as separate characters
232 int *posOut
= _originalPos
;
233 const UChar
*d
= s
.data();
234 for (int i
= 0; i
!= length
; ++i
) {
235 unsigned short c
= d
[i
].unicode();
241 } else if (c
< 0x800) {
242 *p
++ = (char)((c
>> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
243 *p
++ = (char)((c
| 0x80) & 0xBF); // next 6 bits, with high bit set
246 *p
++ = (char)((c
>> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
247 *p
++ = (char)(((c
>> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
248 *p
++ = (char)((c
| 0x80) & 0xBF); // next 6 bits, with high bit set
252 while (sequenceLen
> 0) {
259 _bufferSize
= p
- _buffer
;
263 // Record positions for \0, and the fictional character after that.
265 *(posOut
+1) = length
+1;
268 void RegExp::prepareASCII (const UString
& s
)
272 // Best-effort attempt to get something done
273 // when we don't have utf 8 available -- use
274 // truncated version, and pray for the best
275 CString truncated
= s
.cstring();
276 _buffer
= new char[truncated
.size() + 1];
277 memcpy(_buffer
, truncated
.c_str(), truncated
.size());
278 _buffer
[truncated
.size()] = '\0'; // For _compile use
279 _bufferSize
= truncated
.size();
282 void RegExp::prepareMatch(const UString
&s
)
284 delete[] _originalPos
; // Just to be sure..
286 #ifdef PCRE_CONFIG_UTF8
287 if (utf8Support
== Supported
)
298 void RegExp::doneMatch()
300 delete[] _originalPos
; _originalPos
= 0;
301 delete[] _buffer
; _buffer
= 0;
304 UString
RegExp::match(const UString
&s
, int i
, int *pos
, int **ovector
)
307 assert(s
.data() == _originalS
.data()); // Make sure prepareMatch got called right..
319 if (i
> s
.size() || s
.isNull())
320 return UString::null();
322 #ifdef HAVE_PCREPOSIX
325 return UString::null();
327 // Set up the offset vector for the result.
328 // First 2/3 used for result, the last third used by PCRE.
330 int offsetVectorSize
;
331 int fixedSizeOffsetVector
[3];
333 offsetVectorSize
= 3;
334 offsetVector
= fixedSizeOffsetVector
;
336 offsetVectorSize
= (_numSubPatterns
+ 1) * 3;
337 offsetVector
= new int [offsetVectorSize
];
341 if (utf8Support
== Supported
) {
343 while (_originalPos
[startPos
] < i
)
349 int baseFlags
= utf8Support
== Supported
? PCRE_NO_UTF8_CHECK
: 0;
350 const int numMatches
= pcre_exec(_regex
, NULL
, _buffer
, _bufferSize
, startPos
, baseFlags
, offsetVector
, offsetVectorSize
);
352 //Now go through and patch up the offsetVector
353 if (utf8Support
== Supported
)
354 for (int c
= 0; c
< 2 * numMatches
; ++c
)
355 if (offsetVector
[c
] != -1)
356 offsetVector
[c
] = _originalPos
[offsetVector
[c
]];
358 if (numMatches
< 0) {
360 if (numMatches
!= PCRE_ERROR_NOMATCH
)
361 fprintf(stderr
, "KJS: pcre_exec() failed with result %d\n", numMatches
);
363 if (offsetVector
!= fixedSizeOffsetVector
)
364 delete [] offsetVector
;
365 return UString::null();
368 *pos
= offsetVector
[0];
370 *ovector
= offsetVector
;
371 return s
.substr(offsetVector
[0], offsetVector
[1] - offsetVector
[0]);
376 return UString::null();
378 const unsigned maxMatch
= 10;
379 regmatch_t rmatch
[maxMatch
];
381 char *str
= strdup(s
.ascii()); // TODO: why ???
382 if (regexec(&_regex
, str
+ i
, maxMatch
, rmatch
, 0)) {
384 return UString::null();
389 *pos
= rmatch
[0].rm_so
+ i
;
390 return s
.substr(rmatch
[0].rm_so
+ i
, rmatch
[0].rm_eo
- rmatch
[0].rm_so
);
393 // map rmatch array to ovector used in PCRE case
395 for(unsigned j
= 1; j
< maxMatch
&& rmatch
[j
].rm_so
>= 0 ; j
++)
397 int ovecsize
= (_numSubPatterns
+1)*3; // see above
398 *ovector
= new int[ovecsize
];
399 for (unsigned j
= 0; j
< _numSubPatterns
+ 1; j
++) {
402 (*ovector
)[2*j
] = rmatch
[j
].rm_so
+ i
;
403 (*ovector
)[2*j
+1] = rmatch
[j
].rm_eo
+ i
;
406 *pos
= (*ovector
)[0];
407 return s
.substr((*ovector
)[0], (*ovector
)[1] - (*ovector
)[0]);