tgupdate: merge pcreposix-compat base into pcreposix-compat
[pcreposix-compat.git] / pcrecpp.cc
blobce289afafb822452f8852b8dafaac0bda9449de9
1 // Copyright (c) 2010, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 // Author: Sanjay Ghemawat
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <ctype.h>
39 #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */
40 #include <string.h> /* for memcpy */
41 #include <assert.h>
42 #include <errno.h>
43 #include <string>
44 #include <algorithm>
46 #include "pcrecpp_internal.h"
47 #include "pcre.h"
48 #include "pcrecpp.h"
49 #include "pcre_stringpiece.h"
52 namespace pcrecpp {
54 // Maximum number of args we can set
55 static const int kMaxArgs = 16;
56 static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
58 // Special object that stands-in for no argument
59 Arg RE::no_arg((void*)NULL);
61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
62 // which defined a global no_arg variable instead of putting it in the
63 // RE class. This works on GCC >= 3, at least. It definitely works
64 // for ELF, but may not for other object formats (Mach-O, for
65 // instance, does not support aliases.) We could probably have a more
66 // inclusive test if we ever needed it. (Note that not only the
67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
68 // gnu-specific.)
69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) \
70 && !defined(__INTEL_COMPILER) && !defined(__LCC__)
71 # define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x)
72 # define ULP_AS_STRING_INTERNAL(x) #x
73 # define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__)
74 extern Arg no_arg
75 __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
76 #endif
78 // If a regular expression has no error, its error_ field points here
79 static const string empty_string;
81 // If the user doesn't ask for any options, we just use this one
82 static RE_Options default_options;
84 // Specials for the start of patterns. See comments where start_options is used
85 // below. (PH June 2018)
86 static const char *start_options[] = {
87 "(*UTF8)",
88 "(*UTF)",
89 "(*UCP)",
90 "(*NO_START_OPT)",
91 "(*NO_AUTO_POSSESS)",
92 "(*LIMIT_RECURSION=",
93 "(*LIMIT_MATCH=",
94 "(*CRLF)",
95 "(*LF)",
96 "(*CR)",
97 "(*BSR_UNICODE)",
98 "(*BSR_ANYCRLF)",
99 "(*ANYCRLF)",
100 "(*ANY)",
101 "" };
103 void RE::Init(const string& pat, const RE_Options* options) {
104 pattern_ = pat;
105 if (options == NULL) {
106 options_ = default_options;
107 } else {
108 options_ = *options;
110 error_ = &empty_string;
111 re_full_ = NULL;
112 re_partial_ = NULL;
114 re_partial_ = Compile(UNANCHORED);
115 if (re_partial_ != NULL) {
116 re_full_ = Compile(ANCHOR_BOTH);
120 void RE::Cleanup() {
121 if (re_full_ != NULL) (*pcre_free)(re_full_);
122 if (re_partial_ != NULL) (*pcre_free)(re_partial_);
123 if (error_ != &empty_string) delete error_;
127 RE::~RE() {
128 Cleanup();
132 pcre* RE::Compile(Anchor anchor) {
133 // First, convert RE_Options into pcre options
134 int pcre_options = 0;
135 pcre_options = options_.all_options();
137 // Special treatment for anchoring. This is needed because at
138 // runtime pcre only provides an option for anchoring at the
139 // beginning of a string (unless you use offset).
141 // There are three types of anchoring we want:
142 // UNANCHORED Compile the original pattern, and use
143 // a pcre unanchored match.
144 // ANCHOR_START Compile the original pattern, and use
145 // a pcre anchored match.
146 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
147 // and use a pcre anchored match.
149 const char* compile_error;
150 int eoffset;
151 pcre* re;
152 if (anchor != ANCHOR_BOTH) {
153 re = pcre_compile(pattern_.c_str(), pcre_options,
154 &compile_error, &eoffset, NULL);
155 } else {
156 // Tack a '\z' at the end of RE. Parenthesize it first so that
157 // the '\z' applies to all top-level alternatives in the regexp.
159 /* When this code was written (for PCRE 6.0) it was enough just to
160 parenthesize the entire pattern. Unfortunately, when the feature of
161 starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
162 this code was never updated. This bug was not noticed till 2018, long after
163 PCRE became obsolescent and its maintainer no longer around. Since PCRE is
164 frozen, I have added a hack to check for all the existing "start of
165 pattern" specials - knowing that no new ones will ever be added. I am not a
166 C++ programmer, so the code style is no doubt crude. It is also
167 inefficient, but is only run when the pattern starts with "(*".
168 PH June 2018. */
170 string wrapped = "";
172 if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
173 int kk, klen, kmat;
174 for (;;) { // Loop for any number of leading items
176 for (kk = 0; start_options[kk][0] != 0; kk++) {
177 klen = strlen(start_options[kk]);
178 kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
179 if (kmat >= 0) break;
181 if (kmat != 0) break; // Not found
183 // If the item ended in "=" we must copy digits up to ")".
185 if (start_options[kk][klen-1] == '=') {
186 while (isdigit(pattern_.c_str()[klen])) klen++;
187 if (pattern_.c_str()[klen] != ')') break; // Syntax error
188 klen++;
191 // Move the item from the pattern to the start of the wrapped string.
193 wrapped += pattern_.substr(0, klen);
194 pattern_.erase(0, klen);
198 // Wrap the rest of the pattern.
200 wrapped += "(?:"; // A non-counting grouping operator
201 wrapped += pattern_;
202 wrapped += ")\\z";
203 re = pcre_compile(wrapped.c_str(), pcre_options,
204 &compile_error, &eoffset, NULL);
206 if (re == NULL) {
207 if (error_ == &empty_string) error_ = new string(compile_error);
209 return re;
212 /***** Matching interfaces *****/
214 bool RE::FullMatch(const StringPiece& text,
215 const Arg& ptr1,
216 const Arg& ptr2,
217 const Arg& ptr3,
218 const Arg& ptr4,
219 const Arg& ptr5,
220 const Arg& ptr6,
221 const Arg& ptr7,
222 const Arg& ptr8,
223 const Arg& ptr9,
224 const Arg& ptr10,
225 const Arg& ptr11,
226 const Arg& ptr12,
227 const Arg& ptr13,
228 const Arg& ptr14,
229 const Arg& ptr15,
230 const Arg& ptr16) const {
231 const Arg* args[kMaxArgs];
232 int n = 0;
233 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
234 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
235 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
236 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
237 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
238 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
239 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
240 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
241 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
242 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
243 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
244 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
245 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
246 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
247 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
248 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
249 done:
251 int consumed;
252 int vec[kVecSize];
253 return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
256 bool RE::PartialMatch(const StringPiece& text,
257 const Arg& ptr1,
258 const Arg& ptr2,
259 const Arg& ptr3,
260 const Arg& ptr4,
261 const Arg& ptr5,
262 const Arg& ptr6,
263 const Arg& ptr7,
264 const Arg& ptr8,
265 const Arg& ptr9,
266 const Arg& ptr10,
267 const Arg& ptr11,
268 const Arg& ptr12,
269 const Arg& ptr13,
270 const Arg& ptr14,
271 const Arg& ptr15,
272 const Arg& ptr16) const {
273 const Arg* args[kMaxArgs];
274 int n = 0;
275 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
276 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
277 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
278 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
279 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
280 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
281 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
282 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
283 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
284 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
285 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
286 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
287 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
288 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
289 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
290 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
291 done:
293 int consumed;
294 int vec[kVecSize];
295 return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
298 bool RE::Consume(StringPiece* input,
299 const Arg& ptr1,
300 const Arg& ptr2,
301 const Arg& ptr3,
302 const Arg& ptr4,
303 const Arg& ptr5,
304 const Arg& ptr6,
305 const Arg& ptr7,
306 const Arg& ptr8,
307 const Arg& ptr9,
308 const Arg& ptr10,
309 const Arg& ptr11,
310 const Arg& ptr12,
311 const Arg& ptr13,
312 const Arg& ptr14,
313 const Arg& ptr15,
314 const Arg& ptr16) const {
315 const Arg* args[kMaxArgs];
316 int n = 0;
317 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
318 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
319 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
320 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
321 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
322 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
323 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
324 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
325 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
326 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
327 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
328 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
329 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
330 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
331 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
332 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
333 done:
335 int consumed;
336 int vec[kVecSize];
337 if (DoMatchImpl(*input, ANCHOR_START, &consumed,
338 args, n, vec, kVecSize)) {
339 input->remove_prefix(consumed);
340 return true;
341 } else {
342 return false;
346 bool RE::FindAndConsume(StringPiece* input,
347 const Arg& ptr1,
348 const Arg& ptr2,
349 const Arg& ptr3,
350 const Arg& ptr4,
351 const Arg& ptr5,
352 const Arg& ptr6,
353 const Arg& ptr7,
354 const Arg& ptr8,
355 const Arg& ptr9,
356 const Arg& ptr10,
357 const Arg& ptr11,
358 const Arg& ptr12,
359 const Arg& ptr13,
360 const Arg& ptr14,
361 const Arg& ptr15,
362 const Arg& ptr16) const {
363 const Arg* args[kMaxArgs];
364 int n = 0;
365 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
366 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
367 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
368 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
369 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
370 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
371 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
372 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
373 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
374 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
375 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
376 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
377 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
378 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
379 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
380 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
381 done:
383 int consumed;
384 int vec[kVecSize];
385 if (DoMatchImpl(*input, UNANCHORED, &consumed,
386 args, n, vec, kVecSize)) {
387 input->remove_prefix(consumed);
388 return true;
389 } else {
390 return false;
394 bool RE::Replace(const StringPiece& rewrite,
395 string *str) const {
396 int vec[kVecSize];
397 int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
398 if (matches == 0)
399 return false;
401 string s;
402 if (!Rewrite(&s, rewrite, *str, vec, matches))
403 return false;
405 assert(vec[0] >= 0);
406 assert(vec[1] >= 0);
407 str->replace(vec[0], vec[1] - vec[0], s);
408 return true;
411 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
412 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
413 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
415 static int NewlineMode(int pcre_options) {
416 // TODO: if we can make it threadsafe, cache this var
417 int newline_mode = 0;
418 /* if (newline_mode) return newline_mode; */ // do this once it's cached
419 if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
420 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
421 newline_mode = (pcre_options &
422 (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
423 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
424 } else {
425 int newline;
426 pcre_config(PCRE_CONFIG_NEWLINE, &newline);
427 if (newline == 10)
428 newline_mode = PCRE_NEWLINE_LF;
429 else if (newline == 13)
430 newline_mode = PCRE_NEWLINE_CR;
431 else if (newline == 3338)
432 newline_mode = PCRE_NEWLINE_CRLF;
433 else if (newline == -1)
434 newline_mode = PCRE_NEWLINE_ANY;
435 else if (newline == -2)
436 newline_mode = PCRE_NEWLINE_ANYCRLF;
437 else
438 assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
440 return newline_mode;
443 int RE::GlobalReplace(const StringPiece& rewrite,
444 string *str) const {
445 int count = 0;
446 int vec[kVecSize];
447 string out;
448 int start = 0;
449 bool last_match_was_empty_string = false;
451 while (start <= static_cast<int>(str->length())) {
452 // If the previous match was for the empty string, we shouldn't
453 // just match again: we'll match in the same way and get an
454 // infinite loop. Instead, we do the match in a special way:
455 // anchored -- to force another try at the same position --
456 // and with a flag saying that this time, ignore empty matches.
457 // If this special match returns, that means there's a non-empty
458 // match at this position as well, and we can continue. If not,
459 // we do what perl does, and just advance by one.
460 // Notice that perl prints '@@@' for this;
461 // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
462 int matches;
463 if (last_match_was_empty_string) {
464 matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
465 if (matches <= 0) {
466 int matchend = start + 1; // advance one character.
467 // If the current char is CR and we're in CRLF mode, skip LF too.
468 // Note it's better to call pcre_fullinfo() than to examine
469 // all_options(), since options_ could have changed bewteen
470 // compile-time and now, but this is simpler and safe enough.
471 // Modified by PH to add ANY and ANYCRLF.
472 if (matchend < static_cast<int>(str->length()) &&
473 (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
474 (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
475 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
476 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
477 matchend++;
479 // We also need to advance more than one char if we're in utf8 mode.
480 #ifdef SUPPORT_UTF
481 if (options_.utf8()) {
482 while (matchend < static_cast<int>(str->length()) &&
483 ((*str)[matchend] & 0xc0) == 0x80)
484 matchend++;
486 #endif
487 if (start < static_cast<int>(str->length()))
488 out.append(*str, start, matchend - start);
489 start = matchend;
490 last_match_was_empty_string = false;
491 continue;
493 } else {
494 matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
495 if (matches <= 0)
496 break;
498 int matchstart = vec[0], matchend = vec[1];
499 assert(matchstart >= start);
500 assert(matchend >= matchstart);
501 out.append(*str, start, matchstart - start);
502 Rewrite(&out, rewrite, *str, vec, matches);
503 start = matchend;
504 count++;
505 last_match_was_empty_string = (matchstart == matchend);
508 if (count == 0)
509 return 0;
511 if (start < static_cast<int>(str->length()))
512 out.append(*str, start, str->length() - start);
513 swap(out, *str);
514 return count;
517 bool RE::Extract(const StringPiece& rewrite,
518 const StringPiece& text,
519 string *out) const {
520 int vec[kVecSize];
521 int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
522 if (matches == 0)
523 return false;
524 out->erase();
525 return Rewrite(out, rewrite, text, vec, matches);
528 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
529 string result;
531 // Escape any ascii character not in [A-Za-z_0-9].
533 // Note that it's legal to escape a character even if it has no
534 // special meaning in a regular expression -- so this function does
535 // that. (This also makes it identical to the perl function of the
536 // same name; see `perldoc -f quotemeta`.) The one exception is
537 // escaping NUL: rather than doing backslash + NUL, like perl does,
538 // we do '\0', because pcre itself doesn't take embedded NUL chars.
539 for (int ii = 0; ii < unquoted.size(); ++ii) {
540 // Note that using 'isalnum' here raises the benchmark time from
541 // 32ns to 58ns:
542 if (unquoted[ii] == '\0') {
543 result += "\\0";
544 } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
545 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
546 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
547 unquoted[ii] != '_' &&
548 // If this is the part of a UTF8 or Latin1 character, we need
549 // to copy this byte without escaping. Experimentally this is
550 // what works correctly with the regexp library.
551 !(unquoted[ii] & 128)) {
552 result += '\\';
553 result += unquoted[ii];
554 } else {
555 result += unquoted[ii];
559 return result;
562 /***** Actual matching and rewriting code *****/
564 int RE::TryMatch(const StringPiece& text,
565 int startpos,
566 Anchor anchor,
567 bool empty_ok,
568 int *vec,
569 int vecsize) const {
570 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
571 if (re == NULL) {
572 //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
573 return 0;
576 pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
577 if (options_.match_limit() > 0) {
578 extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
579 extra.match_limit = options_.match_limit();
581 if (options_.match_limit_recursion() > 0) {
582 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
583 extra.match_limit_recursion = options_.match_limit_recursion();
586 // int options = 0;
587 // Changed by PH as a result of bugzilla #1288
588 int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
590 if (anchor != UNANCHORED)
591 options |= PCRE_ANCHORED;
592 if (!empty_ok)
593 options |= PCRE_NOTEMPTY;
595 int rc = pcre_exec(re, // The regular expression object
596 &extra,
597 (text.data() == NULL) ? "" : text.data(),
598 text.size(),
599 startpos,
600 options,
601 vec,
602 vecsize);
604 // Handle errors
605 if (rc == PCRE_ERROR_NOMATCH) {
606 return 0;
607 } else if (rc < 0) {
608 //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
609 // re, pattern_.c_str());
610 return 0;
611 } else if (rc == 0) {
612 // pcre_exec() returns 0 as a special case when the number of
613 // capturing subpatterns exceeds the size of the vector.
614 // When this happens, there is a match and the output vector
615 // is filled, but we miss out on the positions of the extra subpatterns.
616 rc = vecsize / 2;
619 return rc;
622 bool RE::DoMatchImpl(const StringPiece& text,
623 Anchor anchor,
624 int* consumed,
625 const Arg* const* args,
626 int n,
627 int* vec,
628 int vecsize) const {
629 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
630 int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
631 assert(matches >= 0); // TryMatch never returns negatives
632 if (matches == 0)
633 return false;
635 *consumed = vec[1];
637 if (n == 0 || args == NULL) {
638 // We are not interested in results
639 return true;
642 if (NumberOfCapturingGroups() < n) {
643 // RE has fewer capturing groups than number of arg pointers passed in
644 return false;
647 // If we got here, we must have matched the whole pattern.
648 // We do not need (can not do) any more checks on the value of 'matches' here
649 // -- see the comment for TryMatch.
650 for (int i = 0; i < n; i++) {
651 const int start = vec[2*(i+1)];
652 const int limit = vec[2*(i+1)+1];
653 if (!args[i]->Parse(text.data() + start, limit-start)) {
654 // TODO: Should we indicate what the error was?
655 return false;
659 return true;
662 bool RE::DoMatch(const StringPiece& text,
663 Anchor anchor,
664 int* consumed,
665 const Arg* const args[],
666 int n) const {
667 assert(n >= 0);
668 size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
669 // (as for kVecSize)
670 int space[21]; // use stack allocation for small vecsize (common case)
671 int* vec = vecsize <= 21 ? space : new int[vecsize];
672 bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
673 if (vec != space) delete [] vec;
674 return retval;
677 bool RE::Rewrite(string *out, const StringPiece &rewrite,
678 const StringPiece &text, int *vec, int veclen) const {
679 for (const char *s = rewrite.data(), *end = s + rewrite.size();
680 s < end; s++) {
681 int c = *s;
682 if (c == '\\') {
683 c = *++s;
684 if (isdigit(c)) {
685 int n = (c - '0');
686 if (n >= veclen) {
687 //fprintf(stderr, requested group %d in regexp %.*s\n",
688 // n, rewrite.size(), rewrite.data());
689 return false;
691 int start = vec[2 * n];
692 if (start >= 0)
693 out->append(text.data() + start, vec[2 * n + 1] - start);
694 } else if (c == '\\') {
695 *out += '\\';
696 } else {
697 //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
698 // rewrite.size(), rewrite.data());
699 return false;
701 } else {
702 *out += c;
705 return true;
708 // Return the number of capturing subpatterns, or -1 if the
709 // regexp wasn't valid on construction.
710 int RE::NumberOfCapturingGroups() const {
711 if (re_partial_ == NULL) return -1;
713 int result;
714 int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object
715 NULL, // We did not study the pattern
716 PCRE_INFO_CAPTURECOUNT,
717 &result);
718 assert(pcre_retval == 0);
719 return result;
722 /***** Parsers for various types *****/
724 bool Arg::parse_null(const char* str, int n, void* dest) {
725 (void)str;
726 (void)n;
727 // We fail if somebody asked us to store into a non-NULL void* pointer
728 return (dest == NULL);
731 bool Arg::parse_string(const char* str, int n, void* dest) {
732 if (dest == NULL) return true;
733 reinterpret_cast<string*>(dest)->assign(str, n);
734 return true;
737 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
738 if (dest == NULL) return true;
739 reinterpret_cast<StringPiece*>(dest)->set(str, n);
740 return true;
743 bool Arg::parse_char(const char* str, int n, void* dest) {
744 if (n != 1) return false;
745 if (dest == NULL) return true;
746 *(reinterpret_cast<char*>(dest)) = str[0];
747 return true;
750 bool Arg::parse_uchar(const char* str, int n, void* dest) {
751 if (n != 1) return false;
752 if (dest == NULL) return true;
753 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
754 return true;
757 // Largest number spec that we are willing to parse
758 static const int kMaxNumberLength = 32;
760 // REQUIRES "buf" must have length at least kMaxNumberLength+1
761 // REQUIRES "n > 0"
762 // Copies "str" into "buf" and null-terminates if necessary.
763 // Returns one of:
764 // a. "str" if no termination is needed
765 // b. "buf" if the string was copied and null-terminated
766 // c. "" if the input was invalid and has no hope of being parsed
767 static const char* TerminateNumber(char* buf, const char* str, int n) {
768 if ((n > 0) && isspace(*str)) {
769 // We are less forgiving than the strtoxxx() routines and do not
770 // allow leading spaces.
771 return "";
774 // See if the character right after the input text may potentially
775 // look like a digit.
776 if (isdigit(str[n]) ||
777 ((str[n] >= 'a') && (str[n] <= 'f')) ||
778 ((str[n] >= 'A') && (str[n] <= 'F'))) {
779 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
780 memcpy(buf, str, n);
781 buf[n] = '\0';
782 return buf;
783 } else {
784 // We can parse right out of the supplied string, so return it.
785 return str;
789 bool Arg::parse_long_radix(const char* str,
790 int n,
791 void* dest,
792 int radix) {
793 if (n == 0) return false;
794 char buf[kMaxNumberLength+1];
795 str = TerminateNumber(buf, str, n);
796 char* end;
797 errno = 0;
798 long r = strtol(str, &end, radix);
799 if (end != str + n) return false; // Leftover junk
800 if (errno) return false;
801 if (dest == NULL) return true;
802 *(reinterpret_cast<long*>(dest)) = r;
803 return true;
806 bool Arg::parse_ulong_radix(const char* str,
807 int n,
808 void* dest,
809 int radix) {
810 if (n == 0) return false;
811 char buf[kMaxNumberLength+1];
812 str = TerminateNumber(buf, str, n);
813 if (str[0] == '-') return false; // strtoul() on a negative number?!
814 char* end;
815 errno = 0;
816 unsigned long r = strtoul(str, &end, radix);
817 if (end != str + n) return false; // Leftover junk
818 if (errno) return false;
819 if (dest == NULL) return true;
820 *(reinterpret_cast<unsigned long*>(dest)) = r;
821 return true;
824 bool Arg::parse_short_radix(const char* str,
825 int n,
826 void* dest,
827 int radix) {
828 long r;
829 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
830 if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range
831 if (dest == NULL) return true;
832 *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
833 return true;
836 bool Arg::parse_ushort_radix(const char* str,
837 int n,
838 void* dest,
839 int radix) {
840 unsigned long r;
841 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
842 if (r > USHRT_MAX) return false; // Out of range
843 if (dest == NULL) return true;
844 *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
845 return true;
848 bool Arg::parse_int_radix(const char* str,
849 int n,
850 void* dest,
851 int radix) {
852 long r;
853 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
854 if (r < INT_MIN || r > INT_MAX) return false; // Out of range
855 if (dest == NULL) return true;
856 *(reinterpret_cast<int*>(dest)) = r;
857 return true;
860 bool Arg::parse_uint_radix(const char* str,
861 int n,
862 void* dest,
863 int radix) {
864 unsigned long r;
865 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
866 if (r > UINT_MAX) return false; // Out of range
867 if (dest == NULL) return true;
868 *(reinterpret_cast<unsigned int*>(dest)) = r;
869 return true;
872 bool Arg::parse_longlong_radix(const char* str,
873 int n,
874 void* dest,
875 int radix) {
876 #ifndef HAVE_LONG_LONG
877 return false;
878 #else
879 if (n == 0) return false;
880 char buf[kMaxNumberLength+1];
881 str = TerminateNumber(buf, str, n);
882 char* end;
883 errno = 0;
884 #if defined HAVE_STRTOQ
885 long long r = strtoq(str, &end, radix);
886 #elif defined HAVE_STRTOLL
887 long long r = strtoll(str, &end, radix);
888 #elif defined HAVE__STRTOI64
889 long long r = _strtoi64(str, &end, radix);
890 #elif defined HAVE_STRTOIMAX
891 long long r = strtoimax(str, &end, radix);
892 #else
893 #error parse_longlong_radix: cannot convert input to a long-long
894 #endif
895 if (end != str + n) return false; // Leftover junk
896 if (errno) return false;
897 if (dest == NULL) return true;
898 *(reinterpret_cast<long long*>(dest)) = r;
899 return true;
900 #endif /* HAVE_LONG_LONG */
903 bool Arg::parse_ulonglong_radix(const char* str,
904 int n,
905 void* dest,
906 int radix) {
907 #ifndef HAVE_UNSIGNED_LONG_LONG
908 return false;
909 #else
910 if (n == 0) return false;
911 char buf[kMaxNumberLength+1];
912 str = TerminateNumber(buf, str, n);
913 if (str[0] == '-') return false; // strtoull() on a negative number?!
914 char* end;
915 errno = 0;
916 #if defined HAVE_STRTOQ
917 unsigned long long r = strtouq(str, &end, radix);
918 #elif defined HAVE_STRTOLL
919 unsigned long long r = strtoull(str, &end, radix);
920 #elif defined HAVE__STRTOI64
921 unsigned long long r = _strtoui64(str, &end, radix);
922 #elif defined HAVE_STRTOIMAX
923 unsigned long long r = strtoumax(str, &end, radix);
924 #else
925 #error parse_ulonglong_radix: cannot convert input to a long-long
926 #endif
927 if (end != str + n) return false; // Leftover junk
928 if (errno) return false;
929 if (dest == NULL) return true;
930 *(reinterpret_cast<unsigned long long*>(dest)) = r;
931 return true;
932 #endif /* HAVE_UNSIGNED_LONG_LONG */
935 bool Arg::parse_double(const char* str, int n, void* dest) {
936 if (n == 0) return false;
937 static const int kMaxLength = 200;
938 char buf[kMaxLength];
939 if (n >= kMaxLength) return false;
940 memcpy(buf, str, n);
941 buf[n] = '\0';
942 errno = 0;
943 char* end;
944 double r = strtod(buf, &end);
945 if (end != buf + n) return false; // Leftover junk
946 if (errno) return false;
947 if (dest == NULL) return true;
948 *(reinterpret_cast<double*>(dest)) = r;
949 return true;
952 bool Arg::parse_float(const char* str, int n, void* dest) {
953 double r;
954 if (!parse_double(str, n, &r)) return false;
955 if (dest == NULL) return true;
956 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
957 return true;
961 #define DEFINE_INTEGER_PARSERS(name) \
962 bool Arg::parse_##name(const char* str, int n, void* dest) { \
963 return parse_##name##_radix(str, n, dest, 10); \
965 bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
966 return parse_##name##_radix(str, n, dest, 16); \
968 bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
969 return parse_##name##_radix(str, n, dest, 8); \
971 bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
972 return parse_##name##_radix(str, n, dest, 0); \
975 DEFINE_INTEGER_PARSERS(short) /* */
976 DEFINE_INTEGER_PARSERS(ushort) /* */
977 DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
978 DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
979 DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
980 DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
981 DEFINE_INTEGER_PARSERS(longlong) /* */
982 DEFINE_INTEGER_PARSERS(ulonglong) /* */
984 #undef DEFINE_INTEGER_PARSERS
986 } // namespace pcrecpp