1 // Copyright (c) 2010, Google Inc.
2 // All rights reserved.
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 // Author: Sanjay Ghemawat
39 #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */
40 #include <string.h> /* for memcpy */
46 #include "pcrecpp_internal.h"
49 #include "pcre_stringpiece.h"
54 // Maximum number of args we can set
55 static const int kMaxArgs
= 16;
56 static const int kVecSize
= (1 + kMaxArgs
) * 3; // results + PCRE workspace
58 // Special object that stands-in for no argument
59 Arg
RE::no_arg((void*)NULL
);
61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
62 // which defined a global no_arg variable instead of putting it in the
63 // RE class. This works on GCC >= 3, at least. It definitely works
64 // for ELF, but may not for other object formats (Mach-O, for
65 // instance, does not support aliases.) We could probably have a more
66 // inclusive test if we ever needed it. (Note that not only the
67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) \
70 && !defined(__INTEL_COMPILER) && !defined(__LCC__)
71 # define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x)
72 # define ULP_AS_STRING_INTERNAL(x) #x
73 # define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__)
75 __attribute__((alias(USER_LABEL_PREFIX_STR
"_ZN7pcrecpp2RE6no_argE")));
78 // If a regular expression has no error, its error_ field points here
79 static const string empty_string
;
81 // If the user doesn't ask for any options, we just use this one
82 static RE_Options default_options
;
84 // Specials for the start of patterns. See comments where start_options is used
85 // below. (PH June 2018)
86 static const char *start_options
[] = {
103 void RE::Init(const string
& pat
, const RE_Options
* options
) {
105 if (options
== NULL
) {
106 options_
= default_options
;
110 error_
= &empty_string
;
114 re_partial_
= Compile(UNANCHORED
);
115 if (re_partial_
!= NULL
) {
116 re_full_
= Compile(ANCHOR_BOTH
);
121 if (re_full_
!= NULL
) (*pcre_free
)(re_full_
);
122 if (re_partial_
!= NULL
) (*pcre_free
)(re_partial_
);
123 if (error_
!= &empty_string
) delete error_
;
132 pcre
* RE::Compile(Anchor anchor
) {
133 // First, convert RE_Options into pcre options
134 int pcre_options
= 0;
135 pcre_options
= options_
.all_options();
137 // Special treatment for anchoring. This is needed because at
138 // runtime pcre only provides an option for anchoring at the
139 // beginning of a string (unless you use offset).
141 // There are three types of anchoring we want:
142 // UNANCHORED Compile the original pattern, and use
143 // a pcre unanchored match.
144 // ANCHOR_START Compile the original pattern, and use
145 // a pcre anchored match.
146 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
147 // and use a pcre anchored match.
149 const char* compile_error
;
152 if (anchor
!= ANCHOR_BOTH
) {
153 re
= pcre_compile(pattern_
.c_str(), pcre_options
,
154 &compile_error
, &eoffset
, NULL
);
156 // Tack a '\z' at the end of RE. Parenthesize it first so that
157 // the '\z' applies to all top-level alternatives in the regexp.
159 /* When this code was written (for PCRE 6.0) it was enough just to
160 parenthesize the entire pattern. Unfortunately, when the feature of
161 starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
162 this code was never updated. This bug was not noticed till 2018, long after
163 PCRE became obsolescent and its maintainer no longer around. Since PCRE is
164 frozen, I have added a hack to check for all the existing "start of
165 pattern" specials - knowing that no new ones will ever be added. I am not a
166 C++ programmer, so the code style is no doubt crude. It is also
167 inefficient, but is only run when the pattern starts with "(*".
172 if (pattern_
.c_str()[0] == '(' && pattern_
.c_str()[1] == '*') {
174 for (;;) { // Loop for any number of leading items
176 for (kk
= 0; start_options
[kk
][0] != 0; kk
++) {
177 klen
= strlen(start_options
[kk
]);
178 kmat
= strncmp(pattern_
.c_str(), start_options
[kk
], klen
);
179 if (kmat
>= 0) break;
181 if (kmat
!= 0) break; // Not found
183 // If the item ended in "=" we must copy digits up to ")".
185 if (start_options
[kk
][klen
-1] == '=') {
186 while (isdigit(pattern_
.c_str()[klen
])) klen
++;
187 if (pattern_
.c_str()[klen
] != ')') break; // Syntax error
191 // Move the item from the pattern to the start of the wrapped string.
193 wrapped
+= pattern_
.substr(0, klen
);
194 pattern_
.erase(0, klen
);
198 // Wrap the rest of the pattern.
200 wrapped
+= "(?:"; // A non-counting grouping operator
203 re
= pcre_compile(wrapped
.c_str(), pcre_options
,
204 &compile_error
, &eoffset
, NULL
);
207 if (error_
== &empty_string
) error_
= new string(compile_error
);
212 /***** Matching interfaces *****/
214 bool RE::FullMatch(const StringPiece
& text
,
230 const Arg
& ptr16
) const {
231 const Arg
* args
[kMaxArgs
];
233 if (&ptr1
== &no_arg
) { goto done
; } args
[n
++] = &ptr1
;
234 if (&ptr2
== &no_arg
) { goto done
; } args
[n
++] = &ptr2
;
235 if (&ptr3
== &no_arg
) { goto done
; } args
[n
++] = &ptr3
;
236 if (&ptr4
== &no_arg
) { goto done
; } args
[n
++] = &ptr4
;
237 if (&ptr5
== &no_arg
) { goto done
; } args
[n
++] = &ptr5
;
238 if (&ptr6
== &no_arg
) { goto done
; } args
[n
++] = &ptr6
;
239 if (&ptr7
== &no_arg
) { goto done
; } args
[n
++] = &ptr7
;
240 if (&ptr8
== &no_arg
) { goto done
; } args
[n
++] = &ptr8
;
241 if (&ptr9
== &no_arg
) { goto done
; } args
[n
++] = &ptr9
;
242 if (&ptr10
== &no_arg
) { goto done
; } args
[n
++] = &ptr10
;
243 if (&ptr11
== &no_arg
) { goto done
; } args
[n
++] = &ptr11
;
244 if (&ptr12
== &no_arg
) { goto done
; } args
[n
++] = &ptr12
;
245 if (&ptr13
== &no_arg
) { goto done
; } args
[n
++] = &ptr13
;
246 if (&ptr14
== &no_arg
) { goto done
; } args
[n
++] = &ptr14
;
247 if (&ptr15
== &no_arg
) { goto done
; } args
[n
++] = &ptr15
;
248 if (&ptr16
== &no_arg
) { goto done
; } args
[n
++] = &ptr16
;
253 return DoMatchImpl(text
, ANCHOR_BOTH
, &consumed
, args
, n
, vec
, kVecSize
);
256 bool RE::PartialMatch(const StringPiece
& text
,
272 const Arg
& ptr16
) const {
273 const Arg
* args
[kMaxArgs
];
275 if (&ptr1
== &no_arg
) { goto done
; } args
[n
++] = &ptr1
;
276 if (&ptr2
== &no_arg
) { goto done
; } args
[n
++] = &ptr2
;
277 if (&ptr3
== &no_arg
) { goto done
; } args
[n
++] = &ptr3
;
278 if (&ptr4
== &no_arg
) { goto done
; } args
[n
++] = &ptr4
;
279 if (&ptr5
== &no_arg
) { goto done
; } args
[n
++] = &ptr5
;
280 if (&ptr6
== &no_arg
) { goto done
; } args
[n
++] = &ptr6
;
281 if (&ptr7
== &no_arg
) { goto done
; } args
[n
++] = &ptr7
;
282 if (&ptr8
== &no_arg
) { goto done
; } args
[n
++] = &ptr8
;
283 if (&ptr9
== &no_arg
) { goto done
; } args
[n
++] = &ptr9
;
284 if (&ptr10
== &no_arg
) { goto done
; } args
[n
++] = &ptr10
;
285 if (&ptr11
== &no_arg
) { goto done
; } args
[n
++] = &ptr11
;
286 if (&ptr12
== &no_arg
) { goto done
; } args
[n
++] = &ptr12
;
287 if (&ptr13
== &no_arg
) { goto done
; } args
[n
++] = &ptr13
;
288 if (&ptr14
== &no_arg
) { goto done
; } args
[n
++] = &ptr14
;
289 if (&ptr15
== &no_arg
) { goto done
; } args
[n
++] = &ptr15
;
290 if (&ptr16
== &no_arg
) { goto done
; } args
[n
++] = &ptr16
;
295 return DoMatchImpl(text
, UNANCHORED
, &consumed
, args
, n
, vec
, kVecSize
);
298 bool RE::Consume(StringPiece
* input
,
314 const Arg
& ptr16
) const {
315 const Arg
* args
[kMaxArgs
];
317 if (&ptr1
== &no_arg
) { goto done
; } args
[n
++] = &ptr1
;
318 if (&ptr2
== &no_arg
) { goto done
; } args
[n
++] = &ptr2
;
319 if (&ptr3
== &no_arg
) { goto done
; } args
[n
++] = &ptr3
;
320 if (&ptr4
== &no_arg
) { goto done
; } args
[n
++] = &ptr4
;
321 if (&ptr5
== &no_arg
) { goto done
; } args
[n
++] = &ptr5
;
322 if (&ptr6
== &no_arg
) { goto done
; } args
[n
++] = &ptr6
;
323 if (&ptr7
== &no_arg
) { goto done
; } args
[n
++] = &ptr7
;
324 if (&ptr8
== &no_arg
) { goto done
; } args
[n
++] = &ptr8
;
325 if (&ptr9
== &no_arg
) { goto done
; } args
[n
++] = &ptr9
;
326 if (&ptr10
== &no_arg
) { goto done
; } args
[n
++] = &ptr10
;
327 if (&ptr11
== &no_arg
) { goto done
; } args
[n
++] = &ptr11
;
328 if (&ptr12
== &no_arg
) { goto done
; } args
[n
++] = &ptr12
;
329 if (&ptr13
== &no_arg
) { goto done
; } args
[n
++] = &ptr13
;
330 if (&ptr14
== &no_arg
) { goto done
; } args
[n
++] = &ptr14
;
331 if (&ptr15
== &no_arg
) { goto done
; } args
[n
++] = &ptr15
;
332 if (&ptr16
== &no_arg
) { goto done
; } args
[n
++] = &ptr16
;
337 if (DoMatchImpl(*input
, ANCHOR_START
, &consumed
,
338 args
, n
, vec
, kVecSize
)) {
339 input
->remove_prefix(consumed
);
346 bool RE::FindAndConsume(StringPiece
* input
,
362 const Arg
& ptr16
) const {
363 const Arg
* args
[kMaxArgs
];
365 if (&ptr1
== &no_arg
) { goto done
; } args
[n
++] = &ptr1
;
366 if (&ptr2
== &no_arg
) { goto done
; } args
[n
++] = &ptr2
;
367 if (&ptr3
== &no_arg
) { goto done
; } args
[n
++] = &ptr3
;
368 if (&ptr4
== &no_arg
) { goto done
; } args
[n
++] = &ptr4
;
369 if (&ptr5
== &no_arg
) { goto done
; } args
[n
++] = &ptr5
;
370 if (&ptr6
== &no_arg
) { goto done
; } args
[n
++] = &ptr6
;
371 if (&ptr7
== &no_arg
) { goto done
; } args
[n
++] = &ptr7
;
372 if (&ptr8
== &no_arg
) { goto done
; } args
[n
++] = &ptr8
;
373 if (&ptr9
== &no_arg
) { goto done
; } args
[n
++] = &ptr9
;
374 if (&ptr10
== &no_arg
) { goto done
; } args
[n
++] = &ptr10
;
375 if (&ptr11
== &no_arg
) { goto done
; } args
[n
++] = &ptr11
;
376 if (&ptr12
== &no_arg
) { goto done
; } args
[n
++] = &ptr12
;
377 if (&ptr13
== &no_arg
) { goto done
; } args
[n
++] = &ptr13
;
378 if (&ptr14
== &no_arg
) { goto done
; } args
[n
++] = &ptr14
;
379 if (&ptr15
== &no_arg
) { goto done
; } args
[n
++] = &ptr15
;
380 if (&ptr16
== &no_arg
) { goto done
; } args
[n
++] = &ptr16
;
385 if (DoMatchImpl(*input
, UNANCHORED
, &consumed
,
386 args
, n
, vec
, kVecSize
)) {
387 input
->remove_prefix(consumed
);
394 bool RE::Replace(const StringPiece
& rewrite
,
397 int matches
= TryMatch(*str
, 0, UNANCHORED
, true, vec
, kVecSize
);
402 if (!Rewrite(&s
, rewrite
, *str
, vec
, matches
))
407 str
->replace(vec
[0], vec
[1] - vec
[0], s
);
411 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
412 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
413 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
415 static int NewlineMode(int pcre_options
) {
416 // TODO: if we can make it threadsafe, cache this var
417 int newline_mode
= 0;
418 /* if (newline_mode) return newline_mode; */ // do this once it's cached
419 if (pcre_options
& (PCRE_NEWLINE_CRLF
|PCRE_NEWLINE_CR
|PCRE_NEWLINE_LF
|
420 PCRE_NEWLINE_ANY
|PCRE_NEWLINE_ANYCRLF
)) {
421 newline_mode
= (pcre_options
&
422 (PCRE_NEWLINE_CRLF
|PCRE_NEWLINE_CR
|PCRE_NEWLINE_LF
|
423 PCRE_NEWLINE_ANY
|PCRE_NEWLINE_ANYCRLF
));
426 pcre_config(PCRE_CONFIG_NEWLINE
, &newline
);
428 newline_mode
= PCRE_NEWLINE_LF
;
429 else if (newline
== 13)
430 newline_mode
= PCRE_NEWLINE_CR
;
431 else if (newline
== 3338)
432 newline_mode
= PCRE_NEWLINE_CRLF
;
433 else if (newline
== -1)
434 newline_mode
= PCRE_NEWLINE_ANY
;
435 else if (newline
== -2)
436 newline_mode
= PCRE_NEWLINE_ANYCRLF
;
438 assert(NULL
== "Unexpected return value from pcre_config(NEWLINE)");
443 int RE::GlobalReplace(const StringPiece
& rewrite
,
449 bool last_match_was_empty_string
= false;
451 while (start
<= static_cast<int>(str
->length())) {
452 // If the previous match was for the empty string, we shouldn't
453 // just match again: we'll match in the same way and get an
454 // infinite loop. Instead, we do the match in a special way:
455 // anchored -- to force another try at the same position --
456 // and with a flag saying that this time, ignore empty matches.
457 // If this special match returns, that means there's a non-empty
458 // match at this position as well, and we can continue. If not,
459 // we do what perl does, and just advance by one.
460 // Notice that perl prints '@@@' for this;
461 // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
463 if (last_match_was_empty_string
) {
464 matches
= TryMatch(*str
, start
, ANCHOR_START
, false, vec
, kVecSize
);
466 int matchend
= start
+ 1; // advance one character.
467 // If the current char is CR and we're in CRLF mode, skip LF too.
468 // Note it's better to call pcre_fullinfo() than to examine
469 // all_options(), since options_ could have changed bewteen
470 // compile-time and now, but this is simpler and safe enough.
471 // Modified by PH to add ANY and ANYCRLF.
472 if (matchend
< static_cast<int>(str
->length()) &&
473 (*str
)[start
] == '\r' && (*str
)[matchend
] == '\n' &&
474 (NewlineMode(options_
.all_options()) == PCRE_NEWLINE_CRLF
||
475 NewlineMode(options_
.all_options()) == PCRE_NEWLINE_ANY
||
476 NewlineMode(options_
.all_options()) == PCRE_NEWLINE_ANYCRLF
)) {
479 // We also need to advance more than one char if we're in utf8 mode.
481 if (options_
.utf8()) {
482 while (matchend
< static_cast<int>(str
->length()) &&
483 ((*str
)[matchend
] & 0xc0) == 0x80)
487 if (start
< static_cast<int>(str
->length()))
488 out
.append(*str
, start
, matchend
- start
);
490 last_match_was_empty_string
= false;
494 matches
= TryMatch(*str
, start
, UNANCHORED
, true, vec
, kVecSize
);
498 int matchstart
= vec
[0], matchend
= vec
[1];
499 assert(matchstart
>= start
);
500 assert(matchend
>= matchstart
);
501 out
.append(*str
, start
, matchstart
- start
);
502 Rewrite(&out
, rewrite
, *str
, vec
, matches
);
505 last_match_was_empty_string
= (matchstart
== matchend
);
511 if (start
< static_cast<int>(str
->length()))
512 out
.append(*str
, start
, str
->length() - start
);
517 bool RE::Extract(const StringPiece
& rewrite
,
518 const StringPiece
& text
,
521 int matches
= TryMatch(text
, 0, UNANCHORED
, true, vec
, kVecSize
);
525 return Rewrite(out
, rewrite
, text
, vec
, matches
);
528 /*static*/ string
RE::QuoteMeta(const StringPiece
& unquoted
) {
531 // Escape any ascii character not in [A-Za-z_0-9].
533 // Note that it's legal to escape a character even if it has no
534 // special meaning in a regular expression -- so this function does
535 // that. (This also makes it identical to the perl function of the
536 // same name; see `perldoc -f quotemeta`.) The one exception is
537 // escaping NUL: rather than doing backslash + NUL, like perl does,
538 // we do '\0', because pcre itself doesn't take embedded NUL chars.
539 for (int ii
= 0; ii
< unquoted
.size(); ++ii
) {
540 // Note that using 'isalnum' here raises the benchmark time from
542 if (unquoted
[ii
] == '\0') {
544 } else if ((unquoted
[ii
] < 'a' || unquoted
[ii
] > 'z') &&
545 (unquoted
[ii
] < 'A' || unquoted
[ii
] > 'Z') &&
546 (unquoted
[ii
] < '0' || unquoted
[ii
] > '9') &&
547 unquoted
[ii
] != '_' &&
548 // If this is the part of a UTF8 or Latin1 character, we need
549 // to copy this byte without escaping. Experimentally this is
550 // what works correctly with the regexp library.
551 !(unquoted
[ii
] & 128)) {
553 result
+= unquoted
[ii
];
555 result
+= unquoted
[ii
];
562 /***** Actual matching and rewriting code *****/
564 int RE::TryMatch(const StringPiece
& text
,
570 pcre
* re
= (anchor
== ANCHOR_BOTH
) ? re_full_
: re_partial_
;
572 //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
576 pcre_extra extra
= { 0, 0, 0, 0, 0, 0, 0, 0 };
577 if (options_
.match_limit() > 0) {
578 extra
.flags
|= PCRE_EXTRA_MATCH_LIMIT
;
579 extra
.match_limit
= options_
.match_limit();
581 if (options_
.match_limit_recursion() > 0) {
582 extra
.flags
|= PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
583 extra
.match_limit_recursion
= options_
.match_limit_recursion();
587 // Changed by PH as a result of bugzilla #1288
588 int options
= (options_
.all_options() & PCRE_NO_UTF8_CHECK
);
590 if (anchor
!= UNANCHORED
)
591 options
|= PCRE_ANCHORED
;
593 options
|= PCRE_NOTEMPTY
;
595 int rc
= pcre_exec(re
, // The regular expression object
597 (text
.data() == NULL
) ? "" : text
.data(),
605 if (rc
== PCRE_ERROR_NOMATCH
) {
608 //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
609 // re, pattern_.c_str());
611 } else if (rc
== 0) {
612 // pcre_exec() returns 0 as a special case when the number of
613 // capturing subpatterns exceeds the size of the vector.
614 // When this happens, there is a match and the output vector
615 // is filled, but we miss out on the positions of the extra subpatterns.
622 bool RE::DoMatchImpl(const StringPiece
& text
,
625 const Arg
* const* args
,
629 assert((1 + n
) * 3 <= vecsize
); // results + PCRE workspace
630 int matches
= TryMatch(text
, 0, anchor
, true, vec
, vecsize
);
631 assert(matches
>= 0); // TryMatch never returns negatives
637 if (n
== 0 || args
== NULL
) {
638 // We are not interested in results
642 if (NumberOfCapturingGroups() < n
) {
643 // RE has fewer capturing groups than number of arg pointers passed in
647 // If we got here, we must have matched the whole pattern.
648 // We do not need (can not do) any more checks on the value of 'matches' here
649 // -- see the comment for TryMatch.
650 for (int i
= 0; i
< n
; i
++) {
651 const int start
= vec
[2*(i
+1)];
652 const int limit
= vec
[2*(i
+1)+1];
653 if (!args
[i
]->Parse(text
.data() + start
, limit
-start
)) {
654 // TODO: Should we indicate what the error was?
662 bool RE::DoMatch(const StringPiece
& text
,
665 const Arg
* const args
[],
668 size_t const vecsize
= (1 + n
) * 3; // results + PCRE workspace
670 int space
[21]; // use stack allocation for small vecsize (common case)
671 int* vec
= vecsize
<= 21 ? space
: new int[vecsize
];
672 bool retval
= DoMatchImpl(text
, anchor
, consumed
, args
, n
, vec
, (int)vecsize
);
673 if (vec
!= space
) delete [] vec
;
677 bool RE::Rewrite(string
*out
, const StringPiece
&rewrite
,
678 const StringPiece
&text
, int *vec
, int veclen
) const {
679 for (const char *s
= rewrite
.data(), *end
= s
+ rewrite
.size();
687 //fprintf(stderr, requested group %d in regexp %.*s\n",
688 // n, rewrite.size(), rewrite.data());
691 int start
= vec
[2 * n
];
693 out
->append(text
.data() + start
, vec
[2 * n
+ 1] - start
);
694 } else if (c
== '\\') {
697 //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
698 // rewrite.size(), rewrite.data());
708 // Return the number of capturing subpatterns, or -1 if the
709 // regexp wasn't valid on construction.
710 int RE::NumberOfCapturingGroups() const {
711 if (re_partial_
== NULL
) return -1;
714 int pcre_retval
= pcre_fullinfo(re_partial_
, // The regular expression object
715 NULL
, // We did not study the pattern
716 PCRE_INFO_CAPTURECOUNT
,
718 assert(pcre_retval
== 0);
722 /***** Parsers for various types *****/
724 bool Arg::parse_null(const char* str
, int n
, void* dest
) {
727 // We fail if somebody asked us to store into a non-NULL void* pointer
728 return (dest
== NULL
);
731 bool Arg::parse_string(const char* str
, int n
, void* dest
) {
732 if (dest
== NULL
) return true;
733 reinterpret_cast<string
*>(dest
)->assign(str
, n
);
737 bool Arg::parse_stringpiece(const char* str
, int n
, void* dest
) {
738 if (dest
== NULL
) return true;
739 reinterpret_cast<StringPiece
*>(dest
)->set(str
, n
);
743 bool Arg::parse_char(const char* str
, int n
, void* dest
) {
744 if (n
!= 1) return false;
745 if (dest
== NULL
) return true;
746 *(reinterpret_cast<char*>(dest
)) = str
[0];
750 bool Arg::parse_uchar(const char* str
, int n
, void* dest
) {
751 if (n
!= 1) return false;
752 if (dest
== NULL
) return true;
753 *(reinterpret_cast<unsigned char*>(dest
)) = str
[0];
757 // Largest number spec that we are willing to parse
758 static const int kMaxNumberLength
= 32;
760 // REQUIRES "buf" must have length at least kMaxNumberLength+1
762 // Copies "str" into "buf" and null-terminates if necessary.
764 // a. "str" if no termination is needed
765 // b. "buf" if the string was copied and null-terminated
766 // c. "" if the input was invalid and has no hope of being parsed
767 static const char* TerminateNumber(char* buf
, const char* str
, int n
) {
768 if ((n
> 0) && isspace(*str
)) {
769 // We are less forgiving than the strtoxxx() routines and do not
770 // allow leading spaces.
774 // See if the character right after the input text may potentially
775 // look like a digit.
776 if (isdigit(str
[n
]) ||
777 ((str
[n
] >= 'a') && (str
[n
] <= 'f')) ||
778 ((str
[n
] >= 'A') && (str
[n
] <= 'F'))) {
779 if (n
> kMaxNumberLength
) return ""; // Input too big to be a valid number
784 // We can parse right out of the supplied string, so return it.
789 bool Arg::parse_long_radix(const char* str
,
793 if (n
== 0) return false;
794 char buf
[kMaxNumberLength
+1];
795 str
= TerminateNumber(buf
, str
, n
);
798 long r
= strtol(str
, &end
, radix
);
799 if (end
!= str
+ n
) return false; // Leftover junk
800 if (errno
) return false;
801 if (dest
== NULL
) return true;
802 *(reinterpret_cast<long*>(dest
)) = r
;
806 bool Arg::parse_ulong_radix(const char* str
,
810 if (n
== 0) return false;
811 char buf
[kMaxNumberLength
+1];
812 str
= TerminateNumber(buf
, str
, n
);
813 if (str
[0] == '-') return false; // strtoul() on a negative number?!
816 unsigned long r
= strtoul(str
, &end
, radix
);
817 if (end
!= str
+ n
) return false; // Leftover junk
818 if (errno
) return false;
819 if (dest
== NULL
) return true;
820 *(reinterpret_cast<unsigned long*>(dest
)) = r
;
824 bool Arg::parse_short_radix(const char* str
,
829 if (!parse_long_radix(str
, n
, &r
, radix
)) return false; // Could not parse
830 if (r
< SHRT_MIN
|| r
> SHRT_MAX
) return false; // Out of range
831 if (dest
== NULL
) return true;
832 *(reinterpret_cast<short*>(dest
)) = static_cast<short>(r
);
836 bool Arg::parse_ushort_radix(const char* str
,
841 if (!parse_ulong_radix(str
, n
, &r
, radix
)) return false; // Could not parse
842 if (r
> USHRT_MAX
) return false; // Out of range
843 if (dest
== NULL
) return true;
844 *(reinterpret_cast<unsigned short*>(dest
)) = static_cast<unsigned short>(r
);
848 bool Arg::parse_int_radix(const char* str
,
853 if (!parse_long_radix(str
, n
, &r
, radix
)) return false; // Could not parse
854 if (r
< INT_MIN
|| r
> INT_MAX
) return false; // Out of range
855 if (dest
== NULL
) return true;
856 *(reinterpret_cast<int*>(dest
)) = r
;
860 bool Arg::parse_uint_radix(const char* str
,
865 if (!parse_ulong_radix(str
, n
, &r
, radix
)) return false; // Could not parse
866 if (r
> UINT_MAX
) return false; // Out of range
867 if (dest
== NULL
) return true;
868 *(reinterpret_cast<unsigned int*>(dest
)) = r
;
872 bool Arg::parse_longlong_radix(const char* str
,
876 #ifndef HAVE_LONG_LONG
879 if (n
== 0) return false;
880 char buf
[kMaxNumberLength
+1];
881 str
= TerminateNumber(buf
, str
, n
);
884 #if defined HAVE_STRTOQ
885 long long r
= strtoq(str
, &end
, radix
);
886 #elif defined HAVE_STRTOLL
887 long long r
= strtoll(str
, &end
, radix
);
888 #elif defined HAVE__STRTOI64
889 long long r
= _strtoi64(str
, &end
, radix
);
890 #elif defined HAVE_STRTOIMAX
891 long long r
= strtoimax(str
, &end
, radix
);
893 #error parse_longlong_radix: cannot convert input to a long-long
895 if (end
!= str
+ n
) return false; // Leftover junk
896 if (errno
) return false;
897 if (dest
== NULL
) return true;
898 *(reinterpret_cast<long long*>(dest
)) = r
;
900 #endif /* HAVE_LONG_LONG */
903 bool Arg::parse_ulonglong_radix(const char* str
,
907 #ifndef HAVE_UNSIGNED_LONG_LONG
910 if (n
== 0) return false;
911 char buf
[kMaxNumberLength
+1];
912 str
= TerminateNumber(buf
, str
, n
);
913 if (str
[0] == '-') return false; // strtoull() on a negative number?!
916 #if defined HAVE_STRTOQ
917 unsigned long long r
= strtouq(str
, &end
, radix
);
918 #elif defined HAVE_STRTOLL
919 unsigned long long r
= strtoull(str
, &end
, radix
);
920 #elif defined HAVE__STRTOI64
921 unsigned long long r
= _strtoui64(str
, &end
, radix
);
922 #elif defined HAVE_STRTOIMAX
923 unsigned long long r
= strtoumax(str
, &end
, radix
);
925 #error parse_ulonglong_radix: cannot convert input to a long-long
927 if (end
!= str
+ n
) return false; // Leftover junk
928 if (errno
) return false;
929 if (dest
== NULL
) return true;
930 *(reinterpret_cast<unsigned long long*>(dest
)) = r
;
932 #endif /* HAVE_UNSIGNED_LONG_LONG */
935 bool Arg::parse_double(const char* str
, int n
, void* dest
) {
936 if (n
== 0) return false;
937 static const int kMaxLength
= 200;
938 char buf
[kMaxLength
];
939 if (n
>= kMaxLength
) return false;
944 double r
= strtod(buf
, &end
);
945 if (end
!= buf
+ n
) return false; // Leftover junk
946 if (errno
) return false;
947 if (dest
== NULL
) return true;
948 *(reinterpret_cast<double*>(dest
)) = r
;
952 bool Arg::parse_float(const char* str
, int n
, void* dest
) {
954 if (!parse_double(str
, n
, &r
)) return false;
955 if (dest
== NULL
) return true;
956 *(reinterpret_cast<float*>(dest
)) = static_cast<float>(r
);
961 #define DEFINE_INTEGER_PARSERS(name) \
962 bool Arg::parse_##name(const char* str, int n, void* dest) { \
963 return parse_##name##_radix(str, n, dest, 10); \
965 bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
966 return parse_##name##_radix(str, n, dest, 16); \
968 bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
969 return parse_##name##_radix(str, n, dest, 8); \
971 bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
972 return parse_##name##_radix(str, n, dest, 0); \
975 DEFINE_INTEGER_PARSERS(short) /* */
976 DEFINE_INTEGER_PARSERS(ushort
) /* */
977 DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
978 DEFINE_INTEGER_PARSERS(uint
) /* statements because they can cause */
979 DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
980 DEFINE_INTEGER_PARSERS(ulong
) /* level is turned up high enough. */
981 DEFINE_INTEGER_PARSERS(longlong
) /* */
982 DEFINE_INTEGER_PARSERS(ulonglong
) /* */
984 #undef DEFINE_INTEGER_PARSERS
986 } // namespace pcrecpp