pcrecpp.cc

   1 // Copyright (c) 2010, Google Inc.
   2 // All rights reserved.
   3 //
   4 // Redistribution and use in source and binary forms, with or without
   5 // modification, are permitted provided that the following conditions are
   6 // met:
   7 //
   8 //     * Redistributions of source code must retain the above copyright
   9 // notice, this list of conditions and the following disclaimer.
  10 //     * Redistributions in binary form must reproduce the above
  11 // copyright notice, this list of conditions and the following disclaimer
  12 // in the documentation and/or other materials provided with the
  13 // distribution.
  14 //     * Neither the name of Google Inc. nor the names of its
  15 // contributors may be used to endorse or promote products derived from
  16 // this software without specific prior written permission.
  17 //
  18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29 //
  30 // Author: Sanjay Ghemawat
  31
  32 #ifdef HAVE_CONFIG_H
  33 #include "config.h"
  34 #endif
  35
  36 #include <stdlib.h>
  37 #include <stdio.h>
  38 #include <ctype.h>
  39 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
  40 #include <string.h>      /* for memcpy */
  41 #include <assert.h>
  42 #include <errno.h>
  43 #include <string>
  44 #include <algorithm>
  45
  46 #include "pcrecpp_internal.h"
  47 #include "pcre.h"
  48 #include "pcrecpp.h"
  49 #include "pcre_stringpiece.h"
  50
  51
  52 namespace pcrecpp {
  53
  54 // Maximum number of args we can set
  55 static const int kMaxArgs = 16;
  56 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
  57
  58 // Special object that stands-in for no argument
  59 Arg RE::no_arg((void*)NULL);
  60
  61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
  62 // which defined a global no_arg variable instead of putting it in the
  63 // RE class.  This works on GCC >= 3, at least.  It definitely works
  64 // for ELF, but may not for other object formats (Mach-O, for
  65 // instance, does not support aliases.)  We could probably have a more
  66 // inclusive test if we ever needed it.  (Note that not only the
  67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
  68 // gnu-specific.)
  69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) \
  70        && !defined(__INTEL_COMPILER) && !defined(__LCC__)
  71 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
  72 # define ULP_AS_STRING_INTERNAL(x)   #x
  73 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
  74 extern Arg no_arg
  75   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
  76 #endif
  77
  78 // If a regular expression has no error, its error_ field points here
  79 static const string empty_string;
  80
  81 // If the user doesn't ask for any options, we just use this one
  82 static RE_Options default_options;
  83
  84 // Specials for the start of patterns. See comments where start_options is used
  85 // below. (PH June 2018)
  86 static const char *start_options[] = {
  87   "(*UTF8)",
  88   "(*UTF)",
  89   "(*UCP)",
  90   "(*NO_START_OPT)",
  91   "(*NO_AUTO_POSSESS)",
  92   "(*LIMIT_RECURSION=",
  93   "(*LIMIT_MATCH=",
  94   "(*CRLF)",
  95   "(*LF)",
  96   "(*CR)",
  97   "(*BSR_UNICODE)",
  98   "(*BSR_ANYCRLF)",
  99   "(*ANYCRLF)",
 100   "(*ANY)",
 101   "" };
 102
 103 void RE::Init(const string& pat, const RE_Options* options) {
 104   pattern_ = pat;
 105   if (options == NULL) {
 106     options_ = default_options;
 107   } else {
 108     options_ = *options;
 109   }
 110   error_ = &empty_string;
 111   re_full_ = NULL;
 112   re_partial_ = NULL;
 113
 114   re_partial_ = Compile(UNANCHORED);
 115   if (re_partial_ != NULL) {
 116     re_full_ = Compile(ANCHOR_BOTH);
 117   }
 118 }
 119
 120 void RE::Cleanup() {
 121   if (re_full_ != NULL)         (*pcre_free)(re_full_);
 122   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
 123   if (error_ != &empty_string)  delete error_;
 124 }
 125
 126
 127 RE::~RE() {
 128   Cleanup();
 129 }
 130
 131
 132 pcre* RE::Compile(Anchor anchor) {
 133   // First, convert RE_Options into pcre options
 134   int pcre_options = 0;
 135   pcre_options = options_.all_options();
 136
 137   // Special treatment for anchoring.  This is needed because at
 138   // runtime pcre only provides an option for anchoring at the
 139   // beginning of a string (unless you use offset).
 140   //
 141   // There are three types of anchoring we want:
 142   //    UNANCHORED      Compile the original pattern, and use
 143   //                    a pcre unanchored match.
 144   //    ANCHOR_START    Compile the original pattern, and use
 145   //                    a pcre anchored match.
 146   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
 147   //                    and use a pcre anchored match.
 148
 149   const char* compile_error;
 150   int eoffset;
 151   pcre* re;
 152   if (anchor != ANCHOR_BOTH) {
 153     re = pcre_compile(pattern_.c_str(), pcre_options,
 154                       &compile_error, &eoffset, NULL);
 155   } else {
 156     // Tack a '\z' at the end of RE.  Parenthesize it first so that
 157     // the '\z' applies to all top-level alternatives in the regexp.
 158
 159     /* When this code was written (for PCRE 6.0) it was enough just to
 160     parenthesize the entire pattern. Unfortunately, when the feature of
 161     starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
 162     this code was never updated. This bug was not noticed till 2018, long after
 163     PCRE became obsolescent and its maintainer no longer around. Since PCRE is
 164     frozen, I have added a hack to check for all the existing "start of
 165     pattern" specials - knowing that no new ones will ever be added. I am not a
 166     C++ programmer, so the code style is no doubt crude. It is also
 167     inefficient, but is only run when the pattern starts with "(*".
 168     PH June 2018. */
 169
 170     string wrapped = "";
 171
 172     if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
 173       int kk, klen, kmat;
 174       for (;;) {   // Loop for any number of leading items
 175
 176         for (kk = 0; start_options[kk][0] != 0; kk++) {
 177           klen = strlen(start_options[kk]);
 178           kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
 179           if (kmat >= 0) break;
 180         }
 181         if (kmat != 0) break;  // Not found
 182
 183         // If the item ended in "=" we must copy digits up to ")".
 184
 185         if (start_options[kk][klen-1] == '=') {
 186           while (isdigit(pattern_.c_str()[klen])) klen++;
 187           if (pattern_.c_str()[klen] != ')') break;  // Syntax error
 188           klen++;
 189         }
 190
 191         // Move the item from the pattern to the start of the wrapped string.
 192
 193         wrapped += pattern_.substr(0, klen);
 194         pattern_.erase(0, klen);
 195       }
 196     }
 197
 198     // Wrap the rest of the pattern.
 199
 200     wrapped += "(?:";  // A non-counting grouping operator
 201     wrapped += pattern_;
 202     wrapped += ")\\z";
 203     re = pcre_compile(wrapped.c_str(), pcre_options,
 204                       &compile_error, &eoffset, NULL);
 205   }
 206   if (re == NULL) {
 207     if (error_ == &empty_string) error_ = new string(compile_error);
 208   }
 209   return re;
 210 }
 211
 212 /***** Matching interfaces *****/
 213
 214 bool RE::FullMatch(const StringPiece& text,
 215                    const Arg& ptr1,
 216                    const Arg& ptr2,
 217                    const Arg& ptr3,
 218                    const Arg& ptr4,
 219                    const Arg& ptr5,
 220                    const Arg& ptr6,
 221                    const Arg& ptr7,
 222                    const Arg& ptr8,
 223                    const Arg& ptr9,
 224                    const Arg& ptr10,
 225                    const Arg& ptr11,
 226                    const Arg& ptr12,
 227                    const Arg& ptr13,
 228                    const Arg& ptr14,
 229                    const Arg& ptr15,
 230                    const Arg& ptr16) const {
 231   const Arg* args[kMaxArgs];
 232   int n = 0;
 233   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
 234   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
 235   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
 236   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
 237   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
 238   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
 239   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
 240   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
 241   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
 242   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
 243   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
 244   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
 245   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
 246   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
 247   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
 248   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
 249  done:
 250
 251   int consumed;
 252   int vec[kVecSize];
 253   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
 254 }
 255
 256 bool RE::PartialMatch(const StringPiece& text,
 257                       const Arg& ptr1,
 258                       const Arg& ptr2,
 259                       const Arg& ptr3,
 260                       const Arg& ptr4,
 261                       const Arg& ptr5,
 262                       const Arg& ptr6,
 263                       const Arg& ptr7,
 264                       const Arg& ptr8,
 265                       const Arg& ptr9,
 266                       const Arg& ptr10,
 267                       const Arg& ptr11,
 268                       const Arg& ptr12,
 269                       const Arg& ptr13,
 270                       const Arg& ptr14,
 271                       const Arg& ptr15,
 272                       const Arg& ptr16) const {
 273   const Arg* args[kMaxArgs];
 274   int n = 0;
 275   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
 276   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
 277   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
 278   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
 279   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
 280   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
 281   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
 282   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
 283   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
 284   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
 285   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
 286   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
 287   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
 288   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
 289   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
 290   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
 291  done:
 292
 293   int consumed;
 294   int vec[kVecSize];
 295   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
 296 }
 297
 298 bool RE::Consume(StringPiece* input,
 299                  const Arg& ptr1,
 300                  const Arg& ptr2,
 301                  const Arg& ptr3,
 302                  const Arg& ptr4,
 303                  const Arg& ptr5,
 304                  const Arg& ptr6,
 305                  const Arg& ptr7,
 306                  const Arg& ptr8,
 307                  const Arg& ptr9,
 308                  const Arg& ptr10,
 309                  const Arg& ptr11,
 310                  const Arg& ptr12,
 311                  const Arg& ptr13,
 312                  const Arg& ptr14,
 313                  const Arg& ptr15,
 314                  const Arg& ptr16) const {
 315   const Arg* args[kMaxArgs];
 316   int n = 0;
 317   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
 318   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
 319   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
 320   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
 321   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
 322   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
 323   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
 324   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
 325   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
 326   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
 327   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
 328   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
 329   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
 330   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
 331   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
 332   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
 333  done:
 334
 335   int consumed;
 336   int vec[kVecSize];
 337   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
 338                   args, n, vec, kVecSize)) {
 339     input->remove_prefix(consumed);
 340     return true;
 341   } else {
 342     return false;
 343   }
 344 }
 345
 346 bool RE::FindAndConsume(StringPiece* input,
 347                         const Arg& ptr1,
 348                         const Arg& ptr2,
 349                         const Arg& ptr3,
 350                         const Arg& ptr4,
 351                         const Arg& ptr5,
 352                         const Arg& ptr6,
 353                         const Arg& ptr7,
 354                         const Arg& ptr8,
 355                         const Arg& ptr9,
 356                         const Arg& ptr10,
 357                         const Arg& ptr11,
 358                         const Arg& ptr12,
 359                         const Arg& ptr13,
 360                         const Arg& ptr14,
 361                         const Arg& ptr15,
 362                         const Arg& ptr16) const {
 363   const Arg* args[kMaxArgs];
 364   int n = 0;
 365   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
 366   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
 367   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
 368   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
 369   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
 370   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
 371   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
 372   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
 373   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
 374   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
 375   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
 376   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
 377   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
 378   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
 379   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
 380   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
 381  done:
 382
 383   int consumed;
 384   int vec[kVecSize];
 385   if (DoMatchImpl(*input, UNANCHORED, &consumed,
 386                   args, n, vec, kVecSize)) {
 387     input->remove_prefix(consumed);
 388     return true;
 389   } else {
 390     return false;
 391   }
 392 }
 393
 394 bool RE::Replace(const StringPiece& rewrite,
 395                  string *str) const {
 396   int vec[kVecSize];
 397   int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
 398   if (matches == 0)
 399     return false;
 400
 401   string s;
 402   if (!Rewrite(&s, rewrite, *str, vec, matches))
 403     return false;
 404
 405   assert(vec[0] >= 0);
 406   assert(vec[1] >= 0);
 407   str->replace(vec[0], vec[1] - vec[0], s);
 408   return true;
 409 }
 410
 411 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
 412 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
 413 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
 414
 415 static int NewlineMode(int pcre_options) {
 416   // TODO: if we can make it threadsafe, cache this var
 417   int newline_mode = 0;
 418   /* if (newline_mode) return newline_mode; */  // do this once it's cached
 419   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
 420                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
 421     newline_mode = (pcre_options &
 422                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
 423                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
 424   } else {
 425     int newline;
 426     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
 427     if (newline == 10)
 428       newline_mode = PCRE_NEWLINE_LF;
 429     else if (newline == 13)
 430       newline_mode = PCRE_NEWLINE_CR;
 431     else if (newline == 3338)
 432       newline_mode = PCRE_NEWLINE_CRLF;
 433     else if (newline == -1)
 434       newline_mode = PCRE_NEWLINE_ANY;
 435     else if (newline == -2)
 436       newline_mode = PCRE_NEWLINE_ANYCRLF;
 437     else
 438       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
 439   }
 440   return newline_mode;
 441 }
 442
 443 int RE::GlobalReplace(const StringPiece& rewrite,
 444                       string *str) const {
 445   int count = 0;
 446   int vec[kVecSize];
 447   string out;
 448   int start = 0;
 449   bool last_match_was_empty_string = false;
 450
 451   while (start <= static_cast<int>(str->length())) {
 452     // If the previous match was for the empty string, we shouldn't
 453     // just match again: we'll match in the same way and get an
 454     // infinite loop.  Instead, we do the match in a special way:
 455     // anchored -- to force another try at the same position --
 456     // and with a flag saying that this time, ignore empty matches.
 457     // If this special match returns, that means there's a non-empty
 458     // match at this position as well, and we can continue.  If not,
 459     // we do what perl does, and just advance by one.
 460     // Notice that perl prints '@@@' for this;
 461     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
 462     int matches;
 463     if (last_match_was_empty_string) {
 464       matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
 465       if (matches <= 0) {
 466         int matchend = start + 1;     // advance one character.
 467         // If the current char is CR and we're in CRLF mode, skip LF too.
 468         // Note it's better to call pcre_fullinfo() than to examine
 469         // all_options(), since options_ could have changed bewteen
 470         // compile-time and now, but this is simpler and safe enough.
 471         // Modified by PH to add ANY and ANYCRLF.
 472         if (matchend < static_cast<int>(str->length()) &&
 473             (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
 474             (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
 475              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
 476              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
 477           matchend++;
 478         }
 479         // We also need to advance more than one char if we're in utf8 mode.
 480 #ifdef SUPPORT_UTF
 481         if (options_.utf8()) {
 482           while (matchend < static_cast<int>(str->length()) &&
 483                  ((*str)[matchend] & 0xc0) == 0x80)
 484             matchend++;
 485         }
 486 #endif
 487         if (start < static_cast<int>(str->length()))
 488           out.append(*str, start, matchend - start);
 489         start = matchend;
 490         last_match_was_empty_string = false;
 491         continue;
 492       }
 493     } else {
 494       matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
 495       if (matches <= 0)
 496         break;
 497     }
 498     int matchstart = vec[0], matchend = vec[1];
 499     assert(matchstart >= start);
 500     assert(matchend >= matchstart);
 501     out.append(*str, start, matchstart - start);
 502     Rewrite(&out, rewrite, *str, vec, matches);
 503     start = matchend;
 504     count++;
 505     last_match_was_empty_string = (matchstart == matchend);
 506   }
 507
 508   if (count == 0)
 509     return 0;
 510
 511   if (start < static_cast<int>(str->length()))
 512     out.append(*str, start, str->length() - start);
 513   swap(out, *str);
 514   return count;
 515 }
 516
 517 bool RE::Extract(const StringPiece& rewrite,
 518                  const StringPiece& text,
 519                  string *out) const {
 520   int vec[kVecSize];
 521   int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
 522   if (matches == 0)
 523     return false;
 524   out->erase();
 525   return Rewrite(out, rewrite, text, vec, matches);
 526 }
 527
 528 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
 529   string result;
 530
 531   // Escape any ascii character not in [A-Za-z_0-9].
 532   //
 533   // Note that it's legal to escape a character even if it has no
 534   // special meaning in a regular expression -- so this function does
 535   // that.  (This also makes it identical to the perl function of the
 536   // same name; see `perldoc -f quotemeta`.)  The one exception is
 537   // escaping NUL: rather than doing backslash + NUL, like perl does,
 538   // we do '\0', because pcre itself doesn't take embedded NUL chars.
 539   for (int ii = 0; ii < unquoted.size(); ++ii) {
 540     // Note that using 'isalnum' here raises the benchmark time from
 541     // 32ns to 58ns:
 542     if (unquoted[ii] == '\0') {
 543       result += "\\0";
 544     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
 545                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
 546                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
 547                unquoted[ii] != '_' &&
 548                // If this is the part of a UTF8 or Latin1 character, we need
 549                // to copy this byte without escaping.  Experimentally this is
 550                // what works correctly with the regexp library.
 551                !(unquoted[ii] & 128)) {
 552       result += '\\';
 553       result += unquoted[ii];
 554     } else {
 555       result += unquoted[ii];
 556     }
 557   }
 558
 559   return result;
 560 }
 561
 562 /***** Actual matching and rewriting code *****/
 563
 564 int RE::TryMatch(const StringPiece& text,
 565                  int startpos,
 566                  Anchor anchor,
 567                  bool empty_ok,
 568                  int *vec,
 569                  int vecsize) const {
 570   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
 571   if (re == NULL) {
 572     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
 573     return 0;
 574   }
 575
 576   pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
 577   if (options_.match_limit() > 0) {
 578     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
 579     extra.match_limit = options_.match_limit();
 580   }
 581   if (options_.match_limit_recursion() > 0) {
 582     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 583     extra.match_limit_recursion = options_.match_limit_recursion();
 584   }
 585
 586   // int options = 0;
 587   // Changed by PH as a result of bugzilla #1288
 588   int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
 589
 590   if (anchor != UNANCHORED)
 591     options |= PCRE_ANCHORED;
 592   if (!empty_ok)
 593     options |= PCRE_NOTEMPTY;
 594
 595   int rc = pcre_exec(re,              // The regular expression object
 596                      &extra,
 597                      (text.data() == NULL) ? "" : text.data(),
 598                      text.size(),
 599                      startpos,
 600                      options,
 601                      vec,
 602                      vecsize);
 603
 604   // Handle errors
 605   if (rc == PCRE_ERROR_NOMATCH) {
 606     return 0;
 607   } else if (rc < 0) {
 608     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
 609     //        re, pattern_.c_str());
 610     return 0;
 611   } else if (rc == 0) {
 612     // pcre_exec() returns 0 as a special case when the number of
 613     // capturing subpatterns exceeds the size of the vector.
 614     // When this happens, there is a match and the output vector
 615     // is filled, but we miss out on the positions of the extra subpatterns.
 616     rc = vecsize / 2;
 617   }
 618
 619   return rc;
 620 }
 621
 622 bool RE::DoMatchImpl(const StringPiece& text,
 623                      Anchor anchor,
 624                      int* consumed,
 625                      const Arg* const* args,
 626                      int n,
 627                      int* vec,
 628                      int vecsize) const {
 629   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
 630   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
 631   assert(matches >= 0);  // TryMatch never returns negatives
 632   if (matches == 0)
 633     return false;
 634
 635   *consumed = vec[1];
 636
 637   if (n == 0 || args == NULL) {
 638     // We are not interested in results
 639     return true;
 640   }
 641
 642   if (NumberOfCapturingGroups() < n) {
 643     // RE has fewer capturing groups than number of arg pointers passed in
 644     return false;
 645   }
 646
 647   // If we got here, we must have matched the whole pattern.
 648   // We do not need (can not do) any more checks on the value of 'matches' here
 649   // -- see the comment for TryMatch.
 650   for (int i = 0; i < n; i++) {
 651     const int start = vec[2*(i+1)];
 652     const int limit = vec[2*(i+1)+1];
 653     if (!args[i]->Parse(text.data() + start, limit-start)) {
 654       // TODO: Should we indicate what the error was?
 655       return false;
 656     }
 657   }
 658
 659   return true;
 660 }
 661
 662 bool RE::DoMatch(const StringPiece& text,
 663                  Anchor anchor,
 664                  int* consumed,
 665                  const Arg* const args[],
 666                  int n) const {
 667   assert(n >= 0);
 668   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
 669                                        // (as for kVecSize)
 670   int space[21];   // use stack allocation for small vecsize (common case)
 671   int* vec = vecsize <= 21 ? space : new int[vecsize];
 672   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
 673   if (vec != space) delete [] vec;
 674   return retval;
 675 }
 676
 677 bool RE::Rewrite(string *out, const StringPiece &rewrite,
 678                  const StringPiece &text, int *vec, int veclen) const {
 679   for (const char *s = rewrite.data(), *end = s + rewrite.size();
 680        s < end; s++) {
 681     int c = *s;
 682     if (c == '\\') {
 683       c = *++s;
 684       if (isdigit(c)) {
 685         int n = (c - '0');
 686         if (n >= veclen) {
 687           //fprintf(stderr, requested group %d in regexp %.*s\n",
 688           //        n, rewrite.size(), rewrite.data());
 689           return false;
 690         }
 691         int start = vec[2 * n];
 692         if (start >= 0)
 693           out->append(text.data() + start, vec[2 * n + 1] - start);
 694       } else if (c == '\\') {
 695         *out += '\\';
 696       } else {
 697         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
 698         //        rewrite.size(), rewrite.data());
 699         return false;
 700       }
 701     } else {
 702       *out += c;
 703     }
 704   }
 705   return true;
 706 }
 707
 708 // Return the number of capturing subpatterns, or -1 if the
 709 // regexp wasn't valid on construction.
 710 int RE::NumberOfCapturingGroups() const {
 711   if (re_partial_ == NULL) return -1;
 712
 713   int result;
 714   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
 715                                   NULL,         // We did not study the pattern
 716                                   PCRE_INFO_CAPTURECOUNT,
 717                                   &result);
 718   assert(pcre_retval == 0);
 719   return result;
 720 }
 721
 722 /***** Parsers for various types *****/
 723
 724 bool Arg::parse_null(const char* str, int n, void* dest) {
 725   (void)str;
 726   (void)n;
 727   // We fail if somebody asked us to store into a non-NULL void* pointer
 728   return (dest == NULL);
 729 }
 730
 731 bool Arg::parse_string(const char* str, int n, void* dest) {
 732   if (dest == NULL) return true;
 733   reinterpret_cast<string*>(dest)->assign(str, n);
 734   return true;
 735 }
 736
 737 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
 738   if (dest == NULL) return true;
 739   reinterpret_cast<StringPiece*>(dest)->set(str, n);
 740   return true;
 741 }
 742
 743 bool Arg::parse_char(const char* str, int n, void* dest) {
 744   if (n != 1) return false;
 745   if (dest == NULL) return true;
 746   *(reinterpret_cast<char*>(dest)) = str[0];
 747   return true;
 748 }
 749
 750 bool Arg::parse_uchar(const char* str, int n, void* dest) {
 751   if (n != 1) return false;
 752   if (dest == NULL) return true;
 753   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
 754   return true;
 755 }
 756
 757 // Largest number spec that we are willing to parse
 758 static const int kMaxNumberLength = 32;
 759
 760 // REQUIRES "buf" must have length at least kMaxNumberLength+1
 761 // REQUIRES "n > 0"
 762 // Copies "str" into "buf" and null-terminates if necessary.
 763 // Returns one of:
 764 //      a. "str" if no termination is needed
 765 //      b. "buf" if the string was copied and null-terminated
 766 //      c. "" if the input was invalid and has no hope of being parsed
 767 static const char* TerminateNumber(char* buf, const char* str, int n) {
 768   if ((n > 0) && isspace(*str)) {
 769     // We are less forgiving than the strtoxxx() routines and do not
 770     // allow leading spaces.
 771     return "";
 772   }
 773
 774   // See if the character right after the input text may potentially
 775   // look like a digit.
 776   if (isdigit(str[n]) ||
 777       ((str[n] >= 'a') && (str[n] <= 'f')) ||
 778       ((str[n] >= 'A') && (str[n] <= 'F'))) {
 779     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
 780     memcpy(buf, str, n);
 781     buf[n] = '\0';
 782     return buf;
 783   } else {
 784     // We can parse right out of the supplied string, so return it.
 785     return str;
 786   }
 787 }
 788
 789 bool Arg::parse_long_radix(const char* str,
 790                            int n,
 791                            void* dest,
 792                            int radix) {
 793   if (n == 0) return false;
 794   char buf[kMaxNumberLength+1];
 795   str = TerminateNumber(buf, str, n);
 796   char* end;
 797   errno = 0;
 798   long r = strtol(str, &end, radix);
 799   if (end != str + n) return false;   // Leftover junk
 800   if (errno) return false;
 801   if (dest == NULL) return true;
 802   *(reinterpret_cast<long*>(dest)) = r;
 803   return true;
 804 }
 805
 806 bool Arg::parse_ulong_radix(const char* str,
 807                             int n,
 808                             void* dest,
 809                             int radix) {
 810   if (n == 0) return false;
 811   char buf[kMaxNumberLength+1];
 812   str = TerminateNumber(buf, str, n);
 813   if (str[0] == '-') return false;    // strtoul() on a negative number?!
 814   char* end;
 815   errno = 0;
 816   unsigned long r = strtoul(str, &end, radix);
 817   if (end != str + n) return false;   // Leftover junk
 818   if (errno) return false;
 819   if (dest == NULL) return true;
 820   *(reinterpret_cast<unsigned long*>(dest)) = r;
 821   return true;
 822 }
 823
 824 bool Arg::parse_short_radix(const char* str,
 825                             int n,
 826                             void* dest,
 827                             int radix) {
 828   long r;
 829   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
 830   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
 831   if (dest == NULL) return true;
 832   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
 833   return true;
 834 }
 835
 836 bool Arg::parse_ushort_radix(const char* str,
 837                              int n,
 838                              void* dest,
 839                              int radix) {
 840   unsigned long r;
 841   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
 842   if (r > USHRT_MAX) return false;                      // Out of range
 843   if (dest == NULL) return true;
 844   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
 845   return true;
 846 }
 847
 848 bool Arg::parse_int_radix(const char* str,
 849                           int n,
 850                           void* dest,
 851                           int radix) {
 852   long r;
 853   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
 854   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
 855   if (dest == NULL) return true;
 856   *(reinterpret_cast<int*>(dest)) = r;
 857   return true;
 858 }
 859
 860 bool Arg::parse_uint_radix(const char* str,
 861                            int n,
 862                            void* dest,
 863                            int radix) {
 864   unsigned long r;
 865   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
 866   if (r > UINT_MAX) return false;                       // Out of range
 867   if (dest == NULL) return true;
 868   *(reinterpret_cast<unsigned int*>(dest)) = r;
 869   return true;
 870 }
 871
 872 bool Arg::parse_longlong_radix(const char* str,
 873                                int n,
 874                                void* dest,
 875                                int radix) {
 876 #ifndef HAVE_LONG_LONG
 877   return false;
 878 #else
 879   if (n == 0) return false;
 880   char buf[kMaxNumberLength+1];
 881   str = TerminateNumber(buf, str, n);
 882   char* end;
 883   errno = 0;
 884 #if defined HAVE_STRTOQ
 885   long long r = strtoq(str, &end, radix);
 886 #elif defined HAVE_STRTOLL
 887   long long r = strtoll(str, &end, radix);
 888 #elif defined HAVE__STRTOI64
 889   long long r = _strtoi64(str, &end, radix);
 890 #elif defined HAVE_STRTOIMAX
 891   long long r = strtoimax(str, &end, radix);
 892 #else
 893 #error parse_longlong_radix: cannot convert input to a long-long
 894 #endif
 895   if (end != str + n) return false;   // Leftover junk
 896   if (errno) return false;
 897   if (dest == NULL) return true;
 898   *(reinterpret_cast<long long*>(dest)) = r;
 899   return true;
 900 #endif   /* HAVE_LONG_LONG */
 901 }
 902
 903 bool Arg::parse_ulonglong_radix(const char* str,
 904                                 int n,
 905                                 void* dest,
 906                                 int radix) {
 907 #ifndef HAVE_UNSIGNED_LONG_LONG
 908   return false;
 909 #else
 910   if (n == 0) return false;
 911   char buf[kMaxNumberLength+1];
 912   str = TerminateNumber(buf, str, n);
 913   if (str[0] == '-') return false;    // strtoull() on a negative number?!
 914   char* end;
 915   errno = 0;
 916 #if defined HAVE_STRTOQ
 917   unsigned long long r = strtouq(str, &end, radix);
 918 #elif defined HAVE_STRTOLL
 919   unsigned long long r = strtoull(str, &end, radix);
 920 #elif defined HAVE__STRTOI64
 921   unsigned long long r = _strtoui64(str, &end, radix);
 922 #elif defined HAVE_STRTOIMAX
 923   unsigned long long r = strtoumax(str, &end, radix);
 924 #else
 925 #error parse_ulonglong_radix: cannot convert input to a long-long
 926 #endif
 927   if (end != str + n) return false;   // Leftover junk
 928   if (errno) return false;
 929   if (dest == NULL) return true;
 930   *(reinterpret_cast<unsigned long long*>(dest)) = r;
 931   return true;
 932 #endif   /* HAVE_UNSIGNED_LONG_LONG */
 933 }
 934
 935 bool Arg::parse_double(const char* str, int n, void* dest) {
 936   if (n == 0) return false;
 937   static const int kMaxLength = 200;
 938   char buf[kMaxLength];
 939   if (n >= kMaxLength) return false;
 940   memcpy(buf, str, n);
 941   buf[n] = '\0';
 942   errno = 0;
 943   char* end;
 944   double r = strtod(buf, &end);
 945   if (end != buf + n) return false;   // Leftover junk
 946   if (errno) return false;
 947   if (dest == NULL) return true;
 948   *(reinterpret_cast<double*>(dest)) = r;
 949   return true;
 950 }
 951
 952 bool Arg::parse_float(const char* str, int n, void* dest) {
 953   double r;
 954   if (!parse_double(str, n, &r)) return false;
 955   if (dest == NULL) return true;
 956   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
 957   return true;
 958 }
 959
 960
 961 #define DEFINE_INTEGER_PARSERS(name)                                    \
 962   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
 963     return parse_##name##_radix(str, n, dest, 10);                      \
 964   }                                                                     \
 965   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
 966     return parse_##name##_radix(str, n, dest, 16);                      \
 967   }                                                                     \
 968   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
 969     return parse_##name##_radix(str, n, dest, 8);                       \
 970   }                                                                     \
 971   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
 972     return parse_##name##_radix(str, n, dest, 0);                       \
 973   }
 974
 975 DEFINE_INTEGER_PARSERS(short)      /*                                   */
 976 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
 977 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
 978 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
 979 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
 980 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
 981 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
 982 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
 983
 984 #undef DEFINE_INTEGER_PARSERS
 985
 986 }   // namespace pcrecpp