hphp/runtime/base/preg.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #include "hphp/runtime/base/preg.h"
  18
  19 #include <atomic>
  20 #include <fstream>
  21 #include <mutex>
  22 #include <pcre.h>
  23 #include <onigposix.h>
  24 #include <utility>
  25
  26 #include <folly/AtomicHashArray.h>
  27
  28 #include "hphp/runtime/base/array-init.h"
  29 #include "hphp/runtime/base/array-iterator.h"
  30 #include "hphp/runtime/base/builtin-functions.h"
  31 #include "hphp/runtime/base/container-functions.h"
  32 #include "hphp/runtime/base/execution-context.h"
  33 #include "hphp/runtime/base/ini-setting.h"
  34 #include "hphp/runtime/base/init-fini-node.h"
  35 #include "hphp/runtime/base/runtime-option.h"
  36 #include "hphp/runtime/base/string-util.h"
  37 #include "hphp/runtime/base/tv-uncounted.h"
  38 #include "hphp/runtime/base/zend-functions.h"
  39 #include "hphp/runtime/vm/debug/debug.h"
  40 #include "hphp/runtime/vm/treadmill.h"
  41 #include "hphp/runtime/vm/vm-regs.h"
  42
  43 #include "hphp/runtime/ext/std/ext_std_function.h"
  44 #include "hphp/runtime/ext/string/ext_string.h"
  45
  46 #include "hphp/runtime/vm/jit/mcgen.h"
  47 #include "hphp/runtime/vm/jit/types.h"
  48 #include "hphp/runtime/vm/jit/vtune-jit.h"
  49
  50 #include "hphp/util/logger.h"
  51 #include "hphp/util/concurrent-scalable-cache.h"
  52
  53 #include <folly/FileUtil.h>
  54 #include <folly/json.h>
  55
  56 /* Only defined in pcre >= 8.32 */
  57 #ifndef PCRE_STUDY_JIT_COMPILE
  58 # define PCRE_STUDY_JIT_COMPILE 0
  59 #endif
  60
  61 namespace HPHP {
  62
  63 TRACE_SET_MOD(preg);
  64
  65 using jit::TCA;
  66
  67 ///////////////////////////////////////////////////////////////////////////////
  68 // PCREglobals definition
  69
  70 PCREglobals::PCREglobals() {
  71   jit_stack = pcre_jit_stack_alloc(32768, 524288);
  72   // Set these to handle uses of pcre prior to PcreExtension::threadInit
  73   // In particular, for matching tier overrides during RuntimeOption::Load
  74   preg_backtrace_limit = RuntimeOption::PregBacktraceLimit;
  75   preg_recursion_limit = RuntimeOption::PregRecursionLimit;
  76 }
  77
  78 PCREglobals::~PCREglobals() {
  79   pcre_jit_stack_free(jit_stack);
  80 }
  81
  82 ///////////////////////////////////////////////////////////////////////////////
  83 // PCRECache definition
  84
  85 struct PCRECache {
  86   typedef std::shared_ptr<const pcre_cache_entry> EntryPtr;
  87   typedef std::unique_ptr<LRUCacheKey> TempKeyCache;
  88
  89   enum class CacheKind {
  90     Static,
  91     Lru,
  92     Scalable
  93   };
  94
  95 private:
  96   struct ahm_string_data_same {
  97     bool operator()(const StringData* s1, const StringData* s2) {
  98       // ahm uses -1, -2, -3 as magic values
  99       return int64_t(s1) > 0 && (s1 == s2 || s1->same(s2));
 100     }
 101   };
 102
 103   typedef folly::AtomicHashArray<StringData*, const pcre_cache_entry*,
 104           string_data_hash, ahm_string_data_same> StaticCache;
 105   typedef ConcurrentLRUCache<LRUCacheKey, EntryPtr,
 106           LRUCacheKey::HashCompare> LRUCache;
 107   typedef ConcurrentScalableCache<LRUCacheKey, EntryPtr,
 108           LRUCacheKey::HashCompare> ScalableCache;
 109   typedef StaticCache::value_type StaticCachePair;
 110
 111 public:
 112   struct Accessor {
 113     Accessor()
 114       : m_kind(Kind::Empty)
 115     {}
 116
 117     ~Accessor() {
 118       switch (m_kind) {
 119         case Kind::Empty:
 120         case Kind::Ptr:
 121           break;
 122         case Kind::SmartPtr:
 123           m_u.smart_ptr.~EntryPtr();
 124           break;
 125         case Kind::AccessorKind:
 126           m_u.accessor.~ConstAccessor();
 127           break;
 128       }
 129     }
 130
 131     Accessor& operator=(const pcre_cache_entry* ptr) {
 132       assertx(m_kind == Kind::Empty || m_kind == Kind::Ptr);
 133       m_kind = Kind::Ptr;
 134       m_u.ptr = ptr;
 135       return *this;
 136     }
 137
 138     Accessor& operator=(EntryPtr&& ep) {
 139       switch (m_kind) {
 140         case Kind::AccessorKind:
 141           m_u.accessor.~ConstAccessor();
 142         case Kind::Empty:
 143         case Kind::Ptr:
 144           m_kind = Kind::SmartPtr;
 145           new (&m_u.smart_ptr) EntryPtr(std::move(ep));
 146           break;
 147         case Kind::SmartPtr:
 148           m_u.smart_ptr = std::move(ep);
 149           break;
 150       }
 151       return *this;
 152     }
 153
 154     // No assignment from LRUCache::ConstAccessor since it is non-copyable
 155     // Use resetToLRU instead
 156     LRUCache::ConstAccessor& resetToLRU() {
 157       switch (m_kind) {
 158         case Kind::SmartPtr:
 159           m_u.smart_ptr.~EntryPtr();
 160         case Kind::Empty:
 161         case Kind::Ptr:
 162           m_kind = Kind::AccessorKind;
 163           new (&m_u.accessor) LRUCache::ConstAccessor();
 164           break;
 165         case Kind::AccessorKind:
 166           break;
 167       }
 168       return m_u.accessor;
 169     }
 170
 171     const pcre_cache_entry* get() {
 172       switch (m_kind) {
 173         case Kind::Empty:    return nullptr;
 174         case Kind::Ptr:      return m_u.ptr;
 175         case Kind::SmartPtr: return m_u.smart_ptr.get();
 176         case Kind::AccessorKind: return m_u.accessor->get();
 177       }
 178       always_assert(false);
 179     }
 180
 181     const EntryPtr& entryPtr() const {
 182       assertx(m_kind == Kind::SmartPtr);
 183       return m_u.smart_ptr;
 184     }
 185
 186    private:
 187     enum class Kind : uint8_t {
 188       Empty,
 189       Ptr,
 190       SmartPtr,
 191       AccessorKind,
 192     };
 193
 194     union Ptr {
 195        Ptr() {}
 196       ~Ptr() {}
 197
 198       const pcre_cache_entry* ptr;
 199       EntryPtr smart_ptr;
 200       LRUCache::ConstAccessor accessor;
 201     };
 202
 203     Ptr m_u;
 204     Kind m_kind;
 205   };
 206
 207   PCRECache()
 208     : m_kind(CacheKind::Static), m_staticCache(nullptr)
 209   {
 210     reinit(CacheKind::Static);
 211   }
 212
 213   ~PCRECache() {
 214     if (m_kind == CacheKind::Static && m_staticCache.load()) {
 215       DestroyStatic(m_staticCache);
 216     }
 217   }
 218
 219   void reinit(CacheKind kind);
 220   bool find(Accessor& accessor, const StringData* key,
 221             TempKeyCache& keyCache);
 222   void insert(Accessor& accessor, StringData* regex,
 223               TempKeyCache& keyCache, const pcre_cache_entry* ent);
 224   void dump(folly::File& file);
 225   size_t size() const;
 226
 227 private:
 228   void clearStatic();
 229
 230   static void DestroyStatic(StaticCache* cache);
 231   static StaticCache* CreateStatic();
 232
 233   CacheKind m_kind;
 234   std::atomic<StaticCache*> m_staticCache;
 235   std::unique_ptr<LRUCache> m_lruCache;
 236   std::unique_ptr<ScalableCache> m_scalableCache;
 237   std::atomic<time_t> m_expire{};
 238   std::mutex m_clearMutex;
 239 };
 240
 241 ///////////////////////////////////////////////////////////////////////////////
 242 // Data
 243
 244 RDS_LOCAL(PCREglobals, tl_pcre_globals);
 245
 246 static PCRECache s_pcreCache;
 247
 248 // The last pcre error code is available for the whole thread.
 249 static RDS_LOCAL(int, rl_last_error_code);
 250
 251 ///////////////////////////////////////////////////////////////////////////////
 252 // pcre_cache_entry implementation
 253
 254 pcre_cache_entry::~pcre_cache_entry() {
 255   if (extra) {
 256 #if PCRE_MAJOR < 8 || (PCRE_MAJOR == 8 && PCRE_MINOR < 20)
 257     free(extra);
 258 #else
 259     pcre_free_study(extra);
 260 #endif
 261   }
 262   free(subpat_names);
 263   pcre_free(re);
 264 }
 265
 266 bool literalOptions(int options) {
 267   constexpr int mask =
 268     PCRE_ANCHORED | PCRE_CASELESS |
 269     PCRE_DOLLAR_ENDONLY | PCRE_NOTEMPTY;
 270   return !(options & ~mask);
 271 }
 272
 273 pcre_literal_data::pcre_literal_data(const char* pattern, int coptions) {
 274   if (!literalOptions(coptions)) return;
 275
 276   auto p = pattern;
 277   options = coptions;
 278
 279   if (*p == '^') {
 280     match_start_of_line = true;
 281     p++;
 282   }
 283
 284   std::string pattern_buffer;
 285   while (isalnum((unsigned char)*p) || (*p && strchr("/\\ :-_", *p))) {
 286     // backslash + alphanumeric character --> not a literal (i.e. \d).
 287     // backslash + non-alphanumeric character --> literal symbol (i.e. \.)
 288     if (*p == '\\') {
 289       if (!p[1] || isalnum((unsigned char)p[1])) {
 290         break;
 291       } else {
 292         p++;
 293       }
 294     }
 295     pattern_buffer += *p++;
 296   }
 297   if (*p == '$') {
 298     options |= PCRE_DOLLAR_ENDONLY;
 299     p++;
 300   }
 301   if (!*p) {
 302     /* This is an encoding of a literal string. */
 303      ITRACE(2, "Literal pattern: {}\n", pattern_buffer);
 304     literal_str = std::move(pattern_buffer);
 305   }
 306 }
 307
 308 bool pcre_literal_data::isLiteral() const {
 309   return literal_str.has_value();
 310 }
 311
 312 bool pcre_literal_data::matches(const StringData* subject,
 313                                 int pos,
 314                                 int* offsets,
 315                                 int extra_options) const {
 316   assertx(isLiteral() && literalOptions(extra_options));
 317   assertx(pos >= 0);
 318
 319   // Subject must be at least as long as the literal pattern
 320   // for a match to occur.
 321   if (subject->size() < literal_str->length() + pos) {
 322     return false;
 323   }
 324
 325   size_t literal_strlen = literal_str->length();
 326   auto const g_empty = (options | extra_options) & PCRE_NOTEMPTY;
 327   if (g_empty && !literal_strlen) return false;
 328   auto const subject_c = subject->data();
 329   auto const literal_c = literal_str->c_str();
 330
 331   // Compare the literal pattern at an offset of the subject.
 332   auto const subject_substr = subject_c + pos;
 333
 334   auto const match_start = [&]() {
 335     if (match_end() && (subject->size() - pos) != literal_strlen) {
 336       return false;
 337     }
 338     // If only matching the start (^), compare the strings
 339     // for the length of the literal pattern.
 340     if (case_insensitive() ?
 341         bstrcaseeq(subject_substr, literal_c, literal_strlen) :
 342         memcmp(subject_substr, literal_c, literal_strlen) == 0) {
 343       offsets[0] = pos * sizeof(char);
 344       offsets[1] = offsets[0] + literal_strlen * sizeof(char);
 345       return true;
 346     }
 347     return false;
 348   };
 349
 350   if (match_start_of_line) {
 351     return !pos && match_start();
 352   } else if (match_start_of_string()) {
 353     return match_start();
 354   } else if (match_end()) {
 355     // Compare the literal pattern against the tail end of the subject.
 356     auto const subject_tail = subject_c + (subject->size() - literal_strlen);
 357     if (case_insensitive() ?
 358         bstrcaseeq(subject_tail, literal_c, literal_strlen) :
 359         memcmp(subject_tail, literal_c, literal_strlen) == 0) {
 360       offsets[0] = (subject->size() - literal_strlen) * sizeof(char);
 361       offsets[1] = subject->size() * sizeof(char);
 362       return true;
 363     }
 364   } else {
 365     if (!literal_strlen) {
 366       offsets[0] = offsets[1] = pos;
 367       return true;
 368     }
 369     // Check if the literal pattern occurs as a substring of the subject.
 370     auto const subject_str = StrNR(subject);
 371     auto const find_response = subject_str.asString().find(
 372       *literal_str, pos, !case_insensitive());
 373     if (find_response >= 0) {
 374       offsets[0] = find_response * sizeof(char);
 375       offsets[1] = offsets[0] + literal_strlen * sizeof(char);
 376       return true;
 377     }
 378   }
 379   return false;
 380 }
 381
 382 ///////////////////////////////////////////////////////////////////////////////
 383 // PCRECache implementation
 384
 385 PCRECache::StaticCache* PCRECache::CreateStatic() {
 386   StaticCache::Config config;
 387   config.maxLoadFactor = 0.5;
 388   return StaticCache::create(
 389       RuntimeOption::EvalPCRETableSize, config).release();
 390 }
 391
 392 void PCRECache::DestroyStatic(StaticCache* cache) {
 393   // We delete uncounted keys while iterating the cache, which is OK for
 394   // AtomicHashArray, but not OK for other containers, such as
 395   // std::unordered_map.  If you change the cache type make sure that property
 396   // holds or fix this function.
 397   static_assert(std::is_same<PCRECache::StaticCache,
 398       folly::AtomicHashArray<StringData*, const pcre_cache_entry*,
 399                              string_data_hash, ahm_string_data_same>>::value,
 400       "StaticCache must be an AtomicHashArray or this destructor is wrong.");
 401   for (auto& it : *cache) {
 402     DecRefUncountedString(it.first);
 403     delete it.second;
 404   }
 405   StaticCache::destroy(cache);
 406 }
 407
 408 void PCRECache::reinit(CacheKind kind) {
 409   switch (m_kind) {
 410     case CacheKind::Static:
 411       if (m_staticCache.load()) {
 412         DestroyStatic(m_staticCache);
 413         m_staticCache = nullptr;
 414       }
 415       break;
 416     case CacheKind::Lru:
 417       m_lruCache.reset();
 418       break;
 419     case CacheKind::Scalable:
 420       m_scalableCache.reset();
 421       break;
 422   }
 423   m_kind = kind;
 424
 425   switch (kind) {
 426     case CacheKind::Static:
 427       m_staticCache = CreateStatic();
 428       m_expire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
 429       break;
 430     case CacheKind::Lru:
 431       m_lruCache.reset(new LRUCache(RuntimeOption::EvalPCRETableSize));
 432       break;
 433     case CacheKind::Scalable:
 434       m_scalableCache.reset(
 435         new ScalableCache(RuntimeOption::EvalPCRETableSize));
 436       break;
 437   }
 438 }
 439
 440 bool PCRECache::find(Accessor& accessor,
 441                      const StringData* regex,
 442                      TempKeyCache& keyCache)
 443 {
 444   switch (m_kind) {
 445     case CacheKind::Static:
 446       {
 447         assertx(m_staticCache.load());
 448         StaticCache::iterator it;
 449         auto cache = m_staticCache.load(std::memory_order_acquire);
 450         if ((it = cache->find(regex)) != cache->end()) {
 451           accessor = it->second;
 452           return true;
 453         }
 454         return false;
 455       }
 456     case CacheKind::Lru:
 457     case CacheKind::Scalable:
 458       {
 459         if (!keyCache) {
 460           keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
 461         }
 462         bool found;
 463         if (m_kind == CacheKind::Lru) {
 464           found = m_lruCache->find(accessor.resetToLRU(), *keyCache);
 465         } else {
 466           found = m_scalableCache->find(accessor.resetToLRU(), *keyCache);
 467         }
 468         return found;
 469       }
 470   }
 471   always_assert(false);
 472 }
 473
 474 void PCRECache::clearStatic() {
 475   std::unique_lock<std::mutex> lock(m_clearMutex, std::try_to_lock);
 476   if (!lock) return;
 477
 478   auto newExpire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
 479   m_expire.store(newExpire, std::memory_order_relaxed);
 480
 481   auto tmpMap = CreateStatic();
 482   tmpMap = m_staticCache.exchange(tmpMap, std::memory_order_acq_rel);
 483
 484   Treadmill::enqueue([tmpMap]() {
 485       DestroyStatic(tmpMap);
 486    });
 487 }
 488
 489 void PCRECache::insert(
 490   Accessor& accessor,
 491   StringData* regex,
 492   TempKeyCache& keyCache,
 493   const pcre_cache_entry* ent
 494 ) {
 495   switch (m_kind) {
 496     case CacheKind::Static:
 497       {
 498         assertx(m_staticCache.load());
 499         // Clear the cache if we haven't refreshed it in a while
 500         if (time(nullptr) > m_expire) {
 501           clearStatic();
 502         }
 503         auto const cache = m_staticCache.load(std::memory_order_acquire);
 504         auto const key = !regex->persistentIncRef()
 505           ? StringData::MakeUncounted(regex->slice())
 506           : regex;
 507         auto pair = cache->insert(StaticCachePair(key, ent));
 508         if (pair.second) {
 509           // Inserted, container owns the pointer
 510           accessor = ent;
 511         } else {
 512           // Not inserted, caller needs to own the pointer
 513           DecRefUncountedString(key);
 514           accessor = EntryPtr(ent);
 515         }
 516       }
 517       break;
 518     case CacheKind::Lru:
 519     case CacheKind::Scalable:
 520       {
 521         if (!keyCache) {
 522           keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
 523         }
 524         // Pointer ownership is shared between container and caller
 525         accessor = EntryPtr(ent);
 526         if (m_kind == CacheKind::Lru) {
 527           m_lruCache->insert(*keyCache, accessor.entryPtr());
 528         } else {
 529           m_scalableCache->insert(*keyCache, accessor.entryPtr());
 530         }
 531       }
 532       break;
 533   }
 534 }
 535
 536 void PCRECache::dump(folly::File& file) {
 537   switch (m_kind) {
 538     case CacheKind::Static:
 539       for (auto& it : *m_staticCache) {
 540         folly::writeFull(file.fd(), it.first->data(), it.first->size());
 541         folly::writeFull(file.fd(), "\n", 1);
 542       }
 543       break;
 544     case CacheKind::Lru:
 545     case CacheKind::Scalable:
 546       {
 547         std::vector<LRUCacheKey> keys;
 548         if (m_kind == CacheKind::Lru) {
 549           m_lruCache->snapshotKeys(keys);
 550         } else {
 551           m_scalableCache->snapshotKeys(keys);
 552         }
 553         for (auto& key: keys) {
 554           folly::writeFull(file.fd(), key.data(), key.size());
 555           folly::writeFull(file.fd(), "\n", 1);
 556         }
 557       }
 558       break;
 559   }
 560 }
 561
 562 size_t PCRECache::size() const {
 563   switch (m_kind) {
 564     case CacheKind::Static:
 565       return m_staticCache.load(std::memory_order_acquire)->size();
 566     case CacheKind::Lru:
 567       return m_lruCache->size();
 568     case CacheKind::Scalable:
 569      return m_scalableCache->size();
 570   }
 571   always_assert(false);
 572 }
 573
 574 ///////////////////////////////////////////////////////////////////////////////
 575 // Public interface and helper functions
 576
 577 void pcre_reinit() {
 578   PCRECache::CacheKind kind;
 579   if (RuntimeOption::EvalPCRECacheType == "static") {
 580     kind = PCRECache::CacheKind::Static;
 581   } else if (RuntimeOption::EvalPCRECacheType == "lru") {
 582     kind = PCRECache::CacheKind::Lru;
 583   } else if (RuntimeOption::EvalPCRECacheType == "scalable") {
 584     kind = PCRECache::CacheKind::Scalable;
 585   } else {
 586     Logger::Warning("Eval.PCRECacheType should be either static, "
 587                     "lru or scalable");
 588     kind = PCRECache::CacheKind::Scalable;
 589   }
 590   s_pcreCache.reinit(kind);
 591 }
 592
 593 void pcre_init() {
 594 }
 595
 596 void pcre_dump_cache(folly::File& file) {
 597   s_pcreCache.dump(file);
 598 }
 599
 600 static pcre_jit_stack* alloc_jit_stack(void* /*data*/) {
 601   return tl_pcre_globals->jit_stack;
 602 }
 603
 604 namespace {
 605
 606 template<bool useSmartFree = false>
 607 struct FreeHelperImpl {
 608   explicit FreeHelperImpl(void* p) : p(p) {}
 609   ~FreeHelperImpl() {
 610     useSmartFree ? req::free(p) : free(p);
 611   }
 612
 613   FreeHelperImpl(const FreeHelperImpl&) = delete;
 614   FreeHelperImpl& operator=(const FreeHelperImpl&) = delete;
 615
 616 private:
 617   void* p;
 618 };
 619
 620 typedef FreeHelperImpl<true> SmartFreeHelper;
 621 }
 622
 623 static void init_local_extra(pcre_extra* local, pcre_extra* shared) {
 624   if (shared) {
 625     memcpy(local, shared, sizeof(pcre_extra));
 626   } else {
 627     memset(local, 0, sizeof(pcre_extra));
 628     local->flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 629   }
 630   local->match_limit = tl_pcre_globals->preg_backtrace_limit;
 631   local->match_limit_recursion = tl_pcre_globals->preg_recursion_limit;
 632 }
 633
 634 static const char* const*
 635 get_subpat_names(const pcre_cache_entry* pce) {
 636   assertx(!pce->literal_data);
 637   char **subpat_names = pce->subpat_names.load(std::memory_order_relaxed);
 638   if (subpat_names) return subpat_names;
 639
 640   /*
 641   * Build a mapping from subpattern numbers to their names. We will always
 642   * allocate the table, even though there may be no named subpatterns. This
 643   * avoids somewhat more complicated logic in the inner loops.
 644   */
 645   pcre_extra extra;
 646   init_local_extra(&extra, pce->extra);
 647
 648   int name_count;
 649
 650   subpat_names = (char **)calloc(pce->num_subpats, sizeof(char *));
 651   int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMECOUNT, &name_count);
 652   if (rc < 0) {
 653     raise_warning("Internal pcre_fullinfo() error %d", rc);
 654     return nullptr;
 655   }
 656   if (name_count > 0) {
 657     int name_size, ni = 0;
 658     unsigned short name_idx;
 659     char* name_table;
 660     int rc1, rc2;
 661
 662     rc1 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMETABLE, &name_table);
 663     rc2 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
 664     rc = rc2 ? rc2 : rc1;
 665     if (rc < 0) {
 666       raise_warning("Internal pcre_fullinfo() error %d", rc);
 667       return nullptr;
 668     }
 669     // The table returned by PCRE_INFO_NAMETABLE is an array of fixed length
 670     // strings of size PCRE_INFO_NAMEENTRYSIZE.  The first two bytes are a
 671     // big-endian uint16_t defining the array index followed by the
 672     // zero-terminated name string.
 673     // (See https://www.pcre.org/original/doc/html/pcreapi.html)
 674     while (ni++ < name_count) {
 675       name_idx = 0x100 * (unsigned char)name_table[0] +
 676                  (unsigned char)name_table[1];
 677       subpat_names[name_idx] = name_table + 2;
 678       if (is_numeric_string(subpat_names[name_idx],
 679                             strlen(subpat_names[name_idx]),
 680                             nullptr, nullptr, 0) != KindOfNull) {
 681         raise_warning("Numeric named subpatterns are not allowed");
 682         return nullptr;
 683       }
 684       name_table += name_size;
 685     }
 686   }
 687   // Store subpat_names into the cache entry
 688   char **expected = nullptr;
 689   if (!pce->subpat_names.compare_exchange_strong(expected, subpat_names)) {
 690     // Another thread stored subpat_names already. The array created by the
 691     // other thread is now in expected, return it instead and delete the one
 692     // we just made.
 693     free(subpat_names);
 694     return expected;
 695   }
 696   return subpat_names;
 697 }
 698
 699 static bool get_pcre_fullinfo(pcre_cache_entry* pce) {
 700   pcre_extra extra;
 701   init_local_extra(&extra, pce->extra);
 702
 703   /* Calculate the size of the offsets array*/
 704   int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_CAPTURECOUNT,
 705                          &pce->num_subpats);
 706   if (rc < 0) {
 707     raise_warning("Internal pcre_fullinfo() error %d", rc);
 708     return false;
 709   }
 710   pce->num_subpats++;
 711   return true;
 712 }
 713
 714 static bool
 715 pcre_get_compiled_regex_cache(PCRECache::Accessor& accessor,
 716                               StringData* regex) {
 717   PCRECache::TempKeyCache tkc;
 718
 719   /* Try to lookup the cached regex entry, and if successful, just pass
 720      back the compiled pattern, otherwise go on and compile it. */
 721   if (s_pcreCache.find(accessor, regex, tkc)) return true;
 722
 723   /* Parse through the leading whitespace, and display a warning if we
 724      get to the end without encountering a delimiter. */
 725   const char *p = regex->data();
 726   while (isspace((int)*(unsigned char *)p)) p++;
 727   if (*p == 0) {
 728     raise_warning("Empty regular expression");
 729     return false;
 730   }
 731
 732   /* Get the delimiter and display a warning if it is alphanumeric
 733      or a backslash. */
 734   char delimiter = *p++;
 735   if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
 736     raise_warning("Delimiter must not be alphanumeric or backslash");
 737     return false;
 738   }
 739
 740   char start_delimiter = delimiter;
 741   const char *pp = strchr("([{< )]}> )]}>", delimiter);
 742   if (pp) {
 743     delimiter = pp[5];
 744   }
 745   char end_delimiter = delimiter;
 746
 747   if (start_delimiter == end_delimiter) {
 748     /* We need to iterate through the pattern, searching for the ending
 749      * delimiter, but skipping the backslashed delimiters. If the ending
 750      * delimiter is not found, display a warning. */
 751     pp = p;
 752     while (*pp != 0) {
 753       if (*pp == '\\' && pp[1] != 0) pp++;
 754       else if (*pp == delimiter)
 755         break;
 756       pp++;
 757     }
 758     if (*pp == 0) {
 759       raise_warning("No ending delimiter '%c' found: [%s]", delimiter,
 760                       regex->data());
 761       return false;
 762     }
 763   } else {
 764     /* We iterate through the pattern, searching for the matching ending
 765      * delimiter. For each matching starting delimiter, we increment nesting
 766      * level, and decrement it for each matching ending delimiter. If we
 767      * reach the end of the pattern without matching, display a warning.
 768      */
 769     int brackets = 1; // brackets nesting level
 770     pp = p;
 771     while (*pp != 0) {
 772       if (*pp == '\\' && pp[1] != 0) pp++;
 773       else if (*pp == end_delimiter && --brackets <= 0)
 774         break;
 775       else if (*pp == start_delimiter)
 776         brackets++;
 777       pp++;
 778     }
 779     if (*pp == 0) {
 780       raise_warning("No ending matching delimiter '%c' found: [%s]",
 781                       end_delimiter, regex->data());
 782       return false;
 783     }
 784   }
 785
 786   /* Make a copy of the actual pattern. */
 787   String spattern(p, pp-p, CopyString);
 788   const char *pattern = spattern.data();
 789
 790   /* Move on to the options */
 791   pp++;
 792
 793   /* Parse through the options, setting appropriate flags.  Display
 794      a warning if we encounter an unknown modifier. */
 795   int coptions = 0;
 796   int poptions = 0;
 797   bool do_study = false;
 798   while (*pp != 0) {
 799     switch (*pp++) {
 800       /* Perl compatible options */
 801     case 'i':  coptions |= PCRE_CASELESS;       break;
 802     case 'm':  coptions |= PCRE_MULTILINE;      break;
 803     case 's':  coptions |= PCRE_DOTALL;         break;
 804     case 'x':  coptions |= PCRE_EXTENDED;       break;
 805
 806       /* PCRE specific options */
 807     case 'A':  coptions |= PCRE_ANCHORED;       break;
 808     case 'D':  coptions |= PCRE_DOLLAR_ENDONLY; break;
 809     case 'S':  do_study = true;                 break;
 810     case 'U':  coptions |= PCRE_UNGREEDY;       break;
 811     case 'X':  coptions |= PCRE_EXTRA;          break;
 812     case 'u':  coptions |= PCRE_UTF8;
 813   /* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
 814        characters, even in UTF-8 mode. However, this can be changed by setting
 815        the PCRE_UCP option. */
 816 #ifdef PCRE_UCP
 817             coptions |= PCRE_UCP;
 818 #endif
 819       break;
 820
 821       /* Custom preg options */
 822     case 'e':  poptions |= PREG_REPLACE_EVAL;   break;
 823
 824     case ' ':
 825     case '\n':
 826     case '\r':
 827       break;
 828
 829     default:
 830       raise_warning("Unknown modifier '%c': [%s]", pp[-1], regex->data());
 831       return false;
 832     }
 833   }
 834
 835   /* We've reached a null byte, now check if we're actually at the end of the
 836      string.  If not this is a bad expression, and a potential security hole. */
 837   if (regex->size() != (pp - regex->data())) {
 838     raise_error("Error: Null byte found in pattern");
 839   }
 840
 841   /* Store the compiled pattern and extra info in the cache. */
 842   auto const store_pcre_entry =
 843     [&](pcre_literal_data& pld, pcre* re=nullptr, pcre_extra* extra=nullptr) {
 844     assertx((poptions & ~0x1) == 0);
 845     assertx((coptions & 0x80000000) == 0);
 846     pcre_cache_entry* new_entry = new pcre_cache_entry();
 847     new_entry->re = re;
 848     new_entry->extra = extra;
 849     new_entry->preg_options = poptions;
 850     new_entry->compile_options = coptions;
 851
 852     if (pld.isLiteral()) {
 853       new_entry->literal_data =
 854         std::make_unique<pcre_literal_data>(std::move(pld));
 855       new_entry->num_subpats = 1;
 856     } else {
 857        /* Get pcre full info */
 858       if (!get_pcre_fullinfo(new_entry)) {
 859         delete new_entry;
 860         return false;
 861       }
 862     }
 863
 864     s_pcreCache.insert(accessor, regex, tkc, new_entry);
 865     return true;
 866   };
 867
 868   // If the pattern is a literal, we can skip compiling it.
 869   auto literal_data = pcre_literal_data(pattern, coptions);
 870   if (literal_data.isLiteral()) return store_pcre_entry(literal_data);
 871
 872   /* Compile pattern and display a warning if compilation failed. */
 873   const char  *error;
 874   int erroffset;
 875   pcre *re = pcre_compile(pattern, coptions, &error, &erroffset, 0);
 876   if (re == nullptr) {
 877     raise_warning("Compilation failed: %s at offset %d", error, erroffset);
 878     return false;
 879   }
 880
 881   // Careful: from here 're' needs to be freed if something throws.
 882
 883   /* If study option was specified, study the pattern and
 884      store the result in extra for passing to pcre_exec. */
 885   pcre_extra *extra = nullptr;
 886   if (!literal_data.isLiteral()) {
 887     if (do_study || PCRE_STUDY_JIT_COMPILE) {
 888       int soptions = PCRE_STUDY_JIT_COMPILE;
 889       extra = pcre_study(re, soptions, &error);
 890       if (extra) {
 891         extra->flags |= PCRE_EXTRA_MATCH_LIMIT |
 892           PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 893         pcre_assign_jit_stack(extra, alloc_jit_stack, nullptr);
 894       }
 895       if (error != nullptr) {
 896         try {
 897           raise_warning("Error while studying pattern");
 898         } catch (...) {
 899           pcre_free(re);
 900           throw;
 901         }
 902       }
 903       if ((!RuntimeOption::EvalJitNoGdb ||
 904            RuntimeOption::EvalJitUseVtuneAPI ||
 905            RuntimeOption::EvalPerfPidMap) &&
 906           extra &&
 907           extra->executable_jit != nullptr) {
 908         size_t size;
 909         pcre_fullinfo(re, extra, PCRE_INFO_JITSIZE, &size);
 910
 911         TCA start = *(TCA *)(extra->executable_jit);
 912         TCA end = start + size;
 913         std::string name = folly::sformat("HHVM::pcre_jit::{}", pattern);
 914
 915         if (!RuntimeOption::EvalJitNoGdb && jit::mcgen::initialized()) {
 916           Debug::DebugInfo::Get()->recordStub(Debug::TCRange(start, end, false),
 917                                               name);
 918         }
 919         if (RuntimeOption::EvalJitUseVtuneAPI) {
 920           HPHP::jit::reportHelperToVtune(name.c_str(), start, end);
 921         }
 922         if (RuntimeOption::EvalPerfPidMap && jit::mcgen::initialized()) {
 923           std::string escaped_name;
 924           folly::json::escapeString(name, escaped_name,
 925                                     folly::json::serialization_opts());
 926           Debug::DebugInfo::Get()->recordPerfMap(
 927             Debug::TCRange(start, end, false),
 928             SrcKey{}, escaped_name
 929           );
 930         }
 931       }
 932     }
 933   }
 934
 935   return store_pcre_entry(literal_data, re, extra);
 936 }
 937
 938 static int* create_offset_array(const pcre_cache_entry* pce,
 939                                 int& size_offsets) {
 940   /* Allocate memory for the offsets array */
 941   size_offsets = pce->num_subpats * 3;
 942   return (int *)req::malloc_noptrs(size_offsets * sizeof(int));
 943 }
 944
 945 static Array str_offset_pair(const String& str, int offset) {
 946   return make_vec_array(str, offset);
 947 }
 948
 949 static inline bool pcre_need_log_error(int pcre_code) {
 950   return RuntimeOption::EnablePregErrorLog &&
 951          (pcre_code == PCRE_ERROR_MATCHLIMIT ||
 952           pcre_code == PCRE_ERROR_RECURSIONLIMIT);
 953 }
 954
 955 static void pcre_log_error(const char* func, int line, int pcre_code,
 956                            const char* pattern, int pattern_size,
 957                            const char* subject, int subject_size,
 958                            const char* repl, int repl_size,
 959                            int arg1 = 0, int arg2 = 0,
 960                            int arg3 = 0, int arg4 = 0) {
 961   const char* escapedPattern;
 962   const char* escapedSubject;
 963   const char* escapedRepl;
 964   std::string p(pattern, pattern_size);
 965   std::string s(subject, subject_size);
 966   std::string r(repl, repl_size);
 967   escapedPattern = Logger::EscapeString(p);
 968   escapedSubject = Logger::EscapeString(s);
 969   escapedRepl = Logger::EscapeString(r);
 970   const char* errString =
 971     (pcre_code == PCRE_ERROR_MATCHLIMIT) ? "PCRE_ERROR_MATCHLIMIT" :
 972     (pcre_code == PCRE_ERROR_RECURSIONLIMIT) ? "PCRE_ERROR_RECURSIONLIMIT" :
 973     "UNKNOWN";
 974   raise_warning_unsampled(
 975     "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
 976     "limits=(%" PRId64 ", %" PRId64 "), extra=(%d, %d, %d, %d)",
 977     func, line, pcre_code, errString,
 978     escapedPattern, escapedSubject, escapedRepl,
 979     tl_pcre_globals->preg_backtrace_limit,
 980     tl_pcre_globals->preg_recursion_limit,
 981     arg1, arg2, arg3, arg4);
 982   free((void *)escapedPattern);
 983   free((void *)escapedSubject);
 984   free((void *)escapedRepl);
 985 }
 986
 987 namespace {
 988
 989 ALWAYS_INLINE Variant preg_return_internal_error(Variant&& return_value) {
 990   *rl_last_error_code = PHP_PCRE_INTERNAL_ERROR;
 991   return std::move(return_value);
 992 }
 993
 994 ALWAYS_INLINE Variant preg_return_bad_regex_error(Variant&& return_value) {
 995   *rl_last_error_code = PHP_PCRE_BAD_REGEX_ERROR;
 996   return std::move(return_value);
 997 }
 998
 999 void pcre_handle_exec_error(int pcre_code) {
1000   int preg_code = 0;
1001   switch (pcre_code) {
1002   case PCRE_ERROR_MATCHLIMIT:
1003     preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
1004     break;
1005   case PCRE_ERROR_RECURSIONLIMIT:
1006     preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
1007     break;
1008   case PCRE_ERROR_BADUTF8:
1009     preg_code = PHP_PCRE_BAD_UTF8_ERROR;
1010     break;
1011   case PCRE_ERROR_BADUTF8_OFFSET:
1012     preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
1013     break;
1014   default:
1015     preg_code = PHP_PCRE_INTERNAL_ERROR;
1016     break;
1017   }
1018   *rl_last_error_code = preg_code;
1019 }
1020
1021 ALWAYS_INLINE Variant
1022 preg_return_pcre_error(int pcre_code, Variant&& return_value) {
1023   pcre_handle_exec_error(pcre_code);
1024   return std::move(return_value);
1025 }
1026
1027 ALWAYS_INLINE Variant preg_return_no_error(Variant&& return_value) {
1028   *rl_last_error_code = PHP_PCRE_NO_ERROR;
1029   return std::move(return_value);
1030 }
1031
1032 } // namespace
1033
1034 ///////////////////////////////////////////////////////////////////////////////
1035
1036 Variant preg_grep(const String& pattern, const Array& input, int flags /* = 0 */) {
1037   PCRECache::Accessor accessor;
1038   if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1039     return preg_return_bad_regex_error(false);
1040   }
1041   const pcre_cache_entry* pce = accessor.get();
1042
1043   int size_offsets = 0;
1044   int* offsets = create_offset_array(pce, size_offsets);
1045   if (offsets == nullptr) {
1046     return preg_return_internal_error(false);
1047   }
1048   SmartFreeHelper freer(offsets);
1049
1050   /* Initialize return array */
1051   auto ret = Array::CreateDict();
1052
1053   /* Go through the input array */
1054   bool invert = (flags & PREG_GREP_INVERT);
1055   pcre_extra extra;
1056   init_local_extra(&extra, pce->extra);
1057
1058   for (ArrayIter iter(input); iter; ++iter) {
1059     String entry = iter.second().toString();
1060     int count = 0;
1061
1062     if (pce->literal_data) {
1063       assertx(pce->literal_data->isLiteral());
1064       count = pce->literal_data->matches(entry.get(), 0, offsets, 0)
1065         ? 1 : PCRE_ERROR_NOMATCH;
1066     } else {
1067       /* Perform the match */
1068       count = pcre_exec(pce->re, &extra, entry.data(), entry.size(),
1069                         0, 0, offsets, size_offsets);
1070     }
1071     /* Check for too many substrings condition. */
1072     if (count == 0) {
1073       raise_warning("Matched, but too many substrings");
1074       count = pce->num_subpats;
1075     } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1076       if (pcre_need_log_error(count)) {
1077         pcre_log_error(__FUNCTION__, __LINE__, count,
1078                        pattern.data(), pattern.size(),
1079                        entry.data(), entry.size(),
1080                        "", 0,
1081                        flags);
1082       }
1083       // NOTE: this returns an error together with a partial result :-(
1084       return preg_return_pcre_error(count, std::move(ret));
1085     }
1086
1087     /* If the entry fits our requirements */
1088     if ((count > 0 && !invert) ||
1089         (count == PCRE_ERROR_NOMATCH && invert)) {
1090
1091       /* Add to return array */
1092       ret.set(iter.first(), entry);
1093     }
1094   }
1095
1096   return preg_return_no_error(std::move(ret));
1097 }
1098
1099 ///////////////////////////////////////////////////////////////////////////////
1100
1101 static Variant preg_match_impl(StringData* pattern,
1102                                const StringData* subject,
1103                                Variant* subpats, int flags, int start_offset,
1104                                bool global) {
1105   PCRECache::Accessor accessor;
1106   if (!pcre_get_compiled_regex_cache(accessor, pattern)) {
1107     return preg_return_bad_regex_error(false);
1108   }
1109   pcre_extra extra;
1110   const pcre_cache_entry* pce = accessor.get();
1111   init_local_extra(&extra, pce->extra);
1112   int subpats_order = global ? PREG_PATTERN_ORDER : 0;
1113   if (subpats) *subpats = Array::CreateDict();
1114
1115   if (flags) {
1116     /*
1117      * subpats_order is pre-set to pattern mode so we change it only if
1118      * necessary.
1119      */
1120     if (flags & 0xff) {
1121       subpats_order = flags & 0xff;
1122     }
1123     if ((global && (subpats_order < PREG_PATTERN_ORDER ||
1124                     subpats_order > PREG_SET_ORDER)) ||
1125         (!global && subpats_order != 0)) {
1126       raise_warning("Invalid flags specified");
1127       return preg_return_internal_error(init_null());
1128     }
1129   }
1130
1131   /* Negative offset counts from the end of the string. */
1132   if (start_offset < 0) {
1133     start_offset = subject->size() + start_offset;
1134     if (start_offset < 0) {
1135       start_offset = 0;
1136     }
1137   }
1138
1139   int size_offsets = 0;
1140   int* offsets = create_offset_array(pce, size_offsets);
1141   SmartFreeHelper offsetsFreer(offsets);
1142   int num_subpats = pce->num_subpats;
1143   if (offsets == nullptr) return preg_return_internal_error(false);
1144
1145   /* Allocate match sets array and initialize the values. */
1146
1147   /* An array of sets of matches for each subpattern after a global match */
1148   auto match_sets = Array::CreateDict();
1149   if (global && subpats_order == PREG_PATTERN_ORDER) {
1150     for (int i = 0; i < num_subpats; i++) {
1151       match_sets.set(i, Array::CreateDict());
1152     }
1153   }
1154
1155   /*
1156    * If PREG_OFFSET_CAPTURE, each match, instead of being a string, will
1157    * be an array where the first element is a substring containing the
1158    * match and the second element is the position of the first character of
1159    * the substring in the input.
1160    */
1161   bool offset_capture = flags & PREG_OFFSET_CAPTURE;
1162   const char** stringlist; // Holds list of subpatterns
1163   auto const get_value = [&](int i) {
1164     auto const length = offsets[(i<<1)+1] - offsets[i<<1];
1165     auto const match = String(stringlist[i], length, CopyString);
1166     return offset_capture
1167       ? Variant(str_offset_pair(match, offsets[i<<1]))
1168       : Variant(match);
1169   };
1170   auto const get_value_empty = [&](int i) {
1171     auto const match = empty_string();
1172     return offset_capture
1173       ? Variant(str_offset_pair(match, offsets[i<<1]))
1174       : Variant(match);
1175   };
1176
1177   /*
1178    * Skip building name table when using literal_data. Name table is used
1179    * to add named subpatterns to result array. Literal data has none of these,
1180    * so we can skip this step.
1181    */
1182   const char* const* subpat_names = nullptr;
1183   auto const is_literal = pce->literal_data != nullptr;
1184   if (!is_literal) {
1185     subpat_names = get_subpat_names(pce);
1186     if (subpat_names == nullptr) return preg_return_internal_error(false);
1187   }
1188   auto const set_subpats = [&](auto& arr, int i, const Variant& value) {
1189     if (is_literal) return;
1190     if (subpat_names[i]) arr.set(String(subpat_names[i]), value);
1191   };
1192
1193   int i;
1194   const bool includeNonMatchingCaptures = flags & PREG_FB__PRIVATE__HSL_IMPL;
1195
1196   // Add matches to result array for this run
1197   auto add_match_set = [&](auto& arr, int count) {
1198     for (i = 0; i < count; i++) {
1199       auto const value = get_value(i);
1200       set_subpats(arr, i, value);
1201       arr.set(i, value);
1202     }
1203     if (includeNonMatchingCaptures) {
1204       for (; i < num_subpats; i++) {
1205         auto const value = get_value_empty(i);
1206         set_subpats(arr, i, value);
1207         arr.set(i, value);
1208       }
1209     }
1210   };
1211
1212   int matched = 0;
1213   int g_notempty = 0; // If the match should not be empty
1214   int exec_options = 0;
1215
1216   do {
1217     int count = 0;
1218     int options = exec_options | g_notempty;
1219     if (is_literal) {
1220       assertx(literalOptions(options));
1221       count = pce->literal_data->matches(subject, start_offset, offsets, options)
1222         ? 1 : PCRE_ERROR_NOMATCH;
1223     } else {
1224       /* Execute the regular expression. */
1225       count = pcre_exec(pce->re, &extra, subject->data(), subject->size(),
1226                         start_offset, options,
1227                         offsets, size_offsets);
1228
1229       /* The string was already proved to be valid UTF-8 */
1230       exec_options |= PCRE_NO_UTF8_CHECK;
1231     }
1232     /* Check for too many substrings condition. */
1233     if (count == 0) {
1234       raise_warning("Matched, but too many substrings");
1235       count = num_subpats;
1236     }
1237
1238     /* If something has matched */
1239     if (count > 0) {
1240       matched++;
1241
1242       if (subpats) {
1243         // Try to get the list of substrings and display a warning if failed.
1244         if (offsets[1] < offsets[0] ||
1245             pcre_get_substring_list(subject->data(), offsets, count,
1246                                     &stringlist) < 0) {
1247           raise_warning("Get subpatterns list failed");
1248           return preg_return_internal_error(false);
1249         }
1250
1251         if (global) {
1252           if (subpats_order == PREG_PATTERN_ORDER) {
1253             /* For each subpattern, insert it into the appropriate array. */
1254             for (i = 0; i < count; i++) {
1255               auto const value = get_value(i);
1256               auto& arr = asArrRef(match_sets.lval(i));
1257               assertx(arr->isVectorData());
1258               arr.set(safe_cast<int64_t>(arr.size()), value);
1259             }
1260             /*
1261              * If the number of captured subpatterns on this run is
1262              * less than the total possible number, pad the result
1263              * arrays with empty strings.
1264              */
1265             for (; i < num_subpats; i++) {
1266               auto& arr = asArrRef(match_sets.lval(i));
1267               assertx(arr->isVectorData());
1268               arr.set(safe_cast<int64_t>(arr.size()), empty_string());
1269             }
1270           } else {
1271             auto result_set = Array::CreateDict();
1272             add_match_set(result_set, count);
1273             auto& arr = subpats->asArrRef();
1274             assertx(arr->isVectorData());
1275             arr.set(safe_cast<int64_t>(arr.size()), std::move(result_set));
1276           }
1277         } else {
1278           auto& arr = subpats->asArrRef();
1279           add_match_set(arr, count);
1280         }
1281         pcre_free((void *) stringlist);
1282       }
1283     } else if (count == PCRE_ERROR_NOMATCH) {
1284       /* If we previously set PCRE_NOTEMPTY after a null match,
1285          this is not necessarily the end. We need to advance
1286          the start offset, and continue. Fudge the offset values
1287          to achieve this, unless we're already at the end of the string. */
1288       if (g_notempty && start_offset < subject->size()) {
1289         offsets[0] = start_offset;
1290         offsets[1] = start_offset + 1;
1291       } else
1292         break;
1293     } else {
1294       if (pcre_need_log_error(count)) {
1295         pcre_log_error(__FUNCTION__, __LINE__, count,
1296                        pattern->data(), pattern->size(),
1297                        subject->data(), subject->size(),
1298                        "", 0,
1299                        flags, start_offset, g_notempty, global);
1300       }
1301       return preg_return_pcre_error(count, false);
1302     }
1303
1304     /* If we have matched an empty string, mimic what Perl's /g options does.
1305        This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1306        the match again at the same point. If this fails (picked up above) we
1307        advance to the next character. */
1308     g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1309
1310     /* Advance to the position right after the last full match */
1311     start_offset = offsets[1];
1312   } while (global);
1313
1314   /* Add the match sets to the output array and clean up */
1315   if (subpats && global && subpats_order == PREG_PATTERN_ORDER) {
1316     auto& arr = subpats->asArrRef();
1317     for (i = 0; i < num_subpats; i++) {
1318       auto const value = match_sets[i];
1319       set_subpats(arr, i, value);
1320       arr.set(i, match_sets[i]);
1321     }
1322   }
1323   return preg_return_no_error(std::move(matched));
1324 }
1325
1326 Variant preg_match(const String& pattern, const String& subject,
1327                    Variant* matches /* = nullptr */, int flags /* = 0 */,
1328                    int offset /* = 0 */) {
1329   return preg_match(pattern.get(), subject.get(), matches, flags, offset);
1330 }
1331
1332 Variant preg_match(StringData* pattern, const StringData* subject,
1333                    Variant* matches /* = nullptr */, int flags /* = 0 */,
1334                    int offset /* = 0 */) {
1335   return preg_match_impl(pattern, subject, matches, flags, offset, false);
1336 }
1337
1338 Variant preg_match_all(const String& pattern, const String& subject,
1339                        Variant* matches /* = nullptr */,
1340                        int flags /* = 0 */, int offset /* = 0 */) {
1341   return preg_match_all(pattern.get(), subject.get(), matches, flags, offset);
1342 }
1343
1344 Variant preg_match_all(StringData* pattern, const StringData* subject,
1345                        Variant* matches /* = nullptr */,
1346                        int flags /* = 0 */, int offset /* = 0 */) {
1347   return preg_match_impl(pattern, subject, matches, flags, offset, true);
1348 }
1349
1350 ///////////////////////////////////////////////////////////////////////////////
1351
1352 static String preg_do_repl_func(const Variant& function, const String& subject,
1353                                 int* offsets, const char* const* subpat_names,
1354                                 int count) {
1355   Array subpats = Array::CreateDict();
1356   for (int i = 0; i < count; i++) {
1357     auto off1 = offsets[i<<1];
1358     auto off2 = offsets[(i<<1)+1];
1359     auto sub = subject.substr(off1, off2 - off1);
1360
1361     if (subpat_names && subpat_names[i]) {
1362       subpats.set(String(subpat_names[i]), sub);
1363     }
1364     subpats.set(i, sub);
1365   }
1366
1367   return vm_call_user_func(function, make_vec_array(subpats)).toString();
1368 }
1369
1370 static bool preg_get_backref(const char** str, int* backref) {
1371   char in_brace = 0;
1372   const char* walk = *str;
1373
1374   if (walk[1] == 0) {
1375     return false;
1376   }
1377
1378   if (*walk == '$' && walk[1] == '{') {
1379     in_brace = 1;
1380     walk++;
1381   }
1382   walk++;
1383
1384   if (*walk >= '0' && *walk <= '9') {
1385     *backref = *walk - '0';
1386     walk++;
1387   } else {
1388     return false;
1389   }
1390
1391   if (*walk && *walk >= '0' && *walk <= '9') {
1392     *backref = *backref * 10 + *walk - '0';
1393     walk++;
1394   }
1395
1396   if (in_brace) {
1397     if (*walk == 0 || *walk != '}') {
1398       return false;
1399     }
1400     walk++;
1401   }
1402
1403   *str = walk;
1404   return true;
1405 }
1406
1407 static Variant php_pcre_replace(const String& pattern, const String& subject,
1408                                 const Variant& replace_var, bool callable,
1409                                 int limit, int* replace_count) {
1410   PCRECache::Accessor accessor;
1411   if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1412     return preg_return_bad_regex_error(init_null());
1413   }
1414   const pcre_cache_entry* pce = accessor.get();
1415   if (pce->preg_options & PREG_REPLACE_EVAL) {
1416     raise_error("preg_replace(): Support for the /e modifier has been removed, use "
1417                 "preg_replace_callback instead");
1418   }
1419
1420   int size_offsets;
1421   int* offsets = create_offset_array(pce, size_offsets);
1422   SmartFreeHelper offsetsFreer(offsets);
1423   if (offsets == nullptr) {
1424     return preg_return_internal_error(init_null());
1425   }
1426   auto const is_literal = pce->literal_data != nullptr;
1427   const char* const* subpat_names = nullptr;
1428   if (!is_literal) {
1429     subpat_names = get_subpat_names(pce);
1430     if (subpat_names == nullptr) return preg_return_internal_error(init_null());
1431   }
1432
1433   const char* replace = nullptr;
1434   const char* replace_end = nullptr;
1435   int replace_len = 0;
1436   String replace_val;
1437
1438   if (!callable) {
1439     replace_val = replace_var.toString();
1440     replace = replace_val.data();
1441     replace_len = replace_val.size();
1442     replace_end = replace + replace_len;
1443   }
1444
1445   StringBuffer result(2 * subject.size());
1446
1447   try {
1448
1449     /* Initialize */
1450     const char* match = nullptr;
1451     int start_offset = 0;
1452     pcre_extra extra;
1453     init_local_extra(&extra, pce->extra);
1454
1455     const char* walk;     // Used to walk the replacement string
1456     char walk_last;       // Last walked character
1457     int match_len;        // Length of the current match
1458     int backref;          // Backreference number
1459     int g_notempty = 0;   // If the match should not be empty
1460     int exec_options = 0; // Options passed to pcre_exec
1461     while (1) {
1462       int count = 0;
1463       int options = exec_options | g_notempty;
1464       if (pce->literal_data && literalOptions(options)) {
1465         assertx(pce->literal_data->isLiteral());
1466         count =
1467           pce->literal_data->matches(subject.get(), start_offset, offsets, options)
1468           ? 1 : PCRE_ERROR_NOMATCH;
1469       } else {
1470         /* Execute the regular expression. */
1471         count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1472                           start_offset, options, offsets, size_offsets);
1473
1474         /* The string was already proved to be valid UTF-8 */
1475         exec_options |= PCRE_NO_UTF8_CHECK;
1476       }
1477
1478       /* Check for too many substrings condition. */
1479       if (count == 0) {
1480         raise_warning("Matched, but too many substrings");
1481         count = pce->num_subpats;
1482       }
1483
1484       const char* piece = subject.data() + start_offset;
1485       if (count > 0 && offsets[1] >= offsets[0] &&
1486           (limit == -1 || limit > 0)) {
1487         if (replace_count) {
1488           ++*replace_count;
1489         }
1490         /* Set the match location in subject */
1491         match = subject.data() + offsets[0];
1492
1493         String callable_result;
1494         if (callable) {
1495           /* Use custom function to get replacement string and its length. */
1496           callable_result = preg_do_repl_func(replace_var, subject, offsets,
1497                                               subpat_names, count);
1498         } else { /* do regular substitution */
1499           walk = replace;
1500           walk_last = 0;
1501           while (walk < replace_end) {
1502             if ('\\' == *walk || '$' == *walk) {
1503               if (walk_last == '\\') {
1504                 walk++;
1505                 walk_last = 0;
1506                 continue;
1507               }
1508               if (preg_get_backref(&walk, &backref)) {
1509                 if (backref < count) {
1510                   match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1511                 }
1512                 continue;
1513               }
1514             }
1515             walk++;
1516             walk_last = walk[-1];
1517           }
1518         }
1519
1520         /* copy the part of the string before the match */
1521         result.append(piece, match-piece);
1522
1523         /* copy replacement and backrefs */
1524         int result_len = result.size();
1525
1526         if (callable) {
1527           /* Copy result from custom function to buffer and clean up. */
1528           result.append(callable_result.data(), callable_result.size());
1529           result_len += callable_result.size();
1530         } else { /* do regular backreference copying */
1531           walk = replace;
1532           walk_last = 0;
1533           Array params;
1534           while (walk < replace_end) {
1535             if ('\\' == *walk || '$' == *walk) {
1536               if (walk_last == '\\') {
1537                 result.set(result.size() - 1, *walk++);
1538                 walk_last = 0;
1539                 continue;
1540               }
1541               if (preg_get_backref(&walk, &backref)) {
1542                 if (backref < count) {
1543                   match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1544                   result.append(
1545                     subject.data() + offsets[backref<<1],
1546                     match_len
1547                   );
1548                 }
1549                 continue;
1550               }
1551             }
1552             result.append(*walk++);
1553             walk_last = walk[-1];
1554           }
1555         }
1556
1557         if (limit != -1) {
1558           limit--;
1559         }
1560
1561       } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1562         /* If we previously set PCRE_NOTEMPTY after a null match,
1563            this is not necessarily the end. We need to advance
1564            the start offset, and continue. Fudge the offset values
1565            to achieve this, unless we're already at the end of the string. */
1566         if (g_notempty != 0 && start_offset < subject.size()) {
1567           offsets[0] = start_offset;
1568           offsets[1] = start_offset + 1;
1569           result.append(piece, 1);
1570         } else {
1571           /* stick that last bit of string on our output */
1572           result.append(piece, subject.size() - start_offset);
1573           break;
1574         }
1575       } else {
1576         if (pcre_need_log_error(count)) {
1577           const char* s;
1578           int size;
1579           String stemp;
1580           if (callable) {
1581             if (replace_var.isObject()) {
1582               stemp = replace_var.asCObjRef()->getClassName().asString()
1583                     + "::__invoke";
1584             } else {
1585               stemp = replace_var.toString();
1586             }
1587             s = stemp.data();
1588             size = stemp.size();
1589           } else {
1590             s = replace_val.data();
1591             size = replace_val.size();
1592           }
1593           pcre_log_error(__FUNCTION__, __LINE__, count,
1594                          pattern.data(), pattern.size(),
1595                          subject.data(), subject.size(),
1596                          s, size,
1597                          callable, limit, start_offset, g_notempty);
1598         }
1599         return preg_return_pcre_error(count, init_null());
1600       }
1601
1602       /* If we have matched an empty string, mimic what Perl's /g options does.
1603          This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1604          the match again at the same point. If this fails (picked up above) we
1605          advance to the next character. */
1606       g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1607
1608       /* Advance to the next piece. */
1609       start_offset = offsets[1];
1610     }
1611
1612     return preg_return_no_error(result.detach());
1613   } catch (...) {
1614     throw;
1615   }
1616 }
1617
1618 static Variant php_replace_in_subject(const Variant& regex, const Variant& replace,
1619                                       String subject, int limit, bool callable,
1620                                       int* replace_count) {
1621   if (!regex.isArray()) {
1622     return php_pcre_replace(regex.toString(), subject, replace, callable,
1623                             limit, replace_count);
1624   }
1625
1626   if (callable || !replace.isArray()) {
1627     Array arr = regex.toDict();
1628     for (ArrayIter iterRegex(arr); iterRegex; ++iterRegex) {
1629       String regex_entry = iterRegex.second().toString();
1630       auto ret = php_pcre_replace(regex_entry, subject, replace, callable,
1631                                   limit, replace_count);
1632       if (!ret.isString()) {
1633         assertx(ret.isNull());
1634         return ret; // php_pcre_replace already set error
1635       }
1636       subject = ret.asStrRef();
1637       assertx(!subject.isNull());
1638     }
1639     return preg_return_no_error(std::move(subject));
1640   }
1641
1642   Array arrReplace = replace.toDict();
1643   Array arrRegex = regex.toDict();
1644   ArrayIter iterReplace(arrReplace);
1645   for (ArrayIter iterRegex(arrRegex); iterRegex; ++iterRegex) {
1646     String regex_entry = iterRegex.second().toString();
1647     Variant replace_value;
1648     if (iterReplace) {
1649       replace_value = iterReplace.second();
1650       ++iterReplace;
1651     }
1652
1653     auto ret = php_pcre_replace(regex_entry, subject, replace_value, callable,
1654                                 limit, replace_count);
1655     if (!ret.isString()) {
1656       assertx(ret.isNull());
1657       return ret; // php_pcre_replace already set error
1658     }
1659     subject = ret.asStrRef();
1660     assertx(!subject.isNull());
1661   }
1662   return preg_return_no_error(std::move(subject));
1663 }
1664
1665 Variant preg_replace_impl(const Variant& pattern, const Variant& replacement,
1666                           const Variant& subject, int limit, int64_t* count,
1667                           bool is_callable, bool is_filter) {
1668   assertx(!(is_callable && is_filter));
1669   if (!is_callable &&
1670       replacement.isArray() && !pattern.isArray()) {
1671     raise_warning("Parameter mismatch, pattern is a string while "
1672                     "replacement is an array");
1673     return preg_return_internal_error(false);
1674   }
1675
1676   int replace_count = 0;
1677   if (!isContainer(subject)) {
1678     auto ret = php_replace_in_subject(pattern, replacement, subject.toString(),
1679                                       limit, is_callable, &replace_count);
1680
1681     if (ret.isNull()) return ret; // php_replace_in_subject already set error
1682     assertx(ret.isString());
1683     if (count) *count = replace_count;
1684     if (is_filter && replace_count == 0) {
1685       return preg_return_internal_error(init_null());
1686     }
1687     return preg_return_no_error(std::move(ret));
1688   }
1689
1690   Array return_value = Array::CreateDict();
1691   Array arrSubject = subject.toDict();
1692   for (ArrayIter iter(arrSubject); iter; ++iter) {
1693     auto old_replace_count = replace_count;
1694     String subject_entry = iter.second().toString();
1695     auto ret = php_replace_in_subject(pattern, replacement, subject_entry,
1696                                       limit, is_callable, &replace_count);
1697
1698     if (ret.isString() && (!is_filter || replace_count > old_replace_count)) {
1699       return_value.set(iter.first(), ret.asStrRef());
1700     }
1701   }
1702   if (count) *count = replace_count;
1703   return preg_return_no_error(std::move(return_value));
1704 }
1705
1706 int preg_replace(Variant& result,
1707                  const Variant& pattern,
1708                  const Variant& replacement,
1709                  const Variant& subject,
1710                  int limit /* = -1 */) {
1711   int64_t count;
1712   result = preg_replace_impl(pattern, replacement, subject,
1713                              limit, &count, false, false);
1714   return count;
1715 }
1716
1717 int preg_replace_callback(Variant& result,
1718                           const Variant& pattern,
1719                           const Variant& callback,
1720                           const Variant& subject,
1721                           int limit /* = -1 */) {
1722   int64_t count;
1723   result = preg_replace_impl(pattern, callback, subject,
1724                              limit, &count, true, false);
1725   return count;
1726 }
1727
1728 ///////////////////////////////////////////////////////////////////////////////
1729
1730 namespace {
1731
1732 const StaticString s_OneUnicodeCharPattern("/./us");
1733
1734 } // namespace
1735
1736 Variant preg_split(const String& pattern, const String& subject,
1737                    int limit /* = -1 */, int flags /* = 0 */) {
1738   PCRECache::Accessor accessor;
1739   if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1740     return preg_return_bad_regex_error(false);
1741   }
1742   const pcre_cache_entry* pce = accessor.get();
1743
1744   int no_empty = flags & PREG_SPLIT_NO_EMPTY;
1745   bool delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1746   bool offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1747
1748   if (limit == 0) {
1749     limit = -1;
1750   }
1751
1752   int size_offsets = 0;
1753   int* offsets = create_offset_array(pce, size_offsets);
1754   SmartFreeHelper offsetsFreer(offsets);
1755   if (offsets == nullptr) {
1756     return preg_return_internal_error(false);
1757   }
1758
1759   /* Start at the beginning of the string */
1760   int start_offset = 0;
1761   int next_offset = 0;
1762   const char* last_match = subject.data();
1763   pcre_extra extra;
1764   init_local_extra(&extra, pce->extra);
1765
1766   // Get next piece if no limit or limit not yet reached and something matched
1767   Array result = Array::CreateDict();
1768   int g_notempty = 0;   /* If the match should not be empty */
1769   int utf8_check = 0;
1770   PCRECache::Accessor bump_accessor;
1771   const pcre_cache_entry* bump_pce = nullptr; /* instance for empty matches */
1772   while ((limit == -1 || limit > 1)) {
1773     int count = 0;
1774     int options = g_notempty | utf8_check;
1775     if (pce->literal_data && literalOptions(options)) {
1776       assertx(pce->literal_data->isLiteral());
1777       count =
1778         pce->literal_data->matches(subject.get(), start_offset, offsets, options)
1779         ? 1 : PCRE_ERROR_NOMATCH;
1780     } else {
1781       count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1782                         start_offset, options, offsets, size_offsets);
1783       /* Subsequent calls to pcre_exec don't need to bother with the
1784       * utf8 validity check: if the subject isn't valid, the first
1785       * call to pcre_exec will have failed, and as long as we only
1786       * set start_offset to known character boundaries we won't
1787       * supply an invalid offset. */
1788       utf8_check = PCRE_NO_UTF8_CHECK;
1789     }
1790
1791     /* Check for too many substrings condition. */
1792     if (count == 0) {
1793       raise_warning("Matched, but too many substrings");
1794       count = pce->num_subpats;
1795     }
1796
1797     /* If something matched */
1798     if (count > 0 && offsets[1] >= offsets[0]) {
1799       if (!no_empty || subject.data() + offsets[0] != last_match) {
1800         auto const length = subject.data() + offsets[0] - last_match;
1801         auto const match = String(last_match, length, CopyString);
1802         auto const value = offset_capture
1803           ? Variant(str_offset_pair(match, next_offset))
1804           : Variant(match);
1805         assertx(result->isVectorData());
1806         result.set(safe_cast<int64_t>(result.size()), value);
1807
1808         /* One less left to do */
1809         if (limit != -1) limit--;
1810       }
1811
1812       last_match = subject.data() + offsets[1];
1813       next_offset = offsets[1];
1814
1815       if (delim_capture) {
1816         int i, match_len;
1817         for (i = 1; i < count; i++) {
1818           match_len = offsets[(i<<1)+1] - offsets[i<<1];
1819           /* If we have matched a delimiter */
1820           if (!no_empty || match_len > 0) {
1821             auto const match = subject.substr(offsets[i<<1], match_len);
1822             auto const value = offset_capture
1823               ? Variant(str_offset_pair(match, offsets[i<<1]))
1824               : Variant(match);
1825             assertx(result->isVectorData());
1826             result.set(safe_cast<int64_t>(result.size()), value);
1827           }
1828         }
1829       }
1830     } else if (count == PCRE_ERROR_NOMATCH) {
1831       /* If we previously set PCRE_NOTEMPTY after a null match,
1832          this is not necessarily the end. We need to advance
1833          the start offset, and continue. Fudge the offset values
1834          to achieve this, unless we're already at the end of the string. */
1835       if (g_notempty != 0 && start_offset < subject.size()) {
1836         if (pce->compile_options & PCRE_UTF8) {
1837           if (bump_pce == nullptr) {
1838             auto const DEBUG_ONLY ok = pcre_get_compiled_regex_cache(
1839               bump_accessor, s_OneUnicodeCharPattern.get());
1840             assertx(ok);
1841             bump_pce = bump_accessor.get();
1842           }
1843           pcre_extra bump_extra;
1844           init_local_extra(&bump_extra, bump_pce->extra);
1845           count = pcre_exec(bump_pce->re, &bump_extra, subject.data(),
1846                             subject.size(), start_offset,
1847                             utf8_check, offsets, size_offsets);
1848           if (count < 1) {
1849             raise_warning("Unknown error");
1850             offsets[0] = start_offset;
1851             offsets[1] = start_offset + 1;
1852             if (pcre_need_log_error(count)) {
1853               pcre_log_error(__FUNCTION__, __LINE__, count,
1854                              pattern.data(), pattern.size(),
1855                              subject.data(), subject.size(),
1856                              "", 0,
1857                              limit, flags, start_offset);
1858             }
1859           }
1860         } else {
1861           offsets[0] = start_offset;
1862           offsets[1] = start_offset + 1;
1863         }
1864       } else
1865         break;
1866     } else {
1867       if (pcre_need_log_error(count)) {
1868         pcre_log_error(__FUNCTION__, __LINE__, count,
1869                        pattern.data(), pattern.size(),
1870                        subject.data(), subject.size(),
1871                        "", 0,
1872                        limit, flags, start_offset, g_notempty);
1873       }
1874       // NOTE: this returns an error together with a partial result :-(
1875       start_offset = last_match - subject.data(); /* offset might have
1876                                                    * been incremented,
1877                                                    * but without further
1878                                                    * successful matches */
1879       if (!no_empty || start_offset < subject.size()) {
1880         auto const match = subject.substr(start_offset);
1881         auto const value = offset_capture
1882           ? Variant(str_offset_pair(match, start_offset))
1883           : Variant(match);
1884         assertx(result->isVectorData());
1885         result.set(safe_cast<int64_t>(result.size()), value);
1886       }
1887       return preg_return_pcre_error(count, std::move(result));
1888     }
1889
1890     /* If we have matched an empty string, mimic what Perl's /g options does.
1891        This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1892        the match again at the same point. If this fails (picked up above) we
1893        advance to the next character. */
1894     g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1895
1896     /* Advance to the position right after the last full match */
1897     start_offset = offsets[1];
1898   }
1899
1900   start_offset = last_match - subject.data(); /* offset might have
1901                                                 * been incremented,
1902                                                 * but without further
1903                                                 * successful matches */
1904   if (!no_empty || start_offset < subject.size()) {
1905     auto const match = subject.substr(start_offset);
1906     auto const value = offset_capture
1907       ? Variant(str_offset_pair(match, start_offset))
1908       : Variant(match);
1909     assertx(result->isVectorData());
1910     result.set(safe_cast<int64_t>(result.size()), value);
1911   }
1912
1913   return preg_return_no_error(std::move(result));
1914 }
1915
1916 ///////////////////////////////////////////////////////////////////////////////
1917
1918 String preg_quote(const String& str,
1919                   const String& delimiter /* = null_string */) {
1920   const char* in_str = str.data();
1921   const char* in_str_end = in_str + str.size();
1922
1923   /* Nothing to do if we got an empty string */
1924   if (in_str == in_str_end) {
1925     return str;
1926   }
1927
1928   char delim_char = 0;      /* Delimiter character to be quoted */
1929   bool quote_delim = false; /* Whether to quote additional delim char */
1930   if (!delimiter.empty()) {
1931     delim_char = delimiter.charAt(0);
1932     quote_delim = true;
1933   }
1934
1935   /* Allocate enough memory so that even if each character
1936      is quoted, we won't run out of room */
1937   static_assert(
1938     (StringData::MaxSize * 4 + 1) < std::numeric_limits<int64_t>::max()
1939   );
1940   String ret(4 * str.size() + 1, ReserveString);
1941   char* out_str = ret.mutableData();
1942
1943   /* Go through the string and quote necessary characters */
1944   const char* p;
1945   char* q;
1946   for (p = in_str, q = out_str; p != in_str_end; p++) {
1947     char c = *p;
1948     switch (c) {
1949     case '.': case '\\': case '+': case '*': case '?':
1950     case '[': case '^':  case ']': case '$': case '(':
1951     case ')': case '{':  case '}': case '=': case '!':
1952     case '>': case '<':  case '|': case ':': case '-':
1953     case '#':
1954       *q++ = '\\';
1955       *q++ = c;
1956       break;
1957
1958     case '\0':
1959       *q++ = '\\';
1960       *q++ = '0';
1961       *q++ = '0';
1962       *q++ = '0';
1963       break;
1964
1965     default:
1966       if (quote_delim && c == delim_char)
1967         *q++ = '\\';
1968       *q++ = c;
1969       break;
1970     }
1971   }
1972   *q = '\0';
1973
1974   return ret.setSize(q - out_str);
1975 }
1976
1977 ///////////////////////////////////////////////////////////////////////////////
1978 // last_error
1979
1980 int preg_last_error() {
1981   return *rl_last_error_code;
1982 }
1983
1984 PregWithErrorGuard::~PregWithErrorGuard() {
1985   if (*rl_last_error_code == PHP_PCRE_NO_ERROR) {
1986     error.setNull();
1987   } else {
1988     error = *rl_last_error_code;
1989   }
1990   *rl_last_error_code = prior_error;
1991 }
1992
1993 size_t preg_pcre_cache_size() {
1994   return s_pcreCache.size();
1995 }
1996
1997 ///////////////////////////////////////////////////////////////////////////////
1998 // regexec
1999
2000 static void php_reg_eprint(int err, regex_t* re) {
2001   char *buf = nullptr, *message = nullptr;
2002   size_t len;
2003   size_t buf_len;
2004
2005 #ifdef REG_ITOA
2006   /* get the length of the message */
2007   buf_len = regerror(REG_ITOA | err, re, nullptr, 0);
2008   if (buf_len) {
2009     buf = (char *)req::malloc_noptrs(buf_len);
2010     if (!buf) return; /* fail silently */
2011     /* finally, get the error message */
2012     regerror(REG_ITOA | err, re, buf, buf_len);
2013   }
2014 #else
2015   buf_len = 0;
2016 #endif
2017   len = regerror(err, re, nullptr, 0);
2018   if (len) {
2019     message = (char *)req::malloc_noptrs(buf_len + len + 2);
2020     if (!message) {
2021       return; /* fail silently */
2022     }
2023     if (buf_len) {
2024       snprintf(message, buf_len, "%s: ", buf);
2025       buf_len += 1; /* so pointer math below works */
2026     }
2027     /* drop the message into place */
2028     regerror(err, re, message + buf_len, len);
2029     raise_warning("%s", message);
2030   }
2031   req::free(buf);
2032   req::free(message);
2033 }
2034
2035 Variant php_split(const String& spliton, const String& str, int count,
2036                   bool icase) {
2037   const char* strp = str.data();
2038   const char* endp = strp + str.size();
2039
2040   regex_t re;
2041   int copts = icase ? REG_ICASE : 0;
2042   int err = regcomp(&re, spliton.data(), REG_EXTENDED | copts);
2043   if (err) {
2044     php_reg_eprint(err, &re);
2045     return false;
2046   }
2047
2048   Array return_value = Array::CreateVec();
2049   regmatch_t subs[1];
2050
2051   /* churn through str, generating array entries as we go */
2052   while ((count == -1 || count > 1) &&
2053          !(err = regexec(&re, strp, 1, subs, 0))) {
2054     if (subs[0].rm_so == 0 && subs[0].rm_eo) {
2055       /* match is at start of string, return empty string */
2056       return_value.append("");
2057       /* skip ahead the length of the regex match */
2058       strp += subs[0].rm_eo;
2059     } else if (subs[0].rm_so == 0 && subs[0].rm_eo == 0) {
2060       /* No more matches */
2061       regfree(&re);
2062       raise_warning("Invalid Regular Expression to split()");
2063       return false;
2064     } else {
2065       /* On a real match */
2066
2067       /* make a copy of the substring */
2068       int size = subs[0].rm_so;
2069
2070       /* add it to the array */
2071       return_value.append(String(strp, size, CopyString));
2072
2073       /* point at our new starting point */
2074       strp = strp + subs[0].rm_eo;
2075     }
2076
2077     /* if we're only looking for a certain number of points,
2078        stop looking once we hit it */
2079     if (count != -1) {
2080       count--;
2081     }
2082   }
2083
2084   /* see if we encountered an error */
2085   if (err && err != REG_NOMATCH) {
2086     php_reg_eprint(err, &re);
2087     regfree(&re);
2088     return false;
2089   }
2090
2091   /* otherwise we just have one last element to add to the array */
2092   int size = endp - strp;
2093   return_value.append(String(strp, size, CopyString));
2094
2095   regfree(&re);
2096   return return_value;
2097 }
2098
2099 ///////////////////////////////////////////////////////////////////////////////
2100 }