hphp/runtime/base/preg.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #include "hphp/runtime/base/preg.h"
  18
  19 #include <atomic>
  20 #include <fstream>
  21 #include <mutex>
  22 #include <pcre.h>
  23 #include <onigposix.h>
  24 #include <utility>
  25
  26 #include <folly/AtomicHashArray.h>
  27
  28 #include "hphp/runtime/base/array-init.h"
  29 #include "hphp/runtime/base/array-iterator.h"
  30 #include "hphp/runtime/base/builtin-functions.h"
  31 #include "hphp/runtime/base/container-functions.h"
  32 #include "hphp/runtime/base/execution-context.h"
  33 #include "hphp/runtime/base/ini-setting.h"
  34 #include "hphp/runtime/base/runtime-option.h"
  35 #include "hphp/runtime/base/string-util.h"
  36 #include "hphp/runtime/base/init-fini-node.h"
  37 #include "hphp/runtime/base/zend-functions.h"
  38 #include "hphp/runtime/vm/debug/debug.h"
  39 #include "hphp/runtime/vm/treadmill.h"
  40 #include "hphp/runtime/vm/vm-regs.h"
  41
  42 #include "hphp/runtime/ext/std/ext_std_function.h"
  43 #include "hphp/runtime/ext/string/ext_string.h"
  44
  45 #include "hphp/runtime/vm/jit/mcgen.h"
  46 #include "hphp/runtime/vm/jit/types.h"
  47 #include "hphp/runtime/vm/jit/vtune-jit.h"
  48
  49 #include "hphp/util/logger.h"
  50 #include "hphp/util/concurrent-scalable-cache.h"
  51
  52 #include <folly/json.h>
  53
  54 /* Only defined in pcre >= 8.32 */
  55 #ifndef PCRE_STUDY_JIT_COMPILE
  56 # define PCRE_STUDY_JIT_COMPILE 0
  57 #endif
  58
  59 namespace HPHP {
  60
  61 using jit::TCA;
  62
  63 ///////////////////////////////////////////////////////////////////////////////
  64 // PCREglobals definition
  65
  66 PCREglobals::PCREglobals() {
  67   jit_stack = pcre_jit_stack_alloc(32768, 524288);
  68   // Set these to handle uses of pcre prior to PcreExtension::threadInit
  69   // In particular, for matching tier overrides during RuntimeOption::Load
  70   preg_backtrace_limit = RuntimeOption::PregBacktraceLimit;
  71   preg_recursion_limit = RuntimeOption::PregRecursionLimit;
  72 }
  73
  74 PCREglobals::~PCREglobals() {
  75   pcre_jit_stack_free(jit_stack);
  76 }
  77
  78 ///////////////////////////////////////////////////////////////////////////////
  79 // PCRECache definition
  80
  81 struct PCRECache {
  82   typedef std::shared_ptr<const pcre_cache_entry> EntryPtr;
  83   typedef std::unique_ptr<LRUCacheKey> TempKeyCache;
  84
  85   enum class CacheKind {
  86     Static,
  87     Lru,
  88     Scalable
  89   };
  90
  91 private:
  92   struct ahm_string_data_same {
  93     bool operator()(const StringData* s1, const StringData* s2) {
  94       // ahm uses -1, -2, -3 as magic values
  95       return int64_t(s1) > 0 && (s1 == s2 || s1->same(s2));
  96     }
  97   };
  98
  99   typedef folly::AtomicHashArray<const StringData*, const pcre_cache_entry*,
 100           string_data_hash, ahm_string_data_same> StaticCache;
 101   typedef ConcurrentLRUCache<LRUCacheKey, EntryPtr,
 102           LRUCacheKey::HashCompare> LRUCache;
 103   typedef ConcurrentScalableCache<LRUCacheKey, EntryPtr,
 104           LRUCacheKey::HashCompare> ScalableCache;
 105   typedef StaticCache::value_type StaticCachePair;
 106
 107 public:
 108   struct Accessor {
 109     Accessor()
 110       : m_kind(Kind::Empty)
 111     {}
 112
 113     ~Accessor() {
 114       switch (m_kind) {
 115         case Kind::Empty:
 116         case Kind::Ptr:
 117           break;
 118         case Kind::SmartPtr:
 119           m_u.smart_ptr.~EntryPtr();
 120           break;
 121         case Kind::AccessorKind:
 122           m_u.accessor.~ConstAccessor();
 123           break;
 124       }
 125     }
 126
 127     Accessor& operator=(const pcre_cache_entry* ptr) {
 128       assertx(m_kind == Kind::Empty || m_kind == Kind::Ptr);
 129       m_kind = Kind::Ptr;
 130       m_u.ptr = ptr;
 131       return *this;
 132     }
 133
 134     Accessor& operator=(EntryPtr&& ep) {
 135       switch (m_kind) {
 136         case Kind::AccessorKind:
 137           m_u.accessor.~ConstAccessor();
 138         case Kind::Empty:
 139         case Kind::Ptr:
 140           m_kind = Kind::SmartPtr;
 141           new (&m_u.smart_ptr) EntryPtr(std::move(ep));
 142           break;
 143         case Kind::SmartPtr:
 144           m_u.smart_ptr = std::move(ep);
 145           break;
 146       }
 147       return *this;
 148     }
 149
 150     // No assignment from LRUCache::ConstAccessor since it is non-copyable
 151     // Use resetToLRU instead
 152     LRUCache::ConstAccessor& resetToLRU() {
 153       switch (m_kind) {
 154         case Kind::SmartPtr:
 155           m_u.smart_ptr.~EntryPtr();
 156         case Kind::Empty:
 157         case Kind::Ptr:
 158           m_kind = Kind::AccessorKind;
 159           new (&m_u.accessor) LRUCache::ConstAccessor();
 160           break;
 161         case Kind::AccessorKind:
 162           break;
 163       }
 164       return m_u.accessor;
 165     }
 166
 167     const pcre_cache_entry* get() {
 168       switch (m_kind) {
 169         case Kind::Empty:    return nullptr;
 170         case Kind::Ptr:      return m_u.ptr;
 171         case Kind::SmartPtr: return m_u.smart_ptr.get();
 172         case Kind::AccessorKind: return m_u.accessor->get();
 173       }
 174       always_assert(false);
 175     }
 176
 177     const EntryPtr& entryPtr() const {
 178       assertx(m_kind == Kind::SmartPtr);
 179       return m_u.smart_ptr;
 180     }
 181
 182    private:
 183     enum class Kind : uint8_t {
 184       Empty,
 185       Ptr,
 186       SmartPtr,
 187       AccessorKind,
 188     };
 189
 190     union Ptr {
 191        Ptr() {}
 192       ~Ptr() {}
 193
 194       const pcre_cache_entry* ptr;
 195       EntryPtr smart_ptr;
 196       LRUCache::ConstAccessor accessor;
 197     };
 198
 199     Ptr m_u;
 200     Kind m_kind;
 201   };
 202
 203   PCRECache()
 204     : m_kind(CacheKind::Static), m_staticCache(nullptr)
 205   {
 206     reinit(CacheKind::Static);
 207   }
 208
 209   ~PCRECache() {
 210     if (m_kind == CacheKind::Static && m_staticCache.load()) {
 211       DestroyStatic(m_staticCache);
 212     }
 213   }
 214
 215   void reinit(CacheKind kind);
 216   bool find(Accessor& accessor, const StringData* key,
 217             TempKeyCache& keyCache);
 218   void insert(Accessor& accessor, const StringData* regex,
 219               TempKeyCache& keyCache, const pcre_cache_entry* ent);
 220   void dump(const std::string& filename);
 221   size_t size() const;
 222
 223 private:
 224   void clearStatic();
 225
 226   static void DestroyStatic(StaticCache* cache);
 227   static StaticCache* CreateStatic();
 228
 229   CacheKind m_kind;
 230   std::atomic<StaticCache*> m_staticCache;
 231   std::unique_ptr<LRUCache> m_lruCache;
 232   std::unique_ptr<ScalableCache> m_scalableCache;
 233   std::atomic<time_t> m_expire{};
 234   std::mutex m_clearMutex;
 235 };
 236
 237 ///////////////////////////////////////////////////////////////////////////////
 238 // Data
 239
 240 RDS_LOCAL(PCREglobals, tl_pcre_globals);
 241
 242 static PCRECache s_pcreCache;
 243
 244 // The last pcre error code is available for the whole thread.
 245 static RDS_LOCAL(int, rl_last_error_code);
 246
 247 ///////////////////////////////////////////////////////////////////////////////
 248 // pcre_cache_entry implementation
 249
 250 pcre_cache_entry::~pcre_cache_entry() {
 251   if (extra) {
 252 #if PCRE_MAJOR < 8 || (PCRE_MAJOR == 8 && PCRE_MINOR < 20)
 253     free(extra);
 254 #else
 255     pcre_free_study(extra);
 256 #endif
 257   }
 258   free(subpat_names);
 259   pcre_free(re);
 260 }
 261
 262 pcre_literal_data::pcre_literal_data(const char* pattern, int coptions) {
 263   if (coptions & ~PCRE_CASELESS) {
 264     return;
 265   }
 266
 267   auto p = pattern;
 268   if (*p == '^') {
 269     match_start = true;
 270     p++;
 271   }
 272
 273   std::string pattern_buffer;
 274   while (isalnum((unsigned char)*p) || (*p && strchr("/\\ :-_", *p))) {
 275     // backslash + alphanumeric character --> not a literal (i.e. \d).
 276     // backslash + non-alphanumeric character --> literal symbol (i.e. \.)
 277     if (*p == '\\') {
 278       if (!p[1] || isalnum((unsigned char)p[1])) {
 279         break;
 280       } else {
 281         p++;
 282       }
 283     }
 284     pattern_buffer += *p++;
 285   }
 286   if (*p == '$') {
 287     match_end = true;
 288     p++;
 289   }
 290   if (!*p) {
 291     /* This is an encoding of a literal string. */
 292     case_insensitive = coptions & PCRE_CASELESS;
 293     literal_str = std::move(pattern_buffer);
 294   }
 295 }
 296
 297 bool pcre_literal_data::isLiteral() const {
 298   return literal_str.has_value();
 299 }
 300
 301 bool pcre_literal_data::matches(const StringData* subject,
 302                                 int pos,
 303                                 int* offsets) const {
 304   assertx(isLiteral());
 305   assertx(pos >= 0);
 306
 307   // Subject must be at least as long as the literal pattern
 308   // for a match to occur.
 309   if (subject->size() < literal_str->length() + pos) {
 310     return false;
 311   }
 312
 313   size_t literal_strlen = literal_str->length();
 314   auto const subject_c = subject->data();
 315   auto const literal_c = literal_str->c_str();
 316   if (match_start) {
 317     // Make sure an exact match has the right length.
 318     if (pos || (match_end && subject->size() != literal_strlen)) {
 319       return false;
 320     }
 321     // If only matching the start (^), compare the strings
 322     // for the length of the literal pattern.
 323     if (case_insensitive ?
 324         bstrcaseeq(subject_c, literal_c, literal_strlen) :
 325         memcmp(subject_c, literal_c, literal_strlen) == 0) {
 326       offsets[0] = 0;
 327       offsets[1] = literal_strlen * sizeof(char);
 328       return true;
 329     }
 330   } else if (match_end) {
 331     // Compare the literal pattern against the tail end of the subject.
 332     auto const subject_tail = subject_c + (subject->size() - literal_strlen);
 333     if (case_insensitive ?
 334         bstrcaseeq(subject_tail, literal_c, literal_strlen) :
 335         memcmp(subject_tail, literal_c, literal_strlen) == 0) {
 336       offsets[0] = (subject->size() - literal_strlen) * sizeof(char);
 337       offsets[1] = subject->size() * sizeof(char);
 338       return true;
 339     }
 340   } else {
 341     if (!literal_strlen) {
 342       offsets[0] = offsets[1] = pos;
 343       return true;
 344     }
 345     // Check if the literal pattern occurs as a substring of the subject.
 346     auto const subject_str = StrNR(subject);
 347     auto const find_response = subject_str.asString().find(
 348       *literal_str, pos, !case_insensitive);
 349     if (find_response >= 0) {
 350       offsets[0] = find_response * sizeof(char);
 351       offsets[1] = offsets[0] + literal_strlen * sizeof(char);
 352       return true;
 353     }
 354   }
 355   return false;
 356 }
 357
 358 ///////////////////////////////////////////////////////////////////////////////
 359 // PCRECache implementation
 360
 361 PCRECache::StaticCache* PCRECache::CreateStatic() {
 362   StaticCache::Config config;
 363   config.maxLoadFactor = 0.5;
 364   return StaticCache::create(
 365       RuntimeOption::EvalPCRETableSize, config).release();
 366 }
 367
 368 void PCRECache::DestroyStatic(StaticCache* cache) {
 369   // We delete uncounted keys while iterating the cache, which is OK for
 370   // AtomicHashArray, but not OK for other containers, such as
 371   // std::unordered_map.  If you change the cache type make sure that property
 372   // holds or fix this function.
 373   static_assert(std::is_same<PCRECache::StaticCache,
 374       folly::AtomicHashArray<const StringData*, const pcre_cache_entry*,
 375                              string_data_hash, ahm_string_data_same>>::value,
 376       "StaticCache must be an AtomicHashArray or this destructor is wrong.");
 377   for (auto& it : *cache) {
 378     if (it.first->isUncounted()) {
 379       StringData::ReleaseUncounted(it.first);
 380     }
 381     delete it.second;
 382   }
 383   StaticCache::destroy(cache);
 384 }
 385
 386 void PCRECache::reinit(CacheKind kind) {
 387   switch (m_kind) {
 388     case CacheKind::Static:
 389       if (m_staticCache.load()) {
 390         DestroyStatic(m_staticCache);
 391         m_staticCache = nullptr;
 392       }
 393       break;
 394     case CacheKind::Lru:
 395       m_lruCache.reset();
 396       break;
 397     case CacheKind::Scalable:
 398       m_scalableCache.reset();
 399       break;
 400   }
 401   m_kind = kind;
 402
 403   switch (kind) {
 404     case CacheKind::Static:
 405       m_staticCache = CreateStatic();
 406       m_expire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
 407       break;
 408     case CacheKind::Lru:
 409       m_lruCache.reset(new LRUCache(RuntimeOption::EvalPCRETableSize));
 410       break;
 411     case CacheKind::Scalable:
 412       m_scalableCache.reset(
 413         new ScalableCache(RuntimeOption::EvalPCRETableSize));
 414       break;
 415   }
 416 }
 417
 418 bool PCRECache::find(Accessor& accessor,
 419                      const StringData* regex,
 420                      TempKeyCache& keyCache)
 421 {
 422   switch (m_kind) {
 423     case CacheKind::Static:
 424       {
 425         assertx(m_staticCache.load());
 426         StaticCache::iterator it;
 427         auto cache = m_staticCache.load(std::memory_order_acquire);
 428         if ((it = cache->find(regex)) != cache->end()) {
 429           accessor = it->second;
 430           return true;
 431         }
 432         return false;
 433       }
 434     case CacheKind::Lru:
 435     case CacheKind::Scalable:
 436       {
 437         if (!keyCache) {
 438           keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
 439         }
 440         bool found;
 441         if (m_kind == CacheKind::Lru) {
 442           found = m_lruCache->find(accessor.resetToLRU(), *keyCache);
 443         } else {
 444           found = m_scalableCache->find(accessor.resetToLRU(), *keyCache);
 445         }
 446         return found;
 447       }
 448   }
 449   always_assert(false);
 450 }
 451
 452 void PCRECache::clearStatic() {
 453   std::unique_lock<std::mutex> lock(m_clearMutex, std::try_to_lock);
 454   if (!lock) return;
 455
 456   auto newExpire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
 457   m_expire.store(newExpire, std::memory_order_relaxed);
 458
 459   auto tmpMap = CreateStatic();
 460   tmpMap = m_staticCache.exchange(tmpMap, std::memory_order_acq_rel);
 461
 462   Treadmill::enqueue([tmpMap]() {
 463       DestroyStatic(tmpMap);
 464    });
 465 }
 466
 467 void PCRECache::insert(
 468   Accessor& accessor,
 469   const StringData* regex,
 470   TempKeyCache& keyCache,
 471   const pcre_cache_entry* ent
 472 ) {
 473   switch (m_kind) {
 474     case CacheKind::Static:
 475       {
 476         assertx(m_staticCache.load());
 477         // Clear the cache if we haven't refreshed it in a while
 478         if (time(nullptr) > m_expire) {
 479           clearStatic();
 480         }
 481         auto const cache = m_staticCache.load(std::memory_order_acquire);
 482         auto const key =
 483           regex->isStatic() ||
 484           (regex->isUncounted() && regex->uncountedIncRef()) ?
 485           regex : StringData::MakeUncounted(regex->slice());
 486         auto pair = cache->insert(StaticCachePair(key, ent));
 487         if (pair.second) {
 488           // Inserted, container owns the pointer
 489           accessor = ent;
 490         } else {
 491           // Not inserted, caller needs to own the pointer
 492           if (regex->isUncounted()) StringData::ReleaseUncounted(key);
 493           accessor = EntryPtr(ent);
 494         }
 495       }
 496       break;
 497     case CacheKind::Lru:
 498     case CacheKind::Scalable:
 499       {
 500         if (!keyCache) {
 501           keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
 502         }
 503         // Pointer ownership is shared between container and caller
 504         accessor = EntryPtr(ent);
 505         if (m_kind == CacheKind::Lru) {
 506           m_lruCache->insert(*keyCache, accessor.entryPtr());
 507         } else {
 508           m_scalableCache->insert(*keyCache, accessor.entryPtr());
 509         }
 510       }
 511       break;
 512   }
 513 }
 514
 515 void PCRECache::dump(const std::string& filename) {
 516   std::ofstream out(filename.c_str());
 517   switch (m_kind) {
 518     case CacheKind::Static:
 519       for (auto& it : *m_staticCache) {
 520         out << it.first->data() << "\n";
 521       }
 522       break;
 523     case CacheKind::Lru:
 524     case CacheKind::Scalable:
 525       {
 526         std::vector<LRUCacheKey> keys;
 527         if (m_kind == CacheKind::Lru) {
 528           m_lruCache->snapshotKeys(keys);
 529         } else {
 530           m_scalableCache->snapshotKeys(keys);
 531         }
 532         for (auto& key: keys) {
 533           out << key.c_str() << "\n";
 534         }
 535       }
 536       break;
 537   }
 538   out.close();
 539 }
 540
 541 size_t PCRECache::size() const {
 542   switch (m_kind) {
 543     case CacheKind::Static:
 544       return m_staticCache.load(std::memory_order_acquire)->size();
 545     case CacheKind::Lru:
 546       return m_lruCache->size();
 547     case CacheKind::Scalable:
 548      return m_scalableCache->size();
 549   }
 550   always_assert(false);
 551 }
 552
 553 ///////////////////////////////////////////////////////////////////////////////
 554 // Public interface and helper functions
 555
 556 void pcre_reinit() {
 557   PCRECache::CacheKind kind;
 558   if (RuntimeOption::EvalPCRECacheType == "static") {
 559     kind = PCRECache::CacheKind::Static;
 560   } else if (RuntimeOption::EvalPCRECacheType == "lru") {
 561     kind = PCRECache::CacheKind::Lru;
 562   } else if (RuntimeOption::EvalPCRECacheType == "scalable") {
 563     kind = PCRECache::CacheKind::Scalable;
 564   } else {
 565     Logger::Warning("Eval.PCRECacheType should be either static, "
 566                     "lru or scalable");
 567     kind = PCRECache::CacheKind::Scalable;
 568   }
 569   s_pcreCache.reinit(kind);
 570 }
 571
 572 void pcre_init() {
 573 }
 574
 575 void pcre_dump_cache(const std::string& filename) {
 576   s_pcreCache.dump(filename);
 577 }
 578
 579 static pcre_jit_stack* alloc_jit_stack(void* /*data*/) {
 580   return tl_pcre_globals->jit_stack;
 581 }
 582
 583 namespace {
 584
 585 template<bool useSmartFree = false>
 586 struct FreeHelperImpl {
 587   explicit FreeHelperImpl(void* p) : p(p) {}
 588   ~FreeHelperImpl() {
 589     useSmartFree ? req::free(p) : free(p);
 590   }
 591
 592   FreeHelperImpl(const FreeHelperImpl&) = delete;
 593   FreeHelperImpl& operator=(const FreeHelperImpl&) = delete;
 594
 595 private:
 596   void* p;
 597 };
 598
 599 typedef FreeHelperImpl<true> SmartFreeHelper;
 600 }
 601
 602 static void init_local_extra(pcre_extra* local, pcre_extra* shared) {
 603   if (shared) {
 604     memcpy(local, shared, sizeof(pcre_extra));
 605   } else {
 606     memset(local, 0, sizeof(pcre_extra));
 607     local->flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 608   }
 609   local->match_limit = tl_pcre_globals->preg_backtrace_limit;
 610   local->match_limit_recursion = tl_pcre_globals->preg_recursion_limit;
 611 }
 612
 613 static const char* const*
 614 get_subpat_names(const pcre_cache_entry* pce) {
 615   char **subpat_names = pce->subpat_names.load(std::memory_order_relaxed);
 616   if (subpat_names) {
 617     return subpat_names;
 618   }
 619
 620   /*
 621   * Build a mapping from subpattern numbers to their names. We will always
 622   * allocate the table, even though there may be no named subpatterns. This
 623   * avoids somewhat more complicated logic in the inner loops.
 624   */
 625   pcre_extra extra;
 626   init_local_extra(&extra, pce->extra);
 627
 628   int name_count;
 629
 630   subpat_names = (char **)calloc(pce->num_subpats, sizeof(char *));
 631   int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMECOUNT, &name_count);
 632   if (rc < 0) {
 633     raise_warning("Internal pcre_fullinfo() error %d", rc);
 634     return nullptr;
 635   }
 636   if (name_count > 0) {
 637     int name_size, ni = 0;
 638     unsigned short name_idx;
 639     char* name_table;
 640     int rc1, rc2;
 641
 642     rc1 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMETABLE, &name_table);
 643     rc2 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
 644     rc = rc2 ? rc2 : rc1;
 645     if (rc < 0) {
 646       raise_warning("Internal pcre_fullinfo() error %d", rc);
 647       return nullptr;
 648     }
 649     while (ni++ < name_count) {
 650       name_idx = 0xff * (unsigned char)name_table[0] +
 651                  (unsigned char)name_table[1];
 652       subpat_names[name_idx] = name_table + 2;
 653       if (is_numeric_string(subpat_names[name_idx],
 654                             strlen(subpat_names[name_idx]),
 655                             nullptr, nullptr, 0) != KindOfNull) {
 656         raise_warning("Numeric named subpatterns are not allowed");
 657         return nullptr;
 658       }
 659       name_table += name_size;
 660     }
 661   }
 662   // Store subpat_names into the cache entry
 663   char **expected = nullptr;
 664   if (!pce->subpat_names.compare_exchange_strong(expected, subpat_names)) {
 665     // Another thread stored subpat_names already. The array created by the
 666     // other thread is now in expected, return it instead and delete the one
 667     // we just made.
 668     free(subpat_names);
 669     return expected;
 670   }
 671   return subpat_names;
 672 }
 673
 674 static bool get_pcre_fullinfo(pcre_cache_entry* pce) {
 675   pcre_extra extra;
 676   init_local_extra(&extra, pce->extra);
 677
 678   /* Calculate the size of the offsets array*/
 679   int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_CAPTURECOUNT,
 680                          &pce->num_subpats);
 681   if (rc < 0) {
 682     raise_warning("Internal pcre_fullinfo() error %d", rc);
 683     return false;
 684   }
 685   pce->num_subpats++;
 686   return true;
 687 }
 688
 689 static bool
 690 pcre_get_compiled_regex_cache(PCRECache::Accessor& accessor,
 691                               const StringData* regex) {
 692   PCRECache::TempKeyCache tkc;
 693
 694   /* Try to lookup the cached regex entry, and if successful, just pass
 695      back the compiled pattern, otherwise go on and compile it. */
 696   if (s_pcreCache.find(accessor, regex, tkc)) {
 697     return true;
 698   }
 699
 700   /* Parse through the leading whitespace, and display a warning if we
 701      get to the end without encountering a delimiter. */
 702   const char *p = regex->data();
 703   while (isspace((int)*(unsigned char *)p)) p++;
 704   if (*p == 0) {
 705     raise_warning("Empty regular expression");
 706     return false;
 707   }
 708
 709   /* Get the delimiter and display a warning if it is alphanumeric
 710      or a backslash. */
 711   char delimiter = *p++;
 712   if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
 713     raise_warning("Delimiter must not be alphanumeric or backslash");
 714     return false;
 715   }
 716
 717   char start_delimiter = delimiter;
 718   const char *pp = strchr("([{< )]}> )]}>", delimiter);
 719   if (pp) {
 720     delimiter = pp[5];
 721   }
 722   char end_delimiter = delimiter;
 723
 724   if (start_delimiter == end_delimiter) {
 725     /* We need to iterate through the pattern, searching for the ending
 726      * delimiter, but skipping the backslashed delimiters. If the ending
 727      * delimiter is not found, display a warning. */
 728     pp = p;
 729     while (*pp != 0) {
 730       if (*pp == '\\' && pp[1] != 0) pp++;
 731       else if (*pp == delimiter)
 732         break;
 733       pp++;
 734     }
 735     if (*pp == 0) {
 736       raise_warning("No ending delimiter '%c' found: [%s]", delimiter,
 737                       regex->data());
 738       return false;
 739     }
 740   } else {
 741     /* We iterate through the pattern, searching for the matching ending
 742      * delimiter. For each matching starting delimiter, we increment nesting
 743      * level, and decrement it for each matching ending delimiter. If we
 744      * reach the end of the pattern without matching, display a warning.
 745      */
 746     int brackets = 1; // brackets nesting level
 747     pp = p;
 748     while (*pp != 0) {
 749       if (*pp == '\\' && pp[1] != 0) pp++;
 750       else if (*pp == end_delimiter && --brackets <= 0)
 751         break;
 752       else if (*pp == start_delimiter)
 753         brackets++;
 754       pp++;
 755     }
 756     if (*pp == 0) {
 757       raise_warning("No ending matching delimiter '%c' found: [%s]",
 758                       end_delimiter, regex->data());
 759       return false;
 760     }
 761   }
 762
 763   /* Make a copy of the actual pattern. */
 764   String spattern(p, pp-p, CopyString);
 765   const char *pattern = spattern.data();
 766
 767   /* Move on to the options */
 768   pp++;
 769
 770   /* Parse through the options, setting appropriate flags.  Display
 771      a warning if we encounter an unknown modifier. */
 772   int coptions = 0;
 773   int poptions = 0;
 774   bool do_study = false;
 775   while (*pp != 0) {
 776     switch (*pp++) {
 777       /* Perl compatible options */
 778     case 'i':  coptions |= PCRE_CASELESS;       break;
 779     case 'm':  coptions |= PCRE_MULTILINE;      break;
 780     case 's':  coptions |= PCRE_DOTALL;         break;
 781     case 'x':  coptions |= PCRE_EXTENDED;       break;
 782
 783       /* PCRE specific options */
 784     case 'A':  coptions |= PCRE_ANCHORED;       break;
 785     case 'D':  coptions |= PCRE_DOLLAR_ENDONLY; break;
 786     case 'S':  do_study = true;                 break;
 787     case 'U':  coptions |= PCRE_UNGREEDY;       break;
 788     case 'X':  coptions |= PCRE_EXTRA;          break;
 789     case 'u':  coptions |= PCRE_UTF8;
 790   /* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
 791        characters, even in UTF-8 mode. However, this can be changed by setting
 792        the PCRE_UCP option. */
 793 #ifdef PCRE_UCP
 794             coptions |= PCRE_UCP;
 795 #endif
 796       break;
 797
 798       /* Custom preg options */
 799     case 'e':  poptions |= PREG_REPLACE_EVAL;   break;
 800
 801     case ' ':
 802     case '\n':
 803       break;
 804
 805     default:
 806       raise_warning("Unknown modifier '%c': [%s]", pp[-1], regex->data());
 807       return false;
 808     }
 809   }
 810
 811   /* We've reached a null byte, now check if we're actually at the end of the
 812      string.  If not this is a bad expression, and a potential security hole. */
 813   if (regex->size() != (pp - regex->data())) {
 814     raise_error("Error: Null byte found in pattern");
 815   }
 816
 817   /* Compile pattern and display a warning if compilation failed. */
 818   const char  *error;
 819   int erroffset;
 820   pcre *re = pcre_compile(pattern, coptions, &error, &erroffset, 0);
 821   if (re == nullptr) {
 822     raise_warning("Compilation failed: %s at offset %d", error, erroffset);
 823     return false;
 824   }
 825
 826   // Careful: from here 're' needs to be freed if something throws.
 827
 828   // TODO(t14969501): enable literal_data everywhere and skip the
 829   // pcre_compile above.
 830   auto const literal_data = pcre_literal_data(pattern, coptions);
 831
 832   /* If study option was specified, study the pattern and
 833      store the result in extra for passing to pcre_exec. */
 834   pcre_extra *extra = nullptr;
 835   if (!literal_data.isLiteral()) {
 836     if (do_study || PCRE_STUDY_JIT_COMPILE) {
 837       int soptions = PCRE_STUDY_JIT_COMPILE;
 838       extra = pcre_study(re, soptions, &error);
 839       if (extra) {
 840         extra->flags |= PCRE_EXTRA_MATCH_LIMIT |
 841           PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 842         pcre_assign_jit_stack(extra, alloc_jit_stack, nullptr);
 843       }
 844       if (error != nullptr) {
 845         try {
 846           raise_warning("Error while studying pattern");
 847         } catch (...) {
 848           pcre_free(re);
 849           throw;
 850         }
 851       }
 852       if ((!RuntimeOption::EvalJitNoGdb ||
 853            RuntimeOption::EvalJitUseVtuneAPI ||
 854            RuntimeOption::EvalPerfPidMap) &&
 855           extra &&
 856           extra->executable_jit != nullptr) {
 857         size_t size;
 858         pcre_fullinfo(re, extra, PCRE_INFO_JITSIZE, &size);
 859
 860         TCA start = *(TCA *)(extra->executable_jit);
 861         TCA end = start + size;
 862         std::string name = folly::sformat("HHVM::pcre_jit::{}", pattern);
 863
 864         if (!RuntimeOption::EvalJitNoGdb && jit::mcgen::initialized()) {
 865           Debug::DebugInfo::Get()->recordStub(Debug::TCRange(start, end, false),
 866                                               name);
 867         }
 868         if (RuntimeOption::EvalJitUseVtuneAPI) {
 869           HPHP::jit::reportHelperToVtune(name.c_str(), start, end);
 870         }
 871         if (RuntimeOption::EvalPerfPidMap && jit::mcgen::initialized()) {
 872           std::string escaped_name;
 873           folly::json::escapeString(name, escaped_name,
 874                                     folly::json::serialization_opts());
 875           Debug::DebugInfo::Get()->recordPerfMap(
 876             Debug::TCRange(start, end, false),
 877             SrcKey{}, nullptr, false, false, escaped_name
 878           );
 879         }
 880       }
 881     }
 882   }
 883
 884   /* Store the compiled pattern and extra info in the cache. */
 885   pcre_cache_entry* new_entry = new pcre_cache_entry();
 886   new_entry->re = re;
 887   new_entry->extra = extra;
 888   if (literal_data.isLiteral()) {
 889     new_entry->literal_data =
 890       std::make_unique<pcre_literal_data>(std::move(literal_data));
 891   }
 892
 893   assertx((poptions & ~0x1) == 0);
 894   new_entry->preg_options = poptions;
 895
 896   assertx((coptions & 0x80000000) == 0);
 897   new_entry->compile_options = coptions;
 898
 899   /* Get pcre full info */
 900   if (!get_pcre_fullinfo(new_entry)) {
 901     delete new_entry;
 902     return false;
 903   }
 904
 905   s_pcreCache.insert(accessor, regex, tkc, new_entry);
 906   return true;
 907 }
 908
 909 static int* create_offset_array(const pcre_cache_entry* pce,
 910                                 int& size_offsets) {
 911   /* Allocate memory for the offsets array */
 912   size_offsets = pce->num_subpats * 3;
 913   return (int *)req::malloc_noptrs(size_offsets * sizeof(int));
 914 }
 915
 916 static inline void add_offset_pair_split(Array& result,
 917                                          const String& str,
 918                                          int offset,
 919                                          const char* name,
 920                                          bool hackArrOutput) {
 921   auto match_pair = hackArrOutput
 922     ? make_vec_array(str, offset)
 923     : make_varray(str, offset);
 924   if (name) result.set(String(name), match_pair);
 925   result.append(match_pair);
 926 }
 927
 928 static inline void add_offset_pair_match(Array& result,
 929                                          const String& str,
 930                                          int offset,
 931                                          const char* name,
 932                                          bool hackArrOutput) {
 933   auto match_pair = hackArrOutput
 934     ? make_vec_array(str, offset)
 935     : make_varray(str, offset);
 936   if (name) result.set(String(name), match_pair);
 937   result.append(match_pair);
 938 }
 939
 940 static inline bool pcre_need_log_error(int pcre_code) {
 941   return RuntimeOption::EnablePregErrorLog &&
 942          (pcre_code == PCRE_ERROR_MATCHLIMIT ||
 943           pcre_code == PCRE_ERROR_RECURSIONLIMIT);
 944 }
 945
 946 static void pcre_log_error(const char* func, int line, int pcre_code,
 947                            const char* pattern, int pattern_size,
 948                            const char* subject, int subject_size,
 949                            const char* repl, int repl_size,
 950                            int arg1 = 0, int arg2 = 0,
 951                            int arg3 = 0, int arg4 = 0) {
 952   const char* escapedPattern;
 953   const char* escapedSubject;
 954   const char* escapedRepl;
 955   std::string p(pattern, pattern_size);
 956   std::string s(subject, subject_size);
 957   std::string r(repl, repl_size);
 958   escapedPattern = Logger::EscapeString(p);
 959   escapedSubject = Logger::EscapeString(s);
 960   escapedRepl = Logger::EscapeString(r);
 961   const char* errString =
 962     (pcre_code == PCRE_ERROR_MATCHLIMIT) ? "PCRE_ERROR_MATCHLIMIT" :
 963     (pcre_code == PCRE_ERROR_RECURSIONLIMIT) ? "PCRE_ERROR_RECURSIONLIMIT" :
 964     "UNKNOWN";
 965   raise_warning_unsampled(
 966     "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
 967     "limits=(%" PRId64 ", %" PRId64 "), extra=(%d, %d, %d, %d)",
 968     func, line, pcre_code, errString,
 969     escapedPattern, escapedSubject, escapedRepl,
 970     tl_pcre_globals->preg_backtrace_limit,
 971     tl_pcre_globals->preg_recursion_limit,
 972     arg1, arg2, arg3, arg4);
 973   free((void *)escapedPattern);
 974   free((void *)escapedSubject);
 975   free((void *)escapedRepl);
 976 }
 977
 978 static void pcre_handle_exec_error(int pcre_code) {
 979   int preg_code = 0;
 980   switch (pcre_code) {
 981   case PCRE_ERROR_MATCHLIMIT:
 982     preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
 983     break;
 984   case PCRE_ERROR_RECURSIONLIMIT:
 985     preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
 986     break;
 987   case PCRE_ERROR_BADUTF8:
 988     preg_code = PHP_PCRE_BAD_UTF8_ERROR;
 989     break;
 990   case PCRE_ERROR_BADUTF8_OFFSET:
 991     preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
 992     break;
 993   default:
 994     preg_code = PHP_PCRE_INTERNAL_ERROR;
 995     break;
 996   }
 997   *rl_last_error_code = preg_code;
 998 }
 999
1000 ///////////////////////////////////////////////////////////////////////////////
1001
1002 Variant preg_grep(const String& pattern, const Array& input, int flags /* = 0 */) {
1003   PCRECache::Accessor accessor;
1004   if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1005     return false;
1006   }
1007   const pcre_cache_entry* pce = accessor.get();
1008
1009   int size_offsets = 0;
1010   int* offsets = create_offset_array(pce, size_offsets);
1011   if (offsets == nullptr) {
1012     return false;
1013   }
1014   SmartFreeHelper freer(offsets);
1015
1016   const bool hackArrOutput = flags & PREG_FB_HACK_ARRAYS;
1017
1018   /* Initialize return array */
1019   auto ret = hackArrOutput ? Array::CreateDict() : Array::Create();
1020   *rl_last_error_code = PHP_PCRE_NO_ERROR;
1021
1022   /* Go through the input array */
1023   bool invert = (flags & PREG_GREP_INVERT);
1024   pcre_extra extra;
1025   init_local_extra(&extra, pce->extra);
1026
1027   for (ArrayIter iter(input); iter; ++iter) {
1028     String entry = iter.second().toString();
1029
1030     /* Perform the match */
1031     int count = pcre_exec(pce->re, &extra, entry.data(), entry.size(),
1032                           0, 0, offsets, size_offsets);
1033
1034     /* Check for too many substrings condition. */
1035     if (count == 0) {
1036       raise_warning("Matched, but too many substrings");
1037       count = size_offsets / 3;
1038     } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1039       if (pcre_need_log_error(count)) {
1040         pcre_log_error(__FUNCTION__, __LINE__, count,
1041                        pattern.data(), pattern.size(),
1042                        entry.data(), entry.size(),
1043                        "", 0,
1044                        flags);
1045       }
1046       pcre_handle_exec_error(count);
1047       break;
1048     }
1049
1050     /* If the entry fits our requirements */
1051     if ((count > 0 && !invert) ||
1052         (count == PCRE_ERROR_NOMATCH && invert)) {
1053
1054       /* Add to return array */
1055       ret.set(iter.first(), entry);
1056     }
1057   }
1058
1059   return ret;
1060 }
1061
1062 ///////////////////////////////////////////////////////////////////////////////
1063
1064 namespace {
1065
1066 Array& forceToOutput(Variant& var, bool hackArrOutput) {
1067   return hackArrOutput ? forceToDict(var) : forceToDArray(var);
1068 }
1069
1070 Array& forceToOutput(tv_lval lval, bool hackArrOutput) {
1071   return hackArrOutput ? forceToDict(lval) : forceToDArray(lval);
1072 }
1073
1074 }
1075
1076 static Variant preg_match_impl(const StringData* pattern,
1077                                const StringData* subject,
1078                                Variant* subpats, int flags, int start_offset,
1079                                bool global) {
1080   PCRECache::Accessor accessor;
1081   if (!pcre_get_compiled_regex_cache(accessor, pattern)) {
1082     return false;
1083   }
1084   const pcre_cache_entry* pce = accessor.get();
1085
1086   const bool hackArrOutput = flags & PREG_FB_HACK_ARRAYS;
1087   const bool includeNonMatchingCaptures = flags & PREG_FB__PRIVATE__HSL_IMPL;
1088
1089   pcre_extra extra;
1090   init_local_extra(&extra, pce->extra);
1091   if (subpats) {
1092     *subpats = hackArrOutput ? Array::CreateDict() : Array::CreateDArray();
1093   }
1094   int exec_options = 0;
1095
1096   int subpats_order = global ? PREG_PATTERN_ORDER : 0;
1097   bool offset_capture = false;
1098   if (flags) {
1099     offset_capture = flags & PREG_OFFSET_CAPTURE;
1100
1101     /*
1102      * subpats_order is pre-set to pattern mode so we change it only if
1103      * necessary.
1104      */
1105     if (flags & 0xff) {
1106       subpats_order = flags & 0xff;
1107     }
1108     if ((global && (subpats_order < PREG_PATTERN_ORDER ||
1109                     subpats_order > PREG_SET_ORDER)) ||
1110         (!global && subpats_order != 0)) {
1111       raise_warning("Invalid flags specified");
1112       return init_null();
1113     }
1114   }
1115
1116   /* Negative offset counts from the end of the string. */
1117   if (start_offset < 0) {
1118     start_offset = subject->size() + start_offset;
1119     if (start_offset < 0) {
1120       start_offset = 0;
1121     }
1122   }
1123
1124   int size_offsets = 0;
1125   int* offsets = create_offset_array(pce, size_offsets);
1126   SmartFreeHelper offsetsFreer(offsets);
1127   int num_subpats = size_offsets / 3;
1128   if (offsets == nullptr) {
1129     return false;
1130   }
1131
1132   const char* const* subpat_names = get_subpat_names(pce);
1133   if (subpat_names == nullptr) {
1134     return false;
1135   }
1136
1137   /* Allocate match sets array and initialize the values. */
1138
1139   /* An array of sets of matches for each subpattern after a global match */
1140   auto match_sets = hackArrOutput ? Array::CreateDict() : Array::CreateDArray();
1141   if (global && subpats_order == PREG_PATTERN_ORDER) {
1142     for (int i = 0; i < num_subpats; i++) {
1143       match_sets.set(i,
1144         hackArrOutput ? Array::CreateDict() : Array::CreateDArray());
1145     }
1146   }
1147
1148   int matched = 0;
1149   *rl_last_error_code = PHP_PCRE_NO_ERROR;
1150
1151   int g_notempty = 0; // If the match should not be empty
1152   const char** stringlist; // Holds list of subpatterns
1153   int i;
1154   do {
1155
1156     int count = 0;
1157     /*
1158      * Optimization: If the pattern defines a literal substring,
1159      * compare the strings directly (i.e. memcmp) instead of performing
1160      * the full regular expression evaluation.
1161      * Take the slow path if there are any special compile options.
1162      */
1163     if (pce->literal_data && !global) {
1164       assertx(pce->literal_data->isLiteral());
1165       /* TODO(t13140878): compare literal against multiple substrings
1166        * in the preg_match_all (global == true) case. */
1167       count = pce->literal_data->matches(subject, start_offset, offsets) ? 1
1168         : PCRE_ERROR_NOMATCH;
1169     } else {
1170       /* Execute the regular expression. */
1171       count = pcre_exec(pce->re, &extra, subject->data(), subject->size(),
1172                         start_offset,
1173                         exec_options | g_notempty,
1174                         offsets, size_offsets);
1175
1176       /* The string was already proved to be valid UTF-8 */
1177       exec_options |= PCRE_NO_UTF8_CHECK;
1178     }
1179     /* Check for too many substrings condition. */
1180     if (count == 0) {
1181       raise_warning("Matched, but too many substrings");
1182       count = size_offsets / 3;
1183     }
1184
1185     /* If something has matched */
1186     if (count > 0) {
1187       matched++;
1188
1189       if (subpats) {
1190         // Try to get the list of substrings and display a warning if failed.
1191         if (offsets[1] < offsets[0] ||
1192             pcre_get_substring_list(subject->data(), offsets, count,
1193                                     &stringlist) < 0) {
1194           raise_warning("Get subpatterns list failed");
1195           return false;
1196         }
1197
1198         if (global) {  /* global pattern matching */
1199           if (subpats_order == PREG_PATTERN_ORDER) {
1200             /* For each subpattern, insert it into the appropriate array. */
1201             for (i = 0; i < count; i++) {
1202               if (offset_capture) {
1203                 auto const lval = match_sets.lval(i);
1204                 add_offset_pair_match(forceToOutput(lval, hackArrOutput),
1205                                       String(stringlist[i],
1206                                              offsets[(i<<1)+1] - offsets[i<<1],
1207                                              CopyString),
1208                                       offsets[i<<1],
1209                                       nullptr,
1210                                       hackArrOutput);
1211               } else {
1212                 auto const lval = match_sets.lval(i);
1213                 forceToOutput(lval, hackArrOutput).append(
1214                   String(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1215                     CopyString)
1216                 );
1217               }
1218             }
1219             /*
1220              * If the number of captured subpatterns on this run is
1221              * less than the total possible number, pad the result
1222              * arrays with empty strings.
1223              */
1224             if (count < num_subpats) {
1225               for (; i < num_subpats; i++) {
1226                 auto const lval = match_sets.lval(i);
1227                 forceToOutput(lval, hackArrOutput).append("");
1228               }
1229             }
1230           } else {
1231             auto result_set = hackArrOutput
1232               ? Array::CreateDict()
1233               : Array::CreateDArray();
1234
1235             /* Add all the subpatterns to it */
1236             for (i = 0; i < count; i++) {
1237               if (offset_capture) {
1238                 add_offset_pair_match(result_set,
1239                                       String(stringlist[i],
1240                                              offsets[(i<<1)+1] - offsets[i<<1],
1241                                              CopyString),
1242                                       offsets[i<<1],
1243                                       subpat_names[i],
1244                                       hackArrOutput);
1245               } else {
1246                 String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1247                              CopyString);
1248                 if (subpat_names[i]) {
1249                   result_set.set(String(subpat_names[i]), value);
1250                 }
1251                 result_set.append(value);
1252               }
1253             }
1254             if (includeNonMatchingCaptures && count < num_subpats) {
1255               for (; i < num_subpats; i++) {
1256                 // We don't want to set the numeric key if there is a string
1257                 // key, but we have do it usually to make migration from
1258                 // preg_match() practical; given that existing code gets
1259                 // nothing for unmatched captures, we don't need to set both
1260                 // here.
1261                 if (offset_capture) {
1262                   add_offset_pair_match(
1263                     forceToOutput(*subpats, hackArrOutput),
1264                     empty_string(),
1265                     offsets[i<<1],
1266                     subpat_names[i],
1267                     hackArrOutput
1268                   );
1269                 } else {
1270                   if (subpat_names[i]) {
1271                     result_set.set(String(subpat_names[i]), empty_string_tv());
1272                   }
1273                   result_set.append(empty_string());
1274                 }
1275               }
1276             }
1277             /* And add it to the output array */
1278             forceToOutput(*subpats, hackArrOutput).append(
1279               std::move(result_set)
1280             );
1281           }
1282         } else {      /* single pattern matching */
1283           /* For each subpattern, insert it into the subpatterns array. */
1284           for (i = 0; i < count; i++) {
1285             if (offset_capture) {
1286               add_offset_pair_match(forceToOutput(*subpats, hackArrOutput),
1287                                     String(stringlist[i],
1288                                            offsets[(i<<1)+1] - offsets[i<<1],
1289                                            CopyString),
1290                                     offsets[i<<1],
1291                                     subpat_names[i],
1292                                     hackArrOutput);
1293             } else {
1294               String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1295                            CopyString);
1296               if (subpat_names[i]) {
1297                 forceToOutput(*subpats, hackArrOutput).set(
1298                   String(subpat_names[i]), value
1299                 );
1300               }
1301               forceToOutput(*subpats, hackArrOutput).append(value);
1302             }
1303           }
1304           if (includeNonMatchingCaptures && count < num_subpats) {
1305             for (; i < num_subpats; i++) {
1306               if (offset_capture) {
1307                 add_offset_pair_match(
1308                   forceToOutput(*subpats, hackArrOutput),
1309                   empty_string(),
1310                   offsets[i<<1],
1311                   subpat_names[i],
1312                   hackArrOutput
1313                 );
1314               } else {
1315                 if (subpat_names[i]) {
1316                   forceToOutput(*subpats, hackArrOutput).set(
1317                     String(subpat_names[i]), empty_string()
1318                   );
1319                 }
1320                 forceToOutput(*subpats, hackArrOutput).append(empty_string());
1321               }
1322             }
1323           }
1324         }
1325         pcre_free((void *) stringlist);
1326       }
1327     } else if (count == PCRE_ERROR_NOMATCH) {
1328       /* If we previously set PCRE_NOTEMPTY after a null match,
1329          this is not necessarily the end. We need to advance
1330          the start offset, and continue. Fudge the offset values
1331          to achieve this, unless we're already at the end of the string. */
1332       if (g_notempty && start_offset < subject->size()) {
1333         offsets[0] = start_offset;
1334         offsets[1] = start_offset + 1;
1335       } else
1336         break;
1337     } else {
1338       if (pcre_need_log_error(count)) {
1339         pcre_log_error(__FUNCTION__, __LINE__, count,
1340                        pattern->data(), pattern->size(),
1341                        subject->data(), subject->size(),
1342                        "", 0,
1343                        flags, start_offset, g_notempty, global);
1344       }
1345       pcre_handle_exec_error(count);
1346       return false;
1347     }
1348
1349     /* If we have matched an empty string, mimic what Perl's /g options does.
1350        This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1351        the match again at the same point. If this fails (picked up above) we
1352        advance to the next character. */
1353     g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1354
1355     /* Advance to the position right after the last full match */
1356     start_offset = offsets[1];
1357   } while (global);
1358
1359   /* Add the match sets to the output array and clean up */
1360   if (subpats && global && subpats_order == PREG_PATTERN_ORDER) {
1361     for (i = 0; i < num_subpats; i++) {
1362       if (subpat_names[i]) {
1363         forceToOutput(*subpats, hackArrOutput).set(
1364           String(subpat_names[i]), match_sets[i]
1365         );
1366       }
1367       forceToOutput(*subpats, hackArrOutput).append(match_sets[i]);
1368     }
1369   }
1370   return matched;
1371 }
1372
1373 Variant preg_match(const String& pattern, const String& subject,
1374                    Variant* matches /* = nullptr */, int flags /* = 0 */,
1375                    int offset /* = 0 */) {
1376   return preg_match(pattern.get(), subject.get(), matches, flags, offset);
1377 }
1378
1379 Variant preg_match(const StringData* pattern, const StringData* subject,
1380                    Variant* matches /* = nullptr */, int flags /* = 0 */,
1381                    int offset /* = 0 */) {
1382   return preg_match_impl(pattern, subject, matches, flags, offset, false);
1383 }
1384
1385 Variant preg_match_all(const String& pattern, const String& subject,
1386                        Variant* matches /* = nullptr */,
1387                        int flags /* = 0 */, int offset /* = 0 */) {
1388   return preg_match_all(pattern.get(), subject.get(), matches, flags, offset);
1389 }
1390
1391 Variant preg_match_all(const StringData* pattern, const StringData* subject,
1392                        Variant* matches /* = nullptr */,
1393                        int flags /* = 0 */, int offset /* = 0 */) {
1394   return preg_match_impl(pattern, subject, matches, flags, offset, true);
1395 }
1396
1397 ///////////////////////////////////////////////////////////////////////////////
1398
1399 static String preg_do_repl_func(const Variant& function, const String& subject,
1400                                 int* offsets, const char* const* subpat_names,
1401                                 int count) {
1402   Array subpats = Array::CreateDArray();
1403   for (int i = 0; i < count; i++) {
1404     auto off1 = offsets[i<<1];
1405     auto off2 = offsets[(i<<1)+1];
1406     auto sub = subject.substr(off1, off2 - off1);
1407
1408     if (subpat_names[i]) {
1409       subpats.set(String(subpat_names[i]), sub);
1410     }
1411     subpats.append(sub);
1412   }
1413
1414   return vm_call_user_func(function, make_varray(subpats)).toString();
1415 }
1416
1417 static bool preg_get_backref(const char** str, int* backref) {
1418   char in_brace = 0;
1419   const char* walk = *str;
1420
1421   if (walk[1] == 0) {
1422     return false;
1423   }
1424
1425   if (*walk == '$' && walk[1] == '{') {
1426     in_brace = 1;
1427     walk++;
1428   }
1429   walk++;
1430
1431   if (*walk >= '0' && *walk <= '9') {
1432     *backref = *walk - '0';
1433     walk++;
1434   } else {
1435     return false;
1436   }
1437
1438   if (*walk && *walk >= '0' && *walk <= '9') {
1439     *backref = *backref * 10 + *walk - '0';
1440     walk++;
1441   }
1442
1443   if (in_brace) {
1444     if (*walk == 0 || *walk != '}') {
1445       return false;
1446     }
1447     walk++;
1448   }
1449
1450   *str = walk;
1451   return true;
1452 }
1453
1454 static Variant php_pcre_replace(const String& pattern, const String& subject,
1455                                 const Variant& replace_var, bool callable,
1456                                 int limit, int* replace_count) {
1457   PCRECache::Accessor accessor;
1458   if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1459     return false;
1460   }
1461   const pcre_cache_entry* pce = accessor.get();
1462   if (pce->preg_options & PREG_REPLACE_EVAL) {
1463     throw Exception(
1464       "preg_replace(): Support for the /e modifier has been removed, use "
1465       "preg_replace_callback instead"
1466     );
1467   }
1468
1469   int size_offsets;
1470   int* offsets = create_offset_array(pce, size_offsets);
1471   SmartFreeHelper offsetsFreer(offsets);
1472   if (offsets == nullptr) {
1473     return false;
1474   }
1475
1476   const char* const* subpat_names = get_subpat_names(pce);
1477   if (subpat_names == nullptr) {
1478     return false;
1479   }
1480
1481   const char* replace = nullptr;
1482   const char* replace_end = nullptr;
1483   int replace_len = 0;
1484   String replace_val;
1485
1486   if (!callable) {
1487     replace_val = replace_var.toString();
1488     replace = replace_val.data();
1489     replace_len = replace_val.size();
1490     replace_end = replace + replace_len;
1491   }
1492
1493   StringBuffer result(2 * subject.size());
1494
1495   try {
1496
1497     /* Initialize */
1498     const char* match = nullptr;
1499     int start_offset = 0;
1500     *rl_last_error_code = PHP_PCRE_NO_ERROR;
1501     pcre_extra extra;
1502     init_local_extra(&extra, pce->extra);
1503
1504     const char* walk;     // Used to walk the replacement string
1505     char walk_last;       // Last walked character
1506     int match_len;        // Length of the current match
1507     int backref;          // Backreference number
1508     int g_notempty = 0;   // If the match should not be empty
1509     int exec_options = 0; // Options passed to pcre_exec
1510     while (1) {
1511       /* Execute the regular expression. */
1512       int count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1513                             start_offset,
1514                             exec_options | g_notempty,
1515                             offsets, size_offsets);
1516
1517       /* The string was already proved to be valid UTF-8 */
1518       exec_options |= PCRE_NO_UTF8_CHECK;
1519
1520       /* Check for too many substrings condition. */
1521       if (count == 0) {
1522         raise_warning("Matched, but too many substrings");
1523         count = size_offsets / 3;
1524       }
1525
1526       const char* piece = subject.data() + start_offset;
1527       if (count > 0 && offsets[1] >= offsets[0] &&
1528           (limit == -1 || limit > 0)) {
1529         if (replace_count) {
1530           ++*replace_count;
1531         }
1532         /* Set the match location in subject */
1533         match = subject.data() + offsets[0];
1534
1535         String callable_result;
1536         if (callable) {
1537           /* Use custom function to get replacement string and its length. */
1538           callable_result = preg_do_repl_func(replace_var, subject, offsets,
1539                                               subpat_names, count);
1540         } else { /* do regular substitution */
1541           walk = replace;
1542           walk_last = 0;
1543           while (walk < replace_end) {
1544             if ('\\' == *walk || '$' == *walk) {
1545               if (walk_last == '\\') {
1546                 walk++;
1547                 walk_last = 0;
1548                 continue;
1549               }
1550               if (preg_get_backref(&walk, &backref)) {
1551                 if (backref < count) {
1552                   match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1553                 }
1554                 continue;
1555               }
1556             }
1557             walk++;
1558             walk_last = walk[-1];
1559           }
1560         }
1561
1562         /* copy the part of the string before the match */
1563         result.append(piece, match-piece);
1564
1565         /* copy replacement and backrefs */
1566         int result_len = result.size();
1567
1568         if (callable) {
1569           /* Copy result from custom function to buffer and clean up. */
1570           result.append(callable_result.data(), callable_result.size());
1571           result_len += callable_result.size();
1572         } else { /* do regular backreference copying */
1573           walk = replace;
1574           walk_last = 0;
1575           Array params;
1576           while (walk < replace_end) {
1577             if ('\\' == *walk || '$' == *walk) {
1578               if (walk_last == '\\') {
1579                 result.set(result.size() - 1, *walk++);
1580                 walk_last = 0;
1581                 continue;
1582               }
1583               if (preg_get_backref(&walk, &backref)) {
1584                 if (backref < count) {
1585                   match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1586                   result.append(
1587                     subject.data() + offsets[backref<<1],
1588                     match_len
1589                   );
1590                 }
1591                 continue;
1592               }
1593             }
1594             result.append(*walk++);
1595             walk_last = walk[-1];
1596           }
1597         }
1598
1599         if (limit != -1) {
1600           limit--;
1601         }
1602
1603       } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1604         /* If we previously set PCRE_NOTEMPTY after a null match,
1605            this is not necessarily the end. We need to advance
1606            the start offset, and continue. Fudge the offset values
1607            to achieve this, unless we're already at the end of the string. */
1608         if (g_notempty != 0 && start_offset < subject.size()) {
1609           offsets[0] = start_offset;
1610           offsets[1] = start_offset + 1;
1611           result.append(piece, 1);
1612         } else {
1613           /* stick that last bit of string on our output */
1614           result.append(piece, subject.size() - start_offset);
1615           break;
1616         }
1617       } else {
1618         if (pcre_need_log_error(count)) {
1619           const char* s;
1620           int size;
1621           String stemp;
1622           if (callable) {
1623             if (replace_var.isObject()) {
1624               stemp = replace_var.asCObjRef()->getClassName().asString()
1625                     + "::__invoke";
1626             } else {
1627               stemp = replace_var.toString();
1628             }
1629             s = stemp.data();
1630             size = stemp.size();
1631           } else {
1632             s = replace_val.data();
1633             size = replace_val.size();
1634           }
1635           pcre_log_error(__FUNCTION__, __LINE__, count,
1636                          pattern.data(), pattern.size(),
1637                          subject.data(), subject.size(),
1638                          s, size,
1639                          callable, limit, start_offset, g_notempty);
1640         }
1641         pcre_handle_exec_error(count);
1642         return init_null();
1643       }
1644
1645       /* If we have matched an empty string, mimic what Perl's /g options does.
1646          This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1647          the match again at the same point. If this fails (picked up above) we
1648          advance to the next character. */
1649       g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1650
1651       /* Advance to the next piece. */
1652       start_offset = offsets[1];
1653     }
1654
1655     return result.detach();
1656   } catch (...) {
1657     throw;
1658   }
1659 }
1660
1661 static Variant php_replace_in_subject(const Variant& regex, const Variant& replace,
1662                                       String subject, int limit, bool callable,
1663                                       int* replace_count) {
1664   if (!regex.isArray()) {
1665     Variant ret = php_pcre_replace(regex.toString(), subject, replace,
1666                                    callable, limit, replace_count);
1667
1668     if (ret.isBoolean()) {
1669       assertx(!ret.toBoolean());
1670       return init_null();
1671     }
1672
1673     return ret;
1674   }
1675
1676   if (callable || !replace.isArray()) {
1677     Array arr = regex.toDArray();
1678     for (ArrayIter iterRegex(arr); iterRegex; ++iterRegex) {
1679       String regex_entry = iterRegex.second().toString();
1680       Variant ret = php_pcre_replace(regex_entry, subject, replace,
1681                                      callable, limit, replace_count);
1682       if (ret.isBoolean()) {
1683         assertx(!ret.toBoolean());
1684         return init_null();
1685       }
1686       if (!ret.isString()) {
1687         return ret;
1688       }
1689       subject = ret.asStrRef();
1690       if (subject.isNull()) {
1691         return subject;
1692       }
1693     }
1694     return subject;
1695   }
1696
1697   Array arrReplace = replace.toDArray();
1698   Array arrRegex = regex.toDArray();
1699   ArrayIter iterReplace(arrReplace);
1700   for (ArrayIter iterRegex(arrRegex); iterRegex; ++iterRegex) {
1701     String regex_entry = iterRegex.second().toString();
1702     Variant replace_value;
1703     if (iterReplace) {
1704       replace_value = iterReplace.second();
1705       ++iterReplace;
1706     }
1707
1708     Variant ret = php_pcre_replace(regex_entry, subject, replace_value,
1709                                    callable, limit, replace_count);
1710
1711     if (ret.isBoolean()) {
1712       assertx(!ret.toBoolean());
1713       return init_null();
1714     }
1715     if (!ret.isString()) {
1716       return ret;
1717     }
1718     subject = ret.asStrRef();
1719     if (subject.isNull()) {
1720       return subject;
1721     }
1722   }
1723   return subject;
1724 }
1725
1726 Variant preg_replace_impl(const Variant& pattern, const Variant& replacement,
1727                           const Variant& subject, int limit, int64_t* count,
1728                           bool is_callable, bool is_filter) {
1729   assertx(!(is_callable && is_filter));
1730   if (!is_callable &&
1731       replacement.isArray() && !pattern.isArray()) {
1732     raise_warning("Parameter mismatch, pattern is a string while "
1733                     "replacement is an array");
1734     return false;
1735   }
1736
1737   int replace_count = 0;
1738   if (!isContainer(subject)) {
1739     Variant ret = php_replace_in_subject(pattern, replacement,
1740                                          subject.toString(),
1741                                          limit, is_callable, &replace_count);
1742
1743     if (ret.isString()) {
1744       if (count) *count = replace_count;
1745       if (is_filter && replace_count == 0) {
1746         return init_null();
1747       } else {
1748         return ret.asStrRef();
1749       }
1750     }
1751
1752     return ret;
1753   }
1754
1755   Array return_value = Array::CreateDArray();
1756   Array arrSubject = subject.toDArray();
1757   for (ArrayIter iter(arrSubject); iter; ++iter) {
1758     auto old_replace_count = replace_count;
1759     String subject_entry = iter.second().toString();
1760     Variant ret = php_replace_in_subject(pattern, replacement, subject_entry,
1761                                          limit, is_callable, &replace_count);
1762
1763     if (ret.isString() && !ret.isNull() &&
1764         (!is_filter || replace_count > old_replace_count)) {
1765       return_value.set(iter.first(), ret.asStrRef());
1766     }
1767   }
1768   if (count) *count = replace_count;
1769   return return_value;
1770 }
1771
1772 int preg_replace(Variant& result,
1773                  const Variant& pattern,
1774                  const Variant& replacement,
1775                  const Variant& subject,
1776                  int limit /* = -1 */) {
1777   int64_t count;
1778   result = preg_replace_impl(pattern, replacement, subject,
1779                              limit, &count, false, false);
1780   return count;
1781 }
1782
1783 int preg_replace_callback(Variant& result,
1784                           const Variant& pattern,
1785                           const Variant& callback,
1786                           const Variant& subject,
1787                           int limit /* = -1 */) {
1788   int64_t count;
1789   result = preg_replace_impl(pattern, callback, subject,
1790                              limit, &count, true, false);
1791   return count;
1792 }
1793
1794 int preg_filter(Variant& result,
1795                 const Variant& pattern,
1796                 const Variant& replacement,
1797                 const Variant& subject,
1798                 int limit /* = -1 */) {
1799   int64_t count;
1800   result = preg_replace_impl(pattern, replacement, subject,
1801                              limit, &count, false, true);
1802   return count;
1803 }
1804
1805 ///////////////////////////////////////////////////////////////////////////////
1806
1807 Variant preg_split(const String& pattern, const String& subject,
1808                    int limit /* = -1 */, int flags /* = 0 */) {
1809   PCRECache::Accessor accessor;
1810   if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1811     return false;
1812   }
1813   const pcre_cache_entry* pce = accessor.get();
1814
1815   int no_empty = flags & PREG_SPLIT_NO_EMPTY;
1816   bool delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1817   bool offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1818
1819   if (limit == 0) {
1820     limit = -1;
1821   }
1822
1823   int size_offsets = 0;
1824   int* offsets = create_offset_array(pce, size_offsets);
1825   SmartFreeHelper offsetsFreer(offsets);
1826   if (offsets == nullptr) {
1827     return false;
1828   }
1829
1830   /* Start at the beginning of the string */
1831   int start_offset = 0;
1832   int next_offset = 0;
1833   const char* last_match = subject.data();
1834   *rl_last_error_code = PHP_PCRE_NO_ERROR;
1835   pcre_extra extra;
1836   init_local_extra(&extra, pce->extra);
1837
1838   const bool hackArrOutput = flags & PREG_FB_HACK_ARRAYS;
1839
1840   // Get next piece if no limit or limit not yet reached and something matched
1841   Array return_value = hackArrOutput ? Array::CreateDict() : Array::Create();
1842   int g_notempty = 0;   /* If the match should not be empty */
1843   int utf8_check = 0;
1844   PCRECache::Accessor bump_accessor;
1845   const pcre_cache_entry* bump_pce = nullptr; /* instance for empty matches */
1846   while ((limit == -1 || limit > 1)) {
1847     int count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1848                           start_offset, g_notempty | utf8_check,
1849                           offsets, size_offsets);
1850
1851     /* Subsequent calls to pcre_exec don't need to bother with the
1852      * utf8 validity check: if the subject isn't valid, the first
1853      * call to pcre_exec will have failed, and as long as we only
1854      * set start_offset to known character boundaries we won't
1855      * supply an invalid offset. */
1856     utf8_check = PCRE_NO_UTF8_CHECK;
1857
1858     /* Check for too many substrings condition. */
1859     if (count == 0) {
1860       raise_warning("Matched, but too many substrings");
1861       count = size_offsets / 3;
1862     }
1863
1864     /* If something matched */
1865     if (count > 0 && offsets[1] >= offsets[0]) {
1866       if (!no_empty || subject.data() + offsets[0] != last_match) {
1867         if (offset_capture) {
1868           /* Add (match, offset) pair to the return value */
1869           add_offset_pair_split(return_value,
1870                                 String(last_match,
1871                                        subject.data() + offsets[0] - last_match,
1872                                        CopyString),
1873                                 next_offset,
1874                                 nullptr,
1875                                 hackArrOutput);
1876         } else {
1877           /* Add the piece to the return value */
1878           return_value.append(String(last_match,
1879                                      subject.data() + offsets[0] - last_match,
1880                                      CopyString));
1881         }
1882
1883         /* One less left to do */
1884         if (limit != -1)
1885           limit--;
1886       }
1887
1888       last_match = subject.data() + offsets[1];
1889       next_offset = offsets[1];
1890
1891       if (delim_capture) {
1892         int i, match_len;
1893         for (i = 1; i < count; i++) {
1894           match_len = offsets[(i<<1)+1] - offsets[i<<1];
1895           /* If we have matched a delimiter */
1896           if (!no_empty || match_len > 0) {
1897             if (offset_capture) {
1898               add_offset_pair_split(return_value,
1899                                     String(subject.data() + offsets[i<<1],
1900                                            match_len, CopyString),
1901                                     offsets[i<<1],
1902                                     nullptr,
1903                                     hackArrOutput);
1904             } else {
1905               return_value.append(subject.substr(offsets[i<<1], match_len));
1906             }
1907           }
1908         }
1909       }
1910     } else if (count == PCRE_ERROR_NOMATCH) {
1911       /* If we previously set PCRE_NOTEMPTY after a null match,
1912          this is not necessarily the end. We need to advance
1913          the start offset, and continue. Fudge the offset values
1914          to achieve this, unless we're already at the end of the string. */
1915       if (g_notempty != 0 && start_offset < subject.size()) {
1916         if (pce->compile_options & PCRE_UTF8) {
1917           if (bump_pce == nullptr) {
1918             if (!pcre_get_compiled_regex_cache(bump_accessor,
1919                                                String("/./us").get())) {
1920               return false;
1921             }
1922             bump_pce = bump_accessor.get();
1923           }
1924           pcre_extra bump_extra;
1925           init_local_extra(&bump_extra, bump_pce->extra);
1926           count = pcre_exec(bump_pce->re, &bump_extra, subject.data(),
1927                             subject.size(), start_offset,
1928                             utf8_check, offsets, size_offsets);
1929           if (count < 1) {
1930             raise_warning("Unknown error");
1931             offsets[0] = start_offset;
1932             offsets[1] = start_offset + 1;
1933             if (pcre_need_log_error(count)) {
1934               pcre_log_error(__FUNCTION__, __LINE__, count,
1935                              pattern.data(), pattern.size(),
1936                              subject.data(), subject.size(),
1937                              "", 0,
1938                              limit, flags, start_offset);
1939             }
1940           }
1941         } else {
1942           offsets[0] = start_offset;
1943           offsets[1] = start_offset + 1;
1944         }
1945       } else
1946         break;
1947     } else {
1948       if (pcre_need_log_error(count)) {
1949         pcre_log_error(__FUNCTION__, __LINE__, count,
1950                        pattern.data(), pattern.size(),
1951                        subject.data(), subject.size(),
1952                        "", 0,
1953                        limit, flags, start_offset, g_notempty);
1954       }
1955       pcre_handle_exec_error(count);
1956       break;
1957     }
1958
1959     /* If we have matched an empty string, mimic what Perl's /g options does.
1960        This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1961        the match again at the same point. If this fails (picked up above) we
1962        advance to the next character. */
1963     g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1964
1965     /* Advance to the position right after the last full match */
1966     start_offset = offsets[1];
1967   }
1968
1969   start_offset = last_match - subject.data(); /* offset might have
1970                                                 * been incremented,
1971                                                 * but without further
1972                                                 * successful matches */
1973   if (!no_empty || start_offset < subject.size()) {
1974     if (offset_capture) {
1975       /* Add the last (match, offset) pair to the return value */
1976       add_offset_pair_split(return_value,
1977                             subject.substr(start_offset),
1978                             start_offset, nullptr, hackArrOutput);
1979     } else {
1980       /* Add the last piece to the return value */
1981       return_value.append
1982         (String(last_match, subject.data() + subject.size() - last_match,
1983                 CopyString));
1984     }
1985   }
1986
1987   return return_value;
1988 }
1989
1990 ///////////////////////////////////////////////////////////////////////////////
1991
1992 String preg_quote(const String& str,
1993                   const String& delimiter /* = null_string */) {
1994   const char* in_str = str.data();
1995   const char* in_str_end = in_str + str.size();
1996
1997   /* Nothing to do if we got an empty string */
1998   if (in_str == in_str_end) {
1999     return str;
2000   }
2001
2002   char delim_char = 0;      /* Delimiter character to be quoted */
2003   bool quote_delim = false; /* Whether to quote additional delim char */
2004   if (!delimiter.empty()) {
2005     delim_char = delimiter.charAt(0);
2006     quote_delim = true;
2007   }
2008
2009   /* Allocate enough memory so that even if each character
2010      is quoted, we won't run out of room */
2011   String ret(4 * str.size() + 1, ReserveString);
2012   char* out_str = ret.mutableData();
2013
2014   /* Go through the string and quote necessary characters */
2015   const char* p;
2016   char* q;
2017   for (p = in_str, q = out_str; p != in_str_end; p++) {
2018     char c = *p;
2019     switch (c) {
2020     case '.': case '\\': case '+': case '*': case '?':
2021     case '[': case '^':  case ']': case '$': case '(':
2022     case ')': case '{':  case '}': case '=': case '!':
2023     case '>': case '<':  case '|': case ':': case '-':
2024     case '#':
2025       *q++ = '\\';
2026       *q++ = c;
2027       break;
2028
2029     case '\0':
2030       *q++ = '\\';
2031       *q++ = '0';
2032       *q++ = '0';
2033       *q++ = '0';
2034       break;
2035
2036     default:
2037       if (quote_delim && c == delim_char)
2038         *q++ = '\\';
2039       *q++ = c;
2040       break;
2041     }
2042   }
2043   *q = '\0';
2044
2045   return ret.setSize(q - out_str);
2046 }
2047
2048 int preg_last_error() {
2049   return *rl_last_error_code;
2050 }
2051
2052 size_t preg_pcre_cache_size() {
2053   return s_pcreCache.size();
2054 }
2055
2056 ///////////////////////////////////////////////////////////////////////////////
2057 // regexec
2058
2059 static void php_reg_eprint(int err, regex_t* re) {
2060   char *buf = nullptr, *message = nullptr;
2061   size_t len;
2062   size_t buf_len;
2063
2064 #ifdef REG_ITOA
2065   /* get the length of the message */
2066   buf_len = regerror(REG_ITOA | err, re, nullptr, 0);
2067   if (buf_len) {
2068     buf = (char *)req::malloc_noptrs(buf_len);
2069     if (!buf) return; /* fail silently */
2070     /* finally, get the error message */
2071     regerror(REG_ITOA | err, re, buf, buf_len);
2072   }
2073 #else
2074   buf_len = 0;
2075 #endif
2076   len = regerror(err, re, nullptr, 0);
2077   if (len) {
2078     message = (char *)req::malloc_noptrs(buf_len + len + 2);
2079     if (!message) {
2080       return; /* fail silently */
2081     }
2082     if (buf_len) {
2083       snprintf(message, buf_len, "%s: ", buf);
2084       buf_len += 1; /* so pointer math below works */
2085     }
2086     /* drop the message into place */
2087     regerror(err, re, message + buf_len, len);
2088     raise_warning("%s", message);
2089   }
2090   req::free(buf);
2091   req::free(message);
2092 }
2093
2094 Variant php_split(const String& spliton, const String& str, int count,
2095                   bool icase) {
2096   const char* strp = str.data();
2097   const char* endp = strp + str.size();
2098
2099   regex_t re;
2100   int copts = icase ? REG_ICASE : 0;
2101   int err = regcomp(&re, spliton.data(), REG_EXTENDED | copts);
2102   if (err) {
2103     php_reg_eprint(err, &re);
2104     return false;
2105   }
2106
2107   Array return_value = Array::Create();
2108   regmatch_t subs[1];
2109
2110   /* churn through str, generating array entries as we go */
2111   while ((count == -1 || count > 1) &&
2112          !(err = regexec(&re, strp, 1, subs, 0))) {
2113     if (subs[0].rm_so == 0 && subs[0].rm_eo) {
2114       /* match is at start of string, return empty string */
2115       return_value.append("");
2116       /* skip ahead the length of the regex match */
2117       strp += subs[0].rm_eo;
2118     } else if (subs[0].rm_so == 0 && subs[0].rm_eo == 0) {
2119       /* No more matches */
2120       regfree(&re);
2121       raise_warning("Invalid Regular Expression to split()");
2122       return false;
2123     } else {
2124       /* On a real match */
2125
2126       /* make a copy of the substring */
2127       int size = subs[0].rm_so;
2128
2129       /* add it to the array */
2130       return_value.append(String(strp, size, CopyString));
2131
2132       /* point at our new starting point */
2133       strp = strp + subs[0].rm_eo;
2134     }
2135
2136     /* if we're only looking for a certain number of points,
2137        stop looking once we hit it */
2138     if (count != -1) {
2139       count--;
2140     }
2141   }
2142
2143   /* see if we encountered an error */
2144   if (err && err != REG_NOMATCH) {
2145     php_reg_eprint(err, &re);
2146     regfree(&re);
2147     return false;
2148   }
2149
2150   /* otherwise we just have one last element to add to the array */
2151   int size = endp - strp;
2152   return_value.append(String(strp, size, CopyString));
2153
2154   regfree(&re);
2155   return return_value;
2156 }
2157
2158 ///////////////////////////////////////////////////////////////////////////////
2159 }