hphp/runtime/base/preg.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #include "hphp/runtime/base/preg.h"
  18
  19 #include <atomic>
  20 #include <fstream>
  21 #include <mutex>
  22 #include <pcre.h>
  23 #include <onigposix.h>
  24 #include <utility>
  25
  26 #include <folly/AtomicHashArray.h>
  27
  28 #include "hphp/runtime/base/array-init.h"
  29 #include "hphp/runtime/base/array-iterator.h"
  30 #include "hphp/runtime/base/builtin-functions.h"
  31 #include "hphp/runtime/base/container-functions.h"
  32 #include "hphp/runtime/base/execution-context.h"
  33 #include "hphp/runtime/base/ini-setting.h"
  34 #include "hphp/runtime/base/runtime-option.h"
  35 #include "hphp/runtime/base/string-util.h"
  36 #include "hphp/runtime/base/init-fini-node.h"
  37 #include "hphp/runtime/base/zend-functions.h"
  38 #include "hphp/runtime/vm/debug/debug.h"
  39 #include "hphp/runtime/vm/treadmill.h"
  40 #include "hphp/runtime/vm/vm-regs.h"
  41
  42 #include "hphp/runtime/ext/std/ext_std_function.h"
  43 #include "hphp/runtime/ext/string/ext_string.h"
  44
  45 #include "hphp/runtime/vm/jit/mcgen.h"
  46 #include "hphp/runtime/vm/jit/types.h"
  47 #include "hphp/runtime/vm/jit/vtune-jit.h"
  48
  49 #include "hphp/compiler/json.h"
  50
  51 #include "hphp/util/logger.h"
  52 #include "hphp/util/concurrent-scalable-cache.h"
  53
  54 /* Only defined in pcre >= 8.32 */
  55 #ifndef PCRE_STUDY_JIT_COMPILE
  56 # define PCRE_STUDY_JIT_COMPILE 0
  57 #endif
  58
  59 namespace HPHP {
  60
  61 using jit::TCA;
  62
  63 ///////////////////////////////////////////////////////////////////////////////
  64 // PCREglobals definition
  65
  66 PCREglobals::PCREglobals() {
  67   jit_stack = pcre_jit_stack_alloc(32768, 524288);
  68   // Set these to handle uses of pcre prior to PcreExtension::threadInit
  69   // In particular, for matching tier overrides during RuntimeOption::Load
  70   preg_backtrace_limit = RuntimeOption::PregBacktraceLimit;
  71   preg_recursion_limit = RuntimeOption::PregRecursionLimit;
  72 }
  73
  74 PCREglobals::~PCREglobals() {
  75   pcre_jit_stack_free(jit_stack);
  76 }
  77
  78 ///////////////////////////////////////////////////////////////////////////////
  79 // PCRECache definition
  80
  81 struct PCRECache {
  82   typedef std::shared_ptr<const pcre_cache_entry> EntryPtr;
  83   typedef std::unique_ptr<LRUCacheKey> TempKeyCache;
  84
  85   enum class CacheKind {
  86     Static,
  87     Lru,
  88     Scalable
  89   };
  90
  91 private:
  92   struct ahm_string_data_same {
  93     bool operator()(const StringData* s1, const StringData* s2) {
  94       // ahm uses -1, -2, -3 as magic values
  95       return int64_t(s1) > 0 && (s1 == s2 || s1->same(s2));
  96     }
  97   };
  98
  99   typedef folly::AtomicHashArray<const StringData*, const pcre_cache_entry*,
 100           string_data_hash, ahm_string_data_same> StaticCache;
 101   typedef ConcurrentLRUCache<LRUCacheKey, EntryPtr,
 102           LRUCacheKey::HashCompare> LRUCache;
 103   typedef ConcurrentScalableCache<LRUCacheKey, EntryPtr,
 104           LRUCacheKey::HashCompare> ScalableCache;
 105   typedef StaticCache::value_type StaticCachePair;
 106
 107 public:
 108   struct Accessor {
 109     Accessor()
 110       : m_kind(Kind::Empty)
 111     {}
 112
 113     ~Accessor() {
 114       switch (m_kind) {
 115         case Kind::Empty:
 116         case Kind::Ptr:
 117           break;
 118         case Kind::SmartPtr:
 119           m_u.smart_ptr.~EntryPtr();
 120           break;
 121         case Kind::AccessorKind:
 122           m_u.accessor.~ConstAccessor();
 123           break;
 124       }
 125     }
 126
 127     Accessor& operator=(const pcre_cache_entry* ptr) {
 128       assert(m_kind == Kind::Empty || m_kind == Kind::Ptr);
 129       m_kind = Kind::Ptr;
 130       m_u.ptr = ptr;
 131       return *this;
 132     }
 133
 134     Accessor& operator=(EntryPtr&& ep) {
 135       switch (m_kind) {
 136         case Kind::AccessorKind:
 137           m_u.accessor.~ConstAccessor();
 138         case Kind::Empty:
 139         case Kind::Ptr:
 140           m_kind = Kind::SmartPtr;
 141           new (&m_u.smart_ptr) EntryPtr(std::move(ep));
 142           break;
 143         case Kind::SmartPtr:
 144           m_u.smart_ptr = std::move(ep);
 145           break;
 146       }
 147       return *this;
 148     }
 149
 150     // No assignment from LRUCache::ConstAccessor since it is non-copyable
 151     // Use resetToLRU instead
 152     LRUCache::ConstAccessor& resetToLRU() {
 153       switch (m_kind) {
 154         case Kind::SmartPtr:
 155           m_u.smart_ptr.~EntryPtr();
 156         case Kind::Empty:
 157         case Kind::Ptr:
 158           m_kind = Kind::AccessorKind;
 159           new (&m_u.accessor) LRUCache::ConstAccessor();
 160           break;
 161         case Kind::AccessorKind:
 162           break;
 163       }
 164       return m_u.accessor;
 165     }
 166
 167     const pcre_cache_entry* get() {
 168       switch (m_kind) {
 169         case Kind::Empty:    return nullptr;
 170         case Kind::Ptr:      return m_u.ptr;
 171         case Kind::SmartPtr: return m_u.smart_ptr.get();
 172         case Kind::AccessorKind: return m_u.accessor->get();
 173       }
 174       always_assert(false);
 175     }
 176
 177     const EntryPtr& entryPtr() const {
 178       assert(m_kind == Kind::SmartPtr);
 179       return m_u.smart_ptr;
 180     }
 181
 182    private:
 183     enum class Kind : uint8_t {
 184       Empty,
 185       Ptr,
 186       SmartPtr,
 187       AccessorKind,
 188     };
 189
 190     union Ptr {
 191        Ptr() {}
 192       ~Ptr() {}
 193
 194       const pcre_cache_entry* ptr;
 195       EntryPtr smart_ptr;
 196       LRUCache::ConstAccessor accessor;
 197     };
 198
 199     Ptr m_u;
 200     Kind m_kind;
 201   };
 202
 203   PCRECache()
 204     : m_kind(CacheKind::Static), m_staticCache(nullptr)
 205   {
 206     reinit(CacheKind::Static);
 207   }
 208
 209   ~PCRECache() {
 210     if (m_kind == CacheKind::Static && m_staticCache.load()) {
 211       DestroyStatic(m_staticCache);
 212     }
 213   }
 214
 215   void reinit(CacheKind kind);
 216   bool find(Accessor& accessor, const StringData* key,
 217             TempKeyCache& keyCache);
 218   void insert(Accessor& accessor, const StringData* regex,
 219               TempKeyCache& keyCache, const pcre_cache_entry* ent);
 220   void dump(const std::string& filename);
 221   size_t size() const;
 222
 223 private:
 224   void clearStatic();
 225
 226   static void DestroyStatic(StaticCache* cache);
 227   static StaticCache* CreateStatic();
 228
 229   CacheKind m_kind;
 230   std::atomic<StaticCache*> m_staticCache;
 231   std::unique_ptr<LRUCache> m_lruCache;
 232   std::unique_ptr<ScalableCache> m_scalableCache;
 233   std::atomic<time_t> m_expire;
 234   std::mutex m_clearMutex;
 235 };
 236
 237 ///////////////////////////////////////////////////////////////////////////////
 238 // Data
 239
 240 THREAD_LOCAL(PCREglobals, tl_pcre_globals);
 241
 242 static PCRECache s_pcreCache;
 243
 244 // The last pcre error code is available for the whole thread.
 245 static __thread int tl_last_error_code;
 246
 247 ///////////////////////////////////////////////////////////////////////////////
 248 // pcre_cache_entry implementation
 249
 250 pcre_cache_entry::~pcre_cache_entry() {
 251   if (extra) {
 252 #if PCRE_MAJOR < 8 || (PCRE_MAJOR == 8 && PCRE_MINOR < 20)
 253     free(extra);
 254 #else
 255     pcre_free_study(extra);
 256 #endif
 257   }
 258   free(subpat_names);
 259   pcre_free(re);
 260 }
 261
 262 pcre_literal_data::pcre_literal_data(const char* pattern, int coptions) {
 263   if (coptions & ~PCRE_CASELESS) {
 264     return;
 265   }
 266
 267   auto p = pattern;
 268   if (*p == '^') {
 269     match_start = true;
 270     p++;
 271   }
 272
 273   std::string pattern_buffer;
 274   while (isalnum((unsigned char)*p) || (*p && strchr("/\\ :-_", *p))) {
 275     // backslash + alphanumeric character --> not a literal (i.e. \d).
 276     // backslash + non-alphanumeric character --> literal symbol (i.e. \.)
 277     if (*p == '\\') {
 278       if (!p[1] || isalnum((unsigned char)p[1])) {
 279         break;
 280       } else {
 281         p++;
 282       }
 283     }
 284     pattern_buffer += *p++;
 285   }
 286   if (*p == '$') {
 287     match_end = true;
 288     p++;
 289   }
 290   if (!*p) {
 291     /* This is an encoding of a literal string. */
 292     case_insensitive = coptions & PCRE_CASELESS;
 293     literal_str = std::move(pattern_buffer);
 294   }
 295 }
 296
 297 bool pcre_literal_data::isLiteral() const {
 298   return literal_str.hasValue();
 299 }
 300
 301 bool pcre_literal_data::matches(const StringData* subject,
 302                                 int pos,
 303                                 int* offsets) const {
 304   assertx(isLiteral());
 305   assertx(pos >= 0);
 306
 307   // Subject must be at least as long as the literal pattern
 308   // for a match to occur.
 309   if (subject->size() < literal_str->length() + pos) {
 310     return false;
 311   }
 312
 313   size_t literal_strlen = literal_str->length();
 314   auto const subject_c = subject->data();
 315   auto const literal_c = literal_str->c_str();
 316   if (match_start) {
 317     // Make sure an exact match has the right length.
 318     if (pos || (match_end && subject->size() != literal_strlen)) {
 319       return false;
 320     }
 321     // If only matching the start (^), compare the strings
 322     // for the length of the literal pattern.
 323     if (case_insensitive ?
 324         bstrcaseeq(subject_c, literal_c, literal_strlen) :
 325         memcmp(subject_c, literal_c, literal_strlen) == 0) {
 326       offsets[0] = 0;
 327       offsets[1] = literal_strlen * sizeof(char);
 328       return true;
 329     }
 330   } else if (match_end) {
 331     // Compare the literal pattern against the tail end of the subject.
 332     auto const subject_tail = subject_c + (subject->size() - literal_strlen);
 333     if (case_insensitive ?
 334         bstrcaseeq(subject_tail, literal_c, literal_strlen) :
 335         memcmp(subject_tail, literal_c, literal_strlen) == 0) {
 336       offsets[0] = (subject->size() - literal_strlen) * sizeof(char);
 337       offsets[1] = subject->size() * sizeof(char);
 338       return true;
 339     }
 340   } else {
 341     if (!literal_strlen) {
 342       offsets[0] = offsets[1] = pos;
 343       return true;
 344     }
 345     // Check if the literal pattern occurs as a substring of the subject.
 346     auto const subject_str = StrNR(subject);
 347     auto const find_response = subject_str.asString().find(
 348       *literal_str, pos, !case_insensitive);
 349     if (find_response >= 0) {
 350       offsets[0] = find_response * sizeof(char);
 351       offsets[1] = offsets[0] + literal_strlen * sizeof(char);
 352       return true;
 353     }
 354   }
 355   return false;
 356 }
 357
 358 ///////////////////////////////////////////////////////////////////////////////
 359 // PCRECache implementation
 360
 361 PCRECache::StaticCache* PCRECache::CreateStatic() {
 362   StaticCache::Config config;
 363   config.maxLoadFactor = 0.5;
 364   return StaticCache::create(
 365       RuntimeOption::EvalPCRETableSize, config).release();
 366 }
 367
 368 void PCRECache::DestroyStatic(StaticCache* cache) {
 369   // We delete uncounted keys while iterating the cache, which is OK for
 370   // AtomicHashArray, but not OK for other containers, such as
 371   // std::unordered_map.  If you change the cache type make sure that property
 372   // holds or fix this function.
 373   static_assert(std::is_same<PCRECache::StaticCache,
 374       folly::AtomicHashArray<const StringData*, const pcre_cache_entry*,
 375                              string_data_hash, ahm_string_data_same>>::value,
 376       "StaticCache must be an AtomicHashArray or this destructor is wrong.");
 377   for (auto& it : *cache) {
 378     if (it.first->isUncounted()) {
 379       const_cast<StringData*>(it.first)->destructUncounted();
 380     }
 381     delete it.second;
 382   }
 383   StaticCache::destroy(cache);
 384 }
 385
 386 void PCRECache::reinit(CacheKind kind) {
 387   switch (m_kind) {
 388     case CacheKind::Static:
 389       if (m_staticCache.load()) {
 390         DestroyStatic(m_staticCache);
 391         m_staticCache = nullptr;
 392       }
 393       break;
 394     case CacheKind::Lru:
 395       m_lruCache.reset();
 396       break;
 397     case CacheKind::Scalable:
 398       m_scalableCache.reset();
 399       break;
 400   }
 401   m_kind = kind;
 402
 403   switch (kind) {
 404     case CacheKind::Static:
 405       m_staticCache = CreateStatic();
 406       m_expire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
 407       break;
 408     case CacheKind::Lru:
 409       m_lruCache.reset(new LRUCache(RuntimeOption::EvalPCRETableSize));
 410       break;
 411     case CacheKind::Scalable:
 412       m_scalableCache.reset(
 413         new ScalableCache(RuntimeOption::EvalPCRETableSize));
 414       break;
 415   }
 416 }
 417
 418 bool PCRECache::find(Accessor& accessor,
 419                      const StringData* regex,
 420                      TempKeyCache& keyCache)
 421 {
 422   switch (m_kind) {
 423     case CacheKind::Static:
 424       {
 425         assert(m_staticCache.load());
 426         StaticCache::iterator it;
 427         auto cache = m_staticCache.load(std::memory_order_acquire);
 428         if ((it = cache->find(regex)) != cache->end()) {
 429           accessor = it->second;
 430           return true;
 431         }
 432         return false;
 433       }
 434     case CacheKind::Lru:
 435     case CacheKind::Scalable:
 436       {
 437         if (!keyCache) {
 438           keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
 439         }
 440         bool found;
 441         if (m_kind == CacheKind::Lru) {
 442           found = m_lruCache->find(accessor.resetToLRU(), *keyCache);
 443         } else {
 444           found = m_scalableCache->find(accessor.resetToLRU(), *keyCache);
 445         }
 446         return found;
 447       }
 448   }
 449   always_assert(false);
 450 }
 451
 452 void PCRECache::clearStatic() {
 453   std::unique_lock<std::mutex> lock(m_clearMutex, std::try_to_lock);
 454   if (!lock) return;
 455
 456   auto newExpire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
 457   m_expire.store(newExpire, std::memory_order_relaxed);
 458
 459   auto tmpMap = CreateStatic();
 460   tmpMap = m_staticCache.exchange(tmpMap, std::memory_order_acq_rel);
 461
 462   Treadmill::enqueue([tmpMap]() {
 463       DestroyStatic(tmpMap);
 464    });
 465 }
 466
 467 void PCRECache::insert(
 468   Accessor& accessor,
 469   const StringData* regex,
 470   TempKeyCache& keyCache,
 471   const pcre_cache_entry* ent
 472 ) {
 473   switch (m_kind) {
 474     case CacheKind::Static:
 475       {
 476         assert(m_staticCache.load());
 477         // Clear the cache if we haven't refreshed it in a while
 478         if (time(nullptr) > m_expire) {
 479           clearStatic();
 480         }
 481         auto cache = m_staticCache.load(std::memory_order_acquire);
 482         auto key = regex->isStatic()
 483           ? regex
 484           : StringData::MakeUncounted(regex->slice());
 485         auto pair = cache->insert(StaticCachePair(key, ent));
 486         if (pair.second) {
 487           // Inserted, container owns the pointer
 488           accessor = ent;
 489         } else {
 490           // Not inserted, caller needs to own the pointer
 491           if (key != regex) const_cast<StringData*>(key)->destructUncounted();
 492           accessor = EntryPtr(ent);
 493         }
 494       }
 495       break;
 496     case CacheKind::Lru:
 497     case CacheKind::Scalable:
 498       {
 499         if (!keyCache) {
 500           keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
 501         }
 502         // Pointer ownership is shared between container and caller
 503         accessor = EntryPtr(ent);
 504         if (m_kind == CacheKind::Lru) {
 505           m_lruCache->insert(*keyCache, accessor.entryPtr());
 506         } else {
 507           m_scalableCache->insert(*keyCache, accessor.entryPtr());
 508         }
 509       }
 510       break;
 511   }
 512 }
 513
 514 void PCRECache::dump(const std::string& filename) {
 515   std::ofstream out(filename.c_str());
 516   switch (m_kind) {
 517     case CacheKind::Static:
 518       for (auto& it : *m_staticCache) {
 519         out << it.first->data() << "\n";
 520       }
 521       break;
 522     case CacheKind::Lru:
 523     case CacheKind::Scalable:
 524       {
 525         std::vector<LRUCacheKey> keys;
 526         if (m_kind == CacheKind::Lru) {
 527           m_lruCache->snapshotKeys(keys);
 528         } else {
 529           m_scalableCache->snapshotKeys(keys);
 530         }
 531         for (auto& key: keys) {
 532           out << key.c_str() << "\n";
 533         }
 534       }
 535       break;
 536   }
 537   out.close();
 538 }
 539
 540 size_t PCRECache::size() const {
 541   switch (m_kind) {
 542     case CacheKind::Static:
 543       return m_staticCache.load(std::memory_order_acquire)->size();
 544     case CacheKind::Lru:
 545       return m_lruCache->size();
 546     case CacheKind::Scalable:
 547      return m_scalableCache->size();
 548   }
 549   always_assert(false);
 550 }
 551
 552 ///////////////////////////////////////////////////////////////////////////////
 553 // Public interface and helper functions
 554
 555 void pcre_reinit() {
 556   PCRECache::CacheKind kind;
 557   if (RuntimeOption::EvalPCRECacheType == "static") {
 558     kind = PCRECache::CacheKind::Static;
 559   } else if (RuntimeOption::EvalPCRECacheType == "lru") {
 560     kind = PCRECache::CacheKind::Lru;
 561   } else if (RuntimeOption::EvalPCRECacheType == "scalable") {
 562     kind = PCRECache::CacheKind::Scalable;
 563   } else {
 564     Logger::Warning("Eval.PCRECacheType should be either static, "
 565                     "lru or scalable");
 566     kind = PCRECache::CacheKind::Scalable;
 567   }
 568   s_pcreCache.reinit(kind);
 569 }
 570
 571 void pcre_init() {
 572 }
 573
 574 void pcre_dump_cache(const std::string& filename) {
 575   s_pcreCache.dump(filename);
 576 }
 577
 578 static pcre_jit_stack* alloc_jit_stack(void* /*data*/) {
 579   return tl_pcre_globals->jit_stack;
 580 }
 581
 582 namespace {
 583
 584 template<bool useSmartFree = false>
 585 struct FreeHelperImpl {
 586   explicit FreeHelperImpl(void* p) : p(p) {}
 587   ~FreeHelperImpl() {
 588     useSmartFree ? req::free(p) : free(p);
 589   }
 590
 591   FreeHelperImpl(const FreeHelperImpl&) = delete;
 592   FreeHelperImpl& operator=(const FreeHelperImpl&) = delete;
 593
 594 private:
 595   void* p;
 596 };
 597
 598 typedef FreeHelperImpl<true> SmartFreeHelper;
 599 }
 600
 601 static void init_local_extra(pcre_extra* local, pcre_extra* shared) {
 602   if (shared) {
 603     memcpy(local, shared, sizeof(pcre_extra));
 604   } else {
 605     memset(local, 0, sizeof(pcre_extra));
 606     local->flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 607   }
 608   local->match_limit = tl_pcre_globals->preg_backtrace_limit;
 609   local->match_limit_recursion = tl_pcre_globals->preg_recursion_limit;
 610 }
 611
 612 static const char* const*
 613 get_subpat_names(const pcre_cache_entry* pce) {
 614   char **subpat_names = pce->subpat_names.load(std::memory_order_relaxed);
 615   if (subpat_names) {
 616     return subpat_names;
 617   }
 618
 619   /*
 620   * Build a mapping from subpattern numbers to their names. We will always
 621   * allocate the table, even though there may be no named subpatterns. This
 622   * avoids somewhat more complicated logic in the inner loops.
 623   */
 624   pcre_extra extra;
 625   init_local_extra(&extra, pce->extra);
 626
 627   int name_count;
 628
 629   subpat_names = (char **)calloc(pce->num_subpats, sizeof(char *));
 630   int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMECOUNT, &name_count);
 631   if (rc < 0) {
 632     raise_warning("Internal pcre_fullinfo() error %d", rc);
 633     return nullptr;
 634   }
 635   if (name_count > 0) {
 636     int name_size, ni = 0;
 637     unsigned short name_idx;
 638     char* name_table;
 639     int rc1, rc2;
 640
 641     rc1 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMETABLE, &name_table);
 642     rc2 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
 643     rc = rc2 ? rc2 : rc1;
 644     if (rc < 0) {
 645       raise_warning("Internal pcre_fullinfo() error %d", rc);
 646       return nullptr;
 647     }
 648     while (ni++ < name_count) {
 649       name_idx = 0xff * (unsigned char)name_table[0] +
 650                  (unsigned char)name_table[1];
 651       subpat_names[name_idx] = name_table + 2;
 652       if (is_numeric_string(subpat_names[name_idx],
 653                             strlen(subpat_names[name_idx]),
 654                             nullptr, nullptr, 0) != KindOfNull) {
 655         raise_warning("Numeric named subpatterns are not allowed");
 656         return nullptr;
 657       }
 658       name_table += name_size;
 659     }
 660   }
 661   // Store subpat_names into the cache entry
 662   char **expected = nullptr;
 663   if (!pce->subpat_names.compare_exchange_strong(expected, subpat_names)) {
 664     // Another thread stored subpat_names already. The array created by the
 665     // other thread is now in expected, return it instead and delete the one
 666     // we just made.
 667     free(subpat_names);
 668     return expected;
 669   }
 670   return subpat_names;
 671 }
 672
 673 static bool get_pcre_fullinfo(pcre_cache_entry* pce) {
 674   pcre_extra extra;
 675   init_local_extra(&extra, pce->extra);
 676
 677   /* Calculate the size of the offsets array*/
 678   int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_CAPTURECOUNT,
 679                          &pce->num_subpats);
 680   if (rc < 0) {
 681     raise_warning("Internal pcre_fullinfo() error %d", rc);
 682     return false;
 683   }
 684   pce->num_subpats++;
 685   return true;
 686 }
 687
 688 static bool
 689 pcre_get_compiled_regex_cache(PCRECache::Accessor& accessor,
 690                               const StringData* regex) {
 691   PCRECache::TempKeyCache tkc;
 692
 693   /* Try to lookup the cached regex entry, and if successful, just pass
 694      back the compiled pattern, otherwise go on and compile it. */
 695   if (s_pcreCache.find(accessor, regex, tkc)) {
 696     return true;
 697   }
 698
 699   /* Parse through the leading whitespace, and display a warning if we
 700      get to the end without encountering a delimiter. */
 701   const char *p = regex->data();
 702   while (isspace((int)*(unsigned char *)p)) p++;
 703   if (*p == 0) {
 704     raise_warning("Empty regular expression");
 705     return false;
 706   }
 707
 708   /* Get the delimiter and display a warning if it is alphanumeric
 709      or a backslash. */
 710   char delimiter = *p++;
 711   if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
 712     raise_warning("Delimiter must not be alphanumeric or backslash");
 713     return false;
 714   }
 715
 716   char start_delimiter = delimiter;
 717   const char *pp = strchr("([{< )]}> )]}>", delimiter);
 718   if (pp) {
 719     delimiter = pp[5];
 720   }
 721   char end_delimiter = delimiter;
 722
 723   if (start_delimiter == end_delimiter) {
 724     /* We need to iterate through the pattern, searching for the ending
 725      * delimiter, but skipping the backslashed delimiters. If the ending
 726      * delimiter is not found, display a warning. */
 727     pp = p;
 728     while (*pp != 0) {
 729       if (*pp == '\\' && pp[1] != 0) pp++;
 730       else if (*pp == delimiter)
 731         break;
 732       pp++;
 733     }
 734     if (*pp == 0) {
 735       raise_warning("No ending delimiter '%c' found: [%s]", delimiter,
 736                       regex->data());
 737       return false;
 738     }
 739   } else {
 740     /* We iterate through the pattern, searching for the matching ending
 741      * delimiter. For each matching starting delimiter, we increment nesting
 742      * level, and decrement it for each matching ending delimiter. If we
 743      * reach the end of the pattern without matching, display a warning.
 744      */
 745     int brackets = 1; // brackets nesting level
 746     pp = p;
 747     while (*pp != 0) {
 748       if (*pp == '\\' && pp[1] != 0) pp++;
 749       else if (*pp == end_delimiter && --brackets <= 0)
 750         break;
 751       else if (*pp == start_delimiter)
 752         brackets++;
 753       pp++;
 754     }
 755     if (*pp == 0) {
 756       raise_warning("No ending matching delimiter '%c' found: [%s]",
 757                       end_delimiter, regex->data());
 758       return false;
 759     }
 760   }
 761
 762   /* Make a copy of the actual pattern. */
 763   String spattern(p, pp-p, CopyString);
 764   const char *pattern = spattern.data();
 765
 766   /* Move on to the options */
 767   pp++;
 768
 769   /* Parse through the options, setting appropriate flags.  Display
 770      a warning if we encounter an unknown modifier. */
 771   int coptions = 0;
 772   int poptions = 0;
 773   bool do_study = false;
 774   while (*pp != 0) {
 775     switch (*pp++) {
 776       /* Perl compatible options */
 777     case 'i':  coptions |= PCRE_CASELESS;       break;
 778     case 'm':  coptions |= PCRE_MULTILINE;      break;
 779     case 's':  coptions |= PCRE_DOTALL;         break;
 780     case 'x':  coptions |= PCRE_EXTENDED;       break;
 781
 782       /* PCRE specific options */
 783     case 'A':  coptions |= PCRE_ANCHORED;       break;
 784     case 'D':  coptions |= PCRE_DOLLAR_ENDONLY; break;
 785     case 'S':  do_study = true;                 break;
 786     case 'U':  coptions |= PCRE_UNGREEDY;       break;
 787     case 'X':  coptions |= PCRE_EXTRA;          break;
 788     case 'u':  coptions |= PCRE_UTF8;
 789   /* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
 790        characters, even in UTF-8 mode. However, this can be changed by setting
 791        the PCRE_UCP option. */
 792 #ifdef PCRE_UCP
 793             coptions |= PCRE_UCP;
 794 #endif
 795       break;
 796
 797       /* Custom preg options */
 798     case 'e':  poptions |= PREG_REPLACE_EVAL;   break;
 799
 800     case ' ':
 801     case '\n':
 802       break;
 803
 804     default:
 805       raise_warning("Unknown modifier '%c': [%s]", pp[-1], regex->data());
 806       return false;
 807     }
 808   }
 809
 810   /* We've reached a null byte, now check if we're actually at the end of the
 811      string.  If not this is a bad expression, and a potential security hole. */
 812   if (regex->size() != (pp - regex->data())) {
 813     raise_error("Error: Null byte found in pattern");
 814   }
 815
 816   /* Compile pattern and display a warning if compilation failed. */
 817   const char  *error;
 818   int erroffset;
 819   pcre *re = pcre_compile(pattern, coptions, &error, &erroffset, 0);
 820   if (re == nullptr) {
 821     raise_warning("Compilation failed: %s at offset %d", error, erroffset);
 822     return false;
 823   }
 824
 825   // Careful: from here 're' needs to be freed if something throws.
 826
 827   // TODO(t14969501): enable literal_data everywhere and skip the
 828   // pcre_compile above.
 829   auto const literal_data = pcre_literal_data(pattern, coptions);
 830
 831   /* If study option was specified, study the pattern and
 832      store the result in extra for passing to pcre_exec. */
 833   pcre_extra *extra = nullptr;
 834   if (!literal_data.isLiteral()) {
 835     if (do_study || PCRE_STUDY_JIT_COMPILE) {
 836       int soptions = PCRE_STUDY_JIT_COMPILE;
 837       extra = pcre_study(re, soptions, &error);
 838       if (extra) {
 839         extra->flags |= PCRE_EXTRA_MATCH_LIMIT |
 840           PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 841         pcre_assign_jit_stack(extra, alloc_jit_stack, nullptr);
 842       }
 843       if (error != nullptr) {
 844         try {
 845           raise_warning("Error while studying pattern");
 846         } catch (...) {
 847           pcre_free(re);
 848           throw;
 849         }
 850       }
 851       if ((!RuntimeOption::EvalJitNoGdb ||
 852            RuntimeOption::EvalJitUseVtuneAPI ||
 853            RuntimeOption::EvalPerfPidMap) &&
 854           extra &&
 855           extra->executable_jit != nullptr) {
 856         size_t size;
 857         pcre_fullinfo(re, extra, PCRE_INFO_JITSIZE, &size);
 858
 859         TCA start = *(TCA *)(extra->executable_jit);
 860         TCA end = start + size;
 861         std::string name = folly::sformat("HHVM::pcre_jit::{}", pattern);
 862
 863         if (!RuntimeOption::EvalJitNoGdb && jit::mcgen::initialized()) {
 864           Debug::DebugInfo::Get()->recordStub(Debug::TCRange(start, end, false),
 865                                               name);
 866         }
 867         if (RuntimeOption::EvalJitUseVtuneAPI) {
 868           HPHP::jit::reportHelperToVtune(name.c_str(), start, end);
 869         }
 870         if (RuntimeOption::EvalPerfPidMap && jit::mcgen::initialized()) {
 871           Debug::DebugInfo::Get()->recordPerfMap(
 872             Debug::TCRange(start, end, false),
 873             SrcKey{}, nullptr, false, false,
 874             HPHP::JSON::Escape(name.c_str())
 875           );
 876         }
 877       }
 878     }
 879   }
 880
 881   /* Store the compiled pattern and extra info in the cache. */
 882   pcre_cache_entry* new_entry = new pcre_cache_entry();
 883   new_entry->re = re;
 884   new_entry->extra = extra;
 885   if (literal_data.isLiteral()) {
 886     new_entry->literal_data =
 887       std::make_unique<pcre_literal_data>(std::move(literal_data));
 888   }
 889
 890   assert((poptions & ~0x1) == 0);
 891   new_entry->preg_options = poptions;
 892
 893   assert((coptions & 0x80000000) == 0);
 894   new_entry->compile_options = coptions;
 895
 896   /* Get pcre full info */
 897   if (!get_pcre_fullinfo(new_entry)) {
 898     delete new_entry;
 899     return false;
 900   }
 901
 902   s_pcreCache.insert(accessor, regex, tkc, new_entry);
 903   return true;
 904 }
 905
 906 static int* create_offset_array(const pcre_cache_entry* pce,
 907                                 int& size_offsets) {
 908   /* Allocate memory for the offsets array */
 909   size_offsets = pce->num_subpats * 3;
 910   return (int *)req::malloc_noptrs(size_offsets * sizeof(int));
 911 }
 912
 913 static inline void add_offset_pair(Array& result,
 914                                    const String& str,
 915                                    int offset,
 916                                    const char* name) {
 917   auto match_pair = make_packed_array(str, offset);
 918   if (name) result.set(String(name), match_pair);
 919   result.append(match_pair);
 920 }
 921
 922 static inline bool pcre_need_log_error(int pcre_code) {
 923   return RuntimeOption::EnablePregErrorLog &&
 924          (pcre_code == PCRE_ERROR_MATCHLIMIT ||
 925           pcre_code == PCRE_ERROR_RECURSIONLIMIT);
 926 }
 927
 928 static void pcre_log_error(const char* func, int line, int pcre_code,
 929                            const char* pattern, int pattern_size,
 930                            const char* subject, int subject_size,
 931                            const char* repl, int repl_size,
 932                            int arg1 = 0, int arg2 = 0,
 933                            int arg3 = 0, int arg4 = 0) {
 934   if (!RuntimeOption::EnableHipHopSyntax) {
 935     return;
 936   }
 937   const char* escapedPattern;
 938   const char* escapedSubject;
 939   const char* escapedRepl;
 940   std::string p(pattern, pattern_size);
 941   std::string s(subject, subject_size);
 942   std::string r(repl, repl_size);
 943   escapedPattern = Logger::EscapeString(p);
 944   escapedSubject = Logger::EscapeString(s);
 945   escapedRepl = Logger::EscapeString(r);
 946   const char* errString =
 947     (pcre_code == PCRE_ERROR_MATCHLIMIT) ? "PCRE_ERROR_MATCHLIMIT" :
 948     (pcre_code == PCRE_ERROR_RECURSIONLIMIT) ? "PCRE_ERROR_RECURSIONLIMIT" :
 949     "UNKNOWN";
 950   raise_warning_unsampled(
 951     "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
 952     "limits=(%" PRId64 ", %" PRId64 "), extra=(%d, %d, %d, %d)",
 953     func, line, pcre_code, errString,
 954     escapedPattern, escapedSubject, escapedRepl,
 955     tl_pcre_globals->preg_backtrace_limit,
 956     tl_pcre_globals->preg_recursion_limit,
 957     arg1, arg2, arg3, arg4);
 958   free((void *)escapedPattern);
 959   free((void *)escapedSubject);
 960   free((void *)escapedRepl);
 961 }
 962
 963 static void pcre_handle_exec_error(int pcre_code) {
 964   int preg_code = 0;
 965   switch (pcre_code) {
 966   case PCRE_ERROR_MATCHLIMIT:
 967     preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
 968     break;
 969   case PCRE_ERROR_RECURSIONLIMIT:
 970     preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
 971     break;
 972   case PCRE_ERROR_BADUTF8:
 973     preg_code = PHP_PCRE_BAD_UTF8_ERROR;
 974     break;
 975   case PCRE_ERROR_BADUTF8_OFFSET:
 976     preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
 977     break;
 978   default:
 979     preg_code = PHP_PCRE_INTERNAL_ERROR;
 980     break;
 981   }
 982   tl_last_error_code = preg_code;
 983 }
 984
 985 ///////////////////////////////////////////////////////////////////////////////
 986
 987 Variant preg_grep(const String& pattern, const Array& input, int flags /* = 0 */) {
 988   PCRECache::Accessor accessor;
 989   if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
 990     return false;
 991   }
 992   const pcre_cache_entry* pce = accessor.get();
 993
 994   int size_offsets = 0;
 995   int* offsets = create_offset_array(pce, size_offsets);
 996   if (offsets == nullptr) {
 997     return false;
 998   }
 999   SmartFreeHelper freer(offsets);
1000
1001   /* Initialize return array */
1002   Array ret = Array::Create();
1003   tl_last_error_code = PHP_PCRE_NO_ERROR;
1004
1005   /* Go through the input array */
1006   bool invert = (flags & PREG_GREP_INVERT);
1007   pcre_extra extra;
1008   init_local_extra(&extra, pce->extra);
1009
1010   for (ArrayIter iter(input); iter; ++iter) {
1011     String entry = iter.second().toString();
1012
1013     /* Perform the match */
1014     int count = pcre_exec(pce->re, &extra, entry.data(), entry.size(),
1015                           0, 0, offsets, size_offsets);
1016
1017     /* Check for too many substrings condition. */
1018     if (count == 0) {
1019       raise_warning("Matched, but too many substrings");
1020       count = size_offsets / 3;
1021     } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1022       if (pcre_need_log_error(count)) {
1023         pcre_log_error(__FUNCTION__, __LINE__, count,
1024                        pattern.data(), pattern.size(),
1025                        entry.data(), entry.size(),
1026                        "", 0,
1027                        flags);
1028       }
1029       pcre_handle_exec_error(count);
1030       break;
1031     }
1032
1033     /* If the entry fits our requirements */
1034     if ((count > 0 && !invert) ||
1035         (count == PCRE_ERROR_NOMATCH && invert)) {
1036
1037       /* Add to return array */
1038       ret.set(iter.first(), entry);
1039     }
1040   }
1041
1042   return ret;
1043 }
1044
1045 ///////////////////////////////////////////////////////////////////////////////
1046
1047 static Variant preg_match_impl(const StringData* pattern,
1048                                const StringData* subject,
1049                                Variant* subpats, int flags, int start_offset,
1050                                bool global) {
1051   PCRECache::Accessor accessor;
1052   if (!pcre_get_compiled_regex_cache(accessor, pattern)) {
1053     return false;
1054   }
1055   const pcre_cache_entry* pce = accessor.get();
1056
1057   pcre_extra extra;
1058   init_local_extra(&extra, pce->extra);
1059   if (subpats) {
1060     *subpats = Array::Create();
1061   }
1062   int exec_options = 0;
1063
1064   int subpats_order = global ? PREG_PATTERN_ORDER : 0;
1065   bool offset_capture = false;
1066   if (flags) {
1067     offset_capture = flags & PREG_OFFSET_CAPTURE;
1068
1069     /*
1070      * subpats_order is pre-set to pattern mode so we change it only if
1071      * necessary.
1072      */
1073     if (flags & 0xff) {
1074       subpats_order = flags & 0xff;
1075     }
1076     if ((global && (subpats_order < PREG_PATTERN_ORDER ||
1077                     subpats_order > PREG_SET_ORDER)) ||
1078         (!global && subpats_order != 0)) {
1079       raise_warning("Invalid flags specified");
1080       return init_null();
1081     }
1082   }
1083
1084   /* Negative offset counts from the end of the string. */
1085   if (start_offset < 0) {
1086     start_offset = subject->size() + start_offset;
1087     if (start_offset < 0) {
1088       start_offset = 0;
1089     }
1090   }
1091
1092   int size_offsets = 0;
1093   int* offsets = create_offset_array(pce, size_offsets);
1094   SmartFreeHelper offsetsFreer(offsets);
1095   int num_subpats = size_offsets / 3;
1096   if (offsets == nullptr) {
1097     return false;
1098   }
1099
1100   const char* const* subpat_names = get_subpat_names(pce);
1101   if (subpat_names == nullptr) {
1102     return false;
1103   }
1104
1105   /* Allocate match sets array and initialize the values. */
1106   Array match_sets; /* An array of sets of matches for each
1107                        subpattern after a global match */
1108   if (global && subpats_order == PREG_PATTERN_ORDER) {
1109     for (int i = 0; i < num_subpats; i++) {
1110       match_sets.set(i, Array::Create());
1111     }
1112   }
1113
1114   int matched = 0;
1115   tl_last_error_code = PHP_PCRE_NO_ERROR;
1116
1117   int g_notempty = 0; // If the match should not be empty
1118   const char** stringlist; // Holds list of subpatterns
1119   int i;
1120   do {
1121
1122     int count = 0;
1123     /*
1124      * Optimization: If the pattern defines a literal substring,
1125      * compare the strings directly (i.e. memcmp) instead of performing
1126      * the full regular expression evaluation.
1127      * Take the slow path if there are any special compile options.
1128      */
1129     if (pce->literal_data && !global) {
1130       assertx(pce->literal_data->isLiteral());
1131       /* TODO(t13140878): compare literal against multiple substrings
1132        * in the preg_match_all (global == true) case. */
1133       count = pce->literal_data->matches(subject, start_offset, offsets) ? 1
1134         : PCRE_ERROR_NOMATCH;
1135     } else {
1136       /* Execute the regular expression. */
1137       count = pcre_exec(pce->re, &extra, subject->data(), subject->size(),
1138                         start_offset,
1139                         exec_options | g_notempty,
1140                         offsets, size_offsets);
1141
1142       /* The string was already proved to be valid UTF-8 */
1143       exec_options |= PCRE_NO_UTF8_CHECK;
1144     }
1145     /* Check for too many substrings condition. */
1146     if (count == 0) {
1147       raise_warning("Matched, but too many substrings");
1148       count = size_offsets / 3;
1149     }
1150
1151     /* If something has matched */
1152     if (count > 0) {
1153       matched++;
1154
1155       if (subpats) {
1156         // Try to get the list of substrings and display a warning if failed.
1157         if (offsets[1] < offsets[0] ||
1158             pcre_get_substring_list(subject->data(), offsets, count,
1159                                     &stringlist) < 0) {
1160           raise_warning("Get subpatterns list failed");
1161           return false;
1162         }
1163
1164         if (global) {  /* global pattern matching */
1165           if (subpats_order == PREG_PATTERN_ORDER) {
1166             /* For each subpattern, insert it into the appropriate array. */
1167             for (i = 0; i < count; i++) {
1168               if (offset_capture) {
1169                 auto& lval = match_sets.lvalAt(i);
1170                 forceToArray(lval);
1171                 add_offset_pair(lval.toArrRef(),
1172                                 String(stringlist[i],
1173                                        offsets[(i<<1)+1] - offsets[i<<1],
1174                                        CopyString),
1175                                 offsets[i<<1], nullptr);
1176               } else {
1177                 auto& lval = match_sets.lvalAt(i);
1178                 forceToArray(lval).append(
1179                   String(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1180                     CopyString)
1181                 );
1182               }
1183             }
1184             /*
1185              * If the number of captured subpatterns on this run is
1186              * less than the total possible number, pad the result
1187              * arrays with empty strings.
1188              */
1189             if (count < num_subpats) {
1190               for (; i < num_subpats; i++) {
1191                 auto& lval = match_sets.lvalAt(i);
1192                 forceToArray(lval).append("");
1193               }
1194             }
1195           } else {
1196             Array result_set = Array::Create();
1197
1198             /* Add all the subpatterns to it */
1199             for (i = 0; i < count; i++) {
1200               if (offset_capture) {
1201                 add_offset_pair(result_set,
1202                                 String(stringlist[i],
1203                                        offsets[(i<<1)+1] - offsets[i<<1],
1204                                        CopyString),
1205                                 offsets[i<<1], subpat_names[i]);
1206               } else {
1207                 String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1208                              CopyString);
1209                 if (subpat_names[i]) {
1210                   result_set.set(String(subpat_names[i]), value);
1211                 }
1212                 result_set.append(value);
1213               }
1214             }
1215             /* And add it to the output array */
1216             forceToArray(*subpats).append(std::move(result_set));
1217           }
1218         } else {      /* single pattern matching */
1219           /* For each subpattern, insert it into the subpatterns array. */
1220           for (i = 0; i < count; i++) {
1221             if (offset_capture) {
1222               add_offset_pair(forceToArray(*subpats),
1223                               String(stringlist[i],
1224                                      offsets[(i<<1)+1] - offsets[i<<1],
1225                                      CopyString),
1226                               offsets[i<<1], subpat_names[i]);
1227             } else {
1228               String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1229                            CopyString);
1230               if (subpat_names[i]) {
1231                 forceToArray(*subpats).set(String(subpat_names[i]), value);
1232               }
1233               forceToArray(*subpats).append(value);
1234             }
1235           }
1236         }
1237         pcre_free((void *) stringlist);
1238       }
1239     } else if (count == PCRE_ERROR_NOMATCH) {
1240       /* If we previously set PCRE_NOTEMPTY after a null match,
1241          this is not necessarily the end. We need to advance
1242          the start offset, and continue. Fudge the offset values
1243          to achieve this, unless we're already at the end of the string. */
1244       if (g_notempty && start_offset < subject->size()) {
1245         offsets[0] = start_offset;
1246         offsets[1] = start_offset + 1;
1247       } else
1248         break;
1249     } else {
1250       if (pcre_need_log_error(count)) {
1251         pcre_log_error(__FUNCTION__, __LINE__, count,
1252                        pattern->data(), pattern->size(),
1253                        subject->data(), subject->size(),
1254                        "", 0,
1255                        flags, start_offset, g_notempty, global);
1256       }
1257       pcre_handle_exec_error(count);
1258       return false;
1259     }
1260
1261     /* If we have matched an empty string, mimic what Perl's /g options does.
1262        This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1263        the match again at the same point. If this fails (picked up above) we
1264        advance to the next character. */
1265     g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1266
1267     /* Advance to the position right after the last full match */
1268     start_offset = offsets[1];
1269   } while (global);
1270
1271   /* Add the match sets to the output array and clean up */
1272   if (subpats && global && subpats_order == PREG_PATTERN_ORDER) {
1273     for (i = 0; i < num_subpats; i++) {
1274       if (subpat_names[i]) {
1275         forceToArray(*subpats).set(String(subpat_names[i]), match_sets[i]);
1276       }
1277       forceToArray(*subpats).append(match_sets[i]);
1278     }
1279   }
1280   return matched;
1281 }
1282
1283 Variant preg_match(const String& pattern, const String& subject,
1284                    Variant* matches /* = nullptr */, int flags /* = 0 */,
1285                    int offset /* = 0 */) {
1286   return preg_match(pattern.get(), subject.get(), matches, flags, offset);
1287 }
1288
1289 Variant preg_match(const StringData* pattern, const StringData* subject,
1290                    Variant* matches /* = nullptr */, int flags /* = 0 */,
1291                    int offset /* = 0 */) {
1292   return preg_match_impl(pattern, subject, matches, flags, offset, false);
1293 }
1294
1295 Variant preg_match_all(const String& pattern, const String& subject,
1296                        Variant* matches /* = nullptr */,
1297                        int flags /* = 0 */, int offset /* = 0 */) {
1298   return preg_match_all(pattern.get(), subject.get(), matches, flags, offset);
1299 }
1300
1301 Variant preg_match_all(const StringData* pattern, const StringData* subject,
1302                        Variant* matches /* = nullptr */,
1303                        int flags /* = 0 */, int offset /* = 0 */) {
1304   return preg_match_impl(pattern, subject, matches, flags, offset, true);
1305 }
1306
1307 ///////////////////////////////////////////////////////////////////////////////
1308
1309 static String preg_do_repl_func(const Variant& function, const String& subject,
1310                                 int* offsets, const char* const* subpat_names,
1311                                 int count) {
1312   Array subpats = Array::Create();
1313   for (int i = 0; i < count; i++) {
1314     auto off1 = offsets[i<<1];
1315     auto off2 = offsets[(i<<1)+1];
1316     auto sub = subject.substr(off1, off2 - off1);
1317
1318     if (subpat_names[i]) {
1319       subpats.set(String(subpat_names[i]), sub);
1320     }
1321     subpats.append(sub);
1322   }
1323
1324   Array args;
1325   args.set(0, subpats);
1326   return vm_call_user_func(function, args).toString();
1327 }
1328
1329 static bool preg_get_backref(const char** str, int* backref) {
1330   char in_brace = 0;
1331   const char* walk = *str;
1332
1333   if (walk[1] == 0) {
1334     return false;
1335   }
1336
1337   if (*walk == '$' && walk[1] == '{') {
1338     in_brace = 1;
1339     walk++;
1340   }
1341   walk++;
1342
1343   if (*walk >= '0' && *walk <= '9') {
1344     *backref = *walk - '0';
1345     walk++;
1346   } else {
1347     return false;
1348   }
1349
1350   if (*walk && *walk >= '0' && *walk <= '9') {
1351     *backref = *backref * 10 + *walk - '0';
1352     walk++;
1353   }
1354
1355   if (in_brace) {
1356     if (*walk == 0 || *walk != '}') {
1357       return false;
1358     }
1359     walk++;
1360   }
1361
1362   *str = walk;
1363   return true;
1364 }
1365
1366 static Variant php_pcre_replace(const String& pattern, const String& subject,
1367                                 const Variant& replace_var, bool callable,
1368                                 int limit, int* replace_count) {
1369   PCRECache::Accessor accessor;
1370   if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1371     return false;
1372   }
1373   const pcre_cache_entry* pce = accessor.get();
1374   bool eval = pce->preg_options & PREG_REPLACE_EVAL;
1375   if (eval) {
1376     if (RuntimeOption::EvalAuthoritativeMode) {
1377       throw Exception(
1378         "You can't use eval in RepoAuthoritative mode. It breaks all sorts of "
1379         "assumptions we use for speed. Switch to using preg_replace_callback()."
1380       );
1381     }
1382     if (callable) {
1383       raise_warning(
1384         "Modifier /e cannot be used with replacement callback."
1385       );
1386       return init_null();
1387     }
1388     raise_deprecated(
1389       "preg_replace(): The /e modifier is deprecated, use "
1390       "preg_replace_callback instead"
1391     );
1392   }
1393
1394   int size_offsets;
1395   int* offsets = create_offset_array(pce, size_offsets);
1396   SmartFreeHelper offsetsFreer(offsets);
1397   if (offsets == nullptr) {
1398     return false;
1399   }
1400
1401   const char* const* subpat_names = get_subpat_names(pce);
1402   if (subpat_names == nullptr) {
1403     return false;
1404   }
1405
1406   const char* replace = nullptr;
1407   const char* replace_end = nullptr;
1408   int replace_len = 0;
1409   String replace_val;
1410
1411   if (!callable) {
1412     replace_val = replace_var.toString();
1413     replace = replace_val.data();
1414     replace_len = replace_val.size();
1415     replace_end = replace + replace_len;
1416   }
1417
1418   StringBuffer result(2 * subject.size());
1419
1420   try {
1421
1422     /* Initialize */
1423     const char* match = nullptr;
1424     int start_offset = 0;
1425     tl_last_error_code = PHP_PCRE_NO_ERROR;
1426     pcre_extra extra;
1427     init_local_extra(&extra, pce->extra);
1428
1429     const char* walk;     // Used to walk the replacement string
1430     char walk_last;       // Last walked character
1431     int match_len;        // Length of the current match
1432     int backref;          // Backreference number
1433     int g_notempty = 0;   // If the match should not be empty
1434     int exec_options = 0; // Options passed to pcre_exec
1435     while (1) {
1436       /* Execute the regular expression. */
1437       int count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1438                             start_offset,
1439                             exec_options | g_notempty,
1440                             offsets, size_offsets);
1441
1442       /* The string was already proved to be valid UTF-8 */
1443       exec_options |= PCRE_NO_UTF8_CHECK;
1444
1445       /* Check for too many substrings condition. */
1446       if (count == 0) {
1447         raise_warning("Matched, but too many substrings");
1448         count = size_offsets / 3;
1449       }
1450
1451       const char* piece = subject.data() + start_offset;
1452       if (count > 0 && offsets[1] >= offsets[0] &&
1453           (limit == -1 || limit > 0)) {
1454         if (replace_count) {
1455           ++*replace_count;
1456         }
1457         /* Set the match location in subject */
1458         match = subject.data() + offsets[0];
1459
1460         /* If evaluating, do it and add the return string's length */
1461         String eval_result;
1462         if (callable) {
1463           /* Use custom function to get replacement string and its length. */
1464           eval_result = preg_do_repl_func(replace_var, subject, offsets,
1465                                           subpat_names, count);
1466         } else { /* do regular substitution */
1467           walk = replace;
1468           walk_last = 0;
1469           while (walk < replace_end) {
1470             if ('\\' == *walk || '$' == *walk) {
1471               if (walk_last == '\\') {
1472                 walk++;
1473                 walk_last = 0;
1474                 continue;
1475               }
1476               if (preg_get_backref(&walk, &backref)) {
1477                 if (backref < count) {
1478                   match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1479                   if (eval) {
1480                     String esc_match = HHVM_FN(addslashes)(
1481                       String(
1482                         subject.data() + offsets[backref<<1],
1483                         match_len,
1484                         CopyString
1485                       )
1486                     );
1487                     match_len = esc_match.length();
1488                   }
1489                 }
1490                 continue;
1491               }
1492             }
1493             walk++;
1494             walk_last = walk[-1];
1495           }
1496         }
1497
1498         /* copy the part of the string before the match */
1499         result.append(piece, match-piece);
1500
1501         /* copy replacement and backrefs */
1502         int result_len = result.size();
1503
1504         /* If evaluating or using custom function, copy result to the buffer
1505          * and clean up. */
1506         if (callable) {
1507           result.append(eval_result.data(), eval_result.size());
1508           result_len += eval_result.size();
1509         } else { /* do regular backreference copying */
1510           walk = replace;
1511           walk_last = 0;
1512           Array params;
1513           int lastStart = result.size();
1514           while (walk < replace_end) {
1515             bool handleQuote = eval && '"' == *walk && walk_last != '\\';
1516             if (handleQuote && lastStart != result.size()) {
1517               String str(result.data() + lastStart, result.size() - lastStart,
1518                          CopyString);
1519               params.append(str);
1520               lastStart = result.size();
1521               handleQuote = false;
1522             }
1523             if ('\\' == *walk || '$' == *walk) {
1524               if (walk_last == '\\') {
1525                 result.set(result.size() - 1, *walk++);
1526                 walk_last = 0;
1527                 continue;
1528               }
1529               if (preg_get_backref(&walk, &backref)) {
1530                 if (backref < count) {
1531                   match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1532                   if (eval) {
1533                     String esc_match = HHVM_FN(addslashes)(
1534                       String(
1535                         subject.data() + offsets[backref<<1],
1536                         match_len,
1537                         CopyString
1538                       )
1539                     );
1540                     match_len = esc_match.length();
1541                     result.append(esc_match.data(), match_len);
1542                   } else {
1543                     result.append(
1544                       subject.data() + offsets[backref<<1],
1545                       match_len
1546                     );
1547                   }
1548                 }
1549                 continue;
1550               }
1551             }
1552             result.append(*walk++);
1553             walk_last = walk[-1];
1554             if (handleQuote && lastStart != result.size()) {
1555               lastStart = result.size();
1556             }
1557           }
1558           auto full_len = result.size();
1559           auto data = result.data() + result_len;
1560           if (eval) {
1561             VMRegAnchor _;
1562             auto const ar = GetCallerFrame();
1563             // reserve space for "<?php return " + code + ";"
1564             String prefixedCode(full_len - result_len + 14, ReserveString);
1565             prefixedCode +=
1566               (ar->unit()->isHHFile() ? "<?hh return " : "<?php return ");
1567             prefixedCode += folly::StringPiece{data, full_len - result_len};
1568             prefixedCode += ";";
1569             auto const unit = g_context->compileEvalString(prefixedCode.get());
1570             auto const ctx = ar->func()->cls();
1571             auto const func = unit->getMain(ctx);
1572             ObjectData* thiz;
1573             Class* cls;
1574             if (ctx) {
1575               if (ar->hasThis()) {
1576                 thiz = ar->getThis();
1577                 cls = thiz->getVMClass();
1578               } else {
1579                 thiz = nullptr;
1580                 cls = ar->getClass();
1581               }
1582             } else {
1583               thiz = nullptr;
1584               cls = nullptr;
1585             }
1586             auto v = Variant::attach(
1587               g_context->invokeFunc(func, init_null_variant,
1588                                     thiz, cls, nullptr, nullptr,
1589                                     ExecutionContext::InvokePseudoMain)
1590             );
1591             eval_result = v.toString();
1592
1593             result.resize(result_len);
1594             result.append(eval_result.data(), eval_result.size());
1595           }
1596         }
1597
1598         if (limit != -1) {
1599           limit--;
1600         }
1601
1602       } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1603         /* If we previously set PCRE_NOTEMPTY after a null match,
1604            this is not necessarily the end. We need to advance
1605            the start offset, and continue. Fudge the offset values
1606            to achieve this, unless we're already at the end of the string. */
1607         if (g_notempty != 0 && start_offset < subject.size()) {
1608           offsets[0] = start_offset;
1609           offsets[1] = start_offset + 1;
1610           result.append(piece, 1);
1611         } else {
1612           /* stick that last bit of string on our output */
1613           result.append(piece, subject.size() - start_offset);
1614           break;
1615         }
1616       } else {
1617         if (pcre_need_log_error(count)) {
1618           const char* s;
1619           int size;
1620           String stemp;
1621           if (callable) {
1622             if (replace_var.isObject()) {
1623               stemp = replace_var.asCObjRef()->getClassName().asString()
1624                     + "::__invoke";
1625             } else {
1626               stemp = replace_var.toString();
1627             }
1628             s = stemp.data();
1629             size = stemp.size();
1630           } else {
1631             s = replace_val.data();
1632             size = replace_val.size();
1633           }
1634           pcre_log_error(__FUNCTION__, __LINE__, count,
1635                          pattern.data(), pattern.size(),
1636                          subject.data(), subject.size(),
1637                          s, size,
1638                          callable, limit, start_offset, g_notempty);
1639         }
1640         pcre_handle_exec_error(count);
1641         return init_null();
1642       }
1643
1644       /* If we have matched an empty string, mimic what Perl's /g options does.
1645          This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1646          the match again at the same point. If this fails (picked up above) we
1647          advance to the next character. */
1648       g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1649
1650       /* Advance to the next piece. */
1651       start_offset = offsets[1];
1652     }
1653
1654     return result.detach();
1655   } catch (...) {
1656     throw;
1657   }
1658 }
1659
1660 static Variant php_replace_in_subject(const Variant& regex, const Variant& replace,
1661                                       String subject, int limit, bool callable,
1662                                       int* replace_count) {
1663   if (!regex.isArray()) {
1664     Variant ret = php_pcre_replace(regex.toString(), subject, replace,
1665                                    callable, limit, replace_count);
1666
1667     if (ret.isBoolean()) {
1668       assert(!ret.toBoolean());
1669       return init_null();
1670     }
1671
1672     return ret;
1673   }
1674
1675   if (callable || !replace.isArray()) {
1676     Array arr = regex.toArray();
1677     for (ArrayIter iterRegex(arr); iterRegex; ++iterRegex) {
1678       String regex_entry = iterRegex.second().toString();
1679       Variant ret = php_pcre_replace(regex_entry, subject, replace,
1680                                      callable, limit, replace_count);
1681       if (ret.isBoolean()) {
1682         assert(!ret.toBoolean());
1683         return init_null();
1684       }
1685       if (!ret.isString()) {
1686         return ret;
1687       }
1688       subject = ret.asStrRef();
1689       if (subject.isNull()) {
1690         return subject;
1691       }
1692     }
1693     return subject;
1694   }
1695
1696   Array arrReplace = replace.toArray();
1697   Array arrRegex = regex.toArray();
1698   ArrayIter iterReplace(arrReplace);
1699   for (ArrayIter iterRegex(arrRegex); iterRegex; ++iterRegex) {
1700     String regex_entry = iterRegex.second().toString();
1701     Variant replace_value;
1702     if (iterReplace) {
1703       replace_value = iterReplace.second();
1704       ++iterReplace;
1705     }
1706
1707     Variant ret = php_pcre_replace(regex_entry, subject, replace_value,
1708                                    callable, limit, replace_count);
1709
1710     if (ret.isBoolean()) {
1711       assert(!ret.toBoolean());
1712       return init_null();
1713     }
1714     if (!ret.isString()) {
1715       return ret;
1716     }
1717     subject = ret.asStrRef();
1718     if (subject.isNull()) {
1719       return subject;
1720     }
1721   }
1722   return subject;
1723 }
1724
1725 Variant preg_replace_impl(const Variant& pattern, const Variant& replacement,
1726                           const Variant& subject, int limit, Variant* count,
1727                           bool is_callable, bool is_filter) {
1728   assert(!(is_callable && is_filter));
1729   if (!is_callable &&
1730       replacement.isArray() && !pattern.isArray()) {
1731     raise_warning("Parameter mismatch, pattern is a string while "
1732                     "replacement is an array");
1733     return false;
1734   }
1735
1736   int replace_count = 0;
1737   if (!isContainer(subject)) {
1738     Variant ret = php_replace_in_subject(pattern, replacement,
1739                                          subject.toString(),
1740                                          limit, is_callable, &replace_count);
1741
1742     if (ret.isString()) {
1743       if (count) *count = replace_count;
1744       if (is_filter && replace_count == 0) {
1745         return init_null();
1746       } else {
1747         return ret.asStrRef();
1748       }
1749     }
1750
1751     return ret;
1752   }
1753
1754   Array return_value = Array::Create();
1755   Array arrSubject = subject.toArray();
1756   for (ArrayIter iter(arrSubject); iter; ++iter) {
1757     auto old_replace_count = replace_count;
1758     String subject_entry = iter.second().toString();
1759     Variant ret = php_replace_in_subject(pattern, replacement, subject_entry,
1760                                          limit, is_callable, &replace_count);
1761
1762     if (ret.isString() && !ret.isNull() &&
1763         (!is_filter || replace_count > old_replace_count)) {
1764       return_value.set(iter.first(), ret.asStrRef());
1765     }
1766   }
1767   if (count) *count = replace_count;
1768   return return_value;
1769 }
1770
1771 int preg_replace(Variant& result,
1772                  const Variant& pattern,
1773                  const Variant& replacement,
1774                  const Variant& subject,
1775                  int limit /* = -1 */) {
1776   Variant count;
1777   result = preg_replace_impl(pattern, replacement, subject,
1778                              limit, &count, false, false);
1779   return count.toInt32();
1780 }
1781
1782 int preg_replace_callback(Variant& result,
1783                           const Variant& pattern,
1784                           const Variant& callback,
1785                           const Variant& subject,
1786                           int limit /* = -1 */) {
1787   Variant count;
1788   result = preg_replace_impl(pattern, callback, subject,
1789                              limit, &count, true, false);
1790   return count.toInt32();
1791 }
1792
1793 int preg_filter(Variant& result,
1794                 const Variant& pattern,
1795                 const Variant& replacement,
1796                 const Variant& subject,
1797                 int limit /* = -1 */) {
1798   Variant count;
1799   result = preg_replace_impl(pattern, replacement, subject,
1800                              limit, &count, false, true);
1801   return count.toInt32();
1802 }
1803
1804 ///////////////////////////////////////////////////////////////////////////////
1805
1806 Variant preg_split(const String& pattern, const String& subject,
1807                    int limit /* = -1 */, int flags /* = 0 */) {
1808   PCRECache::Accessor accessor;
1809   if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1810     return false;
1811   }
1812   const pcre_cache_entry* pce = accessor.get();
1813
1814   int no_empty = flags & PREG_SPLIT_NO_EMPTY;
1815   bool delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1816   bool offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1817
1818   if (limit == 0) {
1819     limit = -1;
1820   }
1821
1822   int size_offsets = 0;
1823   int* offsets = create_offset_array(pce, size_offsets);
1824   SmartFreeHelper offsetsFreer(offsets);
1825   if (offsets == nullptr) {
1826     return false;
1827   }
1828
1829   /* Start at the beginning of the string */
1830   int start_offset = 0;
1831   int next_offset = 0;
1832   const char* last_match = subject.data();
1833   tl_last_error_code = PHP_PCRE_NO_ERROR;
1834   pcre_extra extra;
1835   init_local_extra(&extra, pce->extra);
1836
1837   // Get next piece if no limit or limit not yet reached and something matched
1838   Array return_value = Array::Create();
1839   int g_notempty = 0;   /* If the match should not be empty */
1840   int utf8_check = 0;
1841   PCRECache::Accessor bump_accessor;
1842   const pcre_cache_entry* bump_pce = nullptr; /* instance for empty matches */
1843   while ((limit == -1 || limit > 1)) {
1844     int count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1845                           start_offset, g_notempty | utf8_check,
1846                           offsets, size_offsets);
1847
1848     /* Subsequent calls to pcre_exec don't need to bother with the
1849      * utf8 validity check: if the subject isn't valid, the first
1850      * call to pcre_exec will have failed, and as long as we only
1851      * set start_offset to known character boundaries we won't
1852      * supply an invalid offset. */
1853     utf8_check = PCRE_NO_UTF8_CHECK;
1854
1855     /* Check for too many substrings condition. */
1856     if (count == 0) {
1857       raise_warning("Matched, but too many substrings");
1858       count = size_offsets / 3;
1859     }
1860
1861     /* If something matched */
1862     if (count > 0 && offsets[1] >= offsets[0]) {
1863       if (!no_empty || subject.data() + offsets[0] != last_match) {
1864         if (offset_capture) {
1865           /* Add (match, offset) pair to the return value */
1866           add_offset_pair(return_value,
1867                           String(last_match,
1868                                  subject.data() + offsets[0] - last_match,
1869                                  CopyString),
1870                           next_offset, nullptr);
1871         } else {
1872           /* Add the piece to the return value */
1873           return_value.append(String(last_match,
1874                                      subject.data() + offsets[0] - last_match,
1875                                      CopyString));
1876         }
1877
1878         /* One less left to do */
1879         if (limit != -1)
1880           limit--;
1881       }
1882
1883       last_match = subject.data() + offsets[1];
1884       next_offset = offsets[1];
1885
1886       if (delim_capture) {
1887         int i, match_len;
1888         for (i = 1; i < count; i++) {
1889           match_len = offsets[(i<<1)+1] - offsets[i<<1];
1890           /* If we have matched a delimiter */
1891           if (!no_empty || match_len > 0) {
1892             if (offset_capture) {
1893               add_offset_pair(return_value,
1894                               String(subject.data() + offsets[i<<1],
1895                                      match_len, CopyString),
1896                               offsets[i<<1], nullptr);
1897             } else {
1898               return_value.append(subject.substr(offsets[i<<1], match_len));
1899             }
1900           }
1901         }
1902       }
1903     } else if (count == PCRE_ERROR_NOMATCH) {
1904       /* If we previously set PCRE_NOTEMPTY after a null match,
1905          this is not necessarily the end. We need to advance
1906          the start offset, and continue. Fudge the offset values
1907          to achieve this, unless we're already at the end of the string. */
1908       if (g_notempty != 0 && start_offset < subject.size()) {
1909         if (pce->compile_options & PCRE_UTF8) {
1910           if (bump_pce == nullptr) {
1911             if (!pcre_get_compiled_regex_cache(bump_accessor,
1912                                                String("/./us").get())) {
1913               return false;
1914             }
1915             bump_pce = bump_accessor.get();
1916           }
1917           pcre_extra bump_extra;
1918           init_local_extra(&bump_extra, bump_pce->extra);
1919           count = pcre_exec(bump_pce->re, &bump_extra, subject.data(),
1920                             subject.size(), start_offset,
1921                             utf8_check, offsets, size_offsets);
1922           if (count < 1) {
1923             raise_warning("Unknown error");
1924             offsets[0] = start_offset;
1925             offsets[1] = start_offset + 1;
1926             if (pcre_need_log_error(count)) {
1927               pcre_log_error(__FUNCTION__, __LINE__, count,
1928                              pattern.data(), pattern.size(),
1929                              subject.data(), subject.size(),
1930                              "", 0,
1931                              limit, flags, start_offset);
1932             }
1933           }
1934         } else {
1935           offsets[0] = start_offset;
1936           offsets[1] = start_offset + 1;
1937         }
1938       } else
1939         break;
1940     } else {
1941       if (pcre_need_log_error(count)) {
1942         pcre_log_error(__FUNCTION__, __LINE__, count,
1943                        pattern.data(), pattern.size(),
1944                        subject.data(), subject.size(),
1945                        "", 0,
1946                        limit, flags, start_offset, g_notempty);
1947       }
1948       pcre_handle_exec_error(count);
1949       break;
1950     }
1951
1952     /* If we have matched an empty string, mimic what Perl's /g options does.
1953        This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1954        the match again at the same point. If this fails (picked up above) we
1955        advance to the next character. */
1956     g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1957
1958     /* Advance to the position right after the last full match */
1959     start_offset = offsets[1];
1960   }
1961
1962   start_offset = last_match - subject.data(); /* offset might have
1963                                                 * been incremented,
1964                                                 * but without further
1965                                                 * successful matches */
1966   if (!no_empty || start_offset < subject.size()) {
1967     if (offset_capture) {
1968       /* Add the last (match, offset) pair to the return value */
1969       add_offset_pair(return_value,
1970                       subject.substr(start_offset),
1971                       start_offset, nullptr);
1972     } else {
1973       /* Add the last piece to the return value */
1974       return_value.append
1975         (String(last_match, subject.data() + subject.size() - last_match,
1976                 CopyString));
1977     }
1978   }
1979
1980   return return_value;
1981 }
1982
1983 ///////////////////////////////////////////////////////////////////////////////
1984
1985 String preg_quote(const String& str,
1986                   const String& delimiter /* = null_string */) {
1987   const char* in_str = str.data();
1988   const char* in_str_end = in_str + str.size();
1989
1990   /* Nothing to do if we got an empty string */
1991   if (in_str == in_str_end) {
1992     return str;
1993   }
1994
1995   char delim_char = 0;      /* Delimiter character to be quoted */
1996   bool quote_delim = false; /* Whether to quote additional delim char */
1997   if (!delimiter.empty()) {
1998     delim_char = delimiter.charAt(0);
1999     quote_delim = true;
2000   }
2001
2002   /* Allocate enough memory so that even if each character
2003      is quoted, we won't run out of room */
2004   String ret(4 * str.size() + 1, ReserveString);
2005   char* out_str = ret.mutableData();
2006
2007   /* Go through the string and quote necessary characters */
2008   const char* p;
2009   char* q;
2010   for (p = in_str, q = out_str; p != in_str_end; p++) {
2011     char c = *p;
2012     switch (c) {
2013     case '.': case '\\': case '+': case '*': case '?':
2014     case '[': case '^':  case ']': case '$': case '(':
2015     case ')': case '{':  case '}': case '=': case '!':
2016     case '>': case '<':  case '|': case ':': case '-':
2017       *q++ = '\\';
2018       *q++ = c;
2019       break;
2020
2021     case '\0':
2022       *q++ = '\\';
2023       *q++ = '0';
2024       *q++ = '0';
2025       *q++ = '0';
2026       break;
2027
2028     default:
2029       if (quote_delim && c == delim_char)
2030         *q++ = '\\';
2031       *q++ = c;
2032       break;
2033     }
2034   }
2035   *q = '\0';
2036
2037   return ret.setSize(q - out_str);
2038 }
2039
2040 int preg_last_error() {
2041   return tl_last_error_code;
2042 }
2043
2044 size_t preg_pcre_cache_size() {
2045   return s_pcreCache.size();
2046 }
2047
2048 ///////////////////////////////////////////////////////////////////////////////
2049 // regexec
2050
2051 static void php_reg_eprint(int err, regex_t* re) {
2052   char *buf = nullptr, *message = nullptr;
2053   size_t len;
2054   size_t buf_len;
2055
2056 #ifdef REG_ITOA
2057   /* get the length of the message */
2058   buf_len = regerror(REG_ITOA | err, re, nullptr, 0);
2059   if (buf_len) {
2060     buf = (char *)req::malloc_noptrs(buf_len);
2061     if (!buf) return; /* fail silently */
2062     /* finally, get the error message */
2063     regerror(REG_ITOA | err, re, buf, buf_len);
2064   }
2065 #else
2066   buf_len = 0;
2067 #endif
2068   len = regerror(err, re, nullptr, 0);
2069   if (len) {
2070     message = (char *)req::malloc_noptrs(buf_len + len + 2);
2071     if (!message) {
2072       return; /* fail silently */
2073     }
2074     if (buf_len) {
2075       snprintf(message, buf_len, "%s: ", buf);
2076       buf_len += 1; /* so pointer math below works */
2077     }
2078     /* drop the message into place */
2079     regerror(err, re, message + buf_len, len);
2080     raise_warning("%s", message);
2081   }
2082   req::free(buf);
2083   req::free(message);
2084 }
2085
2086 Variant php_split(const String& spliton, const String& str, int count,
2087                   bool icase) {
2088   const char* strp = str.data();
2089   const char* endp = strp + str.size();
2090
2091   regex_t re;
2092   int copts = icase ? REG_ICASE : 0;
2093   int err = regcomp(&re, spliton.data(), REG_EXTENDED | copts);
2094   if (err) {
2095     php_reg_eprint(err, &re);
2096     return false;
2097   }
2098
2099   Array return_value = Array::Create();
2100   regmatch_t subs[1];
2101
2102   /* churn through str, generating array entries as we go */
2103   while ((count == -1 || count > 1) &&
2104          !(err = regexec(&re, strp, 1, subs, 0))) {
2105     if (subs[0].rm_so == 0 && subs[0].rm_eo) {
2106       /* match is at start of string, return empty string */
2107       return_value.append("");
2108       /* skip ahead the length of the regex match */
2109       strp += subs[0].rm_eo;
2110     } else if (subs[0].rm_so == 0 && subs[0].rm_eo == 0) {
2111       /* No more matches */
2112       regfree(&re);
2113       raise_warning("Invalid Regular Expression to split()");
2114       return false;
2115     } else {
2116       /* On a real match */
2117
2118       /* make a copy of the substring */
2119       int size = subs[0].rm_so;
2120
2121       /* add it to the array */
2122       return_value.append(String(strp, size, CopyString));
2123
2124       /* point at our new starting point */
2125       strp = strp + subs[0].rm_eo;
2126     }
2127
2128     /* if we're only looking for a certain number of points,
2129        stop looking once we hit it */
2130     if (count != -1) {
2131       count--;
2132     }
2133   }
2134
2135   /* see if we encountered an error */
2136   if (err && err != REG_NOMATCH) {
2137     php_reg_eprint(err, &re);
2138     regfree(&re);
2139     return false;
2140   }
2141
2142   /* otherwise we just have one last element to add to the array */
2143   int size = endp - strp;
2144   return_value.append(String(strp, size, CopyString));
2145
2146   regfree(&re);
2147   return return_value;
2148 }
2149
2150 ///////////////////////////////////////////////////////////////////////////////
2151 }