de-dup THREAD_LOCAL macros
[hiphop-php.git] / hphp / runtime / base / preg.cpp
blobca719b14720fa0a321e3f664d553b302c7da33d4
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/base/preg.h"
19 #include <atomic>
20 #include <fstream>
21 #include <mutex>
22 #include <pcre.h>
23 #include <onigposix.h>
24 #include <utility>
26 #include <folly/AtomicHashArray.h>
28 #include "hphp/runtime/base/array-init.h"
29 #include "hphp/runtime/base/array-iterator.h"
30 #include "hphp/runtime/base/builtin-functions.h"
31 #include "hphp/runtime/base/container-functions.h"
32 #include "hphp/runtime/base/execution-context.h"
33 #include "hphp/runtime/base/ini-setting.h"
34 #include "hphp/runtime/base/runtime-option.h"
35 #include "hphp/runtime/base/string-util.h"
36 #include "hphp/runtime/base/init-fini-node.h"
37 #include "hphp/runtime/base/zend-functions.h"
38 #include "hphp/runtime/vm/debug/debug.h"
39 #include "hphp/runtime/vm/treadmill.h"
40 #include "hphp/runtime/vm/vm-regs.h"
42 #include "hphp/runtime/ext/std/ext_std_function.h"
43 #include "hphp/runtime/ext/string/ext_string.h"
45 #include "hphp/runtime/vm/jit/mcgen.h"
46 #include "hphp/runtime/vm/jit/types.h"
47 #include "hphp/runtime/vm/jit/vtune-jit.h"
49 #include "hphp/compiler/json.h"
51 #include "hphp/util/logger.h"
52 #include "hphp/util/concurrent-scalable-cache.h"
54 /* Only defined in pcre >= 8.32 */
55 #ifndef PCRE_STUDY_JIT_COMPILE
56 # define PCRE_STUDY_JIT_COMPILE 0
57 #endif
59 namespace HPHP {
61 using jit::TCA;
63 ///////////////////////////////////////////////////////////////////////////////
64 // PCREglobals definition
66 PCREglobals::PCREglobals() {
67 jit_stack = pcre_jit_stack_alloc(32768, 524288);
68 // Set these to handle uses of pcre prior to PcreExtension::threadInit
69 // In particular, for matching tier overrides during RuntimeOption::Load
70 preg_backtrace_limit = RuntimeOption::PregBacktraceLimit;
71 preg_recursion_limit = RuntimeOption::PregRecursionLimit;
74 PCREglobals::~PCREglobals() {
75 pcre_jit_stack_free(jit_stack);
78 ///////////////////////////////////////////////////////////////////////////////
79 // PCRECache definition
81 struct PCRECache {
82 typedef std::shared_ptr<const pcre_cache_entry> EntryPtr;
83 typedef std::unique_ptr<LRUCacheKey> TempKeyCache;
85 enum class CacheKind {
86 Static,
87 Lru,
88 Scalable
91 private:
92 struct ahm_string_data_same {
93 bool operator()(const StringData* s1, const StringData* s2) {
94 // ahm uses -1, -2, -3 as magic values
95 return int64_t(s1) > 0 && (s1 == s2 || s1->same(s2));
99 typedef folly::AtomicHashArray<const StringData*, const pcre_cache_entry*,
100 string_data_hash, ahm_string_data_same> StaticCache;
101 typedef ConcurrentLRUCache<LRUCacheKey, EntryPtr,
102 LRUCacheKey::HashCompare> LRUCache;
103 typedef ConcurrentScalableCache<LRUCacheKey, EntryPtr,
104 LRUCacheKey::HashCompare> ScalableCache;
105 typedef StaticCache::value_type StaticCachePair;
107 public:
108 struct Accessor {
109 Accessor()
110 : m_kind(Kind::Empty)
113 ~Accessor() {
114 switch (m_kind) {
115 case Kind::Empty:
116 case Kind::Ptr:
117 break;
118 case Kind::SmartPtr:
119 m_u.smart_ptr.~EntryPtr();
120 break;
121 case Kind::AccessorKind:
122 m_u.accessor.~ConstAccessor();
123 break;
127 Accessor& operator=(const pcre_cache_entry* ptr) {
128 assert(m_kind == Kind::Empty || m_kind == Kind::Ptr);
129 m_kind = Kind::Ptr;
130 m_u.ptr = ptr;
131 return *this;
134 Accessor& operator=(EntryPtr&& ep) {
135 switch (m_kind) {
136 case Kind::AccessorKind:
137 m_u.accessor.~ConstAccessor();
138 case Kind::Empty:
139 case Kind::Ptr:
140 m_kind = Kind::SmartPtr;
141 new (&m_u.smart_ptr) EntryPtr(std::move(ep));
142 break;
143 case Kind::SmartPtr:
144 m_u.smart_ptr = std::move(ep);
145 break;
147 return *this;
150 // No assignment from LRUCache::ConstAccessor since it is non-copyable
151 // Use resetToLRU instead
152 LRUCache::ConstAccessor& resetToLRU() {
153 switch (m_kind) {
154 case Kind::SmartPtr:
155 m_u.smart_ptr.~EntryPtr();
156 case Kind::Empty:
157 case Kind::Ptr:
158 m_kind = Kind::AccessorKind;
159 new (&m_u.accessor) LRUCache::ConstAccessor();
160 break;
161 case Kind::AccessorKind:
162 break;
164 return m_u.accessor;
167 const pcre_cache_entry* get() {
168 switch (m_kind) {
169 case Kind::Empty: return nullptr;
170 case Kind::Ptr: return m_u.ptr;
171 case Kind::SmartPtr: return m_u.smart_ptr.get();
172 case Kind::AccessorKind: return m_u.accessor->get();
174 always_assert(false);
177 const EntryPtr& entryPtr() const {
178 assert(m_kind == Kind::SmartPtr);
179 return m_u.smart_ptr;
182 private:
183 enum class Kind : uint8_t {
184 Empty,
185 Ptr,
186 SmartPtr,
187 AccessorKind,
190 union Ptr {
191 Ptr() {}
192 ~Ptr() {}
194 const pcre_cache_entry* ptr;
195 EntryPtr smart_ptr;
196 LRUCache::ConstAccessor accessor;
199 Ptr m_u;
200 Kind m_kind;
203 PCRECache()
204 : m_kind(CacheKind::Static), m_staticCache(nullptr)
206 reinit(CacheKind::Static);
209 ~PCRECache() {
210 if (m_kind == CacheKind::Static && m_staticCache.load()) {
211 DestroyStatic(m_staticCache);
215 void reinit(CacheKind kind);
216 bool find(Accessor& accessor, const StringData* key,
217 TempKeyCache& keyCache);
218 void insert(Accessor& accessor, const StringData* regex,
219 TempKeyCache& keyCache, const pcre_cache_entry* ent);
220 void dump(const std::string& filename);
221 size_t size() const;
223 private:
224 void clearStatic();
226 static void DestroyStatic(StaticCache* cache);
227 static StaticCache* CreateStatic();
229 CacheKind m_kind;
230 std::atomic<StaticCache*> m_staticCache;
231 std::unique_ptr<LRUCache> m_lruCache;
232 std::unique_ptr<ScalableCache> m_scalableCache;
233 std::atomic<time_t> m_expire;
234 std::mutex m_clearMutex;
237 ///////////////////////////////////////////////////////////////////////////////
238 // Data
240 THREAD_LOCAL(PCREglobals, tl_pcre_globals);
242 static PCRECache s_pcreCache;
244 // The last pcre error code is available for the whole thread.
245 static __thread int tl_last_error_code;
247 ///////////////////////////////////////////////////////////////////////////////
248 // pcre_cache_entry implementation
250 pcre_cache_entry::~pcre_cache_entry() {
251 if (extra) {
252 #if PCRE_MAJOR < 8 || (PCRE_MAJOR == 8 && PCRE_MINOR < 20)
253 free(extra);
254 #else
255 pcre_free_study(extra);
256 #endif
258 free(subpat_names);
259 pcre_free(re);
262 pcre_literal_data::pcre_literal_data(const char* pattern, int coptions) {
263 if (coptions & ~PCRE_CASELESS) {
264 return;
267 auto p = pattern;
268 if (*p == '^') {
269 match_start = true;
270 p++;
273 std::string pattern_buffer;
274 while (isalnum((unsigned char)*p) || (*p && strchr("/\\ :-_", *p))) {
275 // backslash + alphanumeric character --> not a literal (i.e. \d).
276 // backslash + non-alphanumeric character --> literal symbol (i.e. \.)
277 if (*p == '\\') {
278 if (!p[1] || isalnum((unsigned char)p[1])) {
279 break;
280 } else {
281 p++;
284 pattern_buffer += *p++;
286 if (*p == '$') {
287 match_end = true;
288 p++;
290 if (!*p) {
291 /* This is an encoding of a literal string. */
292 case_insensitive = coptions & PCRE_CASELESS;
293 literal_str = std::move(pattern_buffer);
297 bool pcre_literal_data::isLiteral() const {
298 return literal_str.hasValue();
301 bool pcre_literal_data::matches(const StringData* subject,
302 int pos,
303 int* offsets) const {
304 assertx(isLiteral());
305 assertx(pos >= 0);
307 // Subject must be at least as long as the literal pattern
308 // for a match to occur.
309 if (subject->size() < literal_str->length() + pos) {
310 return false;
313 size_t literal_strlen = literal_str->length();
314 auto const subject_c = subject->data();
315 auto const literal_c = literal_str->c_str();
316 if (match_start) {
317 // Make sure an exact match has the right length.
318 if (pos || (match_end && subject->size() != literal_strlen)) {
319 return false;
321 // If only matching the start (^), compare the strings
322 // for the length of the literal pattern.
323 if (case_insensitive ?
324 bstrcaseeq(subject_c, literal_c, literal_strlen) :
325 memcmp(subject_c, literal_c, literal_strlen) == 0) {
326 offsets[0] = 0;
327 offsets[1] = literal_strlen * sizeof(char);
328 return true;
330 } else if (match_end) {
331 // Compare the literal pattern against the tail end of the subject.
332 auto const subject_tail = subject_c + (subject->size() - literal_strlen);
333 if (case_insensitive ?
334 bstrcaseeq(subject_tail, literal_c, literal_strlen) :
335 memcmp(subject_tail, literal_c, literal_strlen) == 0) {
336 offsets[0] = (subject->size() - literal_strlen) * sizeof(char);
337 offsets[1] = subject->size() * sizeof(char);
338 return true;
340 } else {
341 if (!literal_strlen) {
342 offsets[0] = offsets[1] = pos;
343 return true;
345 // Check if the literal pattern occurs as a substring of the subject.
346 auto const subject_str = StrNR(subject);
347 auto const find_response = subject_str.asString().find(
348 *literal_str, pos, !case_insensitive);
349 if (find_response >= 0) {
350 offsets[0] = find_response * sizeof(char);
351 offsets[1] = offsets[0] + literal_strlen * sizeof(char);
352 return true;
355 return false;
358 ///////////////////////////////////////////////////////////////////////////////
359 // PCRECache implementation
361 PCRECache::StaticCache* PCRECache::CreateStatic() {
362 StaticCache::Config config;
363 config.maxLoadFactor = 0.5;
364 return StaticCache::create(
365 RuntimeOption::EvalPCRETableSize, config).release();
368 void PCRECache::DestroyStatic(StaticCache* cache) {
369 // We delete uncounted keys while iterating the cache, which is OK for
370 // AtomicHashArray, but not OK for other containers, such as
371 // std::unordered_map. If you change the cache type make sure that property
372 // holds or fix this function.
373 static_assert(std::is_same<PCRECache::StaticCache,
374 folly::AtomicHashArray<const StringData*, const pcre_cache_entry*,
375 string_data_hash, ahm_string_data_same>>::value,
376 "StaticCache must be an AtomicHashArray or this destructor is wrong.");
377 for (auto& it : *cache) {
378 if (it.first->isUncounted()) {
379 const_cast<StringData*>(it.first)->destructUncounted();
381 delete it.second;
383 StaticCache::destroy(cache);
386 void PCRECache::reinit(CacheKind kind) {
387 switch (m_kind) {
388 case CacheKind::Static:
389 if (m_staticCache.load()) {
390 DestroyStatic(m_staticCache);
391 m_staticCache = nullptr;
393 break;
394 case CacheKind::Lru:
395 m_lruCache.reset();
396 break;
397 case CacheKind::Scalable:
398 m_scalableCache.reset();
399 break;
401 m_kind = kind;
403 switch (kind) {
404 case CacheKind::Static:
405 m_staticCache = CreateStatic();
406 m_expire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
407 break;
408 case CacheKind::Lru:
409 m_lruCache.reset(new LRUCache(RuntimeOption::EvalPCRETableSize));
410 break;
411 case CacheKind::Scalable:
412 m_scalableCache.reset(
413 new ScalableCache(RuntimeOption::EvalPCRETableSize));
414 break;
418 bool PCRECache::find(Accessor& accessor,
419 const StringData* regex,
420 TempKeyCache& keyCache)
422 switch (m_kind) {
423 case CacheKind::Static:
425 assert(m_staticCache.load());
426 StaticCache::iterator it;
427 auto cache = m_staticCache.load(std::memory_order_acquire);
428 if ((it = cache->find(regex)) != cache->end()) {
429 accessor = it->second;
430 return true;
432 return false;
434 case CacheKind::Lru:
435 case CacheKind::Scalable:
437 if (!keyCache) {
438 keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
440 bool found;
441 if (m_kind == CacheKind::Lru) {
442 found = m_lruCache->find(accessor.resetToLRU(), *keyCache);
443 } else {
444 found = m_scalableCache->find(accessor.resetToLRU(), *keyCache);
446 return found;
449 always_assert(false);
452 void PCRECache::clearStatic() {
453 std::unique_lock<std::mutex> lock(m_clearMutex, std::try_to_lock);
454 if (!lock) return;
456 auto newExpire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
457 m_expire.store(newExpire, std::memory_order_relaxed);
459 auto tmpMap = CreateStatic();
460 tmpMap = m_staticCache.exchange(tmpMap, std::memory_order_acq_rel);
462 Treadmill::enqueue([tmpMap]() {
463 DestroyStatic(tmpMap);
467 void PCRECache::insert(
468 Accessor& accessor,
469 const StringData* regex,
470 TempKeyCache& keyCache,
471 const pcre_cache_entry* ent
473 switch (m_kind) {
474 case CacheKind::Static:
476 assert(m_staticCache.load());
477 // Clear the cache if we haven't refreshed it in a while
478 if (time(nullptr) > m_expire) {
479 clearStatic();
481 auto cache = m_staticCache.load(std::memory_order_acquire);
482 auto key = regex->isStatic()
483 ? regex
484 : StringData::MakeUncounted(regex->slice());
485 auto pair = cache->insert(StaticCachePair(key, ent));
486 if (pair.second) {
487 // Inserted, container owns the pointer
488 accessor = ent;
489 } else {
490 // Not inserted, caller needs to own the pointer
491 if (key != regex) const_cast<StringData*>(key)->destructUncounted();
492 accessor = EntryPtr(ent);
495 break;
496 case CacheKind::Lru:
497 case CacheKind::Scalable:
499 if (!keyCache) {
500 keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
502 // Pointer ownership is shared between container and caller
503 accessor = EntryPtr(ent);
504 if (m_kind == CacheKind::Lru) {
505 m_lruCache->insert(*keyCache, accessor.entryPtr());
506 } else {
507 m_scalableCache->insert(*keyCache, accessor.entryPtr());
510 break;
514 void PCRECache::dump(const std::string& filename) {
515 std::ofstream out(filename.c_str());
516 switch (m_kind) {
517 case CacheKind::Static:
518 for (auto& it : *m_staticCache) {
519 out << it.first->data() << "\n";
521 break;
522 case CacheKind::Lru:
523 case CacheKind::Scalable:
525 std::vector<LRUCacheKey> keys;
526 if (m_kind == CacheKind::Lru) {
527 m_lruCache->snapshotKeys(keys);
528 } else {
529 m_scalableCache->snapshotKeys(keys);
531 for (auto& key: keys) {
532 out << key.c_str() << "\n";
535 break;
537 out.close();
540 size_t PCRECache::size() const {
541 switch (m_kind) {
542 case CacheKind::Static:
543 return m_staticCache.load(std::memory_order_acquire)->size();
544 case CacheKind::Lru:
545 return m_lruCache->size();
546 case CacheKind::Scalable:
547 return m_scalableCache->size();
549 always_assert(false);
552 ///////////////////////////////////////////////////////////////////////////////
553 // Public interface and helper functions
555 void pcre_reinit() {
556 PCRECache::CacheKind kind;
557 if (RuntimeOption::EvalPCRECacheType == "static") {
558 kind = PCRECache::CacheKind::Static;
559 } else if (RuntimeOption::EvalPCRECacheType == "lru") {
560 kind = PCRECache::CacheKind::Lru;
561 } else if (RuntimeOption::EvalPCRECacheType == "scalable") {
562 kind = PCRECache::CacheKind::Scalable;
563 } else {
564 Logger::Warning("Eval.PCRECacheType should be either static, "
565 "lru or scalable");
566 kind = PCRECache::CacheKind::Scalable;
568 s_pcreCache.reinit(kind);
571 void pcre_init() {
574 void pcre_dump_cache(const std::string& filename) {
575 s_pcreCache.dump(filename);
578 static pcre_jit_stack* alloc_jit_stack(void* /*data*/) {
579 return tl_pcre_globals->jit_stack;
582 namespace {
584 template<bool useSmartFree = false>
585 struct FreeHelperImpl {
586 explicit FreeHelperImpl(void* p) : p(p) {}
587 ~FreeHelperImpl() {
588 useSmartFree ? req::free(p) : free(p);
591 FreeHelperImpl(const FreeHelperImpl&) = delete;
592 FreeHelperImpl& operator=(const FreeHelperImpl&) = delete;
594 private:
595 void* p;
598 typedef FreeHelperImpl<true> SmartFreeHelper;
601 static void init_local_extra(pcre_extra* local, pcre_extra* shared) {
602 if (shared) {
603 memcpy(local, shared, sizeof(pcre_extra));
604 } else {
605 memset(local, 0, sizeof(pcre_extra));
606 local->flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
608 local->match_limit = tl_pcre_globals->preg_backtrace_limit;
609 local->match_limit_recursion = tl_pcre_globals->preg_recursion_limit;
612 static const char* const*
613 get_subpat_names(const pcre_cache_entry* pce) {
614 char **subpat_names = pce->subpat_names.load(std::memory_order_relaxed);
615 if (subpat_names) {
616 return subpat_names;
620 * Build a mapping from subpattern numbers to their names. We will always
621 * allocate the table, even though there may be no named subpatterns. This
622 * avoids somewhat more complicated logic in the inner loops.
624 pcre_extra extra;
625 init_local_extra(&extra, pce->extra);
627 int name_count;
629 subpat_names = (char **)calloc(pce->num_subpats, sizeof(char *));
630 int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMECOUNT, &name_count);
631 if (rc < 0) {
632 raise_warning("Internal pcre_fullinfo() error %d", rc);
633 return nullptr;
635 if (name_count > 0) {
636 int name_size, ni = 0;
637 unsigned short name_idx;
638 char* name_table;
639 int rc1, rc2;
641 rc1 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMETABLE, &name_table);
642 rc2 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
643 rc = rc2 ? rc2 : rc1;
644 if (rc < 0) {
645 raise_warning("Internal pcre_fullinfo() error %d", rc);
646 return nullptr;
648 while (ni++ < name_count) {
649 name_idx = 0xff * (unsigned char)name_table[0] +
650 (unsigned char)name_table[1];
651 subpat_names[name_idx] = name_table + 2;
652 if (is_numeric_string(subpat_names[name_idx],
653 strlen(subpat_names[name_idx]),
654 nullptr, nullptr, 0) != KindOfNull) {
655 raise_warning("Numeric named subpatterns are not allowed");
656 return nullptr;
658 name_table += name_size;
661 // Store subpat_names into the cache entry
662 char **expected = nullptr;
663 if (!pce->subpat_names.compare_exchange_strong(expected, subpat_names)) {
664 // Another thread stored subpat_names already. The array created by the
665 // other thread is now in expected, return it instead and delete the one
666 // we just made.
667 free(subpat_names);
668 return expected;
670 return subpat_names;
673 static bool get_pcre_fullinfo(pcre_cache_entry* pce) {
674 pcre_extra extra;
675 init_local_extra(&extra, pce->extra);
677 /* Calculate the size of the offsets array*/
678 int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_CAPTURECOUNT,
679 &pce->num_subpats);
680 if (rc < 0) {
681 raise_warning("Internal pcre_fullinfo() error %d", rc);
682 return false;
684 pce->num_subpats++;
685 return true;
688 static bool
689 pcre_get_compiled_regex_cache(PCRECache::Accessor& accessor,
690 const StringData* regex) {
691 PCRECache::TempKeyCache tkc;
693 /* Try to lookup the cached regex entry, and if successful, just pass
694 back the compiled pattern, otherwise go on and compile it. */
695 if (s_pcreCache.find(accessor, regex, tkc)) {
696 return true;
699 /* Parse through the leading whitespace, and display a warning if we
700 get to the end without encountering a delimiter. */
701 const char *p = regex->data();
702 while (isspace((int)*(unsigned char *)p)) p++;
703 if (*p == 0) {
704 raise_warning("Empty regular expression");
705 return false;
708 /* Get the delimiter and display a warning if it is alphanumeric
709 or a backslash. */
710 char delimiter = *p++;
711 if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
712 raise_warning("Delimiter must not be alphanumeric or backslash");
713 return false;
716 char start_delimiter = delimiter;
717 const char *pp = strchr("([{< )]}> )]}>", delimiter);
718 if (pp) {
719 delimiter = pp[5];
721 char end_delimiter = delimiter;
723 if (start_delimiter == end_delimiter) {
724 /* We need to iterate through the pattern, searching for the ending
725 * delimiter, but skipping the backslashed delimiters. If the ending
726 * delimiter is not found, display a warning. */
727 pp = p;
728 while (*pp != 0) {
729 if (*pp == '\\' && pp[1] != 0) pp++;
730 else if (*pp == delimiter)
731 break;
732 pp++;
734 if (*pp == 0) {
735 raise_warning("No ending delimiter '%c' found: [%s]", delimiter,
736 regex->data());
737 return false;
739 } else {
740 /* We iterate through the pattern, searching for the matching ending
741 * delimiter. For each matching starting delimiter, we increment nesting
742 * level, and decrement it for each matching ending delimiter. If we
743 * reach the end of the pattern without matching, display a warning.
745 int brackets = 1; // brackets nesting level
746 pp = p;
747 while (*pp != 0) {
748 if (*pp == '\\' && pp[1] != 0) pp++;
749 else if (*pp == end_delimiter && --brackets <= 0)
750 break;
751 else if (*pp == start_delimiter)
752 brackets++;
753 pp++;
755 if (*pp == 0) {
756 raise_warning("No ending matching delimiter '%c' found: [%s]",
757 end_delimiter, regex->data());
758 return false;
762 /* Make a copy of the actual pattern. */
763 String spattern(p, pp-p, CopyString);
764 const char *pattern = spattern.data();
766 /* Move on to the options */
767 pp++;
769 /* Parse through the options, setting appropriate flags. Display
770 a warning if we encounter an unknown modifier. */
771 int coptions = 0;
772 int poptions = 0;
773 bool do_study = false;
774 while (*pp != 0) {
775 switch (*pp++) {
776 /* Perl compatible options */
777 case 'i': coptions |= PCRE_CASELESS; break;
778 case 'm': coptions |= PCRE_MULTILINE; break;
779 case 's': coptions |= PCRE_DOTALL; break;
780 case 'x': coptions |= PCRE_EXTENDED; break;
782 /* PCRE specific options */
783 case 'A': coptions |= PCRE_ANCHORED; break;
784 case 'D': coptions |= PCRE_DOLLAR_ENDONLY; break;
785 case 'S': do_study = true; break;
786 case 'U': coptions |= PCRE_UNGREEDY; break;
787 case 'X': coptions |= PCRE_EXTRA; break;
788 case 'u': coptions |= PCRE_UTF8;
789 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
790 characters, even in UTF-8 mode. However, this can be changed by setting
791 the PCRE_UCP option. */
792 #ifdef PCRE_UCP
793 coptions |= PCRE_UCP;
794 #endif
795 break;
797 /* Custom preg options */
798 case 'e': poptions |= PREG_REPLACE_EVAL; break;
800 case ' ':
801 case '\n':
802 break;
804 default:
805 raise_warning("Unknown modifier '%c': [%s]", pp[-1], regex->data());
806 return false;
810 /* We've reached a null byte, now check if we're actually at the end of the
811 string. If not this is a bad expression, and a potential security hole. */
812 if (regex->size() != (pp - regex->data())) {
813 raise_error("Error: Null byte found in pattern");
816 /* Compile pattern and display a warning if compilation failed. */
817 const char *error;
818 int erroffset;
819 pcre *re = pcre_compile(pattern, coptions, &error, &erroffset, 0);
820 if (re == nullptr) {
821 raise_warning("Compilation failed: %s at offset %d", error, erroffset);
822 return false;
825 // Careful: from here 're' needs to be freed if something throws.
827 // TODO(t14969501): enable literal_data everywhere and skip the
828 // pcre_compile above.
829 auto const literal_data = pcre_literal_data(pattern, coptions);
831 /* If study option was specified, study the pattern and
832 store the result in extra for passing to pcre_exec. */
833 pcre_extra *extra = nullptr;
834 if (!literal_data.isLiteral()) {
835 if (do_study || PCRE_STUDY_JIT_COMPILE) {
836 int soptions = PCRE_STUDY_JIT_COMPILE;
837 extra = pcre_study(re, soptions, &error);
838 if (extra) {
839 extra->flags |= PCRE_EXTRA_MATCH_LIMIT |
840 PCRE_EXTRA_MATCH_LIMIT_RECURSION;
841 pcre_assign_jit_stack(extra, alloc_jit_stack, nullptr);
843 if (error != nullptr) {
844 try {
845 raise_warning("Error while studying pattern");
846 } catch (...) {
847 pcre_free(re);
848 throw;
851 if ((!RuntimeOption::EvalJitNoGdb ||
852 RuntimeOption::EvalJitUseVtuneAPI ||
853 RuntimeOption::EvalPerfPidMap) &&
854 extra &&
855 extra->executable_jit != nullptr) {
856 size_t size;
857 pcre_fullinfo(re, extra, PCRE_INFO_JITSIZE, &size);
859 TCA start = *(TCA *)(extra->executable_jit);
860 TCA end = start + size;
861 std::string name = folly::sformat("HHVM::pcre_jit::{}", pattern);
863 if (!RuntimeOption::EvalJitNoGdb && jit::mcgen::initialized()) {
864 Debug::DebugInfo::Get()->recordStub(Debug::TCRange(start, end, false),
865 name);
867 if (RuntimeOption::EvalJitUseVtuneAPI) {
868 HPHP::jit::reportHelperToVtune(name.c_str(), start, end);
870 if (RuntimeOption::EvalPerfPidMap && jit::mcgen::initialized()) {
871 Debug::DebugInfo::Get()->recordPerfMap(
872 Debug::TCRange(start, end, false),
873 SrcKey{}, nullptr, false, false,
874 HPHP::JSON::Escape(name.c_str())
881 /* Store the compiled pattern and extra info in the cache. */
882 pcre_cache_entry* new_entry = new pcre_cache_entry();
883 new_entry->re = re;
884 new_entry->extra = extra;
885 if (literal_data.isLiteral()) {
886 new_entry->literal_data =
887 std::make_unique<pcre_literal_data>(std::move(literal_data));
890 assert((poptions & ~0x1) == 0);
891 new_entry->preg_options = poptions;
893 assert((coptions & 0x80000000) == 0);
894 new_entry->compile_options = coptions;
896 /* Get pcre full info */
897 if (!get_pcre_fullinfo(new_entry)) {
898 delete new_entry;
899 return false;
902 s_pcreCache.insert(accessor, regex, tkc, new_entry);
903 return true;
906 static int* create_offset_array(const pcre_cache_entry* pce,
907 int& size_offsets) {
908 /* Allocate memory for the offsets array */
909 size_offsets = pce->num_subpats * 3;
910 return (int *)req::malloc_noptrs(size_offsets * sizeof(int));
913 static inline void add_offset_pair(Array& result,
914 const String& str,
915 int offset,
916 const char* name) {
917 auto match_pair = make_packed_array(str, offset);
918 if (name) result.set(String(name), match_pair);
919 result.append(match_pair);
922 static inline bool pcre_need_log_error(int pcre_code) {
923 return RuntimeOption::EnablePregErrorLog &&
924 (pcre_code == PCRE_ERROR_MATCHLIMIT ||
925 pcre_code == PCRE_ERROR_RECURSIONLIMIT);
928 static void pcre_log_error(const char* func, int line, int pcre_code,
929 const char* pattern, int pattern_size,
930 const char* subject, int subject_size,
931 const char* repl, int repl_size,
932 int arg1 = 0, int arg2 = 0,
933 int arg3 = 0, int arg4 = 0) {
934 if (!RuntimeOption::EnableHipHopSyntax) {
935 return;
937 const char* escapedPattern;
938 const char* escapedSubject;
939 const char* escapedRepl;
940 std::string p(pattern, pattern_size);
941 std::string s(subject, subject_size);
942 std::string r(repl, repl_size);
943 escapedPattern = Logger::EscapeString(p);
944 escapedSubject = Logger::EscapeString(s);
945 escapedRepl = Logger::EscapeString(r);
946 const char* errString =
947 (pcre_code == PCRE_ERROR_MATCHLIMIT) ? "PCRE_ERROR_MATCHLIMIT" :
948 (pcre_code == PCRE_ERROR_RECURSIONLIMIT) ? "PCRE_ERROR_RECURSIONLIMIT" :
949 "UNKNOWN";
950 raise_warning_unsampled(
951 "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
952 "limits=(%" PRId64 ", %" PRId64 "), extra=(%d, %d, %d, %d)",
953 func, line, pcre_code, errString,
954 escapedPattern, escapedSubject, escapedRepl,
955 tl_pcre_globals->preg_backtrace_limit,
956 tl_pcre_globals->preg_recursion_limit,
957 arg1, arg2, arg3, arg4);
958 free((void *)escapedPattern);
959 free((void *)escapedSubject);
960 free((void *)escapedRepl);
963 static void pcre_handle_exec_error(int pcre_code) {
964 int preg_code = 0;
965 switch (pcre_code) {
966 case PCRE_ERROR_MATCHLIMIT:
967 preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
968 break;
969 case PCRE_ERROR_RECURSIONLIMIT:
970 preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
971 break;
972 case PCRE_ERROR_BADUTF8:
973 preg_code = PHP_PCRE_BAD_UTF8_ERROR;
974 break;
975 case PCRE_ERROR_BADUTF8_OFFSET:
976 preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
977 break;
978 default:
979 preg_code = PHP_PCRE_INTERNAL_ERROR;
980 break;
982 tl_last_error_code = preg_code;
985 ///////////////////////////////////////////////////////////////////////////////
987 Variant preg_grep(const String& pattern, const Array& input, int flags /* = 0 */) {
988 PCRECache::Accessor accessor;
989 if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
990 return false;
992 const pcre_cache_entry* pce = accessor.get();
994 int size_offsets = 0;
995 int* offsets = create_offset_array(pce, size_offsets);
996 if (offsets == nullptr) {
997 return false;
999 SmartFreeHelper freer(offsets);
1001 /* Initialize return array */
1002 Array ret = Array::Create();
1003 tl_last_error_code = PHP_PCRE_NO_ERROR;
1005 /* Go through the input array */
1006 bool invert = (flags & PREG_GREP_INVERT);
1007 pcre_extra extra;
1008 init_local_extra(&extra, pce->extra);
1010 for (ArrayIter iter(input); iter; ++iter) {
1011 String entry = iter.second().toString();
1013 /* Perform the match */
1014 int count = pcre_exec(pce->re, &extra, entry.data(), entry.size(),
1015 0, 0, offsets, size_offsets);
1017 /* Check for too many substrings condition. */
1018 if (count == 0) {
1019 raise_warning("Matched, but too many substrings");
1020 count = size_offsets / 3;
1021 } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1022 if (pcre_need_log_error(count)) {
1023 pcre_log_error(__FUNCTION__, __LINE__, count,
1024 pattern.data(), pattern.size(),
1025 entry.data(), entry.size(),
1026 "", 0,
1027 flags);
1029 pcre_handle_exec_error(count);
1030 break;
1033 /* If the entry fits our requirements */
1034 if ((count > 0 && !invert) ||
1035 (count == PCRE_ERROR_NOMATCH && invert)) {
1037 /* Add to return array */
1038 ret.set(iter.first(), entry);
1042 return ret;
1045 ///////////////////////////////////////////////////////////////////////////////
1047 static Variant preg_match_impl(const StringData* pattern,
1048 const StringData* subject,
1049 Variant* subpats, int flags, int start_offset,
1050 bool global) {
1051 PCRECache::Accessor accessor;
1052 if (!pcre_get_compiled_regex_cache(accessor, pattern)) {
1053 return false;
1055 const pcre_cache_entry* pce = accessor.get();
1057 pcre_extra extra;
1058 init_local_extra(&extra, pce->extra);
1059 if (subpats) {
1060 *subpats = Array::Create();
1062 int exec_options = 0;
1064 int subpats_order = global ? PREG_PATTERN_ORDER : 0;
1065 bool offset_capture = false;
1066 if (flags) {
1067 offset_capture = flags & PREG_OFFSET_CAPTURE;
1070 * subpats_order is pre-set to pattern mode so we change it only if
1071 * necessary.
1073 if (flags & 0xff) {
1074 subpats_order = flags & 0xff;
1076 if ((global && (subpats_order < PREG_PATTERN_ORDER ||
1077 subpats_order > PREG_SET_ORDER)) ||
1078 (!global && subpats_order != 0)) {
1079 raise_warning("Invalid flags specified");
1080 return init_null();
1084 /* Negative offset counts from the end of the string. */
1085 if (start_offset < 0) {
1086 start_offset = subject->size() + start_offset;
1087 if (start_offset < 0) {
1088 start_offset = 0;
1092 int size_offsets = 0;
1093 int* offsets = create_offset_array(pce, size_offsets);
1094 SmartFreeHelper offsetsFreer(offsets);
1095 int num_subpats = size_offsets / 3;
1096 if (offsets == nullptr) {
1097 return false;
1100 const char* const* subpat_names = get_subpat_names(pce);
1101 if (subpat_names == nullptr) {
1102 return false;
1105 /* Allocate match sets array and initialize the values. */
1106 Array match_sets; /* An array of sets of matches for each
1107 subpattern after a global match */
1108 if (global && subpats_order == PREG_PATTERN_ORDER) {
1109 for (int i = 0; i < num_subpats; i++) {
1110 match_sets.set(i, Array::Create());
1114 int matched = 0;
1115 tl_last_error_code = PHP_PCRE_NO_ERROR;
1117 int g_notempty = 0; // If the match should not be empty
1118 const char** stringlist; // Holds list of subpatterns
1119 int i;
1120 do {
1122 int count = 0;
1124 * Optimization: If the pattern defines a literal substring,
1125 * compare the strings directly (i.e. memcmp) instead of performing
1126 * the full regular expression evaluation.
1127 * Take the slow path if there are any special compile options.
1129 if (pce->literal_data && !global) {
1130 assertx(pce->literal_data->isLiteral());
1131 /* TODO(t13140878): compare literal against multiple substrings
1132 * in the preg_match_all (global == true) case. */
1133 count = pce->literal_data->matches(subject, start_offset, offsets) ? 1
1134 : PCRE_ERROR_NOMATCH;
1135 } else {
1136 /* Execute the regular expression. */
1137 count = pcre_exec(pce->re, &extra, subject->data(), subject->size(),
1138 start_offset,
1139 exec_options | g_notempty,
1140 offsets, size_offsets);
1142 /* The string was already proved to be valid UTF-8 */
1143 exec_options |= PCRE_NO_UTF8_CHECK;
1145 /* Check for too many substrings condition. */
1146 if (count == 0) {
1147 raise_warning("Matched, but too many substrings");
1148 count = size_offsets / 3;
1151 /* If something has matched */
1152 if (count > 0) {
1153 matched++;
1155 if (subpats) {
1156 // Try to get the list of substrings and display a warning if failed.
1157 if (offsets[1] < offsets[0] ||
1158 pcre_get_substring_list(subject->data(), offsets, count,
1159 &stringlist) < 0) {
1160 raise_warning("Get subpatterns list failed");
1161 return false;
1164 if (global) { /* global pattern matching */
1165 if (subpats_order == PREG_PATTERN_ORDER) {
1166 /* For each subpattern, insert it into the appropriate array. */
1167 for (i = 0; i < count; i++) {
1168 if (offset_capture) {
1169 auto& lval = match_sets.lvalAt(i);
1170 forceToArray(lval);
1171 add_offset_pair(lval.toArrRef(),
1172 String(stringlist[i],
1173 offsets[(i<<1)+1] - offsets[i<<1],
1174 CopyString),
1175 offsets[i<<1], nullptr);
1176 } else {
1177 auto& lval = match_sets.lvalAt(i);
1178 forceToArray(lval).append(
1179 String(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1180 CopyString)
1185 * If the number of captured subpatterns on this run is
1186 * less than the total possible number, pad the result
1187 * arrays with empty strings.
1189 if (count < num_subpats) {
1190 for (; i < num_subpats; i++) {
1191 auto& lval = match_sets.lvalAt(i);
1192 forceToArray(lval).append("");
1195 } else {
1196 Array result_set = Array::Create();
1198 /* Add all the subpatterns to it */
1199 for (i = 0; i < count; i++) {
1200 if (offset_capture) {
1201 add_offset_pair(result_set,
1202 String(stringlist[i],
1203 offsets[(i<<1)+1] - offsets[i<<1],
1204 CopyString),
1205 offsets[i<<1], subpat_names[i]);
1206 } else {
1207 String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1208 CopyString);
1209 if (subpat_names[i]) {
1210 result_set.set(String(subpat_names[i]), value);
1212 result_set.append(value);
1215 /* And add it to the output array */
1216 forceToArray(*subpats).append(std::move(result_set));
1218 } else { /* single pattern matching */
1219 /* For each subpattern, insert it into the subpatterns array. */
1220 for (i = 0; i < count; i++) {
1221 if (offset_capture) {
1222 add_offset_pair(forceToArray(*subpats),
1223 String(stringlist[i],
1224 offsets[(i<<1)+1] - offsets[i<<1],
1225 CopyString),
1226 offsets[i<<1], subpat_names[i]);
1227 } else {
1228 String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1229 CopyString);
1230 if (subpat_names[i]) {
1231 forceToArray(*subpats).set(String(subpat_names[i]), value);
1233 forceToArray(*subpats).append(value);
1237 pcre_free((void *) stringlist);
1239 } else if (count == PCRE_ERROR_NOMATCH) {
1240 /* If we previously set PCRE_NOTEMPTY after a null match,
1241 this is not necessarily the end. We need to advance
1242 the start offset, and continue. Fudge the offset values
1243 to achieve this, unless we're already at the end of the string. */
1244 if (g_notempty && start_offset < subject->size()) {
1245 offsets[0] = start_offset;
1246 offsets[1] = start_offset + 1;
1247 } else
1248 break;
1249 } else {
1250 if (pcre_need_log_error(count)) {
1251 pcre_log_error(__FUNCTION__, __LINE__, count,
1252 pattern->data(), pattern->size(),
1253 subject->data(), subject->size(),
1254 "", 0,
1255 flags, start_offset, g_notempty, global);
1257 pcre_handle_exec_error(count);
1258 return false;
1261 /* If we have matched an empty string, mimic what Perl's /g options does.
1262 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1263 the match again at the same point. If this fails (picked up above) we
1264 advance to the next character. */
1265 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1267 /* Advance to the position right after the last full match */
1268 start_offset = offsets[1];
1269 } while (global);
1271 /* Add the match sets to the output array and clean up */
1272 if (subpats && global && subpats_order == PREG_PATTERN_ORDER) {
1273 for (i = 0; i < num_subpats; i++) {
1274 if (subpat_names[i]) {
1275 forceToArray(*subpats).set(String(subpat_names[i]), match_sets[i]);
1277 forceToArray(*subpats).append(match_sets[i]);
1280 return matched;
1283 Variant preg_match(const String& pattern, const String& subject,
1284 Variant* matches /* = nullptr */, int flags /* = 0 */,
1285 int offset /* = 0 */) {
1286 return preg_match(pattern.get(), subject.get(), matches, flags, offset);
1289 Variant preg_match(const StringData* pattern, const StringData* subject,
1290 Variant* matches /* = nullptr */, int flags /* = 0 */,
1291 int offset /* = 0 */) {
1292 return preg_match_impl(pattern, subject, matches, flags, offset, false);
1295 Variant preg_match_all(const String& pattern, const String& subject,
1296 Variant* matches /* = nullptr */,
1297 int flags /* = 0 */, int offset /* = 0 */) {
1298 return preg_match_all(pattern.get(), subject.get(), matches, flags, offset);
1301 Variant preg_match_all(const StringData* pattern, const StringData* subject,
1302 Variant* matches /* = nullptr */,
1303 int flags /* = 0 */, int offset /* = 0 */) {
1304 return preg_match_impl(pattern, subject, matches, flags, offset, true);
1307 ///////////////////////////////////////////////////////////////////////////////
1309 static String preg_do_repl_func(const Variant& function, const String& subject,
1310 int* offsets, const char* const* subpat_names,
1311 int count) {
1312 Array subpats = Array::Create();
1313 for (int i = 0; i < count; i++) {
1314 auto off1 = offsets[i<<1];
1315 auto off2 = offsets[(i<<1)+1];
1316 auto sub = subject.substr(off1, off2 - off1);
1318 if (subpat_names[i]) {
1319 subpats.set(String(subpat_names[i]), sub);
1321 subpats.append(sub);
1324 Array args;
1325 args.set(0, subpats);
1326 return vm_call_user_func(function, args).toString();
1329 static bool preg_get_backref(const char** str, int* backref) {
1330 char in_brace = 0;
1331 const char* walk = *str;
1333 if (walk[1] == 0) {
1334 return false;
1337 if (*walk == '$' && walk[1] == '{') {
1338 in_brace = 1;
1339 walk++;
1341 walk++;
1343 if (*walk >= '0' && *walk <= '9') {
1344 *backref = *walk - '0';
1345 walk++;
1346 } else {
1347 return false;
1350 if (*walk && *walk >= '0' && *walk <= '9') {
1351 *backref = *backref * 10 + *walk - '0';
1352 walk++;
1355 if (in_brace) {
1356 if (*walk == 0 || *walk != '}') {
1357 return false;
1359 walk++;
1362 *str = walk;
1363 return true;
1366 static Variant php_pcre_replace(const String& pattern, const String& subject,
1367 const Variant& replace_var, bool callable,
1368 int limit, int* replace_count) {
1369 PCRECache::Accessor accessor;
1370 if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1371 return false;
1373 const pcre_cache_entry* pce = accessor.get();
1374 bool eval = pce->preg_options & PREG_REPLACE_EVAL;
1375 if (eval) {
1376 if (RuntimeOption::EvalAuthoritativeMode) {
1377 throw Exception(
1378 "You can't use eval in RepoAuthoritative mode. It breaks all sorts of "
1379 "assumptions we use for speed. Switch to using preg_replace_callback()."
1382 if (callable) {
1383 raise_warning(
1384 "Modifier /e cannot be used with replacement callback."
1386 return init_null();
1388 raise_deprecated(
1389 "preg_replace(): The /e modifier is deprecated, use "
1390 "preg_replace_callback instead"
1394 int size_offsets;
1395 int* offsets = create_offset_array(pce, size_offsets);
1396 SmartFreeHelper offsetsFreer(offsets);
1397 if (offsets == nullptr) {
1398 return false;
1401 const char* const* subpat_names = get_subpat_names(pce);
1402 if (subpat_names == nullptr) {
1403 return false;
1406 const char* replace = nullptr;
1407 const char* replace_end = nullptr;
1408 int replace_len = 0;
1409 String replace_val;
1411 if (!callable) {
1412 replace_val = replace_var.toString();
1413 replace = replace_val.data();
1414 replace_len = replace_val.size();
1415 replace_end = replace + replace_len;
1418 StringBuffer result(2 * subject.size());
1420 try {
1422 /* Initialize */
1423 const char* match = nullptr;
1424 int start_offset = 0;
1425 tl_last_error_code = PHP_PCRE_NO_ERROR;
1426 pcre_extra extra;
1427 init_local_extra(&extra, pce->extra);
1429 const char* walk; // Used to walk the replacement string
1430 char walk_last; // Last walked character
1431 int match_len; // Length of the current match
1432 int backref; // Backreference number
1433 int g_notempty = 0; // If the match should not be empty
1434 int exec_options = 0; // Options passed to pcre_exec
1435 while (1) {
1436 /* Execute the regular expression. */
1437 int count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1438 start_offset,
1439 exec_options | g_notempty,
1440 offsets, size_offsets);
1442 /* The string was already proved to be valid UTF-8 */
1443 exec_options |= PCRE_NO_UTF8_CHECK;
1445 /* Check for too many substrings condition. */
1446 if (count == 0) {
1447 raise_warning("Matched, but too many substrings");
1448 count = size_offsets / 3;
1451 const char* piece = subject.data() + start_offset;
1452 if (count > 0 && offsets[1] >= offsets[0] &&
1453 (limit == -1 || limit > 0)) {
1454 if (replace_count) {
1455 ++*replace_count;
1457 /* Set the match location in subject */
1458 match = subject.data() + offsets[0];
1460 /* If evaluating, do it and add the return string's length */
1461 String eval_result;
1462 if (callable) {
1463 /* Use custom function to get replacement string and its length. */
1464 eval_result = preg_do_repl_func(replace_var, subject, offsets,
1465 subpat_names, count);
1466 } else { /* do regular substitution */
1467 walk = replace;
1468 walk_last = 0;
1469 while (walk < replace_end) {
1470 if ('\\' == *walk || '$' == *walk) {
1471 if (walk_last == '\\') {
1472 walk++;
1473 walk_last = 0;
1474 continue;
1476 if (preg_get_backref(&walk, &backref)) {
1477 if (backref < count) {
1478 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1479 if (eval) {
1480 String esc_match = HHVM_FN(addslashes)(
1481 String(
1482 subject.data() + offsets[backref<<1],
1483 match_len,
1484 CopyString
1487 match_len = esc_match.length();
1490 continue;
1493 walk++;
1494 walk_last = walk[-1];
1498 /* copy the part of the string before the match */
1499 result.append(piece, match-piece);
1501 /* copy replacement and backrefs */
1502 int result_len = result.size();
1504 /* If evaluating or using custom function, copy result to the buffer
1505 * and clean up. */
1506 if (callable) {
1507 result.append(eval_result.data(), eval_result.size());
1508 result_len += eval_result.size();
1509 } else { /* do regular backreference copying */
1510 walk = replace;
1511 walk_last = 0;
1512 Array params;
1513 int lastStart = result.size();
1514 while (walk < replace_end) {
1515 bool handleQuote = eval && '"' == *walk && walk_last != '\\';
1516 if (handleQuote && lastStart != result.size()) {
1517 String str(result.data() + lastStart, result.size() - lastStart,
1518 CopyString);
1519 params.append(str);
1520 lastStart = result.size();
1521 handleQuote = false;
1523 if ('\\' == *walk || '$' == *walk) {
1524 if (walk_last == '\\') {
1525 result.set(result.size() - 1, *walk++);
1526 walk_last = 0;
1527 continue;
1529 if (preg_get_backref(&walk, &backref)) {
1530 if (backref < count) {
1531 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1532 if (eval) {
1533 String esc_match = HHVM_FN(addslashes)(
1534 String(
1535 subject.data() + offsets[backref<<1],
1536 match_len,
1537 CopyString
1540 match_len = esc_match.length();
1541 result.append(esc_match.data(), match_len);
1542 } else {
1543 result.append(
1544 subject.data() + offsets[backref<<1],
1545 match_len
1549 continue;
1552 result.append(*walk++);
1553 walk_last = walk[-1];
1554 if (handleQuote && lastStart != result.size()) {
1555 lastStart = result.size();
1558 auto full_len = result.size();
1559 auto data = result.data() + result_len;
1560 if (eval) {
1561 VMRegAnchor _;
1562 auto const ar = GetCallerFrame();
1563 // reserve space for "<?php return " + code + ";"
1564 String prefixedCode(full_len - result_len + 14, ReserveString);
1565 prefixedCode +=
1566 (ar->unit()->isHHFile() ? "<?hh return " : "<?php return ");
1567 prefixedCode += folly::StringPiece{data, full_len - result_len};
1568 prefixedCode += ";";
1569 auto const unit = g_context->compileEvalString(prefixedCode.get());
1570 auto const ctx = ar->func()->cls();
1571 auto const func = unit->getMain(ctx);
1572 ObjectData* thiz;
1573 Class* cls;
1574 if (ctx) {
1575 if (ar->hasThis()) {
1576 thiz = ar->getThis();
1577 cls = thiz->getVMClass();
1578 } else {
1579 thiz = nullptr;
1580 cls = ar->getClass();
1582 } else {
1583 thiz = nullptr;
1584 cls = nullptr;
1586 auto v = Variant::attach(
1587 g_context->invokeFunc(func, init_null_variant,
1588 thiz, cls, nullptr, nullptr,
1589 ExecutionContext::InvokePseudoMain)
1591 eval_result = v.toString();
1593 result.resize(result_len);
1594 result.append(eval_result.data(), eval_result.size());
1598 if (limit != -1) {
1599 limit--;
1602 } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1603 /* If we previously set PCRE_NOTEMPTY after a null match,
1604 this is not necessarily the end. We need to advance
1605 the start offset, and continue. Fudge the offset values
1606 to achieve this, unless we're already at the end of the string. */
1607 if (g_notempty != 0 && start_offset < subject.size()) {
1608 offsets[0] = start_offset;
1609 offsets[1] = start_offset + 1;
1610 result.append(piece, 1);
1611 } else {
1612 /* stick that last bit of string on our output */
1613 result.append(piece, subject.size() - start_offset);
1614 break;
1616 } else {
1617 if (pcre_need_log_error(count)) {
1618 const char* s;
1619 int size;
1620 String stemp;
1621 if (callable) {
1622 if (replace_var.isObject()) {
1623 stemp = replace_var.asCObjRef()->getClassName().asString()
1624 + "::__invoke";
1625 } else {
1626 stemp = replace_var.toString();
1628 s = stemp.data();
1629 size = stemp.size();
1630 } else {
1631 s = replace_val.data();
1632 size = replace_val.size();
1634 pcre_log_error(__FUNCTION__, __LINE__, count,
1635 pattern.data(), pattern.size(),
1636 subject.data(), subject.size(),
1637 s, size,
1638 callable, limit, start_offset, g_notempty);
1640 pcre_handle_exec_error(count);
1641 return init_null();
1644 /* If we have matched an empty string, mimic what Perl's /g options does.
1645 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1646 the match again at the same point. If this fails (picked up above) we
1647 advance to the next character. */
1648 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1650 /* Advance to the next piece. */
1651 start_offset = offsets[1];
1654 return result.detach();
1655 } catch (...) {
1656 throw;
1660 static Variant php_replace_in_subject(const Variant& regex, const Variant& replace,
1661 String subject, int limit, bool callable,
1662 int* replace_count) {
1663 if (!regex.isArray()) {
1664 Variant ret = php_pcre_replace(regex.toString(), subject, replace,
1665 callable, limit, replace_count);
1667 if (ret.isBoolean()) {
1668 assert(!ret.toBoolean());
1669 return init_null();
1672 return ret;
1675 if (callable || !replace.isArray()) {
1676 Array arr = regex.toArray();
1677 for (ArrayIter iterRegex(arr); iterRegex; ++iterRegex) {
1678 String regex_entry = iterRegex.second().toString();
1679 Variant ret = php_pcre_replace(regex_entry, subject, replace,
1680 callable, limit, replace_count);
1681 if (ret.isBoolean()) {
1682 assert(!ret.toBoolean());
1683 return init_null();
1685 if (!ret.isString()) {
1686 return ret;
1688 subject = ret.asStrRef();
1689 if (subject.isNull()) {
1690 return subject;
1693 return subject;
1696 Array arrReplace = replace.toArray();
1697 Array arrRegex = regex.toArray();
1698 ArrayIter iterReplace(arrReplace);
1699 for (ArrayIter iterRegex(arrRegex); iterRegex; ++iterRegex) {
1700 String regex_entry = iterRegex.second().toString();
1701 Variant replace_value;
1702 if (iterReplace) {
1703 replace_value = iterReplace.second();
1704 ++iterReplace;
1707 Variant ret = php_pcre_replace(regex_entry, subject, replace_value,
1708 callable, limit, replace_count);
1710 if (ret.isBoolean()) {
1711 assert(!ret.toBoolean());
1712 return init_null();
1714 if (!ret.isString()) {
1715 return ret;
1717 subject = ret.asStrRef();
1718 if (subject.isNull()) {
1719 return subject;
1722 return subject;
1725 Variant preg_replace_impl(const Variant& pattern, const Variant& replacement,
1726 const Variant& subject, int limit, Variant* count,
1727 bool is_callable, bool is_filter) {
1728 assert(!(is_callable && is_filter));
1729 if (!is_callable &&
1730 replacement.isArray() && !pattern.isArray()) {
1731 raise_warning("Parameter mismatch, pattern is a string while "
1732 "replacement is an array");
1733 return false;
1736 int replace_count = 0;
1737 if (!isContainer(subject)) {
1738 Variant ret = php_replace_in_subject(pattern, replacement,
1739 subject.toString(),
1740 limit, is_callable, &replace_count);
1742 if (ret.isString()) {
1743 if (count) *count = replace_count;
1744 if (is_filter && replace_count == 0) {
1745 return init_null();
1746 } else {
1747 return ret.asStrRef();
1751 return ret;
1754 Array return_value = Array::Create();
1755 Array arrSubject = subject.toArray();
1756 for (ArrayIter iter(arrSubject); iter; ++iter) {
1757 auto old_replace_count = replace_count;
1758 String subject_entry = iter.second().toString();
1759 Variant ret = php_replace_in_subject(pattern, replacement, subject_entry,
1760 limit, is_callable, &replace_count);
1762 if (ret.isString() && !ret.isNull() &&
1763 (!is_filter || replace_count > old_replace_count)) {
1764 return_value.set(iter.first(), ret.asStrRef());
1767 if (count) *count = replace_count;
1768 return return_value;
1771 int preg_replace(Variant& result,
1772 const Variant& pattern,
1773 const Variant& replacement,
1774 const Variant& subject,
1775 int limit /* = -1 */) {
1776 Variant count;
1777 result = preg_replace_impl(pattern, replacement, subject,
1778 limit, &count, false, false);
1779 return count.toInt32();
1782 int preg_replace_callback(Variant& result,
1783 const Variant& pattern,
1784 const Variant& callback,
1785 const Variant& subject,
1786 int limit /* = -1 */) {
1787 Variant count;
1788 result = preg_replace_impl(pattern, callback, subject,
1789 limit, &count, true, false);
1790 return count.toInt32();
1793 int preg_filter(Variant& result,
1794 const Variant& pattern,
1795 const Variant& replacement,
1796 const Variant& subject,
1797 int limit /* = -1 */) {
1798 Variant count;
1799 result = preg_replace_impl(pattern, replacement, subject,
1800 limit, &count, false, true);
1801 return count.toInt32();
1804 ///////////////////////////////////////////////////////////////////////////////
1806 Variant preg_split(const String& pattern, const String& subject,
1807 int limit /* = -1 */, int flags /* = 0 */) {
1808 PCRECache::Accessor accessor;
1809 if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1810 return false;
1812 const pcre_cache_entry* pce = accessor.get();
1814 int no_empty = flags & PREG_SPLIT_NO_EMPTY;
1815 bool delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1816 bool offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1818 if (limit == 0) {
1819 limit = -1;
1822 int size_offsets = 0;
1823 int* offsets = create_offset_array(pce, size_offsets);
1824 SmartFreeHelper offsetsFreer(offsets);
1825 if (offsets == nullptr) {
1826 return false;
1829 /* Start at the beginning of the string */
1830 int start_offset = 0;
1831 int next_offset = 0;
1832 const char* last_match = subject.data();
1833 tl_last_error_code = PHP_PCRE_NO_ERROR;
1834 pcre_extra extra;
1835 init_local_extra(&extra, pce->extra);
1837 // Get next piece if no limit or limit not yet reached and something matched
1838 Array return_value = Array::Create();
1839 int g_notempty = 0; /* If the match should not be empty */
1840 int utf8_check = 0;
1841 PCRECache::Accessor bump_accessor;
1842 const pcre_cache_entry* bump_pce = nullptr; /* instance for empty matches */
1843 while ((limit == -1 || limit > 1)) {
1844 int count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1845 start_offset, g_notempty | utf8_check,
1846 offsets, size_offsets);
1848 /* Subsequent calls to pcre_exec don't need to bother with the
1849 * utf8 validity check: if the subject isn't valid, the first
1850 * call to pcre_exec will have failed, and as long as we only
1851 * set start_offset to known character boundaries we won't
1852 * supply an invalid offset. */
1853 utf8_check = PCRE_NO_UTF8_CHECK;
1855 /* Check for too many substrings condition. */
1856 if (count == 0) {
1857 raise_warning("Matched, but too many substrings");
1858 count = size_offsets / 3;
1861 /* If something matched */
1862 if (count > 0 && offsets[1] >= offsets[0]) {
1863 if (!no_empty || subject.data() + offsets[0] != last_match) {
1864 if (offset_capture) {
1865 /* Add (match, offset) pair to the return value */
1866 add_offset_pair(return_value,
1867 String(last_match,
1868 subject.data() + offsets[0] - last_match,
1869 CopyString),
1870 next_offset, nullptr);
1871 } else {
1872 /* Add the piece to the return value */
1873 return_value.append(String(last_match,
1874 subject.data() + offsets[0] - last_match,
1875 CopyString));
1878 /* One less left to do */
1879 if (limit != -1)
1880 limit--;
1883 last_match = subject.data() + offsets[1];
1884 next_offset = offsets[1];
1886 if (delim_capture) {
1887 int i, match_len;
1888 for (i = 1; i < count; i++) {
1889 match_len = offsets[(i<<1)+1] - offsets[i<<1];
1890 /* If we have matched a delimiter */
1891 if (!no_empty || match_len > 0) {
1892 if (offset_capture) {
1893 add_offset_pair(return_value,
1894 String(subject.data() + offsets[i<<1],
1895 match_len, CopyString),
1896 offsets[i<<1], nullptr);
1897 } else {
1898 return_value.append(subject.substr(offsets[i<<1], match_len));
1903 } else if (count == PCRE_ERROR_NOMATCH) {
1904 /* If we previously set PCRE_NOTEMPTY after a null match,
1905 this is not necessarily the end. We need to advance
1906 the start offset, and continue. Fudge the offset values
1907 to achieve this, unless we're already at the end of the string. */
1908 if (g_notempty != 0 && start_offset < subject.size()) {
1909 if (pce->compile_options & PCRE_UTF8) {
1910 if (bump_pce == nullptr) {
1911 if (!pcre_get_compiled_regex_cache(bump_accessor,
1912 String("/./us").get())) {
1913 return false;
1915 bump_pce = bump_accessor.get();
1917 pcre_extra bump_extra;
1918 init_local_extra(&bump_extra, bump_pce->extra);
1919 count = pcre_exec(bump_pce->re, &bump_extra, subject.data(),
1920 subject.size(), start_offset,
1921 utf8_check, offsets, size_offsets);
1922 if (count < 1) {
1923 raise_warning("Unknown error");
1924 offsets[0] = start_offset;
1925 offsets[1] = start_offset + 1;
1926 if (pcre_need_log_error(count)) {
1927 pcre_log_error(__FUNCTION__, __LINE__, count,
1928 pattern.data(), pattern.size(),
1929 subject.data(), subject.size(),
1930 "", 0,
1931 limit, flags, start_offset);
1934 } else {
1935 offsets[0] = start_offset;
1936 offsets[1] = start_offset + 1;
1938 } else
1939 break;
1940 } else {
1941 if (pcre_need_log_error(count)) {
1942 pcre_log_error(__FUNCTION__, __LINE__, count,
1943 pattern.data(), pattern.size(),
1944 subject.data(), subject.size(),
1945 "", 0,
1946 limit, flags, start_offset, g_notempty);
1948 pcre_handle_exec_error(count);
1949 break;
1952 /* If we have matched an empty string, mimic what Perl's /g options does.
1953 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1954 the match again at the same point. If this fails (picked up above) we
1955 advance to the next character. */
1956 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1958 /* Advance to the position right after the last full match */
1959 start_offset = offsets[1];
1962 start_offset = last_match - subject.data(); /* offset might have
1963 * been incremented,
1964 * but without further
1965 * successful matches */
1966 if (!no_empty || start_offset < subject.size()) {
1967 if (offset_capture) {
1968 /* Add the last (match, offset) pair to the return value */
1969 add_offset_pair(return_value,
1970 subject.substr(start_offset),
1971 start_offset, nullptr);
1972 } else {
1973 /* Add the last piece to the return value */
1974 return_value.append
1975 (String(last_match, subject.data() + subject.size() - last_match,
1976 CopyString));
1980 return return_value;
1983 ///////////////////////////////////////////////////////////////////////////////
1985 String preg_quote(const String& str,
1986 const String& delimiter /* = null_string */) {
1987 const char* in_str = str.data();
1988 const char* in_str_end = in_str + str.size();
1990 /* Nothing to do if we got an empty string */
1991 if (in_str == in_str_end) {
1992 return str;
1995 char delim_char = 0; /* Delimiter character to be quoted */
1996 bool quote_delim = false; /* Whether to quote additional delim char */
1997 if (!delimiter.empty()) {
1998 delim_char = delimiter.charAt(0);
1999 quote_delim = true;
2002 /* Allocate enough memory so that even if each character
2003 is quoted, we won't run out of room */
2004 String ret(4 * str.size() + 1, ReserveString);
2005 char* out_str = ret.mutableData();
2007 /* Go through the string and quote necessary characters */
2008 const char* p;
2009 char* q;
2010 for (p = in_str, q = out_str; p != in_str_end; p++) {
2011 char c = *p;
2012 switch (c) {
2013 case '.': case '\\': case '+': case '*': case '?':
2014 case '[': case '^': case ']': case '$': case '(':
2015 case ')': case '{': case '}': case '=': case '!':
2016 case '>': case '<': case '|': case ':': case '-':
2017 *q++ = '\\';
2018 *q++ = c;
2019 break;
2021 case '\0':
2022 *q++ = '\\';
2023 *q++ = '0';
2024 *q++ = '0';
2025 *q++ = '0';
2026 break;
2028 default:
2029 if (quote_delim && c == delim_char)
2030 *q++ = '\\';
2031 *q++ = c;
2032 break;
2035 *q = '\0';
2037 return ret.setSize(q - out_str);
2040 int preg_last_error() {
2041 return tl_last_error_code;
2044 size_t preg_pcre_cache_size() {
2045 return s_pcreCache.size();
2048 ///////////////////////////////////////////////////////////////////////////////
2049 // regexec
2051 static void php_reg_eprint(int err, regex_t* re) {
2052 char *buf = nullptr, *message = nullptr;
2053 size_t len;
2054 size_t buf_len;
2056 #ifdef REG_ITOA
2057 /* get the length of the message */
2058 buf_len = regerror(REG_ITOA | err, re, nullptr, 0);
2059 if (buf_len) {
2060 buf = (char *)req::malloc_noptrs(buf_len);
2061 if (!buf) return; /* fail silently */
2062 /* finally, get the error message */
2063 regerror(REG_ITOA | err, re, buf, buf_len);
2065 #else
2066 buf_len = 0;
2067 #endif
2068 len = regerror(err, re, nullptr, 0);
2069 if (len) {
2070 message = (char *)req::malloc_noptrs(buf_len + len + 2);
2071 if (!message) {
2072 return; /* fail silently */
2074 if (buf_len) {
2075 snprintf(message, buf_len, "%s: ", buf);
2076 buf_len += 1; /* so pointer math below works */
2078 /* drop the message into place */
2079 regerror(err, re, message + buf_len, len);
2080 raise_warning("%s", message);
2082 req::free(buf);
2083 req::free(message);
2086 Variant php_split(const String& spliton, const String& str, int count,
2087 bool icase) {
2088 const char* strp = str.data();
2089 const char* endp = strp + str.size();
2091 regex_t re;
2092 int copts = icase ? REG_ICASE : 0;
2093 int err = regcomp(&re, spliton.data(), REG_EXTENDED | copts);
2094 if (err) {
2095 php_reg_eprint(err, &re);
2096 return false;
2099 Array return_value = Array::Create();
2100 regmatch_t subs[1];
2102 /* churn through str, generating array entries as we go */
2103 while ((count == -1 || count > 1) &&
2104 !(err = regexec(&re, strp, 1, subs, 0))) {
2105 if (subs[0].rm_so == 0 && subs[0].rm_eo) {
2106 /* match is at start of string, return empty string */
2107 return_value.append("");
2108 /* skip ahead the length of the regex match */
2109 strp += subs[0].rm_eo;
2110 } else if (subs[0].rm_so == 0 && subs[0].rm_eo == 0) {
2111 /* No more matches */
2112 regfree(&re);
2113 raise_warning("Invalid Regular Expression to split()");
2114 return false;
2115 } else {
2116 /* On a real match */
2118 /* make a copy of the substring */
2119 int size = subs[0].rm_so;
2121 /* add it to the array */
2122 return_value.append(String(strp, size, CopyString));
2124 /* point at our new starting point */
2125 strp = strp + subs[0].rm_eo;
2128 /* if we're only looking for a certain number of points,
2129 stop looking once we hit it */
2130 if (count != -1) {
2131 count--;
2135 /* see if we encountered an error */
2136 if (err && err != REG_NOMATCH) {
2137 php_reg_eprint(err, &re);
2138 regfree(&re);
2139 return false;
2142 /* otherwise we just have one last element to add to the array */
2143 int size = endp - strp;
2144 return_value.append(String(strp, size, CopyString));
2146 regfree(&re);
2147 return return_value;
2150 ///////////////////////////////////////////////////////////////////////////////