EvalEmitDVArray: varray
[hiphop-php.git] / hphp / runtime / base / preg.cpp
blob8c3ec03e1755f902625c9f3901f6e79ef0560140
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/base/preg.h"
19 #include <atomic>
20 #include <fstream>
21 #include <mutex>
22 #include <pcre.h>
23 #include <onigposix.h>
24 #include <utility>
26 #include <folly/AtomicHashArray.h>
28 #include "hphp/runtime/base/array-init.h"
29 #include "hphp/runtime/base/array-iterator.h"
30 #include "hphp/runtime/base/builtin-functions.h"
31 #include "hphp/runtime/base/container-functions.h"
32 #include "hphp/runtime/base/execution-context.h"
33 #include "hphp/runtime/base/ini-setting.h"
34 #include "hphp/runtime/base/runtime-option.h"
35 #include "hphp/runtime/base/string-util.h"
36 #include "hphp/runtime/base/init-fini-node.h"
37 #include "hphp/runtime/base/zend-functions.h"
38 #include "hphp/runtime/vm/debug/debug.h"
39 #include "hphp/runtime/vm/treadmill.h"
40 #include "hphp/runtime/vm/vm-regs.h"
42 #include "hphp/runtime/ext/std/ext_std_function.h"
43 #include "hphp/runtime/ext/string/ext_string.h"
45 #include "hphp/runtime/vm/jit/mcgen.h"
46 #include "hphp/runtime/vm/jit/types.h"
47 #include "hphp/runtime/vm/jit/vtune-jit.h"
49 #include "hphp/util/logger.h"
50 #include "hphp/util/concurrent-scalable-cache.h"
52 #include <folly/json.h>
54 /* Only defined in pcre >= 8.32 */
55 #ifndef PCRE_STUDY_JIT_COMPILE
56 # define PCRE_STUDY_JIT_COMPILE 0
57 #endif
59 namespace HPHP {
61 using jit::TCA;
63 ///////////////////////////////////////////////////////////////////////////////
64 // PCREglobals definition
66 PCREglobals::PCREglobals() {
67 jit_stack = pcre_jit_stack_alloc(32768, 524288);
68 // Set these to handle uses of pcre prior to PcreExtension::threadInit
69 // In particular, for matching tier overrides during RuntimeOption::Load
70 preg_backtrace_limit = RuntimeOption::PregBacktraceLimit;
71 preg_recursion_limit = RuntimeOption::PregRecursionLimit;
74 PCREglobals::~PCREglobals() {
75 pcre_jit_stack_free(jit_stack);
78 ///////////////////////////////////////////////////////////////////////////////
79 // PCRECache definition
81 struct PCRECache {
82 typedef std::shared_ptr<const pcre_cache_entry> EntryPtr;
83 typedef std::unique_ptr<LRUCacheKey> TempKeyCache;
85 enum class CacheKind {
86 Static,
87 Lru,
88 Scalable
91 private:
92 struct ahm_string_data_same {
93 bool operator()(const StringData* s1, const StringData* s2) {
94 // ahm uses -1, -2, -3 as magic values
95 return int64_t(s1) > 0 && (s1 == s2 || s1->same(s2));
99 typedef folly::AtomicHashArray<const StringData*, const pcre_cache_entry*,
100 string_data_hash, ahm_string_data_same> StaticCache;
101 typedef ConcurrentLRUCache<LRUCacheKey, EntryPtr,
102 LRUCacheKey::HashCompare> LRUCache;
103 typedef ConcurrentScalableCache<LRUCacheKey, EntryPtr,
104 LRUCacheKey::HashCompare> ScalableCache;
105 typedef StaticCache::value_type StaticCachePair;
107 public:
108 struct Accessor {
109 Accessor()
110 : m_kind(Kind::Empty)
113 ~Accessor() {
114 switch (m_kind) {
115 case Kind::Empty:
116 case Kind::Ptr:
117 break;
118 case Kind::SmartPtr:
119 m_u.smart_ptr.~EntryPtr();
120 break;
121 case Kind::AccessorKind:
122 m_u.accessor.~ConstAccessor();
123 break;
127 Accessor& operator=(const pcre_cache_entry* ptr) {
128 assertx(m_kind == Kind::Empty || m_kind == Kind::Ptr);
129 m_kind = Kind::Ptr;
130 m_u.ptr = ptr;
131 return *this;
134 Accessor& operator=(EntryPtr&& ep) {
135 switch (m_kind) {
136 case Kind::AccessorKind:
137 m_u.accessor.~ConstAccessor();
138 case Kind::Empty:
139 case Kind::Ptr:
140 m_kind = Kind::SmartPtr;
141 new (&m_u.smart_ptr) EntryPtr(std::move(ep));
142 break;
143 case Kind::SmartPtr:
144 m_u.smart_ptr = std::move(ep);
145 break;
147 return *this;
150 // No assignment from LRUCache::ConstAccessor since it is non-copyable
151 // Use resetToLRU instead
152 LRUCache::ConstAccessor& resetToLRU() {
153 switch (m_kind) {
154 case Kind::SmartPtr:
155 m_u.smart_ptr.~EntryPtr();
156 case Kind::Empty:
157 case Kind::Ptr:
158 m_kind = Kind::AccessorKind;
159 new (&m_u.accessor) LRUCache::ConstAccessor();
160 break;
161 case Kind::AccessorKind:
162 break;
164 return m_u.accessor;
167 const pcre_cache_entry* get() {
168 switch (m_kind) {
169 case Kind::Empty: return nullptr;
170 case Kind::Ptr: return m_u.ptr;
171 case Kind::SmartPtr: return m_u.smart_ptr.get();
172 case Kind::AccessorKind: return m_u.accessor->get();
174 always_assert(false);
177 const EntryPtr& entryPtr() const {
178 assertx(m_kind == Kind::SmartPtr);
179 return m_u.smart_ptr;
182 private:
183 enum class Kind : uint8_t {
184 Empty,
185 Ptr,
186 SmartPtr,
187 AccessorKind,
190 union Ptr {
191 Ptr() {}
192 ~Ptr() {}
194 const pcre_cache_entry* ptr;
195 EntryPtr smart_ptr;
196 LRUCache::ConstAccessor accessor;
199 Ptr m_u;
200 Kind m_kind;
203 PCRECache()
204 : m_kind(CacheKind::Static), m_staticCache(nullptr)
206 reinit(CacheKind::Static);
209 ~PCRECache() {
210 if (m_kind == CacheKind::Static && m_staticCache.load()) {
211 DestroyStatic(m_staticCache);
215 void reinit(CacheKind kind);
216 bool find(Accessor& accessor, const StringData* key,
217 TempKeyCache& keyCache);
218 void insert(Accessor& accessor, const StringData* regex,
219 TempKeyCache& keyCache, const pcre_cache_entry* ent);
220 void dump(const std::string& filename);
221 size_t size() const;
223 private:
224 void clearStatic();
226 static void DestroyStatic(StaticCache* cache);
227 static StaticCache* CreateStatic();
229 CacheKind m_kind;
230 std::atomic<StaticCache*> m_staticCache;
231 std::unique_ptr<LRUCache> m_lruCache;
232 std::unique_ptr<ScalableCache> m_scalableCache;
233 std::atomic<time_t> m_expire{};
234 std::mutex m_clearMutex;
237 ///////////////////////////////////////////////////////////////////////////////
238 // Data
240 RDS_LOCAL(PCREglobals, tl_pcre_globals);
242 static PCRECache s_pcreCache;
244 // The last pcre error code is available for the whole thread.
245 static RDS_LOCAL(int, rl_last_error_code);
247 ///////////////////////////////////////////////////////////////////////////////
248 // pcre_cache_entry implementation
250 pcre_cache_entry::~pcre_cache_entry() {
251 if (extra) {
252 #if PCRE_MAJOR < 8 || (PCRE_MAJOR == 8 && PCRE_MINOR < 20)
253 free(extra);
254 #else
255 pcre_free_study(extra);
256 #endif
258 free(subpat_names);
259 pcre_free(re);
262 pcre_literal_data::pcre_literal_data(const char* pattern, int coptions) {
263 if (coptions & ~PCRE_CASELESS) {
264 return;
267 auto p = pattern;
268 if (*p == '^') {
269 match_start = true;
270 p++;
273 std::string pattern_buffer;
274 while (isalnum((unsigned char)*p) || (*p && strchr("/\\ :-_", *p))) {
275 // backslash + alphanumeric character --> not a literal (i.e. \d).
276 // backslash + non-alphanumeric character --> literal symbol (i.e. \.)
277 if (*p == '\\') {
278 if (!p[1] || isalnum((unsigned char)p[1])) {
279 break;
280 } else {
281 p++;
284 pattern_buffer += *p++;
286 if (*p == '$') {
287 match_end = true;
288 p++;
290 if (!*p) {
291 /* This is an encoding of a literal string. */
292 case_insensitive = coptions & PCRE_CASELESS;
293 literal_str = std::move(pattern_buffer);
297 bool pcre_literal_data::isLiteral() const {
298 return literal_str.has_value();
301 bool pcre_literal_data::matches(const StringData* subject,
302 int pos,
303 int* offsets) const {
304 assertx(isLiteral());
305 assertx(pos >= 0);
307 // Subject must be at least as long as the literal pattern
308 // for a match to occur.
309 if (subject->size() < literal_str->length() + pos) {
310 return false;
313 size_t literal_strlen = literal_str->length();
314 auto const subject_c = subject->data();
315 auto const literal_c = literal_str->c_str();
316 if (match_start) {
317 // Make sure an exact match has the right length.
318 if (pos || (match_end && subject->size() != literal_strlen)) {
319 return false;
321 // If only matching the start (^), compare the strings
322 // for the length of the literal pattern.
323 if (case_insensitive ?
324 bstrcaseeq(subject_c, literal_c, literal_strlen) :
325 memcmp(subject_c, literal_c, literal_strlen) == 0) {
326 offsets[0] = 0;
327 offsets[1] = literal_strlen * sizeof(char);
328 return true;
330 } else if (match_end) {
331 // Compare the literal pattern against the tail end of the subject.
332 auto const subject_tail = subject_c + (subject->size() - literal_strlen);
333 if (case_insensitive ?
334 bstrcaseeq(subject_tail, literal_c, literal_strlen) :
335 memcmp(subject_tail, literal_c, literal_strlen) == 0) {
336 offsets[0] = (subject->size() - literal_strlen) * sizeof(char);
337 offsets[1] = subject->size() * sizeof(char);
338 return true;
340 } else {
341 if (!literal_strlen) {
342 offsets[0] = offsets[1] = pos;
343 return true;
345 // Check if the literal pattern occurs as a substring of the subject.
346 auto const subject_str = StrNR(subject);
347 auto const find_response = subject_str.asString().find(
348 *literal_str, pos, !case_insensitive);
349 if (find_response >= 0) {
350 offsets[0] = find_response * sizeof(char);
351 offsets[1] = offsets[0] + literal_strlen * sizeof(char);
352 return true;
355 return false;
358 ///////////////////////////////////////////////////////////////////////////////
359 // PCRECache implementation
361 PCRECache::StaticCache* PCRECache::CreateStatic() {
362 StaticCache::Config config;
363 config.maxLoadFactor = 0.5;
364 return StaticCache::create(
365 RuntimeOption::EvalPCRETableSize, config).release();
368 void PCRECache::DestroyStatic(StaticCache* cache) {
369 // We delete uncounted keys while iterating the cache, which is OK for
370 // AtomicHashArray, but not OK for other containers, such as
371 // std::unordered_map. If you change the cache type make sure that property
372 // holds or fix this function.
373 static_assert(std::is_same<PCRECache::StaticCache,
374 folly::AtomicHashArray<const StringData*, const pcre_cache_entry*,
375 string_data_hash, ahm_string_data_same>>::value,
376 "StaticCache must be an AtomicHashArray or this destructor is wrong.");
377 for (auto& it : *cache) {
378 if (it.first->isUncounted()) {
379 StringData::ReleaseUncounted(it.first);
381 delete it.second;
383 StaticCache::destroy(cache);
386 void PCRECache::reinit(CacheKind kind) {
387 switch (m_kind) {
388 case CacheKind::Static:
389 if (m_staticCache.load()) {
390 DestroyStatic(m_staticCache);
391 m_staticCache = nullptr;
393 break;
394 case CacheKind::Lru:
395 m_lruCache.reset();
396 break;
397 case CacheKind::Scalable:
398 m_scalableCache.reset();
399 break;
401 m_kind = kind;
403 switch (kind) {
404 case CacheKind::Static:
405 m_staticCache = CreateStatic();
406 m_expire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
407 break;
408 case CacheKind::Lru:
409 m_lruCache.reset(new LRUCache(RuntimeOption::EvalPCRETableSize));
410 break;
411 case CacheKind::Scalable:
412 m_scalableCache.reset(
413 new ScalableCache(RuntimeOption::EvalPCRETableSize));
414 break;
418 bool PCRECache::find(Accessor& accessor,
419 const StringData* regex,
420 TempKeyCache& keyCache)
422 switch (m_kind) {
423 case CacheKind::Static:
425 assertx(m_staticCache.load());
426 StaticCache::iterator it;
427 auto cache = m_staticCache.load(std::memory_order_acquire);
428 if ((it = cache->find(regex)) != cache->end()) {
429 accessor = it->second;
430 return true;
432 return false;
434 case CacheKind::Lru:
435 case CacheKind::Scalable:
437 if (!keyCache) {
438 keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
440 bool found;
441 if (m_kind == CacheKind::Lru) {
442 found = m_lruCache->find(accessor.resetToLRU(), *keyCache);
443 } else {
444 found = m_scalableCache->find(accessor.resetToLRU(), *keyCache);
446 return found;
449 always_assert(false);
452 void PCRECache::clearStatic() {
453 std::unique_lock<std::mutex> lock(m_clearMutex, std::try_to_lock);
454 if (!lock) return;
456 auto newExpire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
457 m_expire.store(newExpire, std::memory_order_relaxed);
459 auto tmpMap = CreateStatic();
460 tmpMap = m_staticCache.exchange(tmpMap, std::memory_order_acq_rel);
462 Treadmill::enqueue([tmpMap]() {
463 DestroyStatic(tmpMap);
467 void PCRECache::insert(
468 Accessor& accessor,
469 const StringData* regex,
470 TempKeyCache& keyCache,
471 const pcre_cache_entry* ent
473 switch (m_kind) {
474 case CacheKind::Static:
476 assertx(m_staticCache.load());
477 // Clear the cache if we haven't refreshed it in a while
478 if (time(nullptr) > m_expire) {
479 clearStatic();
481 auto const cache = m_staticCache.load(std::memory_order_acquire);
482 auto const key =
483 regex->isStatic() ||
484 (regex->isUncounted() && regex->uncountedIncRef()) ?
485 regex : StringData::MakeUncounted(regex->slice());
486 auto pair = cache->insert(StaticCachePair(key, ent));
487 if (pair.second) {
488 // Inserted, container owns the pointer
489 accessor = ent;
490 } else {
491 // Not inserted, caller needs to own the pointer
492 if (regex->isUncounted()) StringData::ReleaseUncounted(key);
493 accessor = EntryPtr(ent);
496 break;
497 case CacheKind::Lru:
498 case CacheKind::Scalable:
500 if (!keyCache) {
501 keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
503 // Pointer ownership is shared between container and caller
504 accessor = EntryPtr(ent);
505 if (m_kind == CacheKind::Lru) {
506 m_lruCache->insert(*keyCache, accessor.entryPtr());
507 } else {
508 m_scalableCache->insert(*keyCache, accessor.entryPtr());
511 break;
515 void PCRECache::dump(const std::string& filename) {
516 std::ofstream out(filename.c_str());
517 switch (m_kind) {
518 case CacheKind::Static:
519 for (auto& it : *m_staticCache) {
520 out << it.first->data() << "\n";
522 break;
523 case CacheKind::Lru:
524 case CacheKind::Scalable:
526 std::vector<LRUCacheKey> keys;
527 if (m_kind == CacheKind::Lru) {
528 m_lruCache->snapshotKeys(keys);
529 } else {
530 m_scalableCache->snapshotKeys(keys);
532 for (auto& key: keys) {
533 out << key.c_str() << "\n";
536 break;
538 out.close();
541 size_t PCRECache::size() const {
542 switch (m_kind) {
543 case CacheKind::Static:
544 return m_staticCache.load(std::memory_order_acquire)->size();
545 case CacheKind::Lru:
546 return m_lruCache->size();
547 case CacheKind::Scalable:
548 return m_scalableCache->size();
550 always_assert(false);
553 ///////////////////////////////////////////////////////////////////////////////
554 // Public interface and helper functions
556 void pcre_reinit() {
557 PCRECache::CacheKind kind;
558 if (RuntimeOption::EvalPCRECacheType == "static") {
559 kind = PCRECache::CacheKind::Static;
560 } else if (RuntimeOption::EvalPCRECacheType == "lru") {
561 kind = PCRECache::CacheKind::Lru;
562 } else if (RuntimeOption::EvalPCRECacheType == "scalable") {
563 kind = PCRECache::CacheKind::Scalable;
564 } else {
565 Logger::Warning("Eval.PCRECacheType should be either static, "
566 "lru or scalable");
567 kind = PCRECache::CacheKind::Scalable;
569 s_pcreCache.reinit(kind);
572 void pcre_init() {
575 void pcre_dump_cache(const std::string& filename) {
576 s_pcreCache.dump(filename);
579 static pcre_jit_stack* alloc_jit_stack(void* /*data*/) {
580 return tl_pcre_globals->jit_stack;
583 namespace {
585 template<bool useSmartFree = false>
586 struct FreeHelperImpl {
587 explicit FreeHelperImpl(void* p) : p(p) {}
588 ~FreeHelperImpl() {
589 useSmartFree ? req::free(p) : free(p);
592 FreeHelperImpl(const FreeHelperImpl&) = delete;
593 FreeHelperImpl& operator=(const FreeHelperImpl&) = delete;
595 private:
596 void* p;
599 typedef FreeHelperImpl<true> SmartFreeHelper;
602 static void init_local_extra(pcre_extra* local, pcre_extra* shared) {
603 if (shared) {
604 memcpy(local, shared, sizeof(pcre_extra));
605 } else {
606 memset(local, 0, sizeof(pcre_extra));
607 local->flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
609 local->match_limit = tl_pcre_globals->preg_backtrace_limit;
610 local->match_limit_recursion = tl_pcre_globals->preg_recursion_limit;
613 static const char* const*
614 get_subpat_names(const pcre_cache_entry* pce) {
615 char **subpat_names = pce->subpat_names.load(std::memory_order_relaxed);
616 if (subpat_names) {
617 return subpat_names;
621 * Build a mapping from subpattern numbers to their names. We will always
622 * allocate the table, even though there may be no named subpatterns. This
623 * avoids somewhat more complicated logic in the inner loops.
625 pcre_extra extra;
626 init_local_extra(&extra, pce->extra);
628 int name_count;
630 subpat_names = (char **)calloc(pce->num_subpats, sizeof(char *));
631 int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMECOUNT, &name_count);
632 if (rc < 0) {
633 raise_warning("Internal pcre_fullinfo() error %d", rc);
634 return nullptr;
636 if (name_count > 0) {
637 int name_size, ni = 0;
638 unsigned short name_idx;
639 char* name_table;
640 int rc1, rc2;
642 rc1 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMETABLE, &name_table);
643 rc2 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
644 rc = rc2 ? rc2 : rc1;
645 if (rc < 0) {
646 raise_warning("Internal pcre_fullinfo() error %d", rc);
647 return nullptr;
649 while (ni++ < name_count) {
650 name_idx = 0xff * (unsigned char)name_table[0] +
651 (unsigned char)name_table[1];
652 subpat_names[name_idx] = name_table + 2;
653 if (is_numeric_string(subpat_names[name_idx],
654 strlen(subpat_names[name_idx]),
655 nullptr, nullptr, 0) != KindOfNull) {
656 raise_warning("Numeric named subpatterns are not allowed");
657 return nullptr;
659 name_table += name_size;
662 // Store subpat_names into the cache entry
663 char **expected = nullptr;
664 if (!pce->subpat_names.compare_exchange_strong(expected, subpat_names)) {
665 // Another thread stored subpat_names already. The array created by the
666 // other thread is now in expected, return it instead and delete the one
667 // we just made.
668 free(subpat_names);
669 return expected;
671 return subpat_names;
674 static bool get_pcre_fullinfo(pcre_cache_entry* pce) {
675 pcre_extra extra;
676 init_local_extra(&extra, pce->extra);
678 /* Calculate the size of the offsets array*/
679 int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_CAPTURECOUNT,
680 &pce->num_subpats);
681 if (rc < 0) {
682 raise_warning("Internal pcre_fullinfo() error %d", rc);
683 return false;
685 pce->num_subpats++;
686 return true;
689 static bool
690 pcre_get_compiled_regex_cache(PCRECache::Accessor& accessor,
691 const StringData* regex) {
692 PCRECache::TempKeyCache tkc;
694 /* Try to lookup the cached regex entry, and if successful, just pass
695 back the compiled pattern, otherwise go on and compile it. */
696 if (s_pcreCache.find(accessor, regex, tkc)) {
697 return true;
700 /* Parse through the leading whitespace, and display a warning if we
701 get to the end without encountering a delimiter. */
702 const char *p = regex->data();
703 while (isspace((int)*(unsigned char *)p)) p++;
704 if (*p == 0) {
705 raise_warning("Empty regular expression");
706 return false;
709 /* Get the delimiter and display a warning if it is alphanumeric
710 or a backslash. */
711 char delimiter = *p++;
712 if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
713 raise_warning("Delimiter must not be alphanumeric or backslash");
714 return false;
717 char start_delimiter = delimiter;
718 const char *pp = strchr("([{< )]}> )]}>", delimiter);
719 if (pp) {
720 delimiter = pp[5];
722 char end_delimiter = delimiter;
724 if (start_delimiter == end_delimiter) {
725 /* We need to iterate through the pattern, searching for the ending
726 * delimiter, but skipping the backslashed delimiters. If the ending
727 * delimiter is not found, display a warning. */
728 pp = p;
729 while (*pp != 0) {
730 if (*pp == '\\' && pp[1] != 0) pp++;
731 else if (*pp == delimiter)
732 break;
733 pp++;
735 if (*pp == 0) {
736 raise_warning("No ending delimiter '%c' found: [%s]", delimiter,
737 regex->data());
738 return false;
740 } else {
741 /* We iterate through the pattern, searching for the matching ending
742 * delimiter. For each matching starting delimiter, we increment nesting
743 * level, and decrement it for each matching ending delimiter. If we
744 * reach the end of the pattern without matching, display a warning.
746 int brackets = 1; // brackets nesting level
747 pp = p;
748 while (*pp != 0) {
749 if (*pp == '\\' && pp[1] != 0) pp++;
750 else if (*pp == end_delimiter && --brackets <= 0)
751 break;
752 else if (*pp == start_delimiter)
753 brackets++;
754 pp++;
756 if (*pp == 0) {
757 raise_warning("No ending matching delimiter '%c' found: [%s]",
758 end_delimiter, regex->data());
759 return false;
763 /* Make a copy of the actual pattern. */
764 String spattern(p, pp-p, CopyString);
765 const char *pattern = spattern.data();
767 /* Move on to the options */
768 pp++;
770 /* Parse through the options, setting appropriate flags. Display
771 a warning if we encounter an unknown modifier. */
772 int coptions = 0;
773 int poptions = 0;
774 bool do_study = false;
775 while (*pp != 0) {
776 switch (*pp++) {
777 /* Perl compatible options */
778 case 'i': coptions |= PCRE_CASELESS; break;
779 case 'm': coptions |= PCRE_MULTILINE; break;
780 case 's': coptions |= PCRE_DOTALL; break;
781 case 'x': coptions |= PCRE_EXTENDED; break;
783 /* PCRE specific options */
784 case 'A': coptions |= PCRE_ANCHORED; break;
785 case 'D': coptions |= PCRE_DOLLAR_ENDONLY; break;
786 case 'S': do_study = true; break;
787 case 'U': coptions |= PCRE_UNGREEDY; break;
788 case 'X': coptions |= PCRE_EXTRA; break;
789 case 'u': coptions |= PCRE_UTF8;
790 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
791 characters, even in UTF-8 mode. However, this can be changed by setting
792 the PCRE_UCP option. */
793 #ifdef PCRE_UCP
794 coptions |= PCRE_UCP;
795 #endif
796 break;
798 /* Custom preg options */
799 case 'e': poptions |= PREG_REPLACE_EVAL; break;
801 case ' ':
802 case '\n':
803 break;
805 default:
806 raise_warning("Unknown modifier '%c': [%s]", pp[-1], regex->data());
807 return false;
811 /* We've reached a null byte, now check if we're actually at the end of the
812 string. If not this is a bad expression, and a potential security hole. */
813 if (regex->size() != (pp - regex->data())) {
814 raise_error("Error: Null byte found in pattern");
817 /* Compile pattern and display a warning if compilation failed. */
818 const char *error;
819 int erroffset;
820 pcre *re = pcre_compile(pattern, coptions, &error, &erroffset, 0);
821 if (re == nullptr) {
822 raise_warning("Compilation failed: %s at offset %d", error, erroffset);
823 return false;
826 // Careful: from here 're' needs to be freed if something throws.
828 // TODO(t14969501): enable literal_data everywhere and skip the
829 // pcre_compile above.
830 auto const literal_data = pcre_literal_data(pattern, coptions);
832 /* If study option was specified, study the pattern and
833 store the result in extra for passing to pcre_exec. */
834 pcre_extra *extra = nullptr;
835 if (!literal_data.isLiteral()) {
836 if (do_study || PCRE_STUDY_JIT_COMPILE) {
837 int soptions = PCRE_STUDY_JIT_COMPILE;
838 extra = pcre_study(re, soptions, &error);
839 if (extra) {
840 extra->flags |= PCRE_EXTRA_MATCH_LIMIT |
841 PCRE_EXTRA_MATCH_LIMIT_RECURSION;
842 pcre_assign_jit_stack(extra, alloc_jit_stack, nullptr);
844 if (error != nullptr) {
845 try {
846 raise_warning("Error while studying pattern");
847 } catch (...) {
848 pcre_free(re);
849 throw;
852 if ((!RuntimeOption::EvalJitNoGdb ||
853 RuntimeOption::EvalJitUseVtuneAPI ||
854 RuntimeOption::EvalPerfPidMap) &&
855 extra &&
856 extra->executable_jit != nullptr) {
857 size_t size;
858 pcre_fullinfo(re, extra, PCRE_INFO_JITSIZE, &size);
860 TCA start = *(TCA *)(extra->executable_jit);
861 TCA end = start + size;
862 std::string name = folly::sformat("HHVM::pcre_jit::{}", pattern);
864 if (!RuntimeOption::EvalJitNoGdb && jit::mcgen::initialized()) {
865 Debug::DebugInfo::Get()->recordStub(Debug::TCRange(start, end, false),
866 name);
868 if (RuntimeOption::EvalJitUseVtuneAPI) {
869 HPHP::jit::reportHelperToVtune(name.c_str(), start, end);
871 if (RuntimeOption::EvalPerfPidMap && jit::mcgen::initialized()) {
872 std::string escaped_name;
873 folly::json::escapeString(name, escaped_name,
874 folly::json::serialization_opts());
875 Debug::DebugInfo::Get()->recordPerfMap(
876 Debug::TCRange(start, end, false),
877 SrcKey{}, nullptr, false, false, escaped_name
884 /* Store the compiled pattern and extra info in the cache. */
885 pcre_cache_entry* new_entry = new pcre_cache_entry();
886 new_entry->re = re;
887 new_entry->extra = extra;
888 if (literal_data.isLiteral()) {
889 new_entry->literal_data =
890 std::make_unique<pcre_literal_data>(std::move(literal_data));
893 assertx((poptions & ~0x1) == 0);
894 new_entry->preg_options = poptions;
896 assertx((coptions & 0x80000000) == 0);
897 new_entry->compile_options = coptions;
899 /* Get pcre full info */
900 if (!get_pcre_fullinfo(new_entry)) {
901 delete new_entry;
902 return false;
905 s_pcreCache.insert(accessor, regex, tkc, new_entry);
906 return true;
909 static int* create_offset_array(const pcre_cache_entry* pce,
910 int& size_offsets) {
911 /* Allocate memory for the offsets array */
912 size_offsets = pce->num_subpats * 3;
913 return (int *)req::malloc_noptrs(size_offsets * sizeof(int));
916 static inline void add_offset_pair_split(Array& result,
917 const String& str,
918 int offset,
919 const char* name,
920 bool hackArrOutput) {
921 auto match_pair = hackArrOutput
922 ? make_vec_array(str, offset)
923 : make_varray(str, offset);
924 if (name) result.set(String(name), match_pair);
925 result.append(match_pair);
928 static inline void add_offset_pair_match(Array& result,
929 const String& str,
930 int offset,
931 const char* name,
932 bool hackArrOutput) {
933 auto match_pair = hackArrOutput
934 ? make_vec_array(str, offset)
935 : make_varray(str, offset);
936 if (name) result.set(String(name), match_pair);
937 result.append(match_pair);
940 static inline bool pcre_need_log_error(int pcre_code) {
941 return RuntimeOption::EnablePregErrorLog &&
942 (pcre_code == PCRE_ERROR_MATCHLIMIT ||
943 pcre_code == PCRE_ERROR_RECURSIONLIMIT);
946 static void pcre_log_error(const char* func, int line, int pcre_code,
947 const char* pattern, int pattern_size,
948 const char* subject, int subject_size,
949 const char* repl, int repl_size,
950 int arg1 = 0, int arg2 = 0,
951 int arg3 = 0, int arg4 = 0) {
952 const char* escapedPattern;
953 const char* escapedSubject;
954 const char* escapedRepl;
955 std::string p(pattern, pattern_size);
956 std::string s(subject, subject_size);
957 std::string r(repl, repl_size);
958 escapedPattern = Logger::EscapeString(p);
959 escapedSubject = Logger::EscapeString(s);
960 escapedRepl = Logger::EscapeString(r);
961 const char* errString =
962 (pcre_code == PCRE_ERROR_MATCHLIMIT) ? "PCRE_ERROR_MATCHLIMIT" :
963 (pcre_code == PCRE_ERROR_RECURSIONLIMIT) ? "PCRE_ERROR_RECURSIONLIMIT" :
964 "UNKNOWN";
965 raise_warning_unsampled(
966 "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
967 "limits=(%" PRId64 ", %" PRId64 "), extra=(%d, %d, %d, %d)",
968 func, line, pcre_code, errString,
969 escapedPattern, escapedSubject, escapedRepl,
970 tl_pcre_globals->preg_backtrace_limit,
971 tl_pcre_globals->preg_recursion_limit,
972 arg1, arg2, arg3, arg4);
973 free((void *)escapedPattern);
974 free((void *)escapedSubject);
975 free((void *)escapedRepl);
978 static void pcre_handle_exec_error(int pcre_code) {
979 int preg_code = 0;
980 switch (pcre_code) {
981 case PCRE_ERROR_MATCHLIMIT:
982 preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
983 break;
984 case PCRE_ERROR_RECURSIONLIMIT:
985 preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
986 break;
987 case PCRE_ERROR_BADUTF8:
988 preg_code = PHP_PCRE_BAD_UTF8_ERROR;
989 break;
990 case PCRE_ERROR_BADUTF8_OFFSET:
991 preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
992 break;
993 default:
994 preg_code = PHP_PCRE_INTERNAL_ERROR;
995 break;
997 *rl_last_error_code = preg_code;
1000 ///////////////////////////////////////////////////////////////////////////////
1002 Variant preg_grep(const String& pattern, const Array& input, int flags /* = 0 */) {
1003 PCRECache::Accessor accessor;
1004 if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1005 return false;
1007 const pcre_cache_entry* pce = accessor.get();
1009 int size_offsets = 0;
1010 int* offsets = create_offset_array(pce, size_offsets);
1011 if (offsets == nullptr) {
1012 return false;
1014 SmartFreeHelper freer(offsets);
1016 const bool hackArrOutput = flags & PREG_FB_HACK_ARRAYS;
1018 /* Initialize return array */
1019 auto ret = hackArrOutput ? Array::CreateDict() : Array::Create();
1020 *rl_last_error_code = PHP_PCRE_NO_ERROR;
1022 /* Go through the input array */
1023 bool invert = (flags & PREG_GREP_INVERT);
1024 pcre_extra extra;
1025 init_local_extra(&extra, pce->extra);
1027 for (ArrayIter iter(input); iter; ++iter) {
1028 String entry = iter.second().toString();
1030 /* Perform the match */
1031 int count = pcre_exec(pce->re, &extra, entry.data(), entry.size(),
1032 0, 0, offsets, size_offsets);
1034 /* Check for too many substrings condition. */
1035 if (count == 0) {
1036 raise_warning("Matched, but too many substrings");
1037 count = size_offsets / 3;
1038 } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1039 if (pcre_need_log_error(count)) {
1040 pcre_log_error(__FUNCTION__, __LINE__, count,
1041 pattern.data(), pattern.size(),
1042 entry.data(), entry.size(),
1043 "", 0,
1044 flags);
1046 pcre_handle_exec_error(count);
1047 break;
1050 /* If the entry fits our requirements */
1051 if ((count > 0 && !invert) ||
1052 (count == PCRE_ERROR_NOMATCH && invert)) {
1054 /* Add to return array */
1055 ret.set(iter.first(), entry);
1059 return ret;
1062 ///////////////////////////////////////////////////////////////////////////////
1064 namespace {
1066 Array& forceToOutput(Variant& var, bool hackArrOutput) {
1067 return hackArrOutput ? forceToDict(var) : forceToDArray(var);
1070 Array& forceToOutput(tv_lval lval, bool hackArrOutput) {
1071 return hackArrOutput ? forceToDict(lval) : forceToDArray(lval);
1076 static Variant preg_match_impl(const StringData* pattern,
1077 const StringData* subject,
1078 Variant* subpats, int flags, int start_offset,
1079 bool global) {
1080 PCRECache::Accessor accessor;
1081 if (!pcre_get_compiled_regex_cache(accessor, pattern)) {
1082 return false;
1084 const pcre_cache_entry* pce = accessor.get();
1086 const bool hackArrOutput = flags & PREG_FB_HACK_ARRAYS;
1087 const bool includeNonMatchingCaptures = flags & PREG_FB__PRIVATE__HSL_IMPL;
1089 pcre_extra extra;
1090 init_local_extra(&extra, pce->extra);
1091 if (subpats) {
1092 *subpats = hackArrOutput ? Array::CreateDict() : Array::CreateDArray();
1094 int exec_options = 0;
1096 int subpats_order = global ? PREG_PATTERN_ORDER : 0;
1097 bool offset_capture = false;
1098 if (flags) {
1099 offset_capture = flags & PREG_OFFSET_CAPTURE;
1102 * subpats_order is pre-set to pattern mode so we change it only if
1103 * necessary.
1105 if (flags & 0xff) {
1106 subpats_order = flags & 0xff;
1108 if ((global && (subpats_order < PREG_PATTERN_ORDER ||
1109 subpats_order > PREG_SET_ORDER)) ||
1110 (!global && subpats_order != 0)) {
1111 raise_warning("Invalid flags specified");
1112 return init_null();
1116 /* Negative offset counts from the end of the string. */
1117 if (start_offset < 0) {
1118 start_offset = subject->size() + start_offset;
1119 if (start_offset < 0) {
1120 start_offset = 0;
1124 int size_offsets = 0;
1125 int* offsets = create_offset_array(pce, size_offsets);
1126 SmartFreeHelper offsetsFreer(offsets);
1127 int num_subpats = size_offsets / 3;
1128 if (offsets == nullptr) {
1129 return false;
1132 const char* const* subpat_names = get_subpat_names(pce);
1133 if (subpat_names == nullptr) {
1134 return false;
1137 /* Allocate match sets array and initialize the values. */
1139 /* An array of sets of matches for each subpattern after a global match */
1140 auto match_sets = hackArrOutput ? Array::CreateDict() : Array::CreateDArray();
1141 if (global && subpats_order == PREG_PATTERN_ORDER) {
1142 for (int i = 0; i < num_subpats; i++) {
1143 match_sets.set(i,
1144 hackArrOutput ? Array::CreateDict() : Array::CreateDArray());
1148 int matched = 0;
1149 *rl_last_error_code = PHP_PCRE_NO_ERROR;
1151 int g_notempty = 0; // If the match should not be empty
1152 const char** stringlist; // Holds list of subpatterns
1153 int i;
1154 do {
1156 int count = 0;
1158 * Optimization: If the pattern defines a literal substring,
1159 * compare the strings directly (i.e. memcmp) instead of performing
1160 * the full regular expression evaluation.
1161 * Take the slow path if there are any special compile options.
1163 if (pce->literal_data && !global) {
1164 assertx(pce->literal_data->isLiteral());
1165 /* TODO(t13140878): compare literal against multiple substrings
1166 * in the preg_match_all (global == true) case. */
1167 count = pce->literal_data->matches(subject, start_offset, offsets) ? 1
1168 : PCRE_ERROR_NOMATCH;
1169 } else {
1170 /* Execute the regular expression. */
1171 count = pcre_exec(pce->re, &extra, subject->data(), subject->size(),
1172 start_offset,
1173 exec_options | g_notempty,
1174 offsets, size_offsets);
1176 /* The string was already proved to be valid UTF-8 */
1177 exec_options |= PCRE_NO_UTF8_CHECK;
1179 /* Check for too many substrings condition. */
1180 if (count == 0) {
1181 raise_warning("Matched, but too many substrings");
1182 count = size_offsets / 3;
1185 /* If something has matched */
1186 if (count > 0) {
1187 matched++;
1189 if (subpats) {
1190 // Try to get the list of substrings and display a warning if failed.
1191 if (offsets[1] < offsets[0] ||
1192 pcre_get_substring_list(subject->data(), offsets, count,
1193 &stringlist) < 0) {
1194 raise_warning("Get subpatterns list failed");
1195 return false;
1198 if (global) { /* global pattern matching */
1199 if (subpats_order == PREG_PATTERN_ORDER) {
1200 /* For each subpattern, insert it into the appropriate array. */
1201 for (i = 0; i < count; i++) {
1202 if (offset_capture) {
1203 auto const lval = match_sets.lval(i);
1204 add_offset_pair_match(forceToOutput(lval, hackArrOutput),
1205 String(stringlist[i],
1206 offsets[(i<<1)+1] - offsets[i<<1],
1207 CopyString),
1208 offsets[i<<1],
1209 nullptr,
1210 hackArrOutput);
1211 } else {
1212 auto const lval = match_sets.lval(i);
1213 forceToOutput(lval, hackArrOutput).append(
1214 String(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1215 CopyString)
1220 * If the number of captured subpatterns on this run is
1221 * less than the total possible number, pad the result
1222 * arrays with empty strings.
1224 if (count < num_subpats) {
1225 for (; i < num_subpats; i++) {
1226 auto const lval = match_sets.lval(i);
1227 forceToOutput(lval, hackArrOutput).append("");
1230 } else {
1231 auto result_set = hackArrOutput
1232 ? Array::CreateDict()
1233 : Array::CreateDArray();
1235 /* Add all the subpatterns to it */
1236 for (i = 0; i < count; i++) {
1237 if (offset_capture) {
1238 add_offset_pair_match(result_set,
1239 String(stringlist[i],
1240 offsets[(i<<1)+1] - offsets[i<<1],
1241 CopyString),
1242 offsets[i<<1],
1243 subpat_names[i],
1244 hackArrOutput);
1245 } else {
1246 String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1247 CopyString);
1248 if (subpat_names[i]) {
1249 result_set.set(String(subpat_names[i]), value);
1251 result_set.append(value);
1254 if (includeNonMatchingCaptures && count < num_subpats) {
1255 for (; i < num_subpats; i++) {
1256 // We don't want to set the numeric key if there is a string
1257 // key, but we have do it usually to make migration from
1258 // preg_match() practical; given that existing code gets
1259 // nothing for unmatched captures, we don't need to set both
1260 // here.
1261 if (offset_capture) {
1262 add_offset_pair_match(
1263 forceToOutput(*subpats, hackArrOutput),
1264 empty_string(),
1265 offsets[i<<1],
1266 subpat_names[i],
1267 hackArrOutput
1269 } else {
1270 if (subpat_names[i]) {
1271 result_set.set(String(subpat_names[i]), empty_string_tv());
1273 result_set.append(empty_string());
1277 /* And add it to the output array */
1278 forceToOutput(*subpats, hackArrOutput).append(
1279 std::move(result_set)
1282 } else { /* single pattern matching */
1283 /* For each subpattern, insert it into the subpatterns array. */
1284 for (i = 0; i < count; i++) {
1285 if (offset_capture) {
1286 add_offset_pair_match(forceToOutput(*subpats, hackArrOutput),
1287 String(stringlist[i],
1288 offsets[(i<<1)+1] - offsets[i<<1],
1289 CopyString),
1290 offsets[i<<1],
1291 subpat_names[i],
1292 hackArrOutput);
1293 } else {
1294 String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
1295 CopyString);
1296 if (subpat_names[i]) {
1297 forceToOutput(*subpats, hackArrOutput).set(
1298 String(subpat_names[i]), value
1301 forceToOutput(*subpats, hackArrOutput).append(value);
1304 if (includeNonMatchingCaptures && count < num_subpats) {
1305 for (; i < num_subpats; i++) {
1306 if (offset_capture) {
1307 add_offset_pair_match(
1308 forceToOutput(*subpats, hackArrOutput),
1309 empty_string(),
1310 offsets[i<<1],
1311 subpat_names[i],
1312 hackArrOutput
1314 } else {
1315 if (subpat_names[i]) {
1316 forceToOutput(*subpats, hackArrOutput).set(
1317 String(subpat_names[i]), empty_string()
1320 forceToOutput(*subpats, hackArrOutput).append(empty_string());
1325 pcre_free((void *) stringlist);
1327 } else if (count == PCRE_ERROR_NOMATCH) {
1328 /* If we previously set PCRE_NOTEMPTY after a null match,
1329 this is not necessarily the end. We need to advance
1330 the start offset, and continue. Fudge the offset values
1331 to achieve this, unless we're already at the end of the string. */
1332 if (g_notempty && start_offset < subject->size()) {
1333 offsets[0] = start_offset;
1334 offsets[1] = start_offset + 1;
1335 } else
1336 break;
1337 } else {
1338 if (pcre_need_log_error(count)) {
1339 pcre_log_error(__FUNCTION__, __LINE__, count,
1340 pattern->data(), pattern->size(),
1341 subject->data(), subject->size(),
1342 "", 0,
1343 flags, start_offset, g_notempty, global);
1345 pcre_handle_exec_error(count);
1346 return false;
1349 /* If we have matched an empty string, mimic what Perl's /g options does.
1350 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1351 the match again at the same point. If this fails (picked up above) we
1352 advance to the next character. */
1353 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1355 /* Advance to the position right after the last full match */
1356 start_offset = offsets[1];
1357 } while (global);
1359 /* Add the match sets to the output array and clean up */
1360 if (subpats && global && subpats_order == PREG_PATTERN_ORDER) {
1361 for (i = 0; i < num_subpats; i++) {
1362 if (subpat_names[i]) {
1363 forceToOutput(*subpats, hackArrOutput).set(
1364 String(subpat_names[i]), match_sets[i]
1367 forceToOutput(*subpats, hackArrOutput).append(match_sets[i]);
1370 return matched;
1373 Variant preg_match(const String& pattern, const String& subject,
1374 Variant* matches /* = nullptr */, int flags /* = 0 */,
1375 int offset /* = 0 */) {
1376 return preg_match(pattern.get(), subject.get(), matches, flags, offset);
1379 Variant preg_match(const StringData* pattern, const StringData* subject,
1380 Variant* matches /* = nullptr */, int flags /* = 0 */,
1381 int offset /* = 0 */) {
1382 return preg_match_impl(pattern, subject, matches, flags, offset, false);
1385 Variant preg_match_all(const String& pattern, const String& subject,
1386 Variant* matches /* = nullptr */,
1387 int flags /* = 0 */, int offset /* = 0 */) {
1388 return preg_match_all(pattern.get(), subject.get(), matches, flags, offset);
1391 Variant preg_match_all(const StringData* pattern, const StringData* subject,
1392 Variant* matches /* = nullptr */,
1393 int flags /* = 0 */, int offset /* = 0 */) {
1394 return preg_match_impl(pattern, subject, matches, flags, offset, true);
1397 ///////////////////////////////////////////////////////////////////////////////
1399 static String preg_do_repl_func(const Variant& function, const String& subject,
1400 int* offsets, const char* const* subpat_names,
1401 int count) {
1402 Array subpats = Array::CreateDArray();
1403 for (int i = 0; i < count; i++) {
1404 auto off1 = offsets[i<<1];
1405 auto off2 = offsets[(i<<1)+1];
1406 auto sub = subject.substr(off1, off2 - off1);
1408 if (subpat_names[i]) {
1409 subpats.set(String(subpat_names[i]), sub);
1411 subpats.append(sub);
1414 return vm_call_user_func(function, make_varray(subpats)).toString();
1417 static bool preg_get_backref(const char** str, int* backref) {
1418 char in_brace = 0;
1419 const char* walk = *str;
1421 if (walk[1] == 0) {
1422 return false;
1425 if (*walk == '$' && walk[1] == '{') {
1426 in_brace = 1;
1427 walk++;
1429 walk++;
1431 if (*walk >= '0' && *walk <= '9') {
1432 *backref = *walk - '0';
1433 walk++;
1434 } else {
1435 return false;
1438 if (*walk && *walk >= '0' && *walk <= '9') {
1439 *backref = *backref * 10 + *walk - '0';
1440 walk++;
1443 if (in_brace) {
1444 if (*walk == 0 || *walk != '}') {
1445 return false;
1447 walk++;
1450 *str = walk;
1451 return true;
1454 static Variant php_pcre_replace(const String& pattern, const String& subject,
1455 const Variant& replace_var, bool callable,
1456 int limit, int* replace_count) {
1457 PCRECache::Accessor accessor;
1458 if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1459 return false;
1461 const pcre_cache_entry* pce = accessor.get();
1462 if (pce->preg_options & PREG_REPLACE_EVAL) {
1463 throw Exception(
1464 "preg_replace(): Support for the /e modifier has been removed, use "
1465 "preg_replace_callback instead"
1469 int size_offsets;
1470 int* offsets = create_offset_array(pce, size_offsets);
1471 SmartFreeHelper offsetsFreer(offsets);
1472 if (offsets == nullptr) {
1473 return false;
1476 const char* const* subpat_names = get_subpat_names(pce);
1477 if (subpat_names == nullptr) {
1478 return false;
1481 const char* replace = nullptr;
1482 const char* replace_end = nullptr;
1483 int replace_len = 0;
1484 String replace_val;
1486 if (!callable) {
1487 replace_val = replace_var.toString();
1488 replace = replace_val.data();
1489 replace_len = replace_val.size();
1490 replace_end = replace + replace_len;
1493 StringBuffer result(2 * subject.size());
1495 try {
1497 /* Initialize */
1498 const char* match = nullptr;
1499 int start_offset = 0;
1500 *rl_last_error_code = PHP_PCRE_NO_ERROR;
1501 pcre_extra extra;
1502 init_local_extra(&extra, pce->extra);
1504 const char* walk; // Used to walk the replacement string
1505 char walk_last; // Last walked character
1506 int match_len; // Length of the current match
1507 int backref; // Backreference number
1508 int g_notempty = 0; // If the match should not be empty
1509 int exec_options = 0; // Options passed to pcre_exec
1510 while (1) {
1511 /* Execute the regular expression. */
1512 int count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1513 start_offset,
1514 exec_options | g_notempty,
1515 offsets, size_offsets);
1517 /* The string was already proved to be valid UTF-8 */
1518 exec_options |= PCRE_NO_UTF8_CHECK;
1520 /* Check for too many substrings condition. */
1521 if (count == 0) {
1522 raise_warning("Matched, but too many substrings");
1523 count = size_offsets / 3;
1526 const char* piece = subject.data() + start_offset;
1527 if (count > 0 && offsets[1] >= offsets[0] &&
1528 (limit == -1 || limit > 0)) {
1529 if (replace_count) {
1530 ++*replace_count;
1532 /* Set the match location in subject */
1533 match = subject.data() + offsets[0];
1535 String callable_result;
1536 if (callable) {
1537 /* Use custom function to get replacement string and its length. */
1538 callable_result = preg_do_repl_func(replace_var, subject, offsets,
1539 subpat_names, count);
1540 } else { /* do regular substitution */
1541 walk = replace;
1542 walk_last = 0;
1543 while (walk < replace_end) {
1544 if ('\\' == *walk || '$' == *walk) {
1545 if (walk_last == '\\') {
1546 walk++;
1547 walk_last = 0;
1548 continue;
1550 if (preg_get_backref(&walk, &backref)) {
1551 if (backref < count) {
1552 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1554 continue;
1557 walk++;
1558 walk_last = walk[-1];
1562 /* copy the part of the string before the match */
1563 result.append(piece, match-piece);
1565 /* copy replacement and backrefs */
1566 int result_len = result.size();
1568 if (callable) {
1569 /* Copy result from custom function to buffer and clean up. */
1570 result.append(callable_result.data(), callable_result.size());
1571 result_len += callable_result.size();
1572 } else { /* do regular backreference copying */
1573 walk = replace;
1574 walk_last = 0;
1575 Array params;
1576 while (walk < replace_end) {
1577 if ('\\' == *walk || '$' == *walk) {
1578 if (walk_last == '\\') {
1579 result.set(result.size() - 1, *walk++);
1580 walk_last = 0;
1581 continue;
1583 if (preg_get_backref(&walk, &backref)) {
1584 if (backref < count) {
1585 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1586 result.append(
1587 subject.data() + offsets[backref<<1],
1588 match_len
1591 continue;
1594 result.append(*walk++);
1595 walk_last = walk[-1];
1599 if (limit != -1) {
1600 limit--;
1603 } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1604 /* If we previously set PCRE_NOTEMPTY after a null match,
1605 this is not necessarily the end. We need to advance
1606 the start offset, and continue. Fudge the offset values
1607 to achieve this, unless we're already at the end of the string. */
1608 if (g_notempty != 0 && start_offset < subject.size()) {
1609 offsets[0] = start_offset;
1610 offsets[1] = start_offset + 1;
1611 result.append(piece, 1);
1612 } else {
1613 /* stick that last bit of string on our output */
1614 result.append(piece, subject.size() - start_offset);
1615 break;
1617 } else {
1618 if (pcre_need_log_error(count)) {
1619 const char* s;
1620 int size;
1621 String stemp;
1622 if (callable) {
1623 if (replace_var.isObject()) {
1624 stemp = replace_var.asCObjRef()->getClassName().asString()
1625 + "::__invoke";
1626 } else {
1627 stemp = replace_var.toString();
1629 s = stemp.data();
1630 size = stemp.size();
1631 } else {
1632 s = replace_val.data();
1633 size = replace_val.size();
1635 pcre_log_error(__FUNCTION__, __LINE__, count,
1636 pattern.data(), pattern.size(),
1637 subject.data(), subject.size(),
1638 s, size,
1639 callable, limit, start_offset, g_notempty);
1641 pcre_handle_exec_error(count);
1642 return init_null();
1645 /* If we have matched an empty string, mimic what Perl's /g options does.
1646 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1647 the match again at the same point. If this fails (picked up above) we
1648 advance to the next character. */
1649 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1651 /* Advance to the next piece. */
1652 start_offset = offsets[1];
1655 return result.detach();
1656 } catch (...) {
1657 throw;
1661 static Variant php_replace_in_subject(const Variant& regex, const Variant& replace,
1662 String subject, int limit, bool callable,
1663 int* replace_count) {
1664 if (!regex.isArray()) {
1665 Variant ret = php_pcre_replace(regex.toString(), subject, replace,
1666 callable, limit, replace_count);
1668 if (ret.isBoolean()) {
1669 assertx(!ret.toBoolean());
1670 return init_null();
1673 return ret;
1676 if (callable || !replace.isArray()) {
1677 Array arr = regex.toDArray();
1678 for (ArrayIter iterRegex(arr); iterRegex; ++iterRegex) {
1679 String regex_entry = iterRegex.second().toString();
1680 Variant ret = php_pcre_replace(regex_entry, subject, replace,
1681 callable, limit, replace_count);
1682 if (ret.isBoolean()) {
1683 assertx(!ret.toBoolean());
1684 return init_null();
1686 if (!ret.isString()) {
1687 return ret;
1689 subject = ret.asStrRef();
1690 if (subject.isNull()) {
1691 return subject;
1694 return subject;
1697 Array arrReplace = replace.toDArray();
1698 Array arrRegex = regex.toDArray();
1699 ArrayIter iterReplace(arrReplace);
1700 for (ArrayIter iterRegex(arrRegex); iterRegex; ++iterRegex) {
1701 String regex_entry = iterRegex.second().toString();
1702 Variant replace_value;
1703 if (iterReplace) {
1704 replace_value = iterReplace.second();
1705 ++iterReplace;
1708 Variant ret = php_pcre_replace(regex_entry, subject, replace_value,
1709 callable, limit, replace_count);
1711 if (ret.isBoolean()) {
1712 assertx(!ret.toBoolean());
1713 return init_null();
1715 if (!ret.isString()) {
1716 return ret;
1718 subject = ret.asStrRef();
1719 if (subject.isNull()) {
1720 return subject;
1723 return subject;
1726 Variant preg_replace_impl(const Variant& pattern, const Variant& replacement,
1727 const Variant& subject, int limit, int64_t* count,
1728 bool is_callable, bool is_filter) {
1729 assertx(!(is_callable && is_filter));
1730 if (!is_callable &&
1731 replacement.isArray() && !pattern.isArray()) {
1732 raise_warning("Parameter mismatch, pattern is a string while "
1733 "replacement is an array");
1734 return false;
1737 int replace_count = 0;
1738 if (!isContainer(subject)) {
1739 Variant ret = php_replace_in_subject(pattern, replacement,
1740 subject.toString(),
1741 limit, is_callable, &replace_count);
1743 if (ret.isString()) {
1744 if (count) *count = replace_count;
1745 if (is_filter && replace_count == 0) {
1746 return init_null();
1747 } else {
1748 return ret.asStrRef();
1752 return ret;
1755 Array return_value = Array::CreateDArray();
1756 Array arrSubject = subject.toDArray();
1757 for (ArrayIter iter(arrSubject); iter; ++iter) {
1758 auto old_replace_count = replace_count;
1759 String subject_entry = iter.second().toString();
1760 Variant ret = php_replace_in_subject(pattern, replacement, subject_entry,
1761 limit, is_callable, &replace_count);
1763 if (ret.isString() && !ret.isNull() &&
1764 (!is_filter || replace_count > old_replace_count)) {
1765 return_value.set(iter.first(), ret.asStrRef());
1768 if (count) *count = replace_count;
1769 return return_value;
1772 int preg_replace(Variant& result,
1773 const Variant& pattern,
1774 const Variant& replacement,
1775 const Variant& subject,
1776 int limit /* = -1 */) {
1777 int64_t count;
1778 result = preg_replace_impl(pattern, replacement, subject,
1779 limit, &count, false, false);
1780 return count;
1783 int preg_replace_callback(Variant& result,
1784 const Variant& pattern,
1785 const Variant& callback,
1786 const Variant& subject,
1787 int limit /* = -1 */) {
1788 int64_t count;
1789 result = preg_replace_impl(pattern, callback, subject,
1790 limit, &count, true, false);
1791 return count;
1794 int preg_filter(Variant& result,
1795 const Variant& pattern,
1796 const Variant& replacement,
1797 const Variant& subject,
1798 int limit /* = -1 */) {
1799 int64_t count;
1800 result = preg_replace_impl(pattern, replacement, subject,
1801 limit, &count, false, true);
1802 return count;
1805 ///////////////////////////////////////////////////////////////////////////////
1807 Variant preg_split(const String& pattern, const String& subject,
1808 int limit /* = -1 */, int flags /* = 0 */) {
1809 PCRECache::Accessor accessor;
1810 if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1811 return false;
1813 const pcre_cache_entry* pce = accessor.get();
1815 int no_empty = flags & PREG_SPLIT_NO_EMPTY;
1816 bool delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1817 bool offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1819 if (limit == 0) {
1820 limit = -1;
1823 int size_offsets = 0;
1824 int* offsets = create_offset_array(pce, size_offsets);
1825 SmartFreeHelper offsetsFreer(offsets);
1826 if (offsets == nullptr) {
1827 return false;
1830 /* Start at the beginning of the string */
1831 int start_offset = 0;
1832 int next_offset = 0;
1833 const char* last_match = subject.data();
1834 *rl_last_error_code = PHP_PCRE_NO_ERROR;
1835 pcre_extra extra;
1836 init_local_extra(&extra, pce->extra);
1838 const bool hackArrOutput = flags & PREG_FB_HACK_ARRAYS;
1840 // Get next piece if no limit or limit not yet reached and something matched
1841 Array return_value = hackArrOutput ? Array::CreateDict() : Array::Create();
1842 int g_notempty = 0; /* If the match should not be empty */
1843 int utf8_check = 0;
1844 PCRECache::Accessor bump_accessor;
1845 const pcre_cache_entry* bump_pce = nullptr; /* instance for empty matches */
1846 while ((limit == -1 || limit > 1)) {
1847 int count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1848 start_offset, g_notempty | utf8_check,
1849 offsets, size_offsets);
1851 /* Subsequent calls to pcre_exec don't need to bother with the
1852 * utf8 validity check: if the subject isn't valid, the first
1853 * call to pcre_exec will have failed, and as long as we only
1854 * set start_offset to known character boundaries we won't
1855 * supply an invalid offset. */
1856 utf8_check = PCRE_NO_UTF8_CHECK;
1858 /* Check for too many substrings condition. */
1859 if (count == 0) {
1860 raise_warning("Matched, but too many substrings");
1861 count = size_offsets / 3;
1864 /* If something matched */
1865 if (count > 0 && offsets[1] >= offsets[0]) {
1866 if (!no_empty || subject.data() + offsets[0] != last_match) {
1867 if (offset_capture) {
1868 /* Add (match, offset) pair to the return value */
1869 add_offset_pair_split(return_value,
1870 String(last_match,
1871 subject.data() + offsets[0] - last_match,
1872 CopyString),
1873 next_offset,
1874 nullptr,
1875 hackArrOutput);
1876 } else {
1877 /* Add the piece to the return value */
1878 return_value.append(String(last_match,
1879 subject.data() + offsets[0] - last_match,
1880 CopyString));
1883 /* One less left to do */
1884 if (limit != -1)
1885 limit--;
1888 last_match = subject.data() + offsets[1];
1889 next_offset = offsets[1];
1891 if (delim_capture) {
1892 int i, match_len;
1893 for (i = 1; i < count; i++) {
1894 match_len = offsets[(i<<1)+1] - offsets[i<<1];
1895 /* If we have matched a delimiter */
1896 if (!no_empty || match_len > 0) {
1897 if (offset_capture) {
1898 add_offset_pair_split(return_value,
1899 String(subject.data() + offsets[i<<1],
1900 match_len, CopyString),
1901 offsets[i<<1],
1902 nullptr,
1903 hackArrOutput);
1904 } else {
1905 return_value.append(subject.substr(offsets[i<<1], match_len));
1910 } else if (count == PCRE_ERROR_NOMATCH) {
1911 /* If we previously set PCRE_NOTEMPTY after a null match,
1912 this is not necessarily the end. We need to advance
1913 the start offset, and continue. Fudge the offset values
1914 to achieve this, unless we're already at the end of the string. */
1915 if (g_notempty != 0 && start_offset < subject.size()) {
1916 if (pce->compile_options & PCRE_UTF8) {
1917 if (bump_pce == nullptr) {
1918 if (!pcre_get_compiled_regex_cache(bump_accessor,
1919 String("/./us").get())) {
1920 return false;
1922 bump_pce = bump_accessor.get();
1924 pcre_extra bump_extra;
1925 init_local_extra(&bump_extra, bump_pce->extra);
1926 count = pcre_exec(bump_pce->re, &bump_extra, subject.data(),
1927 subject.size(), start_offset,
1928 utf8_check, offsets, size_offsets);
1929 if (count < 1) {
1930 raise_warning("Unknown error");
1931 offsets[0] = start_offset;
1932 offsets[1] = start_offset + 1;
1933 if (pcre_need_log_error(count)) {
1934 pcre_log_error(__FUNCTION__, __LINE__, count,
1935 pattern.data(), pattern.size(),
1936 subject.data(), subject.size(),
1937 "", 0,
1938 limit, flags, start_offset);
1941 } else {
1942 offsets[0] = start_offset;
1943 offsets[1] = start_offset + 1;
1945 } else
1946 break;
1947 } else {
1948 if (pcre_need_log_error(count)) {
1949 pcre_log_error(__FUNCTION__, __LINE__, count,
1950 pattern.data(), pattern.size(),
1951 subject.data(), subject.size(),
1952 "", 0,
1953 limit, flags, start_offset, g_notempty);
1955 pcre_handle_exec_error(count);
1956 break;
1959 /* If we have matched an empty string, mimic what Perl's /g options does.
1960 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1961 the match again at the same point. If this fails (picked up above) we
1962 advance to the next character. */
1963 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1965 /* Advance to the position right after the last full match */
1966 start_offset = offsets[1];
1969 start_offset = last_match - subject.data(); /* offset might have
1970 * been incremented,
1971 * but without further
1972 * successful matches */
1973 if (!no_empty || start_offset < subject.size()) {
1974 if (offset_capture) {
1975 /* Add the last (match, offset) pair to the return value */
1976 add_offset_pair_split(return_value,
1977 subject.substr(start_offset),
1978 start_offset, nullptr, hackArrOutput);
1979 } else {
1980 /* Add the last piece to the return value */
1981 return_value.append
1982 (String(last_match, subject.data() + subject.size() - last_match,
1983 CopyString));
1987 return return_value;
1990 ///////////////////////////////////////////////////////////////////////////////
1992 String preg_quote(const String& str,
1993 const String& delimiter /* = null_string */) {
1994 const char* in_str = str.data();
1995 const char* in_str_end = in_str + str.size();
1997 /* Nothing to do if we got an empty string */
1998 if (in_str == in_str_end) {
1999 return str;
2002 char delim_char = 0; /* Delimiter character to be quoted */
2003 bool quote_delim = false; /* Whether to quote additional delim char */
2004 if (!delimiter.empty()) {
2005 delim_char = delimiter.charAt(0);
2006 quote_delim = true;
2009 /* Allocate enough memory so that even if each character
2010 is quoted, we won't run out of room */
2011 String ret(4 * str.size() + 1, ReserveString);
2012 char* out_str = ret.mutableData();
2014 /* Go through the string and quote necessary characters */
2015 const char* p;
2016 char* q;
2017 for (p = in_str, q = out_str; p != in_str_end; p++) {
2018 char c = *p;
2019 switch (c) {
2020 case '.': case '\\': case '+': case '*': case '?':
2021 case '[': case '^': case ']': case '$': case '(':
2022 case ')': case '{': case '}': case '=': case '!':
2023 case '>': case '<': case '|': case ':': case '-':
2024 case '#':
2025 *q++ = '\\';
2026 *q++ = c;
2027 break;
2029 case '\0':
2030 *q++ = '\\';
2031 *q++ = '0';
2032 *q++ = '0';
2033 *q++ = '0';
2034 break;
2036 default:
2037 if (quote_delim && c == delim_char)
2038 *q++ = '\\';
2039 *q++ = c;
2040 break;
2043 *q = '\0';
2045 return ret.setSize(q - out_str);
2048 int preg_last_error() {
2049 return *rl_last_error_code;
2052 size_t preg_pcre_cache_size() {
2053 return s_pcreCache.size();
2056 ///////////////////////////////////////////////////////////////////////////////
2057 // regexec
2059 static void php_reg_eprint(int err, regex_t* re) {
2060 char *buf = nullptr, *message = nullptr;
2061 size_t len;
2062 size_t buf_len;
2064 #ifdef REG_ITOA
2065 /* get the length of the message */
2066 buf_len = regerror(REG_ITOA | err, re, nullptr, 0);
2067 if (buf_len) {
2068 buf = (char *)req::malloc_noptrs(buf_len);
2069 if (!buf) return; /* fail silently */
2070 /* finally, get the error message */
2071 regerror(REG_ITOA | err, re, buf, buf_len);
2073 #else
2074 buf_len = 0;
2075 #endif
2076 len = regerror(err, re, nullptr, 0);
2077 if (len) {
2078 message = (char *)req::malloc_noptrs(buf_len + len + 2);
2079 if (!message) {
2080 return; /* fail silently */
2082 if (buf_len) {
2083 snprintf(message, buf_len, "%s: ", buf);
2084 buf_len += 1; /* so pointer math below works */
2086 /* drop the message into place */
2087 regerror(err, re, message + buf_len, len);
2088 raise_warning("%s", message);
2090 req::free(buf);
2091 req::free(message);
2094 Variant php_split(const String& spliton, const String& str, int count,
2095 bool icase) {
2096 const char* strp = str.data();
2097 const char* endp = strp + str.size();
2099 regex_t re;
2100 int copts = icase ? REG_ICASE : 0;
2101 int err = regcomp(&re, spliton.data(), REG_EXTENDED | copts);
2102 if (err) {
2103 php_reg_eprint(err, &re);
2104 return false;
2107 Array return_value = Array::Create();
2108 regmatch_t subs[1];
2110 /* churn through str, generating array entries as we go */
2111 while ((count == -1 || count > 1) &&
2112 !(err = regexec(&re, strp, 1, subs, 0))) {
2113 if (subs[0].rm_so == 0 && subs[0].rm_eo) {
2114 /* match is at start of string, return empty string */
2115 return_value.append("");
2116 /* skip ahead the length of the regex match */
2117 strp += subs[0].rm_eo;
2118 } else if (subs[0].rm_so == 0 && subs[0].rm_eo == 0) {
2119 /* No more matches */
2120 regfree(&re);
2121 raise_warning("Invalid Regular Expression to split()");
2122 return false;
2123 } else {
2124 /* On a real match */
2126 /* make a copy of the substring */
2127 int size = subs[0].rm_so;
2129 /* add it to the array */
2130 return_value.append(String(strp, size, CopyString));
2132 /* point at our new starting point */
2133 strp = strp + subs[0].rm_eo;
2136 /* if we're only looking for a certain number of points,
2137 stop looking once we hit it */
2138 if (count != -1) {
2139 count--;
2143 /* see if we encountered an error */
2144 if (err && err != REG_NOMATCH) {
2145 php_reg_eprint(err, &re);
2146 regfree(&re);
2147 return false;
2150 /* otherwise we just have one last element to add to the array */
2151 int size = endp - strp;
2152 return_value.append(String(strp, size, CopyString));
2154 regfree(&re);
2155 return return_value;
2158 ///////////////////////////////////////////////////////////////////////////////