Remove support for parsing the `elseif` keyword
[hiphop-php.git] / hphp / runtime / base / preg.cpp
blobcbf69acc8bdb654064f5b2397d6704d4a32bd359
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/base/preg.h"
19 #include <atomic>
20 #include <fstream>
21 #include <mutex>
22 #include <pcre.h>
23 #include <onigposix.h>
24 #include <utility>
26 #include <folly/AtomicHashArray.h>
28 #include "hphp/runtime/base/array-init.h"
29 #include "hphp/runtime/base/array-iterator.h"
30 #include "hphp/runtime/base/builtin-functions.h"
31 #include "hphp/runtime/base/container-functions.h"
32 #include "hphp/runtime/base/execution-context.h"
33 #include "hphp/runtime/base/ini-setting.h"
34 #include "hphp/runtime/base/init-fini-node.h"
35 #include "hphp/runtime/base/runtime-option.h"
36 #include "hphp/runtime/base/string-util.h"
37 #include "hphp/runtime/base/tv-uncounted.h"
38 #include "hphp/runtime/base/zend-functions.h"
39 #include "hphp/runtime/vm/debug/debug.h"
40 #include "hphp/runtime/vm/treadmill.h"
41 #include "hphp/runtime/vm/vm-regs.h"
43 #include "hphp/runtime/ext/std/ext_std_function.h"
44 #include "hphp/runtime/ext/string/ext_string.h"
46 #include "hphp/runtime/vm/jit/mcgen.h"
47 #include "hphp/runtime/vm/jit/types.h"
48 #include "hphp/runtime/vm/jit/vtune-jit.h"
50 #include "hphp/util/logger.h"
51 #include "hphp/util/concurrent-scalable-cache.h"
53 #include <folly/FileUtil.h>
54 #include <folly/json.h>
56 /* Only defined in pcre >= 8.32 */
57 #ifndef PCRE_STUDY_JIT_COMPILE
58 # define PCRE_STUDY_JIT_COMPILE 0
59 #endif
61 namespace HPHP {
63 TRACE_SET_MOD(preg);
65 using jit::TCA;
67 ///////////////////////////////////////////////////////////////////////////////
68 // PCREglobals definition
70 PCREglobals::PCREglobals() {
71 jit_stack = pcre_jit_stack_alloc(32768, 524288);
72 // Set these to handle uses of pcre prior to PcreExtension::threadInit
73 // In particular, for matching tier overrides during RuntimeOption::Load
74 preg_backtrace_limit = RuntimeOption::PregBacktraceLimit;
75 preg_recursion_limit = RuntimeOption::PregRecursionLimit;
78 PCREglobals::~PCREglobals() {
79 pcre_jit_stack_free(jit_stack);
82 ///////////////////////////////////////////////////////////////////////////////
83 // PCRECache definition
85 struct PCRECache {
86 typedef std::shared_ptr<const pcre_cache_entry> EntryPtr;
87 typedef std::unique_ptr<LRUCacheKey> TempKeyCache;
89 enum class CacheKind {
90 Static,
91 Lru,
92 Scalable
95 private:
96 struct ahm_string_data_same {
97 bool operator()(const StringData* s1, const StringData* s2) {
98 // ahm uses -1, -2, -3 as magic values
99 return int64_t(s1) > 0 && (s1 == s2 || s1->same(s2));
103 typedef folly::AtomicHashArray<StringData*, const pcre_cache_entry*,
104 string_data_hash, ahm_string_data_same> StaticCache;
105 typedef ConcurrentLRUCache<LRUCacheKey, EntryPtr,
106 LRUCacheKey::HashCompare> LRUCache;
107 typedef ConcurrentScalableCache<LRUCacheKey, EntryPtr,
108 LRUCacheKey::HashCompare> ScalableCache;
109 typedef StaticCache::value_type StaticCachePair;
111 public:
112 struct Accessor {
113 Accessor()
114 : m_kind(Kind::Empty)
117 ~Accessor() {
118 switch (m_kind) {
119 case Kind::Empty:
120 case Kind::Ptr:
121 break;
122 case Kind::SmartPtr:
123 m_u.smart_ptr.~EntryPtr();
124 break;
125 case Kind::AccessorKind:
126 m_u.accessor.~ConstAccessor();
127 break;
131 Accessor& operator=(const pcre_cache_entry* ptr) {
132 assertx(m_kind == Kind::Empty || m_kind == Kind::Ptr);
133 m_kind = Kind::Ptr;
134 m_u.ptr = ptr;
135 return *this;
138 Accessor& operator=(EntryPtr&& ep) {
139 switch (m_kind) {
140 case Kind::AccessorKind:
141 m_u.accessor.~ConstAccessor();
142 case Kind::Empty:
143 case Kind::Ptr:
144 m_kind = Kind::SmartPtr;
145 new (&m_u.smart_ptr) EntryPtr(std::move(ep));
146 break;
147 case Kind::SmartPtr:
148 m_u.smart_ptr = std::move(ep);
149 break;
151 return *this;
154 // No assignment from LRUCache::ConstAccessor since it is non-copyable
155 // Use resetToLRU instead
156 LRUCache::ConstAccessor& resetToLRU() {
157 switch (m_kind) {
158 case Kind::SmartPtr:
159 m_u.smart_ptr.~EntryPtr();
160 case Kind::Empty:
161 case Kind::Ptr:
162 m_kind = Kind::AccessorKind;
163 new (&m_u.accessor) LRUCache::ConstAccessor();
164 break;
165 case Kind::AccessorKind:
166 break;
168 return m_u.accessor;
171 const pcre_cache_entry* get() {
172 switch (m_kind) {
173 case Kind::Empty: return nullptr;
174 case Kind::Ptr: return m_u.ptr;
175 case Kind::SmartPtr: return m_u.smart_ptr.get();
176 case Kind::AccessorKind: return m_u.accessor->get();
178 always_assert(false);
181 const EntryPtr& entryPtr() const {
182 assertx(m_kind == Kind::SmartPtr);
183 return m_u.smart_ptr;
186 private:
187 enum class Kind : uint8_t {
188 Empty,
189 Ptr,
190 SmartPtr,
191 AccessorKind,
194 union Ptr {
195 Ptr() {}
196 ~Ptr() {}
198 const pcre_cache_entry* ptr;
199 EntryPtr smart_ptr;
200 LRUCache::ConstAccessor accessor;
203 Ptr m_u;
204 Kind m_kind;
207 PCRECache()
208 : m_kind(CacheKind::Static), m_staticCache(nullptr)
210 reinit(CacheKind::Static);
213 ~PCRECache() {
214 if (m_kind == CacheKind::Static && m_staticCache.load()) {
215 DestroyStatic(m_staticCache);
219 void reinit(CacheKind kind);
220 bool find(Accessor& accessor, const StringData* key,
221 TempKeyCache& keyCache);
222 void insert(Accessor& accessor, StringData* regex,
223 TempKeyCache& keyCache, const pcre_cache_entry* ent);
224 void dump(folly::File& file);
225 size_t size() const;
227 private:
228 void clearStatic();
230 static void DestroyStatic(StaticCache* cache);
231 static StaticCache* CreateStatic();
233 CacheKind m_kind;
234 std::atomic<StaticCache*> m_staticCache;
235 std::unique_ptr<LRUCache> m_lruCache;
236 std::unique_ptr<ScalableCache> m_scalableCache;
237 std::atomic<time_t> m_expire{};
238 std::mutex m_clearMutex;
241 ///////////////////////////////////////////////////////////////////////////////
242 // Data
244 RDS_LOCAL(PCREglobals, tl_pcre_globals);
246 static PCRECache s_pcreCache;
248 // The last pcre error code is available for the whole thread.
249 static RDS_LOCAL(int, rl_last_error_code);
251 ///////////////////////////////////////////////////////////////////////////////
252 // pcre_cache_entry implementation
254 pcre_cache_entry::~pcre_cache_entry() {
255 if (extra) {
256 #if PCRE_MAJOR < 8 || (PCRE_MAJOR == 8 && PCRE_MINOR < 20)
257 free(extra);
258 #else
259 pcre_free_study(extra);
260 #endif
262 free(subpat_names);
263 pcre_free(re);
266 bool literalOptions(int options) {
267 constexpr int mask =
268 PCRE_ANCHORED | PCRE_CASELESS |
269 PCRE_DOLLAR_ENDONLY | PCRE_NOTEMPTY;
270 return !(options & ~mask);
273 pcre_literal_data::pcre_literal_data(const char* pattern, int coptions) {
274 if (!literalOptions(coptions)) return;
276 auto p = pattern;
277 options = coptions;
279 if (*p == '^') {
280 match_start_of_line = true;
281 p++;
284 std::string pattern_buffer;
285 while (isalnum((unsigned char)*p) || (*p && strchr("/\\ :-_", *p))) {
286 // backslash + alphanumeric character --> not a literal (i.e. \d).
287 // backslash + non-alphanumeric character --> literal symbol (i.e. \.)
288 if (*p == '\\') {
289 if (!p[1] || isalnum((unsigned char)p[1])) {
290 break;
291 } else {
292 p++;
295 pattern_buffer += *p++;
297 if (*p == '$') {
298 options |= PCRE_DOLLAR_ENDONLY;
299 p++;
301 if (!*p) {
302 /* This is an encoding of a literal string. */
303 ITRACE(2, "Literal pattern: {}\n", pattern_buffer);
304 literal_str = std::move(pattern_buffer);
308 bool pcre_literal_data::isLiteral() const {
309 return literal_str.has_value();
312 bool pcre_literal_data::matches(const StringData* subject,
313 int pos,
314 int* offsets,
315 int extra_options) const {
316 assertx(isLiteral() && literalOptions(extra_options));
317 assertx(pos >= 0);
319 // Subject must be at least as long as the literal pattern
320 // for a match to occur.
321 if (subject->size() < literal_str->length() + pos) {
322 return false;
325 size_t literal_strlen = literal_str->length();
326 auto const g_empty = (options | extra_options) & PCRE_NOTEMPTY;
327 if (g_empty && !literal_strlen) return false;
328 auto const subject_c = subject->data();
329 auto const literal_c = literal_str->c_str();
331 // Compare the literal pattern at an offset of the subject.
332 auto const subject_substr = subject_c + pos;
334 auto const match_start = [&]() {
335 if (match_end() && (subject->size() - pos) != literal_strlen) {
336 return false;
338 // If only matching the start (^), compare the strings
339 // for the length of the literal pattern.
340 if (case_insensitive() ?
341 bstrcaseeq(subject_substr, literal_c, literal_strlen) :
342 memcmp(subject_substr, literal_c, literal_strlen) == 0) {
343 offsets[0] = pos * sizeof(char);
344 offsets[1] = offsets[0] + literal_strlen * sizeof(char);
345 return true;
347 return false;
350 if (match_start_of_line) {
351 return !pos && match_start();
352 } else if (match_start_of_string()) {
353 return match_start();
354 } else if (match_end()) {
355 // Compare the literal pattern against the tail end of the subject.
356 auto const subject_tail = subject_c + (subject->size() - literal_strlen);
357 if (case_insensitive() ?
358 bstrcaseeq(subject_tail, literal_c, literal_strlen) :
359 memcmp(subject_tail, literal_c, literal_strlen) == 0) {
360 offsets[0] = (subject->size() - literal_strlen) * sizeof(char);
361 offsets[1] = subject->size() * sizeof(char);
362 return true;
364 } else {
365 if (!literal_strlen) {
366 offsets[0] = offsets[1] = pos;
367 return true;
369 // Check if the literal pattern occurs as a substring of the subject.
370 auto const subject_str = StrNR(subject);
371 auto const find_response = subject_str.asString().find(
372 *literal_str, pos, !case_insensitive());
373 if (find_response >= 0) {
374 offsets[0] = find_response * sizeof(char);
375 offsets[1] = offsets[0] + literal_strlen * sizeof(char);
376 return true;
379 return false;
382 ///////////////////////////////////////////////////////////////////////////////
383 // PCRECache implementation
385 PCRECache::StaticCache* PCRECache::CreateStatic() {
386 StaticCache::Config config;
387 config.maxLoadFactor = 0.5;
388 return StaticCache::create(
389 RuntimeOption::EvalPCRETableSize, config).release();
392 void PCRECache::DestroyStatic(StaticCache* cache) {
393 // We delete uncounted keys while iterating the cache, which is OK for
394 // AtomicHashArray, but not OK for other containers, such as
395 // std::unordered_map. If you change the cache type make sure that property
396 // holds or fix this function.
397 static_assert(std::is_same<PCRECache::StaticCache,
398 folly::AtomicHashArray<StringData*, const pcre_cache_entry*,
399 string_data_hash, ahm_string_data_same>>::value,
400 "StaticCache must be an AtomicHashArray or this destructor is wrong.");
401 for (auto& it : *cache) {
402 DecRefUncountedString(it.first);
403 delete it.second;
405 StaticCache::destroy(cache);
408 void PCRECache::reinit(CacheKind kind) {
409 switch (m_kind) {
410 case CacheKind::Static:
411 if (m_staticCache.load()) {
412 DestroyStatic(m_staticCache);
413 m_staticCache = nullptr;
415 break;
416 case CacheKind::Lru:
417 m_lruCache.reset();
418 break;
419 case CacheKind::Scalable:
420 m_scalableCache.reset();
421 break;
423 m_kind = kind;
425 switch (kind) {
426 case CacheKind::Static:
427 m_staticCache = CreateStatic();
428 m_expire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
429 break;
430 case CacheKind::Lru:
431 m_lruCache.reset(new LRUCache(RuntimeOption::EvalPCRETableSize));
432 break;
433 case CacheKind::Scalable:
434 m_scalableCache.reset(
435 new ScalableCache(RuntimeOption::EvalPCRETableSize));
436 break;
440 bool PCRECache::find(Accessor& accessor,
441 const StringData* regex,
442 TempKeyCache& keyCache)
444 switch (m_kind) {
445 case CacheKind::Static:
447 assertx(m_staticCache.load());
448 StaticCache::iterator it;
449 auto cache = m_staticCache.load(std::memory_order_acquire);
450 if ((it = cache->find(regex)) != cache->end()) {
451 accessor = it->second;
452 return true;
454 return false;
456 case CacheKind::Lru:
457 case CacheKind::Scalable:
459 if (!keyCache) {
460 keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
462 bool found;
463 if (m_kind == CacheKind::Lru) {
464 found = m_lruCache->find(accessor.resetToLRU(), *keyCache);
465 } else {
466 found = m_scalableCache->find(accessor.resetToLRU(), *keyCache);
468 return found;
471 always_assert(false);
474 void PCRECache::clearStatic() {
475 std::unique_lock<std::mutex> lock(m_clearMutex, std::try_to_lock);
476 if (!lock) return;
478 auto newExpire = time(nullptr) + RuntimeOption::EvalPCREExpireInterval;
479 m_expire.store(newExpire, std::memory_order_relaxed);
481 auto tmpMap = CreateStatic();
482 tmpMap = m_staticCache.exchange(tmpMap, std::memory_order_acq_rel);
484 Treadmill::enqueue([tmpMap]() {
485 DestroyStatic(tmpMap);
489 void PCRECache::insert(
490 Accessor& accessor,
491 StringData* regex,
492 TempKeyCache& keyCache,
493 const pcre_cache_entry* ent
495 switch (m_kind) {
496 case CacheKind::Static:
498 assertx(m_staticCache.load());
499 // Clear the cache if we haven't refreshed it in a while
500 if (time(nullptr) > m_expire) {
501 clearStatic();
503 auto const cache = m_staticCache.load(std::memory_order_acquire);
504 auto const key = !regex->persistentIncRef()
505 ? StringData::MakeUncounted(regex->slice())
506 : regex;
507 auto pair = cache->insert(StaticCachePair(key, ent));
508 if (pair.second) {
509 // Inserted, container owns the pointer
510 accessor = ent;
511 } else {
512 // Not inserted, caller needs to own the pointer
513 DecRefUncountedString(key);
514 accessor = EntryPtr(ent);
517 break;
518 case CacheKind::Lru:
519 case CacheKind::Scalable:
521 if (!keyCache) {
522 keyCache.reset(new LRUCacheKey(regex->data(), regex->size()));
524 // Pointer ownership is shared between container and caller
525 accessor = EntryPtr(ent);
526 if (m_kind == CacheKind::Lru) {
527 m_lruCache->insert(*keyCache, accessor.entryPtr());
528 } else {
529 m_scalableCache->insert(*keyCache, accessor.entryPtr());
532 break;
536 void PCRECache::dump(folly::File& file) {
537 switch (m_kind) {
538 case CacheKind::Static:
539 for (auto& it : *m_staticCache) {
540 folly::writeFull(file.fd(), it.first->data(), it.first->size());
541 folly::writeFull(file.fd(), "\n", 1);
543 break;
544 case CacheKind::Lru:
545 case CacheKind::Scalable:
547 std::vector<LRUCacheKey> keys;
548 if (m_kind == CacheKind::Lru) {
549 m_lruCache->snapshotKeys(keys);
550 } else {
551 m_scalableCache->snapshotKeys(keys);
553 for (auto& key: keys) {
554 folly::writeFull(file.fd(), key.data(), key.size());
555 folly::writeFull(file.fd(), "\n", 1);
558 break;
562 size_t PCRECache::size() const {
563 switch (m_kind) {
564 case CacheKind::Static:
565 return m_staticCache.load(std::memory_order_acquire)->size();
566 case CacheKind::Lru:
567 return m_lruCache->size();
568 case CacheKind::Scalable:
569 return m_scalableCache->size();
571 always_assert(false);
574 ///////////////////////////////////////////////////////////////////////////////
575 // Public interface and helper functions
577 void pcre_reinit() {
578 PCRECache::CacheKind kind;
579 if (RuntimeOption::EvalPCRECacheType == "static") {
580 kind = PCRECache::CacheKind::Static;
581 } else if (RuntimeOption::EvalPCRECacheType == "lru") {
582 kind = PCRECache::CacheKind::Lru;
583 } else if (RuntimeOption::EvalPCRECacheType == "scalable") {
584 kind = PCRECache::CacheKind::Scalable;
585 } else {
586 Logger::Warning("Eval.PCRECacheType should be either static, "
587 "lru or scalable");
588 kind = PCRECache::CacheKind::Scalable;
590 s_pcreCache.reinit(kind);
593 void pcre_init() {
596 void pcre_dump_cache(folly::File& file) {
597 s_pcreCache.dump(file);
600 static pcre_jit_stack* alloc_jit_stack(void* /*data*/) {
601 return tl_pcre_globals->jit_stack;
604 namespace {
606 template<bool useSmartFree = false>
607 struct FreeHelperImpl {
608 explicit FreeHelperImpl(void* p) : p(p) {}
609 ~FreeHelperImpl() {
610 useSmartFree ? req::free(p) : free(p);
613 FreeHelperImpl(const FreeHelperImpl&) = delete;
614 FreeHelperImpl& operator=(const FreeHelperImpl&) = delete;
616 private:
617 void* p;
620 typedef FreeHelperImpl<true> SmartFreeHelper;
623 static void init_local_extra(pcre_extra* local, pcre_extra* shared) {
624 if (shared) {
625 memcpy(local, shared, sizeof(pcre_extra));
626 } else {
627 memset(local, 0, sizeof(pcre_extra));
628 local->flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
630 local->match_limit = tl_pcre_globals->preg_backtrace_limit;
631 local->match_limit_recursion = tl_pcre_globals->preg_recursion_limit;
634 static const char* const*
635 get_subpat_names(const pcre_cache_entry* pce) {
636 assertx(!pce->literal_data);
637 char **subpat_names = pce->subpat_names.load(std::memory_order_relaxed);
638 if (subpat_names) return subpat_names;
641 * Build a mapping from subpattern numbers to their names. We will always
642 * allocate the table, even though there may be no named subpatterns. This
643 * avoids somewhat more complicated logic in the inner loops.
645 pcre_extra extra;
646 init_local_extra(&extra, pce->extra);
648 int name_count;
650 subpat_names = (char **)calloc(pce->num_subpats, sizeof(char *));
651 int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMECOUNT, &name_count);
652 if (rc < 0) {
653 raise_warning("Internal pcre_fullinfo() error %d", rc);
654 return nullptr;
656 if (name_count > 0) {
657 int name_size, ni = 0;
658 unsigned short name_idx;
659 char* name_table;
660 int rc1, rc2;
662 rc1 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMETABLE, &name_table);
663 rc2 = pcre_fullinfo(pce->re, &extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
664 rc = rc2 ? rc2 : rc1;
665 if (rc < 0) {
666 raise_warning("Internal pcre_fullinfo() error %d", rc);
667 return nullptr;
669 // The table returned by PCRE_INFO_NAMETABLE is an array of fixed length
670 // strings of size PCRE_INFO_NAMEENTRYSIZE. The first two bytes are a
671 // big-endian uint16_t defining the array index followed by the
672 // zero-terminated name string.
673 // (See https://www.pcre.org/original/doc/html/pcreapi.html)
674 while (ni++ < name_count) {
675 name_idx = 0x100 * (unsigned char)name_table[0] +
676 (unsigned char)name_table[1];
677 subpat_names[name_idx] = name_table + 2;
678 if (is_numeric_string(subpat_names[name_idx],
679 strlen(subpat_names[name_idx]),
680 nullptr, nullptr, 0) != KindOfNull) {
681 raise_warning("Numeric named subpatterns are not allowed");
682 return nullptr;
684 name_table += name_size;
687 // Store subpat_names into the cache entry
688 char **expected = nullptr;
689 if (!pce->subpat_names.compare_exchange_strong(expected, subpat_names)) {
690 // Another thread stored subpat_names already. The array created by the
691 // other thread is now in expected, return it instead and delete the one
692 // we just made.
693 free(subpat_names);
694 return expected;
696 return subpat_names;
699 static bool get_pcre_fullinfo(pcre_cache_entry* pce) {
700 pcre_extra extra;
701 init_local_extra(&extra, pce->extra);
703 /* Calculate the size of the offsets array*/
704 int rc = pcre_fullinfo(pce->re, &extra, PCRE_INFO_CAPTURECOUNT,
705 &pce->num_subpats);
706 if (rc < 0) {
707 raise_warning("Internal pcre_fullinfo() error %d", rc);
708 return false;
710 pce->num_subpats++;
711 return true;
714 static bool
715 pcre_get_compiled_regex_cache(PCRECache::Accessor& accessor,
716 StringData* regex) {
717 PCRECache::TempKeyCache tkc;
719 /* Try to lookup the cached regex entry, and if successful, just pass
720 back the compiled pattern, otherwise go on and compile it. */
721 if (s_pcreCache.find(accessor, regex, tkc)) return true;
723 /* Parse through the leading whitespace, and display a warning if we
724 get to the end without encountering a delimiter. */
725 const char *p = regex->data();
726 while (isspace((int)*(unsigned char *)p)) p++;
727 if (*p == 0) {
728 raise_warning("Empty regular expression");
729 return false;
732 /* Get the delimiter and display a warning if it is alphanumeric
733 or a backslash. */
734 char delimiter = *p++;
735 if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
736 raise_warning("Delimiter must not be alphanumeric or backslash");
737 return false;
740 char start_delimiter = delimiter;
741 const char *pp = strchr("([{< )]}> )]}>", delimiter);
742 if (pp) {
743 delimiter = pp[5];
745 char end_delimiter = delimiter;
747 if (start_delimiter == end_delimiter) {
748 /* We need to iterate through the pattern, searching for the ending
749 * delimiter, but skipping the backslashed delimiters. If the ending
750 * delimiter is not found, display a warning. */
751 pp = p;
752 while (*pp != 0) {
753 if (*pp == '\\' && pp[1] != 0) pp++;
754 else if (*pp == delimiter)
755 break;
756 pp++;
758 if (*pp == 0) {
759 raise_warning("No ending delimiter '%c' found: [%s]", delimiter,
760 regex->data());
761 return false;
763 } else {
764 /* We iterate through the pattern, searching for the matching ending
765 * delimiter. For each matching starting delimiter, we increment nesting
766 * level, and decrement it for each matching ending delimiter. If we
767 * reach the end of the pattern without matching, display a warning.
769 int brackets = 1; // brackets nesting level
770 pp = p;
771 while (*pp != 0) {
772 if (*pp == '\\' && pp[1] != 0) pp++;
773 else if (*pp == end_delimiter && --brackets <= 0)
774 break;
775 else if (*pp == start_delimiter)
776 brackets++;
777 pp++;
779 if (*pp == 0) {
780 raise_warning("No ending matching delimiter '%c' found: [%s]",
781 end_delimiter, regex->data());
782 return false;
786 /* Make a copy of the actual pattern. */
787 String spattern(p, pp-p, CopyString);
788 const char *pattern = spattern.data();
790 /* Move on to the options */
791 pp++;
793 /* Parse through the options, setting appropriate flags. Display
794 a warning if we encounter an unknown modifier. */
795 int coptions = 0;
796 int poptions = 0;
797 bool do_study = false;
798 while (*pp != 0) {
799 switch (*pp++) {
800 /* Perl compatible options */
801 case 'i': coptions |= PCRE_CASELESS; break;
802 case 'm': coptions |= PCRE_MULTILINE; break;
803 case 's': coptions |= PCRE_DOTALL; break;
804 case 'x': coptions |= PCRE_EXTENDED; break;
806 /* PCRE specific options */
807 case 'A': coptions |= PCRE_ANCHORED; break;
808 case 'D': coptions |= PCRE_DOLLAR_ENDONLY; break;
809 case 'S': do_study = true; break;
810 case 'U': coptions |= PCRE_UNGREEDY; break;
811 case 'X': coptions |= PCRE_EXTRA; break;
812 case 'u': coptions |= PCRE_UTF8;
813 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
814 characters, even in UTF-8 mode. However, this can be changed by setting
815 the PCRE_UCP option. */
816 #ifdef PCRE_UCP
817 coptions |= PCRE_UCP;
818 #endif
819 break;
821 /* Custom preg options */
822 case 'e': poptions |= PREG_REPLACE_EVAL; break;
824 case ' ':
825 case '\n':
826 case '\r':
827 break;
829 default:
830 raise_warning("Unknown modifier '%c': [%s]", pp[-1], regex->data());
831 return false;
835 /* We've reached a null byte, now check if we're actually at the end of the
836 string. If not this is a bad expression, and a potential security hole. */
837 if (regex->size() != (pp - regex->data())) {
838 raise_error("Error: Null byte found in pattern");
841 /* Store the compiled pattern and extra info in the cache. */
842 auto const store_pcre_entry =
843 [&](pcre_literal_data& pld, pcre* re=nullptr, pcre_extra* extra=nullptr) {
844 assertx((poptions & ~0x1) == 0);
845 assertx((coptions & 0x80000000) == 0);
846 pcre_cache_entry* new_entry = new pcre_cache_entry();
847 new_entry->re = re;
848 new_entry->extra = extra;
849 new_entry->preg_options = poptions;
850 new_entry->compile_options = coptions;
852 if (pld.isLiteral()) {
853 new_entry->literal_data =
854 std::make_unique<pcre_literal_data>(std::move(pld));
855 new_entry->num_subpats = 1;
856 } else {
857 /* Get pcre full info */
858 if (!get_pcre_fullinfo(new_entry)) {
859 delete new_entry;
860 return false;
864 s_pcreCache.insert(accessor, regex, tkc, new_entry);
865 return true;
868 // If the pattern is a literal, we can skip compiling it.
869 auto literal_data = pcre_literal_data(pattern, coptions);
870 if (literal_data.isLiteral()) return store_pcre_entry(literal_data);
872 /* Compile pattern and display a warning if compilation failed. */
873 const char *error;
874 int erroffset;
875 pcre *re = pcre_compile(pattern, coptions, &error, &erroffset, 0);
876 if (re == nullptr) {
877 raise_warning("Compilation failed: %s at offset %d", error, erroffset);
878 return false;
881 // Careful: from here 're' needs to be freed if something throws.
883 /* If study option was specified, study the pattern and
884 store the result in extra for passing to pcre_exec. */
885 pcre_extra *extra = nullptr;
886 if (!literal_data.isLiteral()) {
887 if (do_study || PCRE_STUDY_JIT_COMPILE) {
888 int soptions = PCRE_STUDY_JIT_COMPILE;
889 extra = pcre_study(re, soptions, &error);
890 if (extra) {
891 extra->flags |= PCRE_EXTRA_MATCH_LIMIT |
892 PCRE_EXTRA_MATCH_LIMIT_RECURSION;
893 pcre_assign_jit_stack(extra, alloc_jit_stack, nullptr);
895 if (error != nullptr) {
896 try {
897 raise_warning("Error while studying pattern");
898 } catch (...) {
899 pcre_free(re);
900 throw;
903 if ((!RuntimeOption::EvalJitNoGdb ||
904 RuntimeOption::EvalJitUseVtuneAPI ||
905 RuntimeOption::EvalPerfPidMap) &&
906 extra &&
907 extra->executable_jit != nullptr) {
908 size_t size;
909 pcre_fullinfo(re, extra, PCRE_INFO_JITSIZE, &size);
911 TCA start = *(TCA *)(extra->executable_jit);
912 TCA end = start + size;
913 std::string name = folly::sformat("HHVM::pcre_jit::{}", pattern);
915 if (!RuntimeOption::EvalJitNoGdb && jit::mcgen::initialized()) {
916 Debug::DebugInfo::Get()->recordStub(Debug::TCRange(start, end, false),
917 name);
919 if (RuntimeOption::EvalJitUseVtuneAPI) {
920 HPHP::jit::reportHelperToVtune(name.c_str(), start, end);
922 if (RuntimeOption::EvalPerfPidMap && jit::mcgen::initialized()) {
923 std::string escaped_name;
924 folly::json::escapeString(name, escaped_name,
925 folly::json::serialization_opts());
926 Debug::DebugInfo::Get()->recordPerfMap(
927 Debug::TCRange(start, end, false),
928 SrcKey{}, escaped_name
935 return store_pcre_entry(literal_data, re, extra);
938 static int* create_offset_array(const pcre_cache_entry* pce,
939 int& size_offsets) {
940 /* Allocate memory for the offsets array */
941 size_offsets = pce->num_subpats * 3;
942 return (int *)req::malloc_noptrs(size_offsets * sizeof(int));
945 static Array str_offset_pair(const String& str, int offset) {
946 return make_vec_array(str, offset);
949 static inline bool pcre_need_log_error(int pcre_code) {
950 return RuntimeOption::EnablePregErrorLog &&
951 (pcre_code == PCRE_ERROR_MATCHLIMIT ||
952 pcre_code == PCRE_ERROR_RECURSIONLIMIT);
955 static void pcre_log_error(const char* func, int line, int pcre_code,
956 const char* pattern, int pattern_size,
957 const char* subject, int subject_size,
958 const char* repl, int repl_size,
959 int arg1 = 0, int arg2 = 0,
960 int arg3 = 0, int arg4 = 0) {
961 const char* escapedPattern;
962 const char* escapedSubject;
963 const char* escapedRepl;
964 std::string p(pattern, pattern_size);
965 std::string s(subject, subject_size);
966 std::string r(repl, repl_size);
967 escapedPattern = Logger::EscapeString(p);
968 escapedSubject = Logger::EscapeString(s);
969 escapedRepl = Logger::EscapeString(r);
970 const char* errString =
971 (pcre_code == PCRE_ERROR_MATCHLIMIT) ? "PCRE_ERROR_MATCHLIMIT" :
972 (pcre_code == PCRE_ERROR_RECURSIONLIMIT) ? "PCRE_ERROR_RECURSIONLIMIT" :
973 "UNKNOWN";
974 raise_warning_unsampled(
975 "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
976 "limits=(%" PRId64 ", %" PRId64 "), extra=(%d, %d, %d, %d)",
977 func, line, pcre_code, errString,
978 escapedPattern, escapedSubject, escapedRepl,
979 tl_pcre_globals->preg_backtrace_limit,
980 tl_pcre_globals->preg_recursion_limit,
981 arg1, arg2, arg3, arg4);
982 free((void *)escapedPattern);
983 free((void *)escapedSubject);
984 free((void *)escapedRepl);
987 namespace {
989 ALWAYS_INLINE Variant preg_return_internal_error(Variant&& return_value) {
990 *rl_last_error_code = PHP_PCRE_INTERNAL_ERROR;
991 return std::move(return_value);
994 ALWAYS_INLINE Variant preg_return_bad_regex_error(Variant&& return_value) {
995 *rl_last_error_code = PHP_PCRE_BAD_REGEX_ERROR;
996 return std::move(return_value);
999 void pcre_handle_exec_error(int pcre_code) {
1000 int preg_code = 0;
1001 switch (pcre_code) {
1002 case PCRE_ERROR_MATCHLIMIT:
1003 preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
1004 break;
1005 case PCRE_ERROR_RECURSIONLIMIT:
1006 preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
1007 break;
1008 case PCRE_ERROR_BADUTF8:
1009 preg_code = PHP_PCRE_BAD_UTF8_ERROR;
1010 break;
1011 case PCRE_ERROR_BADUTF8_OFFSET:
1012 preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
1013 break;
1014 default:
1015 preg_code = PHP_PCRE_INTERNAL_ERROR;
1016 break;
1018 *rl_last_error_code = preg_code;
1021 ALWAYS_INLINE Variant
1022 preg_return_pcre_error(int pcre_code, Variant&& return_value) {
1023 pcre_handle_exec_error(pcre_code);
1024 return std::move(return_value);
1027 ALWAYS_INLINE Variant preg_return_no_error(Variant&& return_value) {
1028 *rl_last_error_code = PHP_PCRE_NO_ERROR;
1029 return std::move(return_value);
1032 } // namespace
1034 ///////////////////////////////////////////////////////////////////////////////
1036 Variant preg_grep(const String& pattern, const Array& input, int flags /* = 0 */) {
1037 PCRECache::Accessor accessor;
1038 if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1039 return preg_return_bad_regex_error(false);
1041 const pcre_cache_entry* pce = accessor.get();
1043 int size_offsets = 0;
1044 int* offsets = create_offset_array(pce, size_offsets);
1045 if (offsets == nullptr) {
1046 return preg_return_internal_error(false);
1048 SmartFreeHelper freer(offsets);
1050 /* Initialize return array */
1051 auto ret = Array::CreateDict();
1053 /* Go through the input array */
1054 bool invert = (flags & PREG_GREP_INVERT);
1055 pcre_extra extra;
1056 init_local_extra(&extra, pce->extra);
1058 for (ArrayIter iter(input); iter; ++iter) {
1059 String entry = iter.second().toString();
1060 int count = 0;
1062 if (pce->literal_data) {
1063 assertx(pce->literal_data->isLiteral());
1064 count = pce->literal_data->matches(entry.get(), 0, offsets, 0)
1065 ? 1 : PCRE_ERROR_NOMATCH;
1066 } else {
1067 /* Perform the match */
1068 count = pcre_exec(pce->re, &extra, entry.data(), entry.size(),
1069 0, 0, offsets, size_offsets);
1071 /* Check for too many substrings condition. */
1072 if (count == 0) {
1073 raise_warning("Matched, but too many substrings");
1074 count = pce->num_subpats;
1075 } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1076 if (pcre_need_log_error(count)) {
1077 pcre_log_error(__FUNCTION__, __LINE__, count,
1078 pattern.data(), pattern.size(),
1079 entry.data(), entry.size(),
1080 "", 0,
1081 flags);
1083 // NOTE: this returns an error together with a partial result :-(
1084 return preg_return_pcre_error(count, std::move(ret));
1087 /* If the entry fits our requirements */
1088 if ((count > 0 && !invert) ||
1089 (count == PCRE_ERROR_NOMATCH && invert)) {
1091 /* Add to return array */
1092 ret.set(iter.first(), entry);
1096 return preg_return_no_error(std::move(ret));
1099 ///////////////////////////////////////////////////////////////////////////////
1101 static Variant preg_match_impl(StringData* pattern,
1102 const StringData* subject,
1103 Variant* subpats, int flags, int start_offset,
1104 bool global) {
1105 PCRECache::Accessor accessor;
1106 if (!pcre_get_compiled_regex_cache(accessor, pattern)) {
1107 return preg_return_bad_regex_error(false);
1109 pcre_extra extra;
1110 const pcre_cache_entry* pce = accessor.get();
1111 init_local_extra(&extra, pce->extra);
1112 int subpats_order = global ? PREG_PATTERN_ORDER : 0;
1113 if (subpats) *subpats = Array::CreateDict();
1115 if (flags) {
1117 * subpats_order is pre-set to pattern mode so we change it only if
1118 * necessary.
1120 if (flags & 0xff) {
1121 subpats_order = flags & 0xff;
1123 if ((global && (subpats_order < PREG_PATTERN_ORDER ||
1124 subpats_order > PREG_SET_ORDER)) ||
1125 (!global && subpats_order != 0)) {
1126 raise_warning("Invalid flags specified");
1127 return preg_return_internal_error(init_null());
1131 /* Negative offset counts from the end of the string. */
1132 if (start_offset < 0) {
1133 start_offset = subject->size() + start_offset;
1134 if (start_offset < 0) {
1135 start_offset = 0;
1139 int size_offsets = 0;
1140 int* offsets = create_offset_array(pce, size_offsets);
1141 SmartFreeHelper offsetsFreer(offsets);
1142 int num_subpats = pce->num_subpats;
1143 if (offsets == nullptr) return preg_return_internal_error(false);
1145 /* Allocate match sets array and initialize the values. */
1147 /* An array of sets of matches for each subpattern after a global match */
1148 auto match_sets = Array::CreateDict();
1149 if (global && subpats_order == PREG_PATTERN_ORDER) {
1150 for (int i = 0; i < num_subpats; i++) {
1151 match_sets.set(i, Array::CreateDict());
1156 * If PREG_OFFSET_CAPTURE, each match, instead of being a string, will
1157 * be an array where the first element is a substring containing the
1158 * match and the second element is the position of the first character of
1159 * the substring in the input.
1161 bool offset_capture = flags & PREG_OFFSET_CAPTURE;
1162 const char** stringlist; // Holds list of subpatterns
1163 auto const get_value = [&](int i) {
1164 auto const length = offsets[(i<<1)+1] - offsets[i<<1];
1165 auto const match = String(stringlist[i], length, CopyString);
1166 return offset_capture
1167 ? Variant(str_offset_pair(match, offsets[i<<1]))
1168 : Variant(match);
1170 auto const get_value_empty = [&](int i) {
1171 auto const match = empty_string();
1172 return offset_capture
1173 ? Variant(str_offset_pair(match, offsets[i<<1]))
1174 : Variant(match);
1178 * Skip building name table when using literal_data. Name table is used
1179 * to add named subpatterns to result array. Literal data has none of these,
1180 * so we can skip this step.
1182 const char* const* subpat_names = nullptr;
1183 auto const is_literal = pce->literal_data != nullptr;
1184 if (!is_literal) {
1185 subpat_names = get_subpat_names(pce);
1186 if (subpat_names == nullptr) return preg_return_internal_error(false);
1188 auto const set_subpats = [&](auto& arr, int i, const Variant& value) {
1189 if (is_literal) return;
1190 if (subpat_names[i]) arr.set(String(subpat_names[i]), value);
1193 int i;
1194 const bool includeNonMatchingCaptures = flags & PREG_FB__PRIVATE__HSL_IMPL;
1196 // Add matches to result array for this run
1197 auto add_match_set = [&](auto& arr, int count) {
1198 for (i = 0; i < count; i++) {
1199 auto const value = get_value(i);
1200 set_subpats(arr, i, value);
1201 arr.set(i, value);
1203 if (includeNonMatchingCaptures) {
1204 for (; i < num_subpats; i++) {
1205 auto const value = get_value_empty(i);
1206 set_subpats(arr, i, value);
1207 arr.set(i, value);
1212 int matched = 0;
1213 int g_notempty = 0; // If the match should not be empty
1214 int exec_options = 0;
1216 do {
1217 int count = 0;
1218 int options = exec_options | g_notempty;
1219 if (is_literal) {
1220 assertx(literalOptions(options));
1221 count = pce->literal_data->matches(subject, start_offset, offsets, options)
1222 ? 1 : PCRE_ERROR_NOMATCH;
1223 } else {
1224 /* Execute the regular expression. */
1225 count = pcre_exec(pce->re, &extra, subject->data(), subject->size(),
1226 start_offset, options,
1227 offsets, size_offsets);
1229 /* The string was already proved to be valid UTF-8 */
1230 exec_options |= PCRE_NO_UTF8_CHECK;
1232 /* Check for too many substrings condition. */
1233 if (count == 0) {
1234 raise_warning("Matched, but too many substrings");
1235 count = num_subpats;
1238 /* If something has matched */
1239 if (count > 0) {
1240 matched++;
1242 if (subpats) {
1243 // Try to get the list of substrings and display a warning if failed.
1244 if (offsets[1] < offsets[0] ||
1245 pcre_get_substring_list(subject->data(), offsets, count,
1246 &stringlist) < 0) {
1247 raise_warning("Get subpatterns list failed");
1248 return preg_return_internal_error(false);
1251 if (global) {
1252 if (subpats_order == PREG_PATTERN_ORDER) {
1253 /* For each subpattern, insert it into the appropriate array. */
1254 for (i = 0; i < count; i++) {
1255 auto const value = get_value(i);
1256 auto& arr = asArrRef(match_sets.lval(i));
1257 assertx(arr->isVectorData());
1258 arr.set(safe_cast<int64_t>(arr.size()), value);
1261 * If the number of captured subpatterns on this run is
1262 * less than the total possible number, pad the result
1263 * arrays with empty strings.
1265 for (; i < num_subpats; i++) {
1266 auto& arr = asArrRef(match_sets.lval(i));
1267 assertx(arr->isVectorData());
1268 arr.set(safe_cast<int64_t>(arr.size()), empty_string());
1270 } else {
1271 auto result_set = Array::CreateDict();
1272 add_match_set(result_set, count);
1273 auto& arr = subpats->asArrRef();
1274 assertx(arr->isVectorData());
1275 arr.set(safe_cast<int64_t>(arr.size()), std::move(result_set));
1277 } else {
1278 auto& arr = subpats->asArrRef();
1279 add_match_set(arr, count);
1281 pcre_free((void *) stringlist);
1283 } else if (count == PCRE_ERROR_NOMATCH) {
1284 /* If we previously set PCRE_NOTEMPTY after a null match,
1285 this is not necessarily the end. We need to advance
1286 the start offset, and continue. Fudge the offset values
1287 to achieve this, unless we're already at the end of the string. */
1288 if (g_notempty && start_offset < subject->size()) {
1289 offsets[0] = start_offset;
1290 offsets[1] = start_offset + 1;
1291 } else
1292 break;
1293 } else {
1294 if (pcre_need_log_error(count)) {
1295 pcre_log_error(__FUNCTION__, __LINE__, count,
1296 pattern->data(), pattern->size(),
1297 subject->data(), subject->size(),
1298 "", 0,
1299 flags, start_offset, g_notempty, global);
1301 return preg_return_pcre_error(count, false);
1304 /* If we have matched an empty string, mimic what Perl's /g options does.
1305 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1306 the match again at the same point. If this fails (picked up above) we
1307 advance to the next character. */
1308 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1310 /* Advance to the position right after the last full match */
1311 start_offset = offsets[1];
1312 } while (global);
1314 /* Add the match sets to the output array and clean up */
1315 if (subpats && global && subpats_order == PREG_PATTERN_ORDER) {
1316 auto& arr = subpats->asArrRef();
1317 for (i = 0; i < num_subpats; i++) {
1318 auto const value = match_sets[i];
1319 set_subpats(arr, i, value);
1320 arr.set(i, match_sets[i]);
1323 return preg_return_no_error(std::move(matched));
1326 Variant preg_match(const String& pattern, const String& subject,
1327 Variant* matches /* = nullptr */, int flags /* = 0 */,
1328 int offset /* = 0 */) {
1329 return preg_match(pattern.get(), subject.get(), matches, flags, offset);
1332 Variant preg_match(StringData* pattern, const StringData* subject,
1333 Variant* matches /* = nullptr */, int flags /* = 0 */,
1334 int offset /* = 0 */) {
1335 return preg_match_impl(pattern, subject, matches, flags, offset, false);
1338 Variant preg_match_all(const String& pattern, const String& subject,
1339 Variant* matches /* = nullptr */,
1340 int flags /* = 0 */, int offset /* = 0 */) {
1341 return preg_match_all(pattern.get(), subject.get(), matches, flags, offset);
1344 Variant preg_match_all(StringData* pattern, const StringData* subject,
1345 Variant* matches /* = nullptr */,
1346 int flags /* = 0 */, int offset /* = 0 */) {
1347 return preg_match_impl(pattern, subject, matches, flags, offset, true);
1350 ///////////////////////////////////////////////////////////////////////////////
1352 static String preg_do_repl_func(const Variant& function, const String& subject,
1353 int* offsets, const char* const* subpat_names,
1354 int count) {
1355 Array subpats = Array::CreateDict();
1356 for (int i = 0; i < count; i++) {
1357 auto off1 = offsets[i<<1];
1358 auto off2 = offsets[(i<<1)+1];
1359 auto sub = subject.substr(off1, off2 - off1);
1361 if (subpat_names && subpat_names[i]) {
1362 subpats.set(String(subpat_names[i]), sub);
1364 subpats.set(i, sub);
1367 return vm_call_user_func(function, make_vec_array(subpats)).toString();
1370 static bool preg_get_backref(const char** str, int* backref) {
1371 char in_brace = 0;
1372 const char* walk = *str;
1374 if (walk[1] == 0) {
1375 return false;
1378 if (*walk == '$' && walk[1] == '{') {
1379 in_brace = 1;
1380 walk++;
1382 walk++;
1384 if (*walk >= '0' && *walk <= '9') {
1385 *backref = *walk - '0';
1386 walk++;
1387 } else {
1388 return false;
1391 if (*walk && *walk >= '0' && *walk <= '9') {
1392 *backref = *backref * 10 + *walk - '0';
1393 walk++;
1396 if (in_brace) {
1397 if (*walk == 0 || *walk != '}') {
1398 return false;
1400 walk++;
1403 *str = walk;
1404 return true;
1407 static Variant php_pcre_replace(const String& pattern, const String& subject,
1408 const Variant& replace_var, bool callable,
1409 int limit, int* replace_count) {
1410 PCRECache::Accessor accessor;
1411 if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1412 return preg_return_bad_regex_error(init_null());
1414 const pcre_cache_entry* pce = accessor.get();
1415 if (pce->preg_options & PREG_REPLACE_EVAL) {
1416 raise_error("preg_replace(): Support for the /e modifier has been removed, use "
1417 "preg_replace_callback instead");
1420 int size_offsets;
1421 int* offsets = create_offset_array(pce, size_offsets);
1422 SmartFreeHelper offsetsFreer(offsets);
1423 if (offsets == nullptr) {
1424 return preg_return_internal_error(init_null());
1426 auto const is_literal = pce->literal_data != nullptr;
1427 const char* const* subpat_names = nullptr;
1428 if (!is_literal) {
1429 subpat_names = get_subpat_names(pce);
1430 if (subpat_names == nullptr) return preg_return_internal_error(init_null());
1433 const char* replace = nullptr;
1434 const char* replace_end = nullptr;
1435 int replace_len = 0;
1436 String replace_val;
1438 if (!callable) {
1439 replace_val = replace_var.toString();
1440 replace = replace_val.data();
1441 replace_len = replace_val.size();
1442 replace_end = replace + replace_len;
1445 StringBuffer result(2 * subject.size());
1447 try {
1449 /* Initialize */
1450 const char* match = nullptr;
1451 int start_offset = 0;
1452 pcre_extra extra;
1453 init_local_extra(&extra, pce->extra);
1455 const char* walk; // Used to walk the replacement string
1456 char walk_last; // Last walked character
1457 int match_len; // Length of the current match
1458 int backref; // Backreference number
1459 int g_notempty = 0; // If the match should not be empty
1460 int exec_options = 0; // Options passed to pcre_exec
1461 while (1) {
1462 int count = 0;
1463 int options = exec_options | g_notempty;
1464 if (pce->literal_data && literalOptions(options)) {
1465 assertx(pce->literal_data->isLiteral());
1466 count =
1467 pce->literal_data->matches(subject.get(), start_offset, offsets, options)
1468 ? 1 : PCRE_ERROR_NOMATCH;
1469 } else {
1470 /* Execute the regular expression. */
1471 count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1472 start_offset, options, offsets, size_offsets);
1474 /* The string was already proved to be valid UTF-8 */
1475 exec_options |= PCRE_NO_UTF8_CHECK;
1478 /* Check for too many substrings condition. */
1479 if (count == 0) {
1480 raise_warning("Matched, but too many substrings");
1481 count = pce->num_subpats;
1484 const char* piece = subject.data() + start_offset;
1485 if (count > 0 && offsets[1] >= offsets[0] &&
1486 (limit == -1 || limit > 0)) {
1487 if (replace_count) {
1488 ++*replace_count;
1490 /* Set the match location in subject */
1491 match = subject.data() + offsets[0];
1493 String callable_result;
1494 if (callable) {
1495 /* Use custom function to get replacement string and its length. */
1496 callable_result = preg_do_repl_func(replace_var, subject, offsets,
1497 subpat_names, count);
1498 } else { /* do regular substitution */
1499 walk = replace;
1500 walk_last = 0;
1501 while (walk < replace_end) {
1502 if ('\\' == *walk || '$' == *walk) {
1503 if (walk_last == '\\') {
1504 walk++;
1505 walk_last = 0;
1506 continue;
1508 if (preg_get_backref(&walk, &backref)) {
1509 if (backref < count) {
1510 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1512 continue;
1515 walk++;
1516 walk_last = walk[-1];
1520 /* copy the part of the string before the match */
1521 result.append(piece, match-piece);
1523 /* copy replacement and backrefs */
1524 int result_len = result.size();
1526 if (callable) {
1527 /* Copy result from custom function to buffer and clean up. */
1528 result.append(callable_result.data(), callable_result.size());
1529 result_len += callable_result.size();
1530 } else { /* do regular backreference copying */
1531 walk = replace;
1532 walk_last = 0;
1533 Array params;
1534 while (walk < replace_end) {
1535 if ('\\' == *walk || '$' == *walk) {
1536 if (walk_last == '\\') {
1537 result.set(result.size() - 1, *walk++);
1538 walk_last = 0;
1539 continue;
1541 if (preg_get_backref(&walk, &backref)) {
1542 if (backref < count) {
1543 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1544 result.append(
1545 subject.data() + offsets[backref<<1],
1546 match_len
1549 continue;
1552 result.append(*walk++);
1553 walk_last = walk[-1];
1557 if (limit != -1) {
1558 limit--;
1561 } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1562 /* If we previously set PCRE_NOTEMPTY after a null match,
1563 this is not necessarily the end. We need to advance
1564 the start offset, and continue. Fudge the offset values
1565 to achieve this, unless we're already at the end of the string. */
1566 if (g_notempty != 0 && start_offset < subject.size()) {
1567 offsets[0] = start_offset;
1568 offsets[1] = start_offset + 1;
1569 result.append(piece, 1);
1570 } else {
1571 /* stick that last bit of string on our output */
1572 result.append(piece, subject.size() - start_offset);
1573 break;
1575 } else {
1576 if (pcre_need_log_error(count)) {
1577 const char* s;
1578 int size;
1579 String stemp;
1580 if (callable) {
1581 if (replace_var.isObject()) {
1582 stemp = replace_var.asCObjRef()->getClassName().asString()
1583 + "::__invoke";
1584 } else {
1585 stemp = replace_var.toString();
1587 s = stemp.data();
1588 size = stemp.size();
1589 } else {
1590 s = replace_val.data();
1591 size = replace_val.size();
1593 pcre_log_error(__FUNCTION__, __LINE__, count,
1594 pattern.data(), pattern.size(),
1595 subject.data(), subject.size(),
1596 s, size,
1597 callable, limit, start_offset, g_notempty);
1599 return preg_return_pcre_error(count, init_null());
1602 /* If we have matched an empty string, mimic what Perl's /g options does.
1603 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1604 the match again at the same point. If this fails (picked up above) we
1605 advance to the next character. */
1606 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1608 /* Advance to the next piece. */
1609 start_offset = offsets[1];
1612 return preg_return_no_error(result.detach());
1613 } catch (...) {
1614 throw;
1618 static Variant php_replace_in_subject(const Variant& regex, const Variant& replace,
1619 String subject, int limit, bool callable,
1620 int* replace_count) {
1621 if (!regex.isArray()) {
1622 return php_pcre_replace(regex.toString(), subject, replace, callable,
1623 limit, replace_count);
1626 if (callable || !replace.isArray()) {
1627 Array arr = regex.toDict();
1628 for (ArrayIter iterRegex(arr); iterRegex; ++iterRegex) {
1629 String regex_entry = iterRegex.second().toString();
1630 auto ret = php_pcre_replace(regex_entry, subject, replace, callable,
1631 limit, replace_count);
1632 if (!ret.isString()) {
1633 assertx(ret.isNull());
1634 return ret; // php_pcre_replace already set error
1636 subject = ret.asStrRef();
1637 assertx(!subject.isNull());
1639 return preg_return_no_error(std::move(subject));
1642 Array arrReplace = replace.toDict();
1643 Array arrRegex = regex.toDict();
1644 ArrayIter iterReplace(arrReplace);
1645 for (ArrayIter iterRegex(arrRegex); iterRegex; ++iterRegex) {
1646 String regex_entry = iterRegex.second().toString();
1647 Variant replace_value;
1648 if (iterReplace) {
1649 replace_value = iterReplace.second();
1650 ++iterReplace;
1653 auto ret = php_pcre_replace(regex_entry, subject, replace_value, callable,
1654 limit, replace_count);
1655 if (!ret.isString()) {
1656 assertx(ret.isNull());
1657 return ret; // php_pcre_replace already set error
1659 subject = ret.asStrRef();
1660 assertx(!subject.isNull());
1662 return preg_return_no_error(std::move(subject));
1665 Variant preg_replace_impl(const Variant& pattern, const Variant& replacement,
1666 const Variant& subject, int limit, int64_t* count,
1667 bool is_callable, bool is_filter) {
1668 assertx(!(is_callable && is_filter));
1669 if (!is_callable &&
1670 replacement.isArray() && !pattern.isArray()) {
1671 raise_warning("Parameter mismatch, pattern is a string while "
1672 "replacement is an array");
1673 return preg_return_internal_error(false);
1676 int replace_count = 0;
1677 if (!isContainer(subject)) {
1678 auto ret = php_replace_in_subject(pattern, replacement, subject.toString(),
1679 limit, is_callable, &replace_count);
1681 if (ret.isNull()) return ret; // php_replace_in_subject already set error
1682 assertx(ret.isString());
1683 if (count) *count = replace_count;
1684 if (is_filter && replace_count == 0) {
1685 return preg_return_internal_error(init_null());
1687 return preg_return_no_error(std::move(ret));
1690 Array return_value = Array::CreateDict();
1691 Array arrSubject = subject.toDict();
1692 for (ArrayIter iter(arrSubject); iter; ++iter) {
1693 auto old_replace_count = replace_count;
1694 String subject_entry = iter.second().toString();
1695 auto ret = php_replace_in_subject(pattern, replacement, subject_entry,
1696 limit, is_callable, &replace_count);
1698 if (ret.isString() && (!is_filter || replace_count > old_replace_count)) {
1699 return_value.set(iter.first(), ret.asStrRef());
1702 if (count) *count = replace_count;
1703 return preg_return_no_error(std::move(return_value));
1706 int preg_replace(Variant& result,
1707 const Variant& pattern,
1708 const Variant& replacement,
1709 const Variant& subject,
1710 int limit /* = -1 */) {
1711 int64_t count;
1712 result = preg_replace_impl(pattern, replacement, subject,
1713 limit, &count, false, false);
1714 return count;
1717 int preg_replace_callback(Variant& result,
1718 const Variant& pattern,
1719 const Variant& callback,
1720 const Variant& subject,
1721 int limit /* = -1 */) {
1722 int64_t count;
1723 result = preg_replace_impl(pattern, callback, subject,
1724 limit, &count, true, false);
1725 return count;
1728 ///////////////////////////////////////////////////////////////////////////////
1730 namespace {
1732 const StaticString s_OneUnicodeCharPattern("/./us");
1734 } // namespace
1736 Variant preg_split(const String& pattern, const String& subject,
1737 int limit /* = -1 */, int flags /* = 0 */) {
1738 PCRECache::Accessor accessor;
1739 if (!pcre_get_compiled_regex_cache(accessor, pattern.get())) {
1740 return preg_return_bad_regex_error(false);
1742 const pcre_cache_entry* pce = accessor.get();
1744 int no_empty = flags & PREG_SPLIT_NO_EMPTY;
1745 bool delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1746 bool offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1748 if (limit == 0) {
1749 limit = -1;
1752 int size_offsets = 0;
1753 int* offsets = create_offset_array(pce, size_offsets);
1754 SmartFreeHelper offsetsFreer(offsets);
1755 if (offsets == nullptr) {
1756 return preg_return_internal_error(false);
1759 /* Start at the beginning of the string */
1760 int start_offset = 0;
1761 int next_offset = 0;
1762 const char* last_match = subject.data();
1763 pcre_extra extra;
1764 init_local_extra(&extra, pce->extra);
1766 // Get next piece if no limit or limit not yet reached and something matched
1767 Array result = Array::CreateDict();
1768 int g_notempty = 0; /* If the match should not be empty */
1769 int utf8_check = 0;
1770 PCRECache::Accessor bump_accessor;
1771 const pcre_cache_entry* bump_pce = nullptr; /* instance for empty matches */
1772 while ((limit == -1 || limit > 1)) {
1773 int count = 0;
1774 int options = g_notempty | utf8_check;
1775 if (pce->literal_data && literalOptions(options)) {
1776 assertx(pce->literal_data->isLiteral());
1777 count =
1778 pce->literal_data->matches(subject.get(), start_offset, offsets, options)
1779 ? 1 : PCRE_ERROR_NOMATCH;
1780 } else {
1781 count = pcre_exec(pce->re, &extra, subject.data(), subject.size(),
1782 start_offset, options, offsets, size_offsets);
1783 /* Subsequent calls to pcre_exec don't need to bother with the
1784 * utf8 validity check: if the subject isn't valid, the first
1785 * call to pcre_exec will have failed, and as long as we only
1786 * set start_offset to known character boundaries we won't
1787 * supply an invalid offset. */
1788 utf8_check = PCRE_NO_UTF8_CHECK;
1791 /* Check for too many substrings condition. */
1792 if (count == 0) {
1793 raise_warning("Matched, but too many substrings");
1794 count = pce->num_subpats;
1797 /* If something matched */
1798 if (count > 0 && offsets[1] >= offsets[0]) {
1799 if (!no_empty || subject.data() + offsets[0] != last_match) {
1800 auto const length = subject.data() + offsets[0] - last_match;
1801 auto const match = String(last_match, length, CopyString);
1802 auto const value = offset_capture
1803 ? Variant(str_offset_pair(match, next_offset))
1804 : Variant(match);
1805 assertx(result->isVectorData());
1806 result.set(safe_cast<int64_t>(result.size()), value);
1808 /* One less left to do */
1809 if (limit != -1) limit--;
1812 last_match = subject.data() + offsets[1];
1813 next_offset = offsets[1];
1815 if (delim_capture) {
1816 int i, match_len;
1817 for (i = 1; i < count; i++) {
1818 match_len = offsets[(i<<1)+1] - offsets[i<<1];
1819 /* If we have matched a delimiter */
1820 if (!no_empty || match_len > 0) {
1821 auto const match = subject.substr(offsets[i<<1], match_len);
1822 auto const value = offset_capture
1823 ? Variant(str_offset_pair(match, offsets[i<<1]))
1824 : Variant(match);
1825 assertx(result->isVectorData());
1826 result.set(safe_cast<int64_t>(result.size()), value);
1830 } else if (count == PCRE_ERROR_NOMATCH) {
1831 /* If we previously set PCRE_NOTEMPTY after a null match,
1832 this is not necessarily the end. We need to advance
1833 the start offset, and continue. Fudge the offset values
1834 to achieve this, unless we're already at the end of the string. */
1835 if (g_notempty != 0 && start_offset < subject.size()) {
1836 if (pce->compile_options & PCRE_UTF8) {
1837 if (bump_pce == nullptr) {
1838 auto const DEBUG_ONLY ok = pcre_get_compiled_regex_cache(
1839 bump_accessor, s_OneUnicodeCharPattern.get());
1840 assertx(ok);
1841 bump_pce = bump_accessor.get();
1843 pcre_extra bump_extra;
1844 init_local_extra(&bump_extra, bump_pce->extra);
1845 count = pcre_exec(bump_pce->re, &bump_extra, subject.data(),
1846 subject.size(), start_offset,
1847 utf8_check, offsets, size_offsets);
1848 if (count < 1) {
1849 raise_warning("Unknown error");
1850 offsets[0] = start_offset;
1851 offsets[1] = start_offset + 1;
1852 if (pcre_need_log_error(count)) {
1853 pcre_log_error(__FUNCTION__, __LINE__, count,
1854 pattern.data(), pattern.size(),
1855 subject.data(), subject.size(),
1856 "", 0,
1857 limit, flags, start_offset);
1860 } else {
1861 offsets[0] = start_offset;
1862 offsets[1] = start_offset + 1;
1864 } else
1865 break;
1866 } else {
1867 if (pcre_need_log_error(count)) {
1868 pcre_log_error(__FUNCTION__, __LINE__, count,
1869 pattern.data(), pattern.size(),
1870 subject.data(), subject.size(),
1871 "", 0,
1872 limit, flags, start_offset, g_notempty);
1874 // NOTE: this returns an error together with a partial result :-(
1875 start_offset = last_match - subject.data(); /* offset might have
1876 * been incremented,
1877 * but without further
1878 * successful matches */
1879 if (!no_empty || start_offset < subject.size()) {
1880 auto const match = subject.substr(start_offset);
1881 auto const value = offset_capture
1882 ? Variant(str_offset_pair(match, start_offset))
1883 : Variant(match);
1884 assertx(result->isVectorData());
1885 result.set(safe_cast<int64_t>(result.size()), value);
1887 return preg_return_pcre_error(count, std::move(result));
1890 /* If we have matched an empty string, mimic what Perl's /g options does.
1891 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1892 the match again at the same point. If this fails (picked up above) we
1893 advance to the next character. */
1894 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1896 /* Advance to the position right after the last full match */
1897 start_offset = offsets[1];
1900 start_offset = last_match - subject.data(); /* offset might have
1901 * been incremented,
1902 * but without further
1903 * successful matches */
1904 if (!no_empty || start_offset < subject.size()) {
1905 auto const match = subject.substr(start_offset);
1906 auto const value = offset_capture
1907 ? Variant(str_offset_pair(match, start_offset))
1908 : Variant(match);
1909 assertx(result->isVectorData());
1910 result.set(safe_cast<int64_t>(result.size()), value);
1913 return preg_return_no_error(std::move(result));
1916 ///////////////////////////////////////////////////////////////////////////////
1918 String preg_quote(const String& str,
1919 const String& delimiter /* = null_string */) {
1920 const char* in_str = str.data();
1921 const char* in_str_end = in_str + str.size();
1923 /* Nothing to do if we got an empty string */
1924 if (in_str == in_str_end) {
1925 return str;
1928 char delim_char = 0; /* Delimiter character to be quoted */
1929 bool quote_delim = false; /* Whether to quote additional delim char */
1930 if (!delimiter.empty()) {
1931 delim_char = delimiter.charAt(0);
1932 quote_delim = true;
1935 /* Allocate enough memory so that even if each character
1936 is quoted, we won't run out of room */
1937 static_assert(
1938 (StringData::MaxSize * 4 + 1) < std::numeric_limits<int64_t>::max()
1940 String ret(4 * str.size() + 1, ReserveString);
1941 char* out_str = ret.mutableData();
1943 /* Go through the string and quote necessary characters */
1944 const char* p;
1945 char* q;
1946 for (p = in_str, q = out_str; p != in_str_end; p++) {
1947 char c = *p;
1948 switch (c) {
1949 case '.': case '\\': case '+': case '*': case '?':
1950 case '[': case '^': case ']': case '$': case '(':
1951 case ')': case '{': case '}': case '=': case '!':
1952 case '>': case '<': case '|': case ':': case '-':
1953 case '#':
1954 *q++ = '\\';
1955 *q++ = c;
1956 break;
1958 case '\0':
1959 *q++ = '\\';
1960 *q++ = '0';
1961 *q++ = '0';
1962 *q++ = '0';
1963 break;
1965 default:
1966 if (quote_delim && c == delim_char)
1967 *q++ = '\\';
1968 *q++ = c;
1969 break;
1972 *q = '\0';
1974 return ret.setSize(q - out_str);
1977 ///////////////////////////////////////////////////////////////////////////////
1978 // last_error
1980 int preg_last_error() {
1981 return *rl_last_error_code;
1984 PregWithErrorGuard::~PregWithErrorGuard() {
1985 if (*rl_last_error_code == PHP_PCRE_NO_ERROR) {
1986 error.setNull();
1987 } else {
1988 error = *rl_last_error_code;
1990 *rl_last_error_code = prior_error;
1993 size_t preg_pcre_cache_size() {
1994 return s_pcreCache.size();
1997 ///////////////////////////////////////////////////////////////////////////////
1998 // regexec
2000 static void php_reg_eprint(int err, regex_t* re) {
2001 char *buf = nullptr, *message = nullptr;
2002 size_t len;
2003 size_t buf_len;
2005 #ifdef REG_ITOA
2006 /* get the length of the message */
2007 buf_len = regerror(REG_ITOA | err, re, nullptr, 0);
2008 if (buf_len) {
2009 buf = (char *)req::malloc_noptrs(buf_len);
2010 if (!buf) return; /* fail silently */
2011 /* finally, get the error message */
2012 regerror(REG_ITOA | err, re, buf, buf_len);
2014 #else
2015 buf_len = 0;
2016 #endif
2017 len = regerror(err, re, nullptr, 0);
2018 if (len) {
2019 message = (char *)req::malloc_noptrs(buf_len + len + 2);
2020 if (!message) {
2021 return; /* fail silently */
2023 if (buf_len) {
2024 snprintf(message, buf_len, "%s: ", buf);
2025 buf_len += 1; /* so pointer math below works */
2027 /* drop the message into place */
2028 regerror(err, re, message + buf_len, len);
2029 raise_warning("%s", message);
2031 req::free(buf);
2032 req::free(message);
2035 Variant php_split(const String& spliton, const String& str, int count,
2036 bool icase) {
2037 const char* strp = str.data();
2038 const char* endp = strp + str.size();
2040 regex_t re;
2041 int copts = icase ? REG_ICASE : 0;
2042 int err = regcomp(&re, spliton.data(), REG_EXTENDED | copts);
2043 if (err) {
2044 php_reg_eprint(err, &re);
2045 return false;
2048 Array return_value = Array::CreateVec();
2049 regmatch_t subs[1];
2051 /* churn through str, generating array entries as we go */
2052 while ((count == -1 || count > 1) &&
2053 !(err = regexec(&re, strp, 1, subs, 0))) {
2054 if (subs[0].rm_so == 0 && subs[0].rm_eo) {
2055 /* match is at start of string, return empty string */
2056 return_value.append("");
2057 /* skip ahead the length of the regex match */
2058 strp += subs[0].rm_eo;
2059 } else if (subs[0].rm_so == 0 && subs[0].rm_eo == 0) {
2060 /* No more matches */
2061 regfree(&re);
2062 raise_warning("Invalid Regular Expression to split()");
2063 return false;
2064 } else {
2065 /* On a real match */
2067 /* make a copy of the substring */
2068 int size = subs[0].rm_so;
2070 /* add it to the array */
2071 return_value.append(String(strp, size, CopyString));
2073 /* point at our new starting point */
2074 strp = strp + subs[0].rm_eo;
2077 /* if we're only looking for a certain number of points,
2078 stop looking once we hit it */
2079 if (count != -1) {
2080 count--;
2084 /* see if we encountered an error */
2085 if (err && err != REG_NOMATCH) {
2086 php_reg_eprint(err, &re);
2087 regfree(&re);
2088 return false;
2091 /* otherwise we just have one last element to add to the array */
2092 int size = endp - strp;
2093 return_value.append(String(strp, size, CopyString));
2095 regfree(&re);
2096 return return_value;
2099 ///////////////////////////////////////////////////////////////////////////////