2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/base/preg.h"
23 #include <onigposix.h>
26 #include <folly/AtomicHashArray.h>
28 #include "hphp/runtime/base/array-init.h"
29 #include "hphp/runtime/base/array-iterator.h"
30 #include "hphp/runtime/base/builtin-functions.h"
31 #include "hphp/runtime/base/container-functions.h"
32 #include "hphp/runtime/base/execution-context.h"
33 #include "hphp/runtime/base/ini-setting.h"
34 #include "hphp/runtime/base/init-fini-node.h"
35 #include "hphp/runtime/base/runtime-option.h"
36 #include "hphp/runtime/base/string-util.h"
37 #include "hphp/runtime/base/tv-uncounted.h"
38 #include "hphp/runtime/base/zend-functions.h"
39 #include "hphp/runtime/vm/debug/debug.h"
40 #include "hphp/runtime/vm/treadmill.h"
41 #include "hphp/runtime/vm/vm-regs.h"
43 #include "hphp/runtime/ext/std/ext_std_function.h"
44 #include "hphp/runtime/ext/string/ext_string.h"
46 #include "hphp/runtime/vm/jit/mcgen.h"
47 #include "hphp/runtime/vm/jit/types.h"
48 #include "hphp/runtime/vm/jit/vtune-jit.h"
50 #include "hphp/util/logger.h"
51 #include "hphp/util/concurrent-scalable-cache.h"
53 #include <folly/FileUtil.h>
54 #include <folly/json.h>
56 /* Only defined in pcre >= 8.32 */
57 #ifndef PCRE_STUDY_JIT_COMPILE
58 # define PCRE_STUDY_JIT_COMPILE 0
67 ///////////////////////////////////////////////////////////////////////////////
68 // PCREglobals definition
70 PCREglobals::PCREglobals() {
71 jit_stack
= pcre_jit_stack_alloc(32768, 524288);
72 // Set these to handle uses of pcre prior to PcreExtension::threadInit
73 // In particular, for matching tier overrides during RuntimeOption::Load
74 preg_backtrace_limit
= RuntimeOption::PregBacktraceLimit
;
75 preg_recursion_limit
= RuntimeOption::PregRecursionLimit
;
78 PCREglobals::~PCREglobals() {
79 pcre_jit_stack_free(jit_stack
);
82 ///////////////////////////////////////////////////////////////////////////////
83 // PCRECache definition
86 typedef std::shared_ptr
<const pcre_cache_entry
> EntryPtr
;
87 typedef std::unique_ptr
<LRUCacheKey
> TempKeyCache
;
89 enum class CacheKind
{
96 struct ahm_string_data_same
{
97 bool operator()(const StringData
* s1
, const StringData
* s2
) {
98 // ahm uses -1, -2, -3 as magic values
99 return int64_t(s1
) > 0 && (s1
== s2
|| s1
->same(s2
));
103 typedef folly::AtomicHashArray
<StringData
*, const pcre_cache_entry
*,
104 string_data_hash
, ahm_string_data_same
> StaticCache
;
105 typedef ConcurrentLRUCache
<LRUCacheKey
, EntryPtr
,
106 LRUCacheKey::HashCompare
> LRUCache
;
107 typedef ConcurrentScalableCache
<LRUCacheKey
, EntryPtr
,
108 LRUCacheKey::HashCompare
> ScalableCache
;
109 typedef StaticCache::value_type StaticCachePair
;
114 : m_kind(Kind::Empty
)
123 m_u
.smart_ptr
.~EntryPtr();
125 case Kind::AccessorKind
:
126 m_u
.accessor
.~ConstAccessor();
131 Accessor
& operator=(const pcre_cache_entry
* ptr
) {
132 assertx(m_kind
== Kind::Empty
|| m_kind
== Kind::Ptr
);
138 Accessor
& operator=(EntryPtr
&& ep
) {
140 case Kind::AccessorKind
:
141 m_u
.accessor
.~ConstAccessor();
144 m_kind
= Kind::SmartPtr
;
145 new (&m_u
.smart_ptr
) EntryPtr(std::move(ep
));
148 m_u
.smart_ptr
= std::move(ep
);
154 // No assignment from LRUCache::ConstAccessor since it is non-copyable
155 // Use resetToLRU instead
156 LRUCache::ConstAccessor
& resetToLRU() {
159 m_u
.smart_ptr
.~EntryPtr();
162 m_kind
= Kind::AccessorKind
;
163 new (&m_u
.accessor
) LRUCache::ConstAccessor();
165 case Kind::AccessorKind
:
171 const pcre_cache_entry
* get() {
173 case Kind::Empty
: return nullptr;
174 case Kind::Ptr
: return m_u
.ptr
;
175 case Kind::SmartPtr
: return m_u
.smart_ptr
.get();
176 case Kind::AccessorKind
: return m_u
.accessor
->get();
178 always_assert(false);
181 const EntryPtr
& entryPtr() const {
182 assertx(m_kind
== Kind::SmartPtr
);
183 return m_u
.smart_ptr
;
187 enum class Kind
: uint8_t {
198 const pcre_cache_entry
* ptr
;
200 LRUCache::ConstAccessor accessor
;
208 : m_kind(CacheKind::Static
), m_staticCache(nullptr)
210 reinit(CacheKind::Static
);
214 if (m_kind
== CacheKind::Static
&& m_staticCache
.load()) {
215 DestroyStatic(m_staticCache
);
219 void reinit(CacheKind kind
);
220 bool find(Accessor
& accessor
, const StringData
* key
,
221 TempKeyCache
& keyCache
);
222 void insert(Accessor
& accessor
, StringData
* regex
,
223 TempKeyCache
& keyCache
, const pcre_cache_entry
* ent
);
224 void dump(folly::File
& file
);
230 static void DestroyStatic(StaticCache
* cache
);
231 static StaticCache
* CreateStatic();
234 std::atomic
<StaticCache
*> m_staticCache
;
235 std::unique_ptr
<LRUCache
> m_lruCache
;
236 std::unique_ptr
<ScalableCache
> m_scalableCache
;
237 std::atomic
<time_t> m_expire
{};
238 std::mutex m_clearMutex
;
241 ///////////////////////////////////////////////////////////////////////////////
244 RDS_LOCAL(PCREglobals
, tl_pcre_globals
);
246 static PCRECache s_pcreCache
;
248 // The last pcre error code is available for the whole thread.
249 static RDS_LOCAL(int, rl_last_error_code
);
251 ///////////////////////////////////////////////////////////////////////////////
252 // pcre_cache_entry implementation
254 pcre_cache_entry::~pcre_cache_entry() {
256 #if PCRE_MAJOR < 8 || (PCRE_MAJOR == 8 && PCRE_MINOR < 20)
259 pcre_free_study(extra
);
266 bool literalOptions(int options
) {
268 PCRE_ANCHORED
| PCRE_CASELESS
|
269 PCRE_DOLLAR_ENDONLY
| PCRE_NOTEMPTY
;
270 return !(options
& ~mask
);
273 pcre_literal_data::pcre_literal_data(const char* pattern
, int coptions
) {
274 if (!literalOptions(coptions
)) return;
280 match_start_of_line
= true;
284 std::string pattern_buffer
;
285 while (isalnum((unsigned char)*p
) || (*p
&& strchr("/\\ :-_", *p
))) {
286 // backslash + alphanumeric character --> not a literal (i.e. \d).
287 // backslash + non-alphanumeric character --> literal symbol (i.e. \.)
289 if (!p
[1] || isalnum((unsigned char)p
[1])) {
295 pattern_buffer
+= *p
++;
298 options
|= PCRE_DOLLAR_ENDONLY
;
302 /* This is an encoding of a literal string. */
303 ITRACE(2, "Literal pattern: {}\n", pattern_buffer
);
304 literal_str
= std::move(pattern_buffer
);
308 bool pcre_literal_data::isLiteral() const {
309 return literal_str
.has_value();
312 bool pcre_literal_data::matches(const StringData
* subject
,
315 int extra_options
) const {
316 assertx(isLiteral() && literalOptions(extra_options
));
319 // Subject must be at least as long as the literal pattern
320 // for a match to occur.
321 if (subject
->size() < literal_str
->length() + pos
) {
325 size_t literal_strlen
= literal_str
->length();
326 auto const g_empty
= (options
| extra_options
) & PCRE_NOTEMPTY
;
327 if (g_empty
&& !literal_strlen
) return false;
328 auto const subject_c
= subject
->data();
329 auto const literal_c
= literal_str
->c_str();
331 // Compare the literal pattern at an offset of the subject.
332 auto const subject_substr
= subject_c
+ pos
;
334 auto const match_start
= [&]() {
335 if (match_end() && (subject
->size() - pos
) != literal_strlen
) {
338 // If only matching the start (^), compare the strings
339 // for the length of the literal pattern.
340 if (case_insensitive() ?
341 bstrcaseeq(subject_substr
, literal_c
, literal_strlen
) :
342 memcmp(subject_substr
, literal_c
, literal_strlen
) == 0) {
343 offsets
[0] = pos
* sizeof(char);
344 offsets
[1] = offsets
[0] + literal_strlen
* sizeof(char);
350 if (match_start_of_line
) {
351 return !pos
&& match_start();
352 } else if (match_start_of_string()) {
353 return match_start();
354 } else if (match_end()) {
355 // Compare the literal pattern against the tail end of the subject.
356 auto const subject_tail
= subject_c
+ (subject
->size() - literal_strlen
);
357 if (case_insensitive() ?
358 bstrcaseeq(subject_tail
, literal_c
, literal_strlen
) :
359 memcmp(subject_tail
, literal_c
, literal_strlen
) == 0) {
360 offsets
[0] = (subject
->size() - literal_strlen
) * sizeof(char);
361 offsets
[1] = subject
->size() * sizeof(char);
365 if (!literal_strlen
) {
366 offsets
[0] = offsets
[1] = pos
;
369 // Check if the literal pattern occurs as a substring of the subject.
370 auto const subject_str
= StrNR(subject
);
371 auto const find_response
= subject_str
.asString().find(
372 *literal_str
, pos
, !case_insensitive());
373 if (find_response
>= 0) {
374 offsets
[0] = find_response
* sizeof(char);
375 offsets
[1] = offsets
[0] + literal_strlen
* sizeof(char);
382 ///////////////////////////////////////////////////////////////////////////////
383 // PCRECache implementation
385 PCRECache::StaticCache
* PCRECache::CreateStatic() {
386 StaticCache::Config config
;
387 config
.maxLoadFactor
= 0.5;
388 return StaticCache::create(
389 RuntimeOption::EvalPCRETableSize
, config
).release();
392 void PCRECache::DestroyStatic(StaticCache
* cache
) {
393 // We delete uncounted keys while iterating the cache, which is OK for
394 // AtomicHashArray, but not OK for other containers, such as
395 // std::unordered_map. If you change the cache type make sure that property
396 // holds or fix this function.
397 static_assert(std::is_same
<PCRECache::StaticCache
,
398 folly::AtomicHashArray
<StringData
*, const pcre_cache_entry
*,
399 string_data_hash
, ahm_string_data_same
>>::value
,
400 "StaticCache must be an AtomicHashArray or this destructor is wrong.");
401 for (auto& it
: *cache
) {
402 DecRefUncountedString(it
.first
);
405 StaticCache::destroy(cache
);
408 void PCRECache::reinit(CacheKind kind
) {
410 case CacheKind::Static
:
411 if (m_staticCache
.load()) {
412 DestroyStatic(m_staticCache
);
413 m_staticCache
= nullptr;
419 case CacheKind::Scalable
:
420 m_scalableCache
.reset();
426 case CacheKind::Static
:
427 m_staticCache
= CreateStatic();
428 m_expire
= time(nullptr) + RuntimeOption::EvalPCREExpireInterval
;
431 m_lruCache
.reset(new LRUCache(RuntimeOption::EvalPCRETableSize
));
433 case CacheKind::Scalable
:
434 m_scalableCache
.reset(
435 new ScalableCache(RuntimeOption::EvalPCRETableSize
));
440 bool PCRECache::find(Accessor
& accessor
,
441 const StringData
* regex
,
442 TempKeyCache
& keyCache
)
445 case CacheKind::Static
:
447 assertx(m_staticCache
.load());
448 StaticCache::iterator it
;
449 auto cache
= m_staticCache
.load(std::memory_order_acquire
);
450 if ((it
= cache
->find(regex
)) != cache
->end()) {
451 accessor
= it
->second
;
457 case CacheKind::Scalable
:
460 keyCache
.reset(new LRUCacheKey(regex
->data(), regex
->size()));
463 if (m_kind
== CacheKind::Lru
) {
464 found
= m_lruCache
->find(accessor
.resetToLRU(), *keyCache
);
466 found
= m_scalableCache
->find(accessor
.resetToLRU(), *keyCache
);
471 always_assert(false);
474 void PCRECache::clearStatic() {
475 std::unique_lock
<std::mutex
> lock(m_clearMutex
, std::try_to_lock
);
478 auto newExpire
= time(nullptr) + RuntimeOption::EvalPCREExpireInterval
;
479 m_expire
.store(newExpire
, std::memory_order_relaxed
);
481 auto tmpMap
= CreateStatic();
482 tmpMap
= m_staticCache
.exchange(tmpMap
, std::memory_order_acq_rel
);
484 Treadmill::enqueue([tmpMap
]() {
485 DestroyStatic(tmpMap
);
489 void PCRECache::insert(
492 TempKeyCache
& keyCache
,
493 const pcre_cache_entry
* ent
496 case CacheKind::Static
:
498 assertx(m_staticCache
.load());
499 // Clear the cache if we haven't refreshed it in a while
500 if (time(nullptr) > m_expire
) {
503 auto const cache
= m_staticCache
.load(std::memory_order_acquire
);
504 auto const key
= !regex
->persistentIncRef()
505 ? StringData::MakeUncounted(regex
->slice())
507 auto pair
= cache
->insert(StaticCachePair(key
, ent
));
509 // Inserted, container owns the pointer
512 // Not inserted, caller needs to own the pointer
513 DecRefUncountedString(key
);
514 accessor
= EntryPtr(ent
);
519 case CacheKind::Scalable
:
522 keyCache
.reset(new LRUCacheKey(regex
->data(), regex
->size()));
524 // Pointer ownership is shared between container and caller
525 accessor
= EntryPtr(ent
);
526 if (m_kind
== CacheKind::Lru
) {
527 m_lruCache
->insert(*keyCache
, accessor
.entryPtr());
529 m_scalableCache
->insert(*keyCache
, accessor
.entryPtr());
536 void PCRECache::dump(folly::File
& file
) {
538 case CacheKind::Static
:
539 for (auto& it
: *m_staticCache
) {
540 folly::writeFull(file
.fd(), it
.first
->data(), it
.first
->size());
541 folly::writeFull(file
.fd(), "\n", 1);
545 case CacheKind::Scalable
:
547 std::vector
<LRUCacheKey
> keys
;
548 if (m_kind
== CacheKind::Lru
) {
549 m_lruCache
->snapshotKeys(keys
);
551 m_scalableCache
->snapshotKeys(keys
);
553 for (auto& key
: keys
) {
554 folly::writeFull(file
.fd(), key
.data(), key
.size());
555 folly::writeFull(file
.fd(), "\n", 1);
562 size_t PCRECache::size() const {
564 case CacheKind::Static
:
565 return m_staticCache
.load(std::memory_order_acquire
)->size();
567 return m_lruCache
->size();
568 case CacheKind::Scalable
:
569 return m_scalableCache
->size();
571 always_assert(false);
574 ///////////////////////////////////////////////////////////////////////////////
575 // Public interface and helper functions
578 PCRECache::CacheKind kind
;
579 if (RuntimeOption::EvalPCRECacheType
== "static") {
580 kind
= PCRECache::CacheKind::Static
;
581 } else if (RuntimeOption::EvalPCRECacheType
== "lru") {
582 kind
= PCRECache::CacheKind::Lru
;
583 } else if (RuntimeOption::EvalPCRECacheType
== "scalable") {
584 kind
= PCRECache::CacheKind::Scalable
;
586 Logger::Warning("Eval.PCRECacheType should be either static, "
588 kind
= PCRECache::CacheKind::Scalable
;
590 s_pcreCache
.reinit(kind
);
596 void pcre_dump_cache(folly::File
& file
) {
597 s_pcreCache
.dump(file
);
600 static pcre_jit_stack
* alloc_jit_stack(void* /*data*/) {
601 return tl_pcre_globals
->jit_stack
;
606 template<bool useSmartFree
= false>
607 struct FreeHelperImpl
{
608 explicit FreeHelperImpl(void* p
) : p(p
) {}
610 useSmartFree
? req::free(p
) : free(p
);
613 FreeHelperImpl(const FreeHelperImpl
&) = delete;
614 FreeHelperImpl
& operator=(const FreeHelperImpl
&) = delete;
620 typedef FreeHelperImpl
<true> SmartFreeHelper
;
623 static void init_local_extra(pcre_extra
* local
, pcre_extra
* shared
) {
625 memcpy(local
, shared
, sizeof(pcre_extra
));
627 memset(local
, 0, sizeof(pcre_extra
));
628 local
->flags
= PCRE_EXTRA_MATCH_LIMIT
| PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
630 local
->match_limit
= tl_pcre_globals
->preg_backtrace_limit
;
631 local
->match_limit_recursion
= tl_pcre_globals
->preg_recursion_limit
;
634 static const char* const*
635 get_subpat_names(const pcre_cache_entry
* pce
) {
636 assertx(!pce
->literal_data
);
637 char **subpat_names
= pce
->subpat_names
.load(std::memory_order_relaxed
);
638 if (subpat_names
) return subpat_names
;
641 * Build a mapping from subpattern numbers to their names. We will always
642 * allocate the table, even though there may be no named subpatterns. This
643 * avoids somewhat more complicated logic in the inner loops.
646 init_local_extra(&extra
, pce
->extra
);
650 subpat_names
= (char **)calloc(pce
->num_subpats
, sizeof(char *));
651 int rc
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_NAMECOUNT
, &name_count
);
653 raise_warning("Internal pcre_fullinfo() error %d", rc
);
656 if (name_count
> 0) {
657 int name_size
, ni
= 0;
658 unsigned short name_idx
;
662 rc1
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_NAMETABLE
, &name_table
);
663 rc2
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_NAMEENTRYSIZE
, &name_size
);
664 rc
= rc2
? rc2
: rc1
;
666 raise_warning("Internal pcre_fullinfo() error %d", rc
);
669 // The table returned by PCRE_INFO_NAMETABLE is an array of fixed length
670 // strings of size PCRE_INFO_NAMEENTRYSIZE. The first two bytes are a
671 // big-endian uint16_t defining the array index followed by the
672 // zero-terminated name string.
673 // (See https://www.pcre.org/original/doc/html/pcreapi.html)
674 while (ni
++ < name_count
) {
675 name_idx
= 0x100 * (unsigned char)name_table
[0] +
676 (unsigned char)name_table
[1];
677 subpat_names
[name_idx
] = name_table
+ 2;
678 if (is_numeric_string(subpat_names
[name_idx
],
679 strlen(subpat_names
[name_idx
]),
680 nullptr, nullptr, 0) != KindOfNull
) {
681 raise_warning("Numeric named subpatterns are not allowed");
684 name_table
+= name_size
;
687 // Store subpat_names into the cache entry
688 char **expected
= nullptr;
689 if (!pce
->subpat_names
.compare_exchange_strong(expected
, subpat_names
)) {
690 // Another thread stored subpat_names already. The array created by the
691 // other thread is now in expected, return it instead and delete the one
699 static bool get_pcre_fullinfo(pcre_cache_entry
* pce
) {
701 init_local_extra(&extra
, pce
->extra
);
703 /* Calculate the size of the offsets array*/
704 int rc
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_CAPTURECOUNT
,
707 raise_warning("Internal pcre_fullinfo() error %d", rc
);
715 pcre_get_compiled_regex_cache(PCRECache::Accessor
& accessor
,
717 PCRECache::TempKeyCache tkc
;
719 /* Try to lookup the cached regex entry, and if successful, just pass
720 back the compiled pattern, otherwise go on and compile it. */
721 if (s_pcreCache
.find(accessor
, regex
, tkc
)) return true;
723 /* Parse through the leading whitespace, and display a warning if we
724 get to the end without encountering a delimiter. */
725 const char *p
= regex
->data();
726 while (isspace((int)*(unsigned char *)p
)) p
++;
728 raise_warning("Empty regular expression");
732 /* Get the delimiter and display a warning if it is alphanumeric
734 char delimiter
= *p
++;
735 if (isalnum((int)*(unsigned char *)&delimiter
) || delimiter
== '\\') {
736 raise_warning("Delimiter must not be alphanumeric or backslash");
740 char start_delimiter
= delimiter
;
741 const char *pp
= strchr("([{< )]}> )]}>", delimiter
);
745 char end_delimiter
= delimiter
;
747 if (start_delimiter
== end_delimiter
) {
748 /* We need to iterate through the pattern, searching for the ending
749 * delimiter, but skipping the backslashed delimiters. If the ending
750 * delimiter is not found, display a warning. */
753 if (*pp
== '\\' && pp
[1] != 0) pp
++;
754 else if (*pp
== delimiter
)
759 raise_warning("No ending delimiter '%c' found: [%s]", delimiter
,
764 /* We iterate through the pattern, searching for the matching ending
765 * delimiter. For each matching starting delimiter, we increment nesting
766 * level, and decrement it for each matching ending delimiter. If we
767 * reach the end of the pattern without matching, display a warning.
769 int brackets
= 1; // brackets nesting level
772 if (*pp
== '\\' && pp
[1] != 0) pp
++;
773 else if (*pp
== end_delimiter
&& --brackets
<= 0)
775 else if (*pp
== start_delimiter
)
780 raise_warning("No ending matching delimiter '%c' found: [%s]",
781 end_delimiter
, regex
->data());
786 /* Make a copy of the actual pattern. */
787 String
spattern(p
, pp
-p
, CopyString
);
788 const char *pattern
= spattern
.data();
790 /* Move on to the options */
793 /* Parse through the options, setting appropriate flags. Display
794 a warning if we encounter an unknown modifier. */
797 bool do_study
= false;
800 /* Perl compatible options */
801 case 'i': coptions
|= PCRE_CASELESS
; break;
802 case 'm': coptions
|= PCRE_MULTILINE
; break;
803 case 's': coptions
|= PCRE_DOTALL
; break;
804 case 'x': coptions
|= PCRE_EXTENDED
; break;
806 /* PCRE specific options */
807 case 'A': coptions
|= PCRE_ANCHORED
; break;
808 case 'D': coptions
|= PCRE_DOLLAR_ENDONLY
; break;
809 case 'S': do_study
= true; break;
810 case 'U': coptions
|= PCRE_UNGREEDY
; break;
811 case 'X': coptions
|= PCRE_EXTRA
; break;
812 case 'u': coptions
|= PCRE_UTF8
;
813 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
814 characters, even in UTF-8 mode. However, this can be changed by setting
815 the PCRE_UCP option. */
817 coptions
|= PCRE_UCP
;
821 /* Custom preg options */
822 case 'e': poptions
|= PREG_REPLACE_EVAL
; break;
830 raise_warning("Unknown modifier '%c': [%s]", pp
[-1], regex
->data());
835 /* We've reached a null byte, now check if we're actually at the end of the
836 string. If not this is a bad expression, and a potential security hole. */
837 if (regex
->size() != (pp
- regex
->data())) {
838 raise_error("Error: Null byte found in pattern");
841 /* Store the compiled pattern and extra info in the cache. */
842 auto const store_pcre_entry
=
843 [&](pcre_literal_data
& pld
, pcre
* re
=nullptr, pcre_extra
* extra
=nullptr) {
844 assertx((poptions
& ~0x1) == 0);
845 assertx((coptions
& 0x80000000) == 0);
846 pcre_cache_entry
* new_entry
= new pcre_cache_entry();
848 new_entry
->extra
= extra
;
849 new_entry
->preg_options
= poptions
;
850 new_entry
->compile_options
= coptions
;
852 if (pld
.isLiteral()) {
853 new_entry
->literal_data
=
854 std::make_unique
<pcre_literal_data
>(std::move(pld
));
855 new_entry
->num_subpats
= 1;
857 /* Get pcre full info */
858 if (!get_pcre_fullinfo(new_entry
)) {
864 s_pcreCache
.insert(accessor
, regex
, tkc
, new_entry
);
868 // If the pattern is a literal, we can skip compiling it.
869 auto literal_data
= pcre_literal_data(pattern
, coptions
);
870 if (literal_data
.isLiteral()) return store_pcre_entry(literal_data
);
872 /* Compile pattern and display a warning if compilation failed. */
875 pcre
*re
= pcre_compile(pattern
, coptions
, &error
, &erroffset
, 0);
877 raise_warning("Compilation failed: %s at offset %d", error
, erroffset
);
881 // Careful: from here 're' needs to be freed if something throws.
883 /* If study option was specified, study the pattern and
884 store the result in extra for passing to pcre_exec. */
885 pcre_extra
*extra
= nullptr;
886 if (!literal_data
.isLiteral()) {
887 if (do_study
|| PCRE_STUDY_JIT_COMPILE
) {
888 int soptions
= PCRE_STUDY_JIT_COMPILE
;
889 extra
= pcre_study(re
, soptions
, &error
);
891 extra
->flags
|= PCRE_EXTRA_MATCH_LIMIT
|
892 PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
893 pcre_assign_jit_stack(extra
, alloc_jit_stack
, nullptr);
895 if (error
!= nullptr) {
897 raise_warning("Error while studying pattern");
903 if ((!RuntimeOption::EvalJitNoGdb
||
904 RuntimeOption::EvalJitUseVtuneAPI
||
905 RuntimeOption::EvalPerfPidMap
) &&
907 extra
->executable_jit
!= nullptr) {
909 pcre_fullinfo(re
, extra
, PCRE_INFO_JITSIZE
, &size
);
911 TCA start
= *(TCA
*)(extra
->executable_jit
);
912 TCA end
= start
+ size
;
913 std::string name
= folly::sformat("HHVM::pcre_jit::{}", pattern
);
915 if (!RuntimeOption::EvalJitNoGdb
&& jit::mcgen::initialized()) {
916 Debug::DebugInfo::Get()->recordStub(Debug::TCRange(start
, end
, false),
919 if (RuntimeOption::EvalJitUseVtuneAPI
) {
920 HPHP::jit::reportHelperToVtune(name
.c_str(), start
, end
);
922 if (RuntimeOption::EvalPerfPidMap
&& jit::mcgen::initialized()) {
923 std::string escaped_name
;
924 folly::json::escapeString(name
, escaped_name
,
925 folly::json::serialization_opts());
926 Debug::DebugInfo::Get()->recordPerfMap(
927 Debug::TCRange(start
, end
, false),
928 SrcKey
{}, escaped_name
935 return store_pcre_entry(literal_data
, re
, extra
);
938 static int* create_offset_array(const pcre_cache_entry
* pce
,
940 /* Allocate memory for the offsets array */
941 size_offsets
= pce
->num_subpats
* 3;
942 return (int *)req::malloc_noptrs(size_offsets
* sizeof(int));
945 static Array
str_offset_pair(const String
& str
, int offset
) {
946 return make_vec_array(str
, offset
);
949 static inline bool pcre_need_log_error(int pcre_code
) {
950 return RuntimeOption::EnablePregErrorLog
&&
951 (pcre_code
== PCRE_ERROR_MATCHLIMIT
||
952 pcre_code
== PCRE_ERROR_RECURSIONLIMIT
);
955 static void pcre_log_error(const char* func
, int line
, int pcre_code
,
956 const char* pattern
, int pattern_size
,
957 const char* subject
, int subject_size
,
958 const char* repl
, int repl_size
,
959 int arg1
= 0, int arg2
= 0,
960 int arg3
= 0, int arg4
= 0) {
961 const char* escapedPattern
;
962 const char* escapedSubject
;
963 const char* escapedRepl
;
964 std::string
p(pattern
, pattern_size
);
965 std::string
s(subject
, subject_size
);
966 std::string
r(repl
, repl_size
);
967 escapedPattern
= Logger::EscapeString(p
);
968 escapedSubject
= Logger::EscapeString(s
);
969 escapedRepl
= Logger::EscapeString(r
);
970 const char* errString
=
971 (pcre_code
== PCRE_ERROR_MATCHLIMIT
) ? "PCRE_ERROR_MATCHLIMIT" :
972 (pcre_code
== PCRE_ERROR_RECURSIONLIMIT
) ? "PCRE_ERROR_RECURSIONLIMIT" :
974 raise_warning_unsampled(
975 "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
976 "limits=(%" PRId64
", %" PRId64
"), extra=(%d, %d, %d, %d)",
977 func
, line
, pcre_code
, errString
,
978 escapedPattern
, escapedSubject
, escapedRepl
,
979 tl_pcre_globals
->preg_backtrace_limit
,
980 tl_pcre_globals
->preg_recursion_limit
,
981 arg1
, arg2
, arg3
, arg4
);
982 free((void *)escapedPattern
);
983 free((void *)escapedSubject
);
984 free((void *)escapedRepl
);
989 ALWAYS_INLINE Variant
preg_return_internal_error(Variant
&& return_value
) {
990 *rl_last_error_code
= PHP_PCRE_INTERNAL_ERROR
;
991 return std::move(return_value
);
994 ALWAYS_INLINE Variant
preg_return_bad_regex_error(Variant
&& return_value
) {
995 *rl_last_error_code
= PHP_PCRE_BAD_REGEX_ERROR
;
996 return std::move(return_value
);
999 void pcre_handle_exec_error(int pcre_code
) {
1001 switch (pcre_code
) {
1002 case PCRE_ERROR_MATCHLIMIT
:
1003 preg_code
= PHP_PCRE_BACKTRACK_LIMIT_ERROR
;
1005 case PCRE_ERROR_RECURSIONLIMIT
:
1006 preg_code
= PHP_PCRE_RECURSION_LIMIT_ERROR
;
1008 case PCRE_ERROR_BADUTF8
:
1009 preg_code
= PHP_PCRE_BAD_UTF8_ERROR
;
1011 case PCRE_ERROR_BADUTF8_OFFSET
:
1012 preg_code
= PHP_PCRE_BAD_UTF8_OFFSET_ERROR
;
1015 preg_code
= PHP_PCRE_INTERNAL_ERROR
;
1018 *rl_last_error_code
= preg_code
;
1021 ALWAYS_INLINE Variant
1022 preg_return_pcre_error(int pcre_code
, Variant
&& return_value
) {
1023 pcre_handle_exec_error(pcre_code
);
1024 return std::move(return_value
);
1027 ALWAYS_INLINE Variant
preg_return_no_error(Variant
&& return_value
) {
1028 *rl_last_error_code
= PHP_PCRE_NO_ERROR
;
1029 return std::move(return_value
);
1034 ///////////////////////////////////////////////////////////////////////////////
1036 Variant
preg_grep(const String
& pattern
, const Array
& input
, int flags
/* = 0 */) {
1037 PCRECache::Accessor accessor
;
1038 if (!pcre_get_compiled_regex_cache(accessor
, pattern
.get())) {
1039 return preg_return_bad_regex_error(false);
1041 const pcre_cache_entry
* pce
= accessor
.get();
1043 int size_offsets
= 0;
1044 int* offsets
= create_offset_array(pce
, size_offsets
);
1045 if (offsets
== nullptr) {
1046 return preg_return_internal_error(false);
1048 SmartFreeHelper
freer(offsets
);
1050 /* Initialize return array */
1051 auto ret
= Array::CreateDict();
1053 /* Go through the input array */
1054 bool invert
= (flags
& PREG_GREP_INVERT
);
1056 init_local_extra(&extra
, pce
->extra
);
1058 for (ArrayIter
iter(input
); iter
; ++iter
) {
1059 String entry
= iter
.second().toString();
1062 if (pce
->literal_data
) {
1063 assertx(pce
->literal_data
->isLiteral());
1064 count
= pce
->literal_data
->matches(entry
.get(), 0, offsets
, 0)
1065 ? 1 : PCRE_ERROR_NOMATCH
;
1067 /* Perform the match */
1068 count
= pcre_exec(pce
->re
, &extra
, entry
.data(), entry
.size(),
1069 0, 0, offsets
, size_offsets
);
1071 /* Check for too many substrings condition. */
1073 raise_warning("Matched, but too many substrings");
1074 count
= pce
->num_subpats
;
1075 } else if (count
< 0 && count
!= PCRE_ERROR_NOMATCH
) {
1076 if (pcre_need_log_error(count
)) {
1077 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1078 pattern
.data(), pattern
.size(),
1079 entry
.data(), entry
.size(),
1083 // NOTE: this returns an error together with a partial result :-(
1084 return preg_return_pcre_error(count
, std::move(ret
));
1087 /* If the entry fits our requirements */
1088 if ((count
> 0 && !invert
) ||
1089 (count
== PCRE_ERROR_NOMATCH
&& invert
)) {
1091 /* Add to return array */
1092 ret
.set(iter
.first(), entry
);
1096 return preg_return_no_error(std::move(ret
));
1099 ///////////////////////////////////////////////////////////////////////////////
1101 static Variant
preg_match_impl(StringData
* pattern
,
1102 const StringData
* subject
,
1103 Variant
* subpats
, int flags
, int start_offset
,
1105 PCRECache::Accessor accessor
;
1106 if (!pcre_get_compiled_regex_cache(accessor
, pattern
)) {
1107 return preg_return_bad_regex_error(false);
1110 const pcre_cache_entry
* pce
= accessor
.get();
1111 init_local_extra(&extra
, pce
->extra
);
1112 int subpats_order
= global
? PREG_PATTERN_ORDER
: 0;
1113 if (subpats
) *subpats
= Array::CreateDict();
1117 * subpats_order is pre-set to pattern mode so we change it only if
1121 subpats_order
= flags
& 0xff;
1123 if ((global
&& (subpats_order
< PREG_PATTERN_ORDER
||
1124 subpats_order
> PREG_SET_ORDER
)) ||
1125 (!global
&& subpats_order
!= 0)) {
1126 raise_warning("Invalid flags specified");
1127 return preg_return_internal_error(init_null());
1131 /* Negative offset counts from the end of the string. */
1132 if (start_offset
< 0) {
1133 start_offset
= subject
->size() + start_offset
;
1134 if (start_offset
< 0) {
1139 int size_offsets
= 0;
1140 int* offsets
= create_offset_array(pce
, size_offsets
);
1141 SmartFreeHelper
offsetsFreer(offsets
);
1142 int num_subpats
= pce
->num_subpats
;
1143 if (offsets
== nullptr) return preg_return_internal_error(false);
1145 /* Allocate match sets array and initialize the values. */
1147 /* An array of sets of matches for each subpattern after a global match */
1148 auto match_sets
= Array::CreateDict();
1149 if (global
&& subpats_order
== PREG_PATTERN_ORDER
) {
1150 for (int i
= 0; i
< num_subpats
; i
++) {
1151 match_sets
.set(i
, Array::CreateDict());
1156 * If PREG_OFFSET_CAPTURE, each match, instead of being a string, will
1157 * be an array where the first element is a substring containing the
1158 * match and the second element is the position of the first character of
1159 * the substring in the input.
1161 bool offset_capture
= flags
& PREG_OFFSET_CAPTURE
;
1162 const char** stringlist
; // Holds list of subpatterns
1163 auto const get_value
= [&](int i
) {
1164 auto const length
= offsets
[(i
<<1)+1] - offsets
[i
<<1];
1165 auto const match
= String(stringlist
[i
], length
, CopyString
);
1166 return offset_capture
1167 ? Variant(str_offset_pair(match
, offsets
[i
<<1]))
1170 auto const get_value_empty
= [&](int i
) {
1171 auto const match
= empty_string();
1172 return offset_capture
1173 ? Variant(str_offset_pair(match
, offsets
[i
<<1]))
1178 * Skip building name table when using literal_data. Name table is used
1179 * to add named subpatterns to result array. Literal data has none of these,
1180 * so we can skip this step.
1182 const char* const* subpat_names
= nullptr;
1183 auto const is_literal
= pce
->literal_data
!= nullptr;
1185 subpat_names
= get_subpat_names(pce
);
1186 if (subpat_names
== nullptr) return preg_return_internal_error(false);
1188 auto const set_subpats
= [&](auto& arr
, int i
, const Variant
& value
) {
1189 if (is_literal
) return;
1190 if (subpat_names
[i
]) arr
.set(String(subpat_names
[i
]), value
);
1194 const bool includeNonMatchingCaptures
= flags
& PREG_FB__PRIVATE__HSL_IMPL
;
1196 // Add matches to result array for this run
1197 auto add_match_set
= [&](auto& arr
, int count
) {
1198 for (i
= 0; i
< count
; i
++) {
1199 auto const value
= get_value(i
);
1200 set_subpats(arr
, i
, value
);
1203 if (includeNonMatchingCaptures
) {
1204 for (; i
< num_subpats
; i
++) {
1205 auto const value
= get_value_empty(i
);
1206 set_subpats(arr
, i
, value
);
1213 int g_notempty
= 0; // If the match should not be empty
1214 int exec_options
= 0;
1218 int options
= exec_options
| g_notempty
;
1220 assertx(literalOptions(options
));
1221 count
= pce
->literal_data
->matches(subject
, start_offset
, offsets
, options
)
1222 ? 1 : PCRE_ERROR_NOMATCH
;
1224 /* Execute the regular expression. */
1225 count
= pcre_exec(pce
->re
, &extra
, subject
->data(), subject
->size(),
1226 start_offset
, options
,
1227 offsets
, size_offsets
);
1229 /* The string was already proved to be valid UTF-8 */
1230 exec_options
|= PCRE_NO_UTF8_CHECK
;
1232 /* Check for too many substrings condition. */
1234 raise_warning("Matched, but too many substrings");
1235 count
= num_subpats
;
1238 /* If something has matched */
1243 // Try to get the list of substrings and display a warning if failed.
1244 if (offsets
[1] < offsets
[0] ||
1245 pcre_get_substring_list(subject
->data(), offsets
, count
,
1247 raise_warning("Get subpatterns list failed");
1248 return preg_return_internal_error(false);
1252 if (subpats_order
== PREG_PATTERN_ORDER
) {
1253 /* For each subpattern, insert it into the appropriate array. */
1254 for (i
= 0; i
< count
; i
++) {
1255 auto const value
= get_value(i
);
1256 auto& arr
= asArrRef(match_sets
.lval(i
));
1257 assertx(arr
->isVectorData());
1258 arr
.set(safe_cast
<int64_t>(arr
.size()), value
);
1261 * If the number of captured subpatterns on this run is
1262 * less than the total possible number, pad the result
1263 * arrays with empty strings.
1265 for (; i
< num_subpats
; i
++) {
1266 auto& arr
= asArrRef(match_sets
.lval(i
));
1267 assertx(arr
->isVectorData());
1268 arr
.set(safe_cast
<int64_t>(arr
.size()), empty_string());
1271 auto result_set
= Array::CreateDict();
1272 add_match_set(result_set
, count
);
1273 auto& arr
= subpats
->asArrRef();
1274 assertx(arr
->isVectorData());
1275 arr
.set(safe_cast
<int64_t>(arr
.size()), std::move(result_set
));
1278 auto& arr
= subpats
->asArrRef();
1279 add_match_set(arr
, count
);
1281 pcre_free((void *) stringlist
);
1283 } else if (count
== PCRE_ERROR_NOMATCH
) {
1284 /* If we previously set PCRE_NOTEMPTY after a null match,
1285 this is not necessarily the end. We need to advance
1286 the start offset, and continue. Fudge the offset values
1287 to achieve this, unless we're already at the end of the string. */
1288 if (g_notempty
&& start_offset
< subject
->size()) {
1289 offsets
[0] = start_offset
;
1290 offsets
[1] = start_offset
+ 1;
1294 if (pcre_need_log_error(count
)) {
1295 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1296 pattern
->data(), pattern
->size(),
1297 subject
->data(), subject
->size(),
1299 flags
, start_offset
, g_notempty
, global
);
1301 return preg_return_pcre_error(count
, false);
1304 /* If we have matched an empty string, mimic what Perl's /g options does.
1305 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1306 the match again at the same point. If this fails (picked up above) we
1307 advance to the next character. */
1308 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1310 /* Advance to the position right after the last full match */
1311 start_offset
= offsets
[1];
1314 /* Add the match sets to the output array and clean up */
1315 if (subpats
&& global
&& subpats_order
== PREG_PATTERN_ORDER
) {
1316 auto& arr
= subpats
->asArrRef();
1317 for (i
= 0; i
< num_subpats
; i
++) {
1318 auto const value
= match_sets
[i
];
1319 set_subpats(arr
, i
, value
);
1320 arr
.set(i
, match_sets
[i
]);
1323 return preg_return_no_error(std::move(matched
));
1326 Variant
preg_match(const String
& pattern
, const String
& subject
,
1327 Variant
* matches
/* = nullptr */, int flags
/* = 0 */,
1328 int offset
/* = 0 */) {
1329 return preg_match(pattern
.get(), subject
.get(), matches
, flags
, offset
);
1332 Variant
preg_match(StringData
* pattern
, const StringData
* subject
,
1333 Variant
* matches
/* = nullptr */, int flags
/* = 0 */,
1334 int offset
/* = 0 */) {
1335 return preg_match_impl(pattern
, subject
, matches
, flags
, offset
, false);
1338 Variant
preg_match_all(const String
& pattern
, const String
& subject
,
1339 Variant
* matches
/* = nullptr */,
1340 int flags
/* = 0 */, int offset
/* = 0 */) {
1341 return preg_match_all(pattern
.get(), subject
.get(), matches
, flags
, offset
);
1344 Variant
preg_match_all(StringData
* pattern
, const StringData
* subject
,
1345 Variant
* matches
/* = nullptr */,
1346 int flags
/* = 0 */, int offset
/* = 0 */) {
1347 return preg_match_impl(pattern
, subject
, matches
, flags
, offset
, true);
1350 ///////////////////////////////////////////////////////////////////////////////
1352 static String
preg_do_repl_func(const Variant
& function
, const String
& subject
,
1353 int* offsets
, const char* const* subpat_names
,
1355 Array subpats
= Array::CreateDict();
1356 for (int i
= 0; i
< count
; i
++) {
1357 auto off1
= offsets
[i
<<1];
1358 auto off2
= offsets
[(i
<<1)+1];
1359 auto sub
= subject
.substr(off1
, off2
- off1
);
1361 if (subpat_names
&& subpat_names
[i
]) {
1362 subpats
.set(String(subpat_names
[i
]), sub
);
1364 subpats
.set(i
, sub
);
1367 return vm_call_user_func(function
, make_vec_array(subpats
)).toString();
1370 static bool preg_get_backref(const char** str
, int* backref
) {
1372 const char* walk
= *str
;
1378 if (*walk
== '$' && walk
[1] == '{') {
1384 if (*walk
>= '0' && *walk
<= '9') {
1385 *backref
= *walk
- '0';
1391 if (*walk
&& *walk
>= '0' && *walk
<= '9') {
1392 *backref
= *backref
* 10 + *walk
- '0';
1397 if (*walk
== 0 || *walk
!= '}') {
1407 static Variant
php_pcre_replace(const String
& pattern
, const String
& subject
,
1408 const Variant
& replace_var
, bool callable
,
1409 int limit
, int* replace_count
) {
1410 PCRECache::Accessor accessor
;
1411 if (!pcre_get_compiled_regex_cache(accessor
, pattern
.get())) {
1412 return preg_return_bad_regex_error(init_null());
1414 const pcre_cache_entry
* pce
= accessor
.get();
1415 if (pce
->preg_options
& PREG_REPLACE_EVAL
) {
1416 raise_error("preg_replace(): Support for the /e modifier has been removed, use "
1417 "preg_replace_callback instead");
1421 int* offsets
= create_offset_array(pce
, size_offsets
);
1422 SmartFreeHelper
offsetsFreer(offsets
);
1423 if (offsets
== nullptr) {
1424 return preg_return_internal_error(init_null());
1426 auto const is_literal
= pce
->literal_data
!= nullptr;
1427 const char* const* subpat_names
= nullptr;
1429 subpat_names
= get_subpat_names(pce
);
1430 if (subpat_names
== nullptr) return preg_return_internal_error(init_null());
1433 const char* replace
= nullptr;
1434 const char* replace_end
= nullptr;
1435 int replace_len
= 0;
1439 replace_val
= replace_var
.toString();
1440 replace
= replace_val
.data();
1441 replace_len
= replace_val
.size();
1442 replace_end
= replace
+ replace_len
;
1445 StringBuffer
result(2 * subject
.size());
1450 const char* match
= nullptr;
1451 int start_offset
= 0;
1453 init_local_extra(&extra
, pce
->extra
);
1455 const char* walk
; // Used to walk the replacement string
1456 char walk_last
; // Last walked character
1457 int match_len
; // Length of the current match
1458 int backref
; // Backreference number
1459 int g_notempty
= 0; // If the match should not be empty
1460 int exec_options
= 0; // Options passed to pcre_exec
1463 int options
= exec_options
| g_notempty
;
1464 if (pce
->literal_data
&& literalOptions(options
)) {
1465 assertx(pce
->literal_data
->isLiteral());
1467 pce
->literal_data
->matches(subject
.get(), start_offset
, offsets
, options
)
1468 ? 1 : PCRE_ERROR_NOMATCH
;
1470 /* Execute the regular expression. */
1471 count
= pcre_exec(pce
->re
, &extra
, subject
.data(), subject
.size(),
1472 start_offset
, options
, offsets
, size_offsets
);
1474 /* The string was already proved to be valid UTF-8 */
1475 exec_options
|= PCRE_NO_UTF8_CHECK
;
1478 /* Check for too many substrings condition. */
1480 raise_warning("Matched, but too many substrings");
1481 count
= pce
->num_subpats
;
1484 const char* piece
= subject
.data() + start_offset
;
1485 if (count
> 0 && offsets
[1] >= offsets
[0] &&
1486 (limit
== -1 || limit
> 0)) {
1487 if (replace_count
) {
1490 /* Set the match location in subject */
1491 match
= subject
.data() + offsets
[0];
1493 String callable_result
;
1495 /* Use custom function to get replacement string and its length. */
1496 callable_result
= preg_do_repl_func(replace_var
, subject
, offsets
,
1497 subpat_names
, count
);
1498 } else { /* do regular substitution */
1501 while (walk
< replace_end
) {
1502 if ('\\' == *walk
|| '$' == *walk
) {
1503 if (walk_last
== '\\') {
1508 if (preg_get_backref(&walk
, &backref
)) {
1509 if (backref
< count
) {
1510 match_len
= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
1516 walk_last
= walk
[-1];
1520 /* copy the part of the string before the match */
1521 result
.append(piece
, match
-piece
);
1523 /* copy replacement and backrefs */
1524 int result_len
= result
.size();
1527 /* Copy result from custom function to buffer and clean up. */
1528 result
.append(callable_result
.data(), callable_result
.size());
1529 result_len
+= callable_result
.size();
1530 } else { /* do regular backreference copying */
1534 while (walk
< replace_end
) {
1535 if ('\\' == *walk
|| '$' == *walk
) {
1536 if (walk_last
== '\\') {
1537 result
.set(result
.size() - 1, *walk
++);
1541 if (preg_get_backref(&walk
, &backref
)) {
1542 if (backref
< count
) {
1543 match_len
= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
1545 subject
.data() + offsets
[backref
<<1],
1552 result
.append(*walk
++);
1553 walk_last
= walk
[-1];
1561 } else if (count
== PCRE_ERROR_NOMATCH
|| limit
== 0) {
1562 /* If we previously set PCRE_NOTEMPTY after a null match,
1563 this is not necessarily the end. We need to advance
1564 the start offset, and continue. Fudge the offset values
1565 to achieve this, unless we're already at the end of the string. */
1566 if (g_notempty
!= 0 && start_offset
< subject
.size()) {
1567 offsets
[0] = start_offset
;
1568 offsets
[1] = start_offset
+ 1;
1569 result
.append(piece
, 1);
1571 /* stick that last bit of string on our output */
1572 result
.append(piece
, subject
.size() - start_offset
);
1576 if (pcre_need_log_error(count
)) {
1581 if (replace_var
.isObject()) {
1582 stemp
= replace_var
.asCObjRef()->getClassName().asString()
1585 stemp
= replace_var
.toString();
1588 size
= stemp
.size();
1590 s
= replace_val
.data();
1591 size
= replace_val
.size();
1593 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1594 pattern
.data(), pattern
.size(),
1595 subject
.data(), subject
.size(),
1597 callable
, limit
, start_offset
, g_notempty
);
1599 return preg_return_pcre_error(count
, init_null());
1602 /* If we have matched an empty string, mimic what Perl's /g options does.
1603 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1604 the match again at the same point. If this fails (picked up above) we
1605 advance to the next character. */
1606 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1608 /* Advance to the next piece. */
1609 start_offset
= offsets
[1];
1612 return preg_return_no_error(result
.detach());
1618 static Variant
php_replace_in_subject(const Variant
& regex
, const Variant
& replace
,
1619 String subject
, int limit
, bool callable
,
1620 int* replace_count
) {
1621 if (!regex
.isArray()) {
1622 return php_pcre_replace(regex
.toString(), subject
, replace
, callable
,
1623 limit
, replace_count
);
1626 if (callable
|| !replace
.isArray()) {
1627 Array arr
= regex
.toDict();
1628 for (ArrayIter
iterRegex(arr
); iterRegex
; ++iterRegex
) {
1629 String regex_entry
= iterRegex
.second().toString();
1630 auto ret
= php_pcre_replace(regex_entry
, subject
, replace
, callable
,
1631 limit
, replace_count
);
1632 if (!ret
.isString()) {
1633 assertx(ret
.isNull());
1634 return ret
; // php_pcre_replace already set error
1636 subject
= ret
.asStrRef();
1637 assertx(!subject
.isNull());
1639 return preg_return_no_error(std::move(subject
));
1642 Array arrReplace
= replace
.toDict();
1643 Array arrRegex
= regex
.toDict();
1644 ArrayIter
iterReplace(arrReplace
);
1645 for (ArrayIter
iterRegex(arrRegex
); iterRegex
; ++iterRegex
) {
1646 String regex_entry
= iterRegex
.second().toString();
1647 Variant replace_value
;
1649 replace_value
= iterReplace
.second();
1653 auto ret
= php_pcre_replace(regex_entry
, subject
, replace_value
, callable
,
1654 limit
, replace_count
);
1655 if (!ret
.isString()) {
1656 assertx(ret
.isNull());
1657 return ret
; // php_pcre_replace already set error
1659 subject
= ret
.asStrRef();
1660 assertx(!subject
.isNull());
1662 return preg_return_no_error(std::move(subject
));
1665 Variant
preg_replace_impl(const Variant
& pattern
, const Variant
& replacement
,
1666 const Variant
& subject
, int limit
, int64_t* count
,
1667 bool is_callable
, bool is_filter
) {
1668 assertx(!(is_callable
&& is_filter
));
1670 replacement
.isArray() && !pattern
.isArray()) {
1671 raise_warning("Parameter mismatch, pattern is a string while "
1672 "replacement is an array");
1673 return preg_return_internal_error(false);
1676 int replace_count
= 0;
1677 if (!isContainer(subject
)) {
1678 auto ret
= php_replace_in_subject(pattern
, replacement
, subject
.toString(),
1679 limit
, is_callable
, &replace_count
);
1681 if (ret
.isNull()) return ret
; // php_replace_in_subject already set error
1682 assertx(ret
.isString());
1683 if (count
) *count
= replace_count
;
1684 if (is_filter
&& replace_count
== 0) {
1685 return preg_return_internal_error(init_null());
1687 return preg_return_no_error(std::move(ret
));
1690 Array return_value
= Array::CreateDict();
1691 Array arrSubject
= subject
.toDict();
1692 for (ArrayIter
iter(arrSubject
); iter
; ++iter
) {
1693 auto old_replace_count
= replace_count
;
1694 String subject_entry
= iter
.second().toString();
1695 auto ret
= php_replace_in_subject(pattern
, replacement
, subject_entry
,
1696 limit
, is_callable
, &replace_count
);
1698 if (ret
.isString() && (!is_filter
|| replace_count
> old_replace_count
)) {
1699 return_value
.set(iter
.first(), ret
.asStrRef());
1702 if (count
) *count
= replace_count
;
1703 return preg_return_no_error(std::move(return_value
));
1706 int preg_replace(Variant
& result
,
1707 const Variant
& pattern
,
1708 const Variant
& replacement
,
1709 const Variant
& subject
,
1710 int limit
/* = -1 */) {
1712 result
= preg_replace_impl(pattern
, replacement
, subject
,
1713 limit
, &count
, false, false);
1717 int preg_replace_callback(Variant
& result
,
1718 const Variant
& pattern
,
1719 const Variant
& callback
,
1720 const Variant
& subject
,
1721 int limit
/* = -1 */) {
1723 result
= preg_replace_impl(pattern
, callback
, subject
,
1724 limit
, &count
, true, false);
1728 ///////////////////////////////////////////////////////////////////////////////
1732 const StaticString
s_OneUnicodeCharPattern("/./us");
1736 Variant
preg_split(const String
& pattern
, const String
& subject
,
1737 int limit
/* = -1 */, int flags
/* = 0 */) {
1738 PCRECache::Accessor accessor
;
1739 if (!pcre_get_compiled_regex_cache(accessor
, pattern
.get())) {
1740 return preg_return_bad_regex_error(false);
1742 const pcre_cache_entry
* pce
= accessor
.get();
1744 int no_empty
= flags
& PREG_SPLIT_NO_EMPTY
;
1745 bool delim_capture
= flags
& PREG_SPLIT_DELIM_CAPTURE
;
1746 bool offset_capture
= flags
& PREG_SPLIT_OFFSET_CAPTURE
;
1752 int size_offsets
= 0;
1753 int* offsets
= create_offset_array(pce
, size_offsets
);
1754 SmartFreeHelper
offsetsFreer(offsets
);
1755 if (offsets
== nullptr) {
1756 return preg_return_internal_error(false);
1759 /* Start at the beginning of the string */
1760 int start_offset
= 0;
1761 int next_offset
= 0;
1762 const char* last_match
= subject
.data();
1764 init_local_extra(&extra
, pce
->extra
);
1766 // Get next piece if no limit or limit not yet reached and something matched
1767 Array result
= Array::CreateDict();
1768 int g_notempty
= 0; /* If the match should not be empty */
1770 PCRECache::Accessor bump_accessor
;
1771 const pcre_cache_entry
* bump_pce
= nullptr; /* instance for empty matches */
1772 while ((limit
== -1 || limit
> 1)) {
1774 int options
= g_notempty
| utf8_check
;
1775 if (pce
->literal_data
&& literalOptions(options
)) {
1776 assertx(pce
->literal_data
->isLiteral());
1778 pce
->literal_data
->matches(subject
.get(), start_offset
, offsets
, options
)
1779 ? 1 : PCRE_ERROR_NOMATCH
;
1781 count
= pcre_exec(pce
->re
, &extra
, subject
.data(), subject
.size(),
1782 start_offset
, options
, offsets
, size_offsets
);
1783 /* Subsequent calls to pcre_exec don't need to bother with the
1784 * utf8 validity check: if the subject isn't valid, the first
1785 * call to pcre_exec will have failed, and as long as we only
1786 * set start_offset to known character boundaries we won't
1787 * supply an invalid offset. */
1788 utf8_check
= PCRE_NO_UTF8_CHECK
;
1791 /* Check for too many substrings condition. */
1793 raise_warning("Matched, but too many substrings");
1794 count
= pce
->num_subpats
;
1797 /* If something matched */
1798 if (count
> 0 && offsets
[1] >= offsets
[0]) {
1799 if (!no_empty
|| subject
.data() + offsets
[0] != last_match
) {
1800 auto const length
= subject
.data() + offsets
[0] - last_match
;
1801 auto const match
= String(last_match
, length
, CopyString
);
1802 auto const value
= offset_capture
1803 ? Variant(str_offset_pair(match
, next_offset
))
1805 assertx(result
->isVectorData());
1806 result
.set(safe_cast
<int64_t>(result
.size()), value
);
1808 /* One less left to do */
1809 if (limit
!= -1) limit
--;
1812 last_match
= subject
.data() + offsets
[1];
1813 next_offset
= offsets
[1];
1815 if (delim_capture
) {
1817 for (i
= 1; i
< count
; i
++) {
1818 match_len
= offsets
[(i
<<1)+1] - offsets
[i
<<1];
1819 /* If we have matched a delimiter */
1820 if (!no_empty
|| match_len
> 0) {
1821 auto const match
= subject
.substr(offsets
[i
<<1], match_len
);
1822 auto const value
= offset_capture
1823 ? Variant(str_offset_pair(match
, offsets
[i
<<1]))
1825 assertx(result
->isVectorData());
1826 result
.set(safe_cast
<int64_t>(result
.size()), value
);
1830 } else if (count
== PCRE_ERROR_NOMATCH
) {
1831 /* If we previously set PCRE_NOTEMPTY after a null match,
1832 this is not necessarily the end. We need to advance
1833 the start offset, and continue. Fudge the offset values
1834 to achieve this, unless we're already at the end of the string. */
1835 if (g_notempty
!= 0 && start_offset
< subject
.size()) {
1836 if (pce
->compile_options
& PCRE_UTF8
) {
1837 if (bump_pce
== nullptr) {
1838 auto const DEBUG_ONLY ok
= pcre_get_compiled_regex_cache(
1839 bump_accessor
, s_OneUnicodeCharPattern
.get());
1841 bump_pce
= bump_accessor
.get();
1843 pcre_extra bump_extra
;
1844 init_local_extra(&bump_extra
, bump_pce
->extra
);
1845 count
= pcre_exec(bump_pce
->re
, &bump_extra
, subject
.data(),
1846 subject
.size(), start_offset
,
1847 utf8_check
, offsets
, size_offsets
);
1849 raise_warning("Unknown error");
1850 offsets
[0] = start_offset
;
1851 offsets
[1] = start_offset
+ 1;
1852 if (pcre_need_log_error(count
)) {
1853 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1854 pattern
.data(), pattern
.size(),
1855 subject
.data(), subject
.size(),
1857 limit
, flags
, start_offset
);
1861 offsets
[0] = start_offset
;
1862 offsets
[1] = start_offset
+ 1;
1867 if (pcre_need_log_error(count
)) {
1868 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1869 pattern
.data(), pattern
.size(),
1870 subject
.data(), subject
.size(),
1872 limit
, flags
, start_offset
, g_notempty
);
1874 // NOTE: this returns an error together with a partial result :-(
1875 start_offset
= last_match
- subject
.data(); /* offset might have
1877 * but without further
1878 * successful matches */
1879 if (!no_empty
|| start_offset
< subject
.size()) {
1880 auto const match
= subject
.substr(start_offset
);
1881 auto const value
= offset_capture
1882 ? Variant(str_offset_pair(match
, start_offset
))
1884 assertx(result
->isVectorData());
1885 result
.set(safe_cast
<int64_t>(result
.size()), value
);
1887 return preg_return_pcre_error(count
, std::move(result
));
1890 /* If we have matched an empty string, mimic what Perl's /g options does.
1891 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1892 the match again at the same point. If this fails (picked up above) we
1893 advance to the next character. */
1894 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1896 /* Advance to the position right after the last full match */
1897 start_offset
= offsets
[1];
1900 start_offset
= last_match
- subject
.data(); /* offset might have
1902 * but without further
1903 * successful matches */
1904 if (!no_empty
|| start_offset
< subject
.size()) {
1905 auto const match
= subject
.substr(start_offset
);
1906 auto const value
= offset_capture
1907 ? Variant(str_offset_pair(match
, start_offset
))
1909 assertx(result
->isVectorData());
1910 result
.set(safe_cast
<int64_t>(result
.size()), value
);
1913 return preg_return_no_error(std::move(result
));
1916 ///////////////////////////////////////////////////////////////////////////////
1918 String
preg_quote(const String
& str
,
1919 const String
& delimiter
/* = null_string */) {
1920 const char* in_str
= str
.data();
1921 const char* in_str_end
= in_str
+ str
.size();
1923 /* Nothing to do if we got an empty string */
1924 if (in_str
== in_str_end
) {
1928 char delim_char
= 0; /* Delimiter character to be quoted */
1929 bool quote_delim
= false; /* Whether to quote additional delim char */
1930 if (!delimiter
.empty()) {
1931 delim_char
= delimiter
.charAt(0);
1935 /* Allocate enough memory so that even if each character
1936 is quoted, we won't run out of room */
1938 (StringData::MaxSize
* 4 + 1) < std::numeric_limits
<int64_t>::max()
1940 String
ret(4 * str
.size() + 1, ReserveString
);
1941 char* out_str
= ret
.mutableData();
1943 /* Go through the string and quote necessary characters */
1946 for (p
= in_str
, q
= out_str
; p
!= in_str_end
; p
++) {
1949 case '.': case '\\': case '+': case '*': case '?':
1950 case '[': case '^': case ']': case '$': case '(':
1951 case ')': case '{': case '}': case '=': case '!':
1952 case '>': case '<': case '|': case ':': case '-':
1966 if (quote_delim
&& c
== delim_char
)
1974 return ret
.setSize(q
- out_str
);
1977 ///////////////////////////////////////////////////////////////////////////////
1980 int preg_last_error() {
1981 return *rl_last_error_code
;
1984 PregWithErrorGuard::~PregWithErrorGuard() {
1985 if (*rl_last_error_code
== PHP_PCRE_NO_ERROR
) {
1988 error
= *rl_last_error_code
;
1990 *rl_last_error_code
= prior_error
;
1993 size_t preg_pcre_cache_size() {
1994 return s_pcreCache
.size();
1997 ///////////////////////////////////////////////////////////////////////////////
2000 static void php_reg_eprint(int err
, regex_t
* re
) {
2001 char *buf
= nullptr, *message
= nullptr;
2006 /* get the length of the message */
2007 buf_len
= regerror(REG_ITOA
| err
, re
, nullptr, 0);
2009 buf
= (char *)req::malloc_noptrs(buf_len
);
2010 if (!buf
) return; /* fail silently */
2011 /* finally, get the error message */
2012 regerror(REG_ITOA
| err
, re
, buf
, buf_len
);
2017 len
= regerror(err
, re
, nullptr, 0);
2019 message
= (char *)req::malloc_noptrs(buf_len
+ len
+ 2);
2021 return; /* fail silently */
2024 snprintf(message
, buf_len
, "%s: ", buf
);
2025 buf_len
+= 1; /* so pointer math below works */
2027 /* drop the message into place */
2028 regerror(err
, re
, message
+ buf_len
, len
);
2029 raise_warning("%s", message
);
2035 Variant
php_split(const String
& spliton
, const String
& str
, int count
,
2037 const char* strp
= str
.data();
2038 const char* endp
= strp
+ str
.size();
2041 int copts
= icase
? REG_ICASE
: 0;
2042 int err
= regcomp(&re
, spliton
.data(), REG_EXTENDED
| copts
);
2044 php_reg_eprint(err
, &re
);
2048 Array return_value
= Array::CreateVec();
2051 /* churn through str, generating array entries as we go */
2052 while ((count
== -1 || count
> 1) &&
2053 !(err
= regexec(&re
, strp
, 1, subs
, 0))) {
2054 if (subs
[0].rm_so
== 0 && subs
[0].rm_eo
) {
2055 /* match is at start of string, return empty string */
2056 return_value
.append("");
2057 /* skip ahead the length of the regex match */
2058 strp
+= subs
[0].rm_eo
;
2059 } else if (subs
[0].rm_so
== 0 && subs
[0].rm_eo
== 0) {
2060 /* No more matches */
2062 raise_warning("Invalid Regular Expression to split()");
2065 /* On a real match */
2067 /* make a copy of the substring */
2068 int size
= subs
[0].rm_so
;
2070 /* add it to the array */
2071 return_value
.append(String(strp
, size
, CopyString
));
2073 /* point at our new starting point */
2074 strp
= strp
+ subs
[0].rm_eo
;
2077 /* if we're only looking for a certain number of points,
2078 stop looking once we hit it */
2084 /* see if we encountered an error */
2085 if (err
&& err
!= REG_NOMATCH
) {
2086 php_reg_eprint(err
, &re
);
2091 /* otherwise we just have one last element to add to the array */
2092 int size
= endp
- strp
;
2093 return_value
.append(String(strp
, size
, CopyString
));
2096 return return_value
;
2099 ///////////////////////////////////////////////////////////////////////////////