2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/base/preg.h"
23 #include <onigposix.h>
26 #include <folly/AtomicHashArray.h>
28 #include "hphp/runtime/base/array-init.h"
29 #include "hphp/runtime/base/array-iterator.h"
30 #include "hphp/runtime/base/builtin-functions.h"
31 #include "hphp/runtime/base/container-functions.h"
32 #include "hphp/runtime/base/execution-context.h"
33 #include "hphp/runtime/base/ini-setting.h"
34 #include "hphp/runtime/base/runtime-option.h"
35 #include "hphp/runtime/base/string-util.h"
36 #include "hphp/runtime/base/init-fini-node.h"
37 #include "hphp/runtime/base/zend-functions.h"
38 #include "hphp/runtime/vm/debug/debug.h"
39 #include "hphp/runtime/vm/treadmill.h"
40 #include "hphp/runtime/vm/vm-regs.h"
42 #include "hphp/runtime/ext/std/ext_std_function.h"
43 #include "hphp/runtime/ext/string/ext_string.h"
45 #include "hphp/runtime/vm/jit/mcgen.h"
46 #include "hphp/runtime/vm/jit/types.h"
47 #include "hphp/runtime/vm/jit/vtune-jit.h"
49 #include "hphp/util/logger.h"
50 #include "hphp/util/concurrent-scalable-cache.h"
52 #include <folly/json.h>
54 /* Only defined in pcre >= 8.32 */
55 #ifndef PCRE_STUDY_JIT_COMPILE
56 # define PCRE_STUDY_JIT_COMPILE 0
63 ///////////////////////////////////////////////////////////////////////////////
64 // PCREglobals definition
66 PCREglobals::PCREglobals() {
67 jit_stack
= pcre_jit_stack_alloc(32768, 524288);
68 // Set these to handle uses of pcre prior to PcreExtension::threadInit
69 // In particular, for matching tier overrides during RuntimeOption::Load
70 preg_backtrace_limit
= RuntimeOption::PregBacktraceLimit
;
71 preg_recursion_limit
= RuntimeOption::PregRecursionLimit
;
74 PCREglobals::~PCREglobals() {
75 pcre_jit_stack_free(jit_stack
);
78 ///////////////////////////////////////////////////////////////////////////////
79 // PCRECache definition
82 typedef std::shared_ptr
<const pcre_cache_entry
> EntryPtr
;
83 typedef std::unique_ptr
<LRUCacheKey
> TempKeyCache
;
85 enum class CacheKind
{
92 struct ahm_string_data_same
{
93 bool operator()(const StringData
* s1
, const StringData
* s2
) {
94 // ahm uses -1, -2, -3 as magic values
95 return int64_t(s1
) > 0 && (s1
== s2
|| s1
->same(s2
));
99 typedef folly::AtomicHashArray
<const StringData
*, const pcre_cache_entry
*,
100 string_data_hash
, ahm_string_data_same
> StaticCache
;
101 typedef ConcurrentLRUCache
<LRUCacheKey
, EntryPtr
,
102 LRUCacheKey::HashCompare
> LRUCache
;
103 typedef ConcurrentScalableCache
<LRUCacheKey
, EntryPtr
,
104 LRUCacheKey::HashCompare
> ScalableCache
;
105 typedef StaticCache::value_type StaticCachePair
;
110 : m_kind(Kind::Empty
)
119 m_u
.smart_ptr
.~EntryPtr();
121 case Kind::AccessorKind
:
122 m_u
.accessor
.~ConstAccessor();
127 Accessor
& operator=(const pcre_cache_entry
* ptr
) {
128 assertx(m_kind
== Kind::Empty
|| m_kind
== Kind::Ptr
);
134 Accessor
& operator=(EntryPtr
&& ep
) {
136 case Kind::AccessorKind
:
137 m_u
.accessor
.~ConstAccessor();
140 m_kind
= Kind::SmartPtr
;
141 new (&m_u
.smart_ptr
) EntryPtr(std::move(ep
));
144 m_u
.smart_ptr
= std::move(ep
);
150 // No assignment from LRUCache::ConstAccessor since it is non-copyable
151 // Use resetToLRU instead
152 LRUCache::ConstAccessor
& resetToLRU() {
155 m_u
.smart_ptr
.~EntryPtr();
158 m_kind
= Kind::AccessorKind
;
159 new (&m_u
.accessor
) LRUCache::ConstAccessor();
161 case Kind::AccessorKind
:
167 const pcre_cache_entry
* get() {
169 case Kind::Empty
: return nullptr;
170 case Kind::Ptr
: return m_u
.ptr
;
171 case Kind::SmartPtr
: return m_u
.smart_ptr
.get();
172 case Kind::AccessorKind
: return m_u
.accessor
->get();
174 always_assert(false);
177 const EntryPtr
& entryPtr() const {
178 assertx(m_kind
== Kind::SmartPtr
);
179 return m_u
.smart_ptr
;
183 enum class Kind
: uint8_t {
194 const pcre_cache_entry
* ptr
;
196 LRUCache::ConstAccessor accessor
;
204 : m_kind(CacheKind::Static
), m_staticCache(nullptr)
206 reinit(CacheKind::Static
);
210 if (m_kind
== CacheKind::Static
&& m_staticCache
.load()) {
211 DestroyStatic(m_staticCache
);
215 void reinit(CacheKind kind
);
216 bool find(Accessor
& accessor
, const StringData
* key
,
217 TempKeyCache
& keyCache
);
218 void insert(Accessor
& accessor
, const StringData
* regex
,
219 TempKeyCache
& keyCache
, const pcre_cache_entry
* ent
);
220 void dump(const std::string
& filename
);
226 static void DestroyStatic(StaticCache
* cache
);
227 static StaticCache
* CreateStatic();
230 std::atomic
<StaticCache
*> m_staticCache
;
231 std::unique_ptr
<LRUCache
> m_lruCache
;
232 std::unique_ptr
<ScalableCache
> m_scalableCache
;
233 std::atomic
<time_t> m_expire
{};
234 std::mutex m_clearMutex
;
237 ///////////////////////////////////////////////////////////////////////////////
240 RDS_LOCAL(PCREglobals
, tl_pcre_globals
);
242 static PCRECache s_pcreCache
;
244 // The last pcre error code is available for the whole thread.
245 static RDS_LOCAL(int, rl_last_error_code
);
247 ///////////////////////////////////////////////////////////////////////////////
248 // pcre_cache_entry implementation
250 pcre_cache_entry::~pcre_cache_entry() {
252 #if PCRE_MAJOR < 8 || (PCRE_MAJOR == 8 && PCRE_MINOR < 20)
255 pcre_free_study(extra
);
262 pcre_literal_data::pcre_literal_data(const char* pattern
, int coptions
) {
263 if (coptions
& ~PCRE_CASELESS
) {
273 std::string pattern_buffer
;
274 while (isalnum((unsigned char)*p
) || (*p
&& strchr("/\\ :-_", *p
))) {
275 // backslash + alphanumeric character --> not a literal (i.e. \d).
276 // backslash + non-alphanumeric character --> literal symbol (i.e. \.)
278 if (!p
[1] || isalnum((unsigned char)p
[1])) {
284 pattern_buffer
+= *p
++;
291 /* This is an encoding of a literal string. */
292 case_insensitive
= coptions
& PCRE_CASELESS
;
293 literal_str
= std::move(pattern_buffer
);
297 bool pcre_literal_data::isLiteral() const {
298 return literal_str
.has_value();
301 bool pcre_literal_data::matches(const StringData
* subject
,
303 int* offsets
) const {
304 assertx(isLiteral());
307 // Subject must be at least as long as the literal pattern
308 // for a match to occur.
309 if (subject
->size() < literal_str
->length() + pos
) {
313 size_t literal_strlen
= literal_str
->length();
314 auto const subject_c
= subject
->data();
315 auto const literal_c
= literal_str
->c_str();
317 // Make sure an exact match has the right length.
318 if (pos
|| (match_end
&& subject
->size() != literal_strlen
)) {
321 // If only matching the start (^), compare the strings
322 // for the length of the literal pattern.
323 if (case_insensitive
?
324 bstrcaseeq(subject_c
, literal_c
, literal_strlen
) :
325 memcmp(subject_c
, literal_c
, literal_strlen
) == 0) {
327 offsets
[1] = literal_strlen
* sizeof(char);
330 } else if (match_end
) {
331 // Compare the literal pattern against the tail end of the subject.
332 auto const subject_tail
= subject_c
+ (subject
->size() - literal_strlen
);
333 if (case_insensitive
?
334 bstrcaseeq(subject_tail
, literal_c
, literal_strlen
) :
335 memcmp(subject_tail
, literal_c
, literal_strlen
) == 0) {
336 offsets
[0] = (subject
->size() - literal_strlen
) * sizeof(char);
337 offsets
[1] = subject
->size() * sizeof(char);
341 if (!literal_strlen
) {
342 offsets
[0] = offsets
[1] = pos
;
345 // Check if the literal pattern occurs as a substring of the subject.
346 auto const subject_str
= StrNR(subject
);
347 auto const find_response
= subject_str
.asString().find(
348 *literal_str
, pos
, !case_insensitive
);
349 if (find_response
>= 0) {
350 offsets
[0] = find_response
* sizeof(char);
351 offsets
[1] = offsets
[0] + literal_strlen
* sizeof(char);
358 ///////////////////////////////////////////////////////////////////////////////
359 // PCRECache implementation
361 PCRECache::StaticCache
* PCRECache::CreateStatic() {
362 StaticCache::Config config
;
363 config
.maxLoadFactor
= 0.5;
364 return StaticCache::create(
365 RuntimeOption::EvalPCRETableSize
, config
).release();
368 void PCRECache::DestroyStatic(StaticCache
* cache
) {
369 // We delete uncounted keys while iterating the cache, which is OK for
370 // AtomicHashArray, but not OK for other containers, such as
371 // std::unordered_map. If you change the cache type make sure that property
372 // holds or fix this function.
373 static_assert(std::is_same
<PCRECache::StaticCache
,
374 folly::AtomicHashArray
<const StringData
*, const pcre_cache_entry
*,
375 string_data_hash
, ahm_string_data_same
>>::value
,
376 "StaticCache must be an AtomicHashArray or this destructor is wrong.");
377 for (auto& it
: *cache
) {
378 if (it
.first
->isUncounted()) {
379 StringData::ReleaseUncounted(it
.first
);
383 StaticCache::destroy(cache
);
386 void PCRECache::reinit(CacheKind kind
) {
388 case CacheKind::Static
:
389 if (m_staticCache
.load()) {
390 DestroyStatic(m_staticCache
);
391 m_staticCache
= nullptr;
397 case CacheKind::Scalable
:
398 m_scalableCache
.reset();
404 case CacheKind::Static
:
405 m_staticCache
= CreateStatic();
406 m_expire
= time(nullptr) + RuntimeOption::EvalPCREExpireInterval
;
409 m_lruCache
.reset(new LRUCache(RuntimeOption::EvalPCRETableSize
));
411 case CacheKind::Scalable
:
412 m_scalableCache
.reset(
413 new ScalableCache(RuntimeOption::EvalPCRETableSize
));
418 bool PCRECache::find(Accessor
& accessor
,
419 const StringData
* regex
,
420 TempKeyCache
& keyCache
)
423 case CacheKind::Static
:
425 assertx(m_staticCache
.load());
426 StaticCache::iterator it
;
427 auto cache
= m_staticCache
.load(std::memory_order_acquire
);
428 if ((it
= cache
->find(regex
)) != cache
->end()) {
429 accessor
= it
->second
;
435 case CacheKind::Scalable
:
438 keyCache
.reset(new LRUCacheKey(regex
->data(), regex
->size()));
441 if (m_kind
== CacheKind::Lru
) {
442 found
= m_lruCache
->find(accessor
.resetToLRU(), *keyCache
);
444 found
= m_scalableCache
->find(accessor
.resetToLRU(), *keyCache
);
449 always_assert(false);
452 void PCRECache::clearStatic() {
453 std::unique_lock
<std::mutex
> lock(m_clearMutex
, std::try_to_lock
);
456 auto newExpire
= time(nullptr) + RuntimeOption::EvalPCREExpireInterval
;
457 m_expire
.store(newExpire
, std::memory_order_relaxed
);
459 auto tmpMap
= CreateStatic();
460 tmpMap
= m_staticCache
.exchange(tmpMap
, std::memory_order_acq_rel
);
462 Treadmill::enqueue([tmpMap
]() {
463 DestroyStatic(tmpMap
);
467 void PCRECache::insert(
469 const StringData
* regex
,
470 TempKeyCache
& keyCache
,
471 const pcre_cache_entry
* ent
474 case CacheKind::Static
:
476 assertx(m_staticCache
.load());
477 // Clear the cache if we haven't refreshed it in a while
478 if (time(nullptr) > m_expire
) {
481 auto const cache
= m_staticCache
.load(std::memory_order_acquire
);
484 (regex
->isUncounted() && regex
->uncountedIncRef()) ?
485 regex
: StringData::MakeUncounted(regex
->slice());
486 auto pair
= cache
->insert(StaticCachePair(key
, ent
));
488 // Inserted, container owns the pointer
491 // Not inserted, caller needs to own the pointer
492 if (regex
->isUncounted()) StringData::ReleaseUncounted(key
);
493 accessor
= EntryPtr(ent
);
498 case CacheKind::Scalable
:
501 keyCache
.reset(new LRUCacheKey(regex
->data(), regex
->size()));
503 // Pointer ownership is shared between container and caller
504 accessor
= EntryPtr(ent
);
505 if (m_kind
== CacheKind::Lru
) {
506 m_lruCache
->insert(*keyCache
, accessor
.entryPtr());
508 m_scalableCache
->insert(*keyCache
, accessor
.entryPtr());
515 void PCRECache::dump(const std::string
& filename
) {
516 std::ofstream
out(filename
.c_str());
518 case CacheKind::Static
:
519 for (auto& it
: *m_staticCache
) {
520 out
<< it
.first
->data() << "\n";
524 case CacheKind::Scalable
:
526 std::vector
<LRUCacheKey
> keys
;
527 if (m_kind
== CacheKind::Lru
) {
528 m_lruCache
->snapshotKeys(keys
);
530 m_scalableCache
->snapshotKeys(keys
);
532 for (auto& key
: keys
) {
533 out
<< key
.c_str() << "\n";
541 size_t PCRECache::size() const {
543 case CacheKind::Static
:
544 return m_staticCache
.load(std::memory_order_acquire
)->size();
546 return m_lruCache
->size();
547 case CacheKind::Scalable
:
548 return m_scalableCache
->size();
550 always_assert(false);
553 ///////////////////////////////////////////////////////////////////////////////
554 // Public interface and helper functions
557 PCRECache::CacheKind kind
;
558 if (RuntimeOption::EvalPCRECacheType
== "static") {
559 kind
= PCRECache::CacheKind::Static
;
560 } else if (RuntimeOption::EvalPCRECacheType
== "lru") {
561 kind
= PCRECache::CacheKind::Lru
;
562 } else if (RuntimeOption::EvalPCRECacheType
== "scalable") {
563 kind
= PCRECache::CacheKind::Scalable
;
565 Logger::Warning("Eval.PCRECacheType should be either static, "
567 kind
= PCRECache::CacheKind::Scalable
;
569 s_pcreCache
.reinit(kind
);
575 void pcre_dump_cache(const std::string
& filename
) {
576 s_pcreCache
.dump(filename
);
579 static pcre_jit_stack
* alloc_jit_stack(void* /*data*/) {
580 return tl_pcre_globals
->jit_stack
;
585 template<bool useSmartFree
= false>
586 struct FreeHelperImpl
{
587 explicit FreeHelperImpl(void* p
) : p(p
) {}
589 useSmartFree
? req::free(p
) : free(p
);
592 FreeHelperImpl(const FreeHelperImpl
&) = delete;
593 FreeHelperImpl
& operator=(const FreeHelperImpl
&) = delete;
599 typedef FreeHelperImpl
<true> SmartFreeHelper
;
602 static void init_local_extra(pcre_extra
* local
, pcre_extra
* shared
) {
604 memcpy(local
, shared
, sizeof(pcre_extra
));
606 memset(local
, 0, sizeof(pcre_extra
));
607 local
->flags
= PCRE_EXTRA_MATCH_LIMIT
| PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
609 local
->match_limit
= tl_pcre_globals
->preg_backtrace_limit
;
610 local
->match_limit_recursion
= tl_pcre_globals
->preg_recursion_limit
;
613 static const char* const*
614 get_subpat_names(const pcre_cache_entry
* pce
) {
615 char **subpat_names
= pce
->subpat_names
.load(std::memory_order_relaxed
);
621 * Build a mapping from subpattern numbers to their names. We will always
622 * allocate the table, even though there may be no named subpatterns. This
623 * avoids somewhat more complicated logic in the inner loops.
626 init_local_extra(&extra
, pce
->extra
);
630 subpat_names
= (char **)calloc(pce
->num_subpats
, sizeof(char *));
631 int rc
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_NAMECOUNT
, &name_count
);
633 raise_warning("Internal pcre_fullinfo() error %d", rc
);
636 if (name_count
> 0) {
637 int name_size
, ni
= 0;
638 unsigned short name_idx
;
642 rc1
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_NAMETABLE
, &name_table
);
643 rc2
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_NAMEENTRYSIZE
, &name_size
);
644 rc
= rc2
? rc2
: rc1
;
646 raise_warning("Internal pcre_fullinfo() error %d", rc
);
649 while (ni
++ < name_count
) {
650 name_idx
= 0xff * (unsigned char)name_table
[0] +
651 (unsigned char)name_table
[1];
652 subpat_names
[name_idx
] = name_table
+ 2;
653 if (is_numeric_string(subpat_names
[name_idx
],
654 strlen(subpat_names
[name_idx
]),
655 nullptr, nullptr, 0) != KindOfNull
) {
656 raise_warning("Numeric named subpatterns are not allowed");
659 name_table
+= name_size
;
662 // Store subpat_names into the cache entry
663 char **expected
= nullptr;
664 if (!pce
->subpat_names
.compare_exchange_strong(expected
, subpat_names
)) {
665 // Another thread stored subpat_names already. The array created by the
666 // other thread is now in expected, return it instead and delete the one
674 static bool get_pcre_fullinfo(pcre_cache_entry
* pce
) {
676 init_local_extra(&extra
, pce
->extra
);
678 /* Calculate the size of the offsets array*/
679 int rc
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_CAPTURECOUNT
,
682 raise_warning("Internal pcre_fullinfo() error %d", rc
);
690 pcre_get_compiled_regex_cache(PCRECache::Accessor
& accessor
,
691 const StringData
* regex
) {
692 PCRECache::TempKeyCache tkc
;
694 /* Try to lookup the cached regex entry, and if successful, just pass
695 back the compiled pattern, otherwise go on and compile it. */
696 if (s_pcreCache
.find(accessor
, regex
, tkc
)) {
700 /* Parse through the leading whitespace, and display a warning if we
701 get to the end without encountering a delimiter. */
702 const char *p
= regex
->data();
703 while (isspace((int)*(unsigned char *)p
)) p
++;
705 raise_warning("Empty regular expression");
709 /* Get the delimiter and display a warning if it is alphanumeric
711 char delimiter
= *p
++;
712 if (isalnum((int)*(unsigned char *)&delimiter
) || delimiter
== '\\') {
713 raise_warning("Delimiter must not be alphanumeric or backslash");
717 char start_delimiter
= delimiter
;
718 const char *pp
= strchr("([{< )]}> )]}>", delimiter
);
722 char end_delimiter
= delimiter
;
724 if (start_delimiter
== end_delimiter
) {
725 /* We need to iterate through the pattern, searching for the ending
726 * delimiter, but skipping the backslashed delimiters. If the ending
727 * delimiter is not found, display a warning. */
730 if (*pp
== '\\' && pp
[1] != 0) pp
++;
731 else if (*pp
== delimiter
)
736 raise_warning("No ending delimiter '%c' found: [%s]", delimiter
,
741 /* We iterate through the pattern, searching for the matching ending
742 * delimiter. For each matching starting delimiter, we increment nesting
743 * level, and decrement it for each matching ending delimiter. If we
744 * reach the end of the pattern without matching, display a warning.
746 int brackets
= 1; // brackets nesting level
749 if (*pp
== '\\' && pp
[1] != 0) pp
++;
750 else if (*pp
== end_delimiter
&& --brackets
<= 0)
752 else if (*pp
== start_delimiter
)
757 raise_warning("No ending matching delimiter '%c' found: [%s]",
758 end_delimiter
, regex
->data());
763 /* Make a copy of the actual pattern. */
764 String
spattern(p
, pp
-p
, CopyString
);
765 const char *pattern
= spattern
.data();
767 /* Move on to the options */
770 /* Parse through the options, setting appropriate flags. Display
771 a warning if we encounter an unknown modifier. */
774 bool do_study
= false;
777 /* Perl compatible options */
778 case 'i': coptions
|= PCRE_CASELESS
; break;
779 case 'm': coptions
|= PCRE_MULTILINE
; break;
780 case 's': coptions
|= PCRE_DOTALL
; break;
781 case 'x': coptions
|= PCRE_EXTENDED
; break;
783 /* PCRE specific options */
784 case 'A': coptions
|= PCRE_ANCHORED
; break;
785 case 'D': coptions
|= PCRE_DOLLAR_ENDONLY
; break;
786 case 'S': do_study
= true; break;
787 case 'U': coptions
|= PCRE_UNGREEDY
; break;
788 case 'X': coptions
|= PCRE_EXTRA
; break;
789 case 'u': coptions
|= PCRE_UTF8
;
790 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
791 characters, even in UTF-8 mode. However, this can be changed by setting
792 the PCRE_UCP option. */
794 coptions
|= PCRE_UCP
;
798 /* Custom preg options */
799 case 'e': poptions
|= PREG_REPLACE_EVAL
; break;
806 raise_warning("Unknown modifier '%c': [%s]", pp
[-1], regex
->data());
811 /* We've reached a null byte, now check if we're actually at the end of the
812 string. If not this is a bad expression, and a potential security hole. */
813 if (regex
->size() != (pp
- regex
->data())) {
814 raise_error("Error: Null byte found in pattern");
817 /* Compile pattern and display a warning if compilation failed. */
820 pcre
*re
= pcre_compile(pattern
, coptions
, &error
, &erroffset
, 0);
822 raise_warning("Compilation failed: %s at offset %d", error
, erroffset
);
826 // Careful: from here 're' needs to be freed if something throws.
828 // TODO(t14969501): enable literal_data everywhere and skip the
829 // pcre_compile above.
830 auto const literal_data
= pcre_literal_data(pattern
, coptions
);
832 /* If study option was specified, study the pattern and
833 store the result in extra for passing to pcre_exec. */
834 pcre_extra
*extra
= nullptr;
835 if (!literal_data
.isLiteral()) {
836 if (do_study
|| PCRE_STUDY_JIT_COMPILE
) {
837 int soptions
= PCRE_STUDY_JIT_COMPILE
;
838 extra
= pcre_study(re
, soptions
, &error
);
840 extra
->flags
|= PCRE_EXTRA_MATCH_LIMIT
|
841 PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
842 pcre_assign_jit_stack(extra
, alloc_jit_stack
, nullptr);
844 if (error
!= nullptr) {
846 raise_warning("Error while studying pattern");
852 if ((!RuntimeOption::EvalJitNoGdb
||
853 RuntimeOption::EvalJitUseVtuneAPI
||
854 RuntimeOption::EvalPerfPidMap
) &&
856 extra
->executable_jit
!= nullptr) {
858 pcre_fullinfo(re
, extra
, PCRE_INFO_JITSIZE
, &size
);
860 TCA start
= *(TCA
*)(extra
->executable_jit
);
861 TCA end
= start
+ size
;
862 std::string name
= folly::sformat("HHVM::pcre_jit::{}", pattern
);
864 if (!RuntimeOption::EvalJitNoGdb
&& jit::mcgen::initialized()) {
865 Debug::DebugInfo::Get()->recordStub(Debug::TCRange(start
, end
, false),
868 if (RuntimeOption::EvalJitUseVtuneAPI
) {
869 HPHP::jit::reportHelperToVtune(name
.c_str(), start
, end
);
871 if (RuntimeOption::EvalPerfPidMap
&& jit::mcgen::initialized()) {
872 std::string escaped_name
;
873 folly::json::escapeString(name
, escaped_name
,
874 folly::json::serialization_opts());
875 Debug::DebugInfo::Get()->recordPerfMap(
876 Debug::TCRange(start
, end
, false),
877 SrcKey
{}, nullptr, false, false, escaped_name
884 /* Store the compiled pattern and extra info in the cache. */
885 pcre_cache_entry
* new_entry
= new pcre_cache_entry();
887 new_entry
->extra
= extra
;
888 if (literal_data
.isLiteral()) {
889 new_entry
->literal_data
=
890 std::make_unique
<pcre_literal_data
>(std::move(literal_data
));
893 assertx((poptions
& ~0x1) == 0);
894 new_entry
->preg_options
= poptions
;
896 assertx((coptions
& 0x80000000) == 0);
897 new_entry
->compile_options
= coptions
;
899 /* Get pcre full info */
900 if (!get_pcre_fullinfo(new_entry
)) {
905 s_pcreCache
.insert(accessor
, regex
, tkc
, new_entry
);
909 static int* create_offset_array(const pcre_cache_entry
* pce
,
911 /* Allocate memory for the offsets array */
912 size_offsets
= pce
->num_subpats
* 3;
913 return (int *)req::malloc_noptrs(size_offsets
* sizeof(int));
916 static inline void add_offset_pair_split(Array
& result
,
920 bool hackArrOutput
) {
921 auto match_pair
= hackArrOutput
922 ? make_vec_array(str
, offset
)
923 : make_varray(str
, offset
);
924 if (name
) result
.set(String(name
), match_pair
);
925 result
.append(match_pair
);
928 static inline void add_offset_pair_match(Array
& result
,
932 bool hackArrOutput
) {
933 auto match_pair
= hackArrOutput
934 ? make_vec_array(str
, offset
)
935 : make_varray(str
, offset
);
936 if (name
) result
.set(String(name
), match_pair
);
937 result
.append(match_pair
);
940 static inline bool pcre_need_log_error(int pcre_code
) {
941 return RuntimeOption::EnablePregErrorLog
&&
942 (pcre_code
== PCRE_ERROR_MATCHLIMIT
||
943 pcre_code
== PCRE_ERROR_RECURSIONLIMIT
);
946 static void pcre_log_error(const char* func
, int line
, int pcre_code
,
947 const char* pattern
, int pattern_size
,
948 const char* subject
, int subject_size
,
949 const char* repl
, int repl_size
,
950 int arg1
= 0, int arg2
= 0,
951 int arg3
= 0, int arg4
= 0) {
952 const char* escapedPattern
;
953 const char* escapedSubject
;
954 const char* escapedRepl
;
955 std::string
p(pattern
, pattern_size
);
956 std::string
s(subject
, subject_size
);
957 std::string
r(repl
, repl_size
);
958 escapedPattern
= Logger::EscapeString(p
);
959 escapedSubject
= Logger::EscapeString(s
);
960 escapedRepl
= Logger::EscapeString(r
);
961 const char* errString
=
962 (pcre_code
== PCRE_ERROR_MATCHLIMIT
) ? "PCRE_ERROR_MATCHLIMIT" :
963 (pcre_code
== PCRE_ERROR_RECURSIONLIMIT
) ? "PCRE_ERROR_RECURSIONLIMIT" :
965 raise_warning_unsampled(
966 "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
967 "limits=(%" PRId64
", %" PRId64
"), extra=(%d, %d, %d, %d)",
968 func
, line
, pcre_code
, errString
,
969 escapedPattern
, escapedSubject
, escapedRepl
,
970 tl_pcre_globals
->preg_backtrace_limit
,
971 tl_pcre_globals
->preg_recursion_limit
,
972 arg1
, arg2
, arg3
, arg4
);
973 free((void *)escapedPattern
);
974 free((void *)escapedSubject
);
975 free((void *)escapedRepl
);
978 static void pcre_handle_exec_error(int pcre_code
) {
981 case PCRE_ERROR_MATCHLIMIT
:
982 preg_code
= PHP_PCRE_BACKTRACK_LIMIT_ERROR
;
984 case PCRE_ERROR_RECURSIONLIMIT
:
985 preg_code
= PHP_PCRE_RECURSION_LIMIT_ERROR
;
987 case PCRE_ERROR_BADUTF8
:
988 preg_code
= PHP_PCRE_BAD_UTF8_ERROR
;
990 case PCRE_ERROR_BADUTF8_OFFSET
:
991 preg_code
= PHP_PCRE_BAD_UTF8_OFFSET_ERROR
;
994 preg_code
= PHP_PCRE_INTERNAL_ERROR
;
997 *rl_last_error_code
= preg_code
;
1000 ///////////////////////////////////////////////////////////////////////////////
1002 Variant
preg_grep(const String
& pattern
, const Array
& input
, int flags
/* = 0 */) {
1003 PCRECache::Accessor accessor
;
1004 if (!pcre_get_compiled_regex_cache(accessor
, pattern
.get())) {
1007 const pcre_cache_entry
* pce
= accessor
.get();
1009 int size_offsets
= 0;
1010 int* offsets
= create_offset_array(pce
, size_offsets
);
1011 if (offsets
== nullptr) {
1014 SmartFreeHelper
freer(offsets
);
1016 const bool hackArrOutput
= flags
& PREG_FB_HACK_ARRAYS
;
1018 /* Initialize return array */
1019 auto ret
= hackArrOutput
? Array::CreateDict() : Array::Create();
1020 *rl_last_error_code
= PHP_PCRE_NO_ERROR
;
1022 /* Go through the input array */
1023 bool invert
= (flags
& PREG_GREP_INVERT
);
1025 init_local_extra(&extra
, pce
->extra
);
1027 for (ArrayIter
iter(input
); iter
; ++iter
) {
1028 String entry
= iter
.second().toString();
1030 /* Perform the match */
1031 int count
= pcre_exec(pce
->re
, &extra
, entry
.data(), entry
.size(),
1032 0, 0, offsets
, size_offsets
);
1034 /* Check for too many substrings condition. */
1036 raise_warning("Matched, but too many substrings");
1037 count
= size_offsets
/ 3;
1038 } else if (count
< 0 && count
!= PCRE_ERROR_NOMATCH
) {
1039 if (pcre_need_log_error(count
)) {
1040 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1041 pattern
.data(), pattern
.size(),
1042 entry
.data(), entry
.size(),
1046 pcre_handle_exec_error(count
);
1050 /* If the entry fits our requirements */
1051 if ((count
> 0 && !invert
) ||
1052 (count
== PCRE_ERROR_NOMATCH
&& invert
)) {
1054 /* Add to return array */
1055 ret
.set(iter
.first(), entry
);
1062 ///////////////////////////////////////////////////////////////////////////////
1066 Array
& forceToOutput(Variant
& var
, bool hackArrOutput
) {
1067 return hackArrOutput
? forceToDict(var
) : forceToDArray(var
);
1070 Array
& forceToOutput(tv_lval lval
, bool hackArrOutput
) {
1071 return hackArrOutput
? forceToDict(lval
) : forceToDArray(lval
);
1076 static Variant
preg_match_impl(const StringData
* pattern
,
1077 const StringData
* subject
,
1078 Variant
* subpats
, int flags
, int start_offset
,
1080 PCRECache::Accessor accessor
;
1081 if (!pcre_get_compiled_regex_cache(accessor
, pattern
)) {
1084 const pcre_cache_entry
* pce
= accessor
.get();
1086 const bool hackArrOutput
= flags
& PREG_FB_HACK_ARRAYS
;
1087 const bool includeNonMatchingCaptures
= flags
& PREG_FB__PRIVATE__HSL_IMPL
;
1090 init_local_extra(&extra
, pce
->extra
);
1092 *subpats
= hackArrOutput
? Array::CreateDict() : Array::CreateDArray();
1094 int exec_options
= 0;
1096 int subpats_order
= global
? PREG_PATTERN_ORDER
: 0;
1097 bool offset_capture
= false;
1099 offset_capture
= flags
& PREG_OFFSET_CAPTURE
;
1102 * subpats_order is pre-set to pattern mode so we change it only if
1106 subpats_order
= flags
& 0xff;
1108 if ((global
&& (subpats_order
< PREG_PATTERN_ORDER
||
1109 subpats_order
> PREG_SET_ORDER
)) ||
1110 (!global
&& subpats_order
!= 0)) {
1111 raise_warning("Invalid flags specified");
1116 /* Negative offset counts from the end of the string. */
1117 if (start_offset
< 0) {
1118 start_offset
= subject
->size() + start_offset
;
1119 if (start_offset
< 0) {
1124 int size_offsets
= 0;
1125 int* offsets
= create_offset_array(pce
, size_offsets
);
1126 SmartFreeHelper
offsetsFreer(offsets
);
1127 int num_subpats
= size_offsets
/ 3;
1128 if (offsets
== nullptr) {
1132 const char* const* subpat_names
= get_subpat_names(pce
);
1133 if (subpat_names
== nullptr) {
1137 /* Allocate match sets array and initialize the values. */
1139 /* An array of sets of matches for each subpattern after a global match */
1140 auto match_sets
= hackArrOutput
? Array::CreateDict() : Array::CreateDArray();
1141 if (global
&& subpats_order
== PREG_PATTERN_ORDER
) {
1142 for (int i
= 0; i
< num_subpats
; i
++) {
1144 hackArrOutput
? Array::CreateDict() : Array::CreateDArray());
1149 *rl_last_error_code
= PHP_PCRE_NO_ERROR
;
1151 int g_notempty
= 0; // If the match should not be empty
1152 const char** stringlist
; // Holds list of subpatterns
1158 * Optimization: If the pattern defines a literal substring,
1159 * compare the strings directly (i.e. memcmp) instead of performing
1160 * the full regular expression evaluation.
1161 * Take the slow path if there are any special compile options.
1163 if (pce
->literal_data
&& !global
) {
1164 assertx(pce
->literal_data
->isLiteral());
1165 /* TODO(t13140878): compare literal against multiple substrings
1166 * in the preg_match_all (global == true) case. */
1167 count
= pce
->literal_data
->matches(subject
, start_offset
, offsets
) ? 1
1168 : PCRE_ERROR_NOMATCH
;
1170 /* Execute the regular expression. */
1171 count
= pcre_exec(pce
->re
, &extra
, subject
->data(), subject
->size(),
1173 exec_options
| g_notempty
,
1174 offsets
, size_offsets
);
1176 /* The string was already proved to be valid UTF-8 */
1177 exec_options
|= PCRE_NO_UTF8_CHECK
;
1179 /* Check for too many substrings condition. */
1181 raise_warning("Matched, but too many substrings");
1182 count
= size_offsets
/ 3;
1185 /* If something has matched */
1190 // Try to get the list of substrings and display a warning if failed.
1191 if (offsets
[1] < offsets
[0] ||
1192 pcre_get_substring_list(subject
->data(), offsets
, count
,
1194 raise_warning("Get subpatterns list failed");
1198 if (global
) { /* global pattern matching */
1199 if (subpats_order
== PREG_PATTERN_ORDER
) {
1200 /* For each subpattern, insert it into the appropriate array. */
1201 for (i
= 0; i
< count
; i
++) {
1202 if (offset_capture
) {
1203 auto const lval
= match_sets
.lval(i
);
1204 add_offset_pair_match(forceToOutput(lval
, hackArrOutput
),
1205 String(stringlist
[i
],
1206 offsets
[(i
<<1)+1] - offsets
[i
<<1],
1212 auto const lval
= match_sets
.lval(i
);
1213 forceToOutput(lval
, hackArrOutput
).append(
1214 String(stringlist
[i
], offsets
[(i
<<1)+1] - offsets
[i
<<1],
1220 * If the number of captured subpatterns on this run is
1221 * less than the total possible number, pad the result
1222 * arrays with empty strings.
1224 if (count
< num_subpats
) {
1225 for (; i
< num_subpats
; i
++) {
1226 auto const lval
= match_sets
.lval(i
);
1227 forceToOutput(lval
, hackArrOutput
).append("");
1231 auto result_set
= hackArrOutput
1232 ? Array::CreateDict()
1233 : Array::CreateDArray();
1235 /* Add all the subpatterns to it */
1236 for (i
= 0; i
< count
; i
++) {
1237 if (offset_capture
) {
1238 add_offset_pair_match(result_set
,
1239 String(stringlist
[i
],
1240 offsets
[(i
<<1)+1] - offsets
[i
<<1],
1246 String
value(stringlist
[i
], offsets
[(i
<<1)+1] - offsets
[i
<<1],
1248 if (subpat_names
[i
]) {
1249 result_set
.set(String(subpat_names
[i
]), value
);
1251 result_set
.append(value
);
1254 if (includeNonMatchingCaptures
&& count
< num_subpats
) {
1255 for (; i
< num_subpats
; i
++) {
1256 // We don't want to set the numeric key if there is a string
1257 // key, but we have do it usually to make migration from
1258 // preg_match() practical; given that existing code gets
1259 // nothing for unmatched captures, we don't need to set both
1261 if (offset_capture
) {
1262 add_offset_pair_match(
1263 forceToOutput(*subpats
, hackArrOutput
),
1270 if (subpat_names
[i
]) {
1271 result_set
.set(String(subpat_names
[i
]), empty_string_tv());
1273 result_set
.append(empty_string());
1277 /* And add it to the output array */
1278 forceToOutput(*subpats
, hackArrOutput
).append(
1279 std::move(result_set
)
1282 } else { /* single pattern matching */
1283 /* For each subpattern, insert it into the subpatterns array. */
1284 for (i
= 0; i
< count
; i
++) {
1285 if (offset_capture
) {
1286 add_offset_pair_match(forceToOutput(*subpats
, hackArrOutput
),
1287 String(stringlist
[i
],
1288 offsets
[(i
<<1)+1] - offsets
[i
<<1],
1294 String
value(stringlist
[i
], offsets
[(i
<<1)+1] - offsets
[i
<<1],
1296 if (subpat_names
[i
]) {
1297 forceToOutput(*subpats
, hackArrOutput
).set(
1298 String(subpat_names
[i
]), value
1301 forceToOutput(*subpats
, hackArrOutput
).append(value
);
1304 if (includeNonMatchingCaptures
&& count
< num_subpats
) {
1305 for (; i
< num_subpats
; i
++) {
1306 if (offset_capture
) {
1307 add_offset_pair_match(
1308 forceToOutput(*subpats
, hackArrOutput
),
1315 if (subpat_names
[i
]) {
1316 forceToOutput(*subpats
, hackArrOutput
).set(
1317 String(subpat_names
[i
]), empty_string()
1320 forceToOutput(*subpats
, hackArrOutput
).append(empty_string());
1325 pcre_free((void *) stringlist
);
1327 } else if (count
== PCRE_ERROR_NOMATCH
) {
1328 /* If we previously set PCRE_NOTEMPTY after a null match,
1329 this is not necessarily the end. We need to advance
1330 the start offset, and continue. Fudge the offset values
1331 to achieve this, unless we're already at the end of the string. */
1332 if (g_notempty
&& start_offset
< subject
->size()) {
1333 offsets
[0] = start_offset
;
1334 offsets
[1] = start_offset
+ 1;
1338 if (pcre_need_log_error(count
)) {
1339 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1340 pattern
->data(), pattern
->size(),
1341 subject
->data(), subject
->size(),
1343 flags
, start_offset
, g_notempty
, global
);
1345 pcre_handle_exec_error(count
);
1349 /* If we have matched an empty string, mimic what Perl's /g options does.
1350 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1351 the match again at the same point. If this fails (picked up above) we
1352 advance to the next character. */
1353 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1355 /* Advance to the position right after the last full match */
1356 start_offset
= offsets
[1];
1359 /* Add the match sets to the output array and clean up */
1360 if (subpats
&& global
&& subpats_order
== PREG_PATTERN_ORDER
) {
1361 for (i
= 0; i
< num_subpats
; i
++) {
1362 if (subpat_names
[i
]) {
1363 forceToOutput(*subpats
, hackArrOutput
).set(
1364 String(subpat_names
[i
]), match_sets
[i
]
1367 forceToOutput(*subpats
, hackArrOutput
).append(match_sets
[i
]);
1373 Variant
preg_match(const String
& pattern
, const String
& subject
,
1374 Variant
* matches
/* = nullptr */, int flags
/* = 0 */,
1375 int offset
/* = 0 */) {
1376 return preg_match(pattern
.get(), subject
.get(), matches
, flags
, offset
);
1379 Variant
preg_match(const StringData
* pattern
, const StringData
* subject
,
1380 Variant
* matches
/* = nullptr */, int flags
/* = 0 */,
1381 int offset
/* = 0 */) {
1382 return preg_match_impl(pattern
, subject
, matches
, flags
, offset
, false);
1385 Variant
preg_match_all(const String
& pattern
, const String
& subject
,
1386 Variant
* matches
/* = nullptr */,
1387 int flags
/* = 0 */, int offset
/* = 0 */) {
1388 return preg_match_all(pattern
.get(), subject
.get(), matches
, flags
, offset
);
1391 Variant
preg_match_all(const StringData
* pattern
, const StringData
* subject
,
1392 Variant
* matches
/* = nullptr */,
1393 int flags
/* = 0 */, int offset
/* = 0 */) {
1394 return preg_match_impl(pattern
, subject
, matches
, flags
, offset
, true);
1397 ///////////////////////////////////////////////////////////////////////////////
1399 static String
preg_do_repl_func(const Variant
& function
, const String
& subject
,
1400 int* offsets
, const char* const* subpat_names
,
1402 Array subpats
= Array::CreateDArray();
1403 for (int i
= 0; i
< count
; i
++) {
1404 auto off1
= offsets
[i
<<1];
1405 auto off2
= offsets
[(i
<<1)+1];
1406 auto sub
= subject
.substr(off1
, off2
- off1
);
1408 if (subpat_names
[i
]) {
1409 subpats
.set(String(subpat_names
[i
]), sub
);
1411 subpats
.append(sub
);
1414 return vm_call_user_func(function
, make_varray(subpats
)).toString();
1417 static bool preg_get_backref(const char** str
, int* backref
) {
1419 const char* walk
= *str
;
1425 if (*walk
== '$' && walk
[1] == '{') {
1431 if (*walk
>= '0' && *walk
<= '9') {
1432 *backref
= *walk
- '0';
1438 if (*walk
&& *walk
>= '0' && *walk
<= '9') {
1439 *backref
= *backref
* 10 + *walk
- '0';
1444 if (*walk
== 0 || *walk
!= '}') {
1454 static Variant
php_pcre_replace(const String
& pattern
, const String
& subject
,
1455 const Variant
& replace_var
, bool callable
,
1456 int limit
, int* replace_count
) {
1457 PCRECache::Accessor accessor
;
1458 if (!pcre_get_compiled_regex_cache(accessor
, pattern
.get())) {
1461 const pcre_cache_entry
* pce
= accessor
.get();
1462 if (pce
->preg_options
& PREG_REPLACE_EVAL
) {
1464 "preg_replace(): Support for the /e modifier has been removed, use "
1465 "preg_replace_callback instead"
1470 int* offsets
= create_offset_array(pce
, size_offsets
);
1471 SmartFreeHelper
offsetsFreer(offsets
);
1472 if (offsets
== nullptr) {
1476 const char* const* subpat_names
= get_subpat_names(pce
);
1477 if (subpat_names
== nullptr) {
1481 const char* replace
= nullptr;
1482 const char* replace_end
= nullptr;
1483 int replace_len
= 0;
1487 replace_val
= replace_var
.toString();
1488 replace
= replace_val
.data();
1489 replace_len
= replace_val
.size();
1490 replace_end
= replace
+ replace_len
;
1493 StringBuffer
result(2 * subject
.size());
1498 const char* match
= nullptr;
1499 int start_offset
= 0;
1500 *rl_last_error_code
= PHP_PCRE_NO_ERROR
;
1502 init_local_extra(&extra
, pce
->extra
);
1504 const char* walk
; // Used to walk the replacement string
1505 char walk_last
; // Last walked character
1506 int match_len
; // Length of the current match
1507 int backref
; // Backreference number
1508 int g_notempty
= 0; // If the match should not be empty
1509 int exec_options
= 0; // Options passed to pcre_exec
1511 /* Execute the regular expression. */
1512 int count
= pcre_exec(pce
->re
, &extra
, subject
.data(), subject
.size(),
1514 exec_options
| g_notempty
,
1515 offsets
, size_offsets
);
1517 /* The string was already proved to be valid UTF-8 */
1518 exec_options
|= PCRE_NO_UTF8_CHECK
;
1520 /* Check for too many substrings condition. */
1522 raise_warning("Matched, but too many substrings");
1523 count
= size_offsets
/ 3;
1526 const char* piece
= subject
.data() + start_offset
;
1527 if (count
> 0 && offsets
[1] >= offsets
[0] &&
1528 (limit
== -1 || limit
> 0)) {
1529 if (replace_count
) {
1532 /* Set the match location in subject */
1533 match
= subject
.data() + offsets
[0];
1535 String callable_result
;
1537 /* Use custom function to get replacement string and its length. */
1538 callable_result
= preg_do_repl_func(replace_var
, subject
, offsets
,
1539 subpat_names
, count
);
1540 } else { /* do regular substitution */
1543 while (walk
< replace_end
) {
1544 if ('\\' == *walk
|| '$' == *walk
) {
1545 if (walk_last
== '\\') {
1550 if (preg_get_backref(&walk
, &backref
)) {
1551 if (backref
< count
) {
1552 match_len
= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
1558 walk_last
= walk
[-1];
1562 /* copy the part of the string before the match */
1563 result
.append(piece
, match
-piece
);
1565 /* copy replacement and backrefs */
1566 int result_len
= result
.size();
1569 /* Copy result from custom function to buffer and clean up. */
1570 result
.append(callable_result
.data(), callable_result
.size());
1571 result_len
+= callable_result
.size();
1572 } else { /* do regular backreference copying */
1576 while (walk
< replace_end
) {
1577 if ('\\' == *walk
|| '$' == *walk
) {
1578 if (walk_last
== '\\') {
1579 result
.set(result
.size() - 1, *walk
++);
1583 if (preg_get_backref(&walk
, &backref
)) {
1584 if (backref
< count
) {
1585 match_len
= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
1587 subject
.data() + offsets
[backref
<<1],
1594 result
.append(*walk
++);
1595 walk_last
= walk
[-1];
1603 } else if (count
== PCRE_ERROR_NOMATCH
|| limit
== 0) {
1604 /* If we previously set PCRE_NOTEMPTY after a null match,
1605 this is not necessarily the end. We need to advance
1606 the start offset, and continue. Fudge the offset values
1607 to achieve this, unless we're already at the end of the string. */
1608 if (g_notempty
!= 0 && start_offset
< subject
.size()) {
1609 offsets
[0] = start_offset
;
1610 offsets
[1] = start_offset
+ 1;
1611 result
.append(piece
, 1);
1613 /* stick that last bit of string on our output */
1614 result
.append(piece
, subject
.size() - start_offset
);
1618 if (pcre_need_log_error(count
)) {
1623 if (replace_var
.isObject()) {
1624 stemp
= replace_var
.asCObjRef()->getClassName().asString()
1627 stemp
= replace_var
.toString();
1630 size
= stemp
.size();
1632 s
= replace_val
.data();
1633 size
= replace_val
.size();
1635 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1636 pattern
.data(), pattern
.size(),
1637 subject
.data(), subject
.size(),
1639 callable
, limit
, start_offset
, g_notempty
);
1641 pcre_handle_exec_error(count
);
1645 /* If we have matched an empty string, mimic what Perl's /g options does.
1646 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1647 the match again at the same point. If this fails (picked up above) we
1648 advance to the next character. */
1649 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1651 /* Advance to the next piece. */
1652 start_offset
= offsets
[1];
1655 return result
.detach();
1661 static Variant
php_replace_in_subject(const Variant
& regex
, const Variant
& replace
,
1662 String subject
, int limit
, bool callable
,
1663 int* replace_count
) {
1664 if (!regex
.isArray()) {
1665 Variant ret
= php_pcre_replace(regex
.toString(), subject
, replace
,
1666 callable
, limit
, replace_count
);
1668 if (ret
.isBoolean()) {
1669 assertx(!ret
.toBoolean());
1676 if (callable
|| !replace
.isArray()) {
1677 Array arr
= regex
.toDArray();
1678 for (ArrayIter
iterRegex(arr
); iterRegex
; ++iterRegex
) {
1679 String regex_entry
= iterRegex
.second().toString();
1680 Variant ret
= php_pcre_replace(regex_entry
, subject
, replace
,
1681 callable
, limit
, replace_count
);
1682 if (ret
.isBoolean()) {
1683 assertx(!ret
.toBoolean());
1686 if (!ret
.isString()) {
1689 subject
= ret
.asStrRef();
1690 if (subject
.isNull()) {
1697 Array arrReplace
= replace
.toDArray();
1698 Array arrRegex
= regex
.toDArray();
1699 ArrayIter
iterReplace(arrReplace
);
1700 for (ArrayIter
iterRegex(arrRegex
); iterRegex
; ++iterRegex
) {
1701 String regex_entry
= iterRegex
.second().toString();
1702 Variant replace_value
;
1704 replace_value
= iterReplace
.second();
1708 Variant ret
= php_pcre_replace(regex_entry
, subject
, replace_value
,
1709 callable
, limit
, replace_count
);
1711 if (ret
.isBoolean()) {
1712 assertx(!ret
.toBoolean());
1715 if (!ret
.isString()) {
1718 subject
= ret
.asStrRef();
1719 if (subject
.isNull()) {
1726 Variant
preg_replace_impl(const Variant
& pattern
, const Variant
& replacement
,
1727 const Variant
& subject
, int limit
, int64_t* count
,
1728 bool is_callable
, bool is_filter
) {
1729 assertx(!(is_callable
&& is_filter
));
1731 replacement
.isArray() && !pattern
.isArray()) {
1732 raise_warning("Parameter mismatch, pattern is a string while "
1733 "replacement is an array");
1737 int replace_count
= 0;
1738 if (!isContainer(subject
)) {
1739 Variant ret
= php_replace_in_subject(pattern
, replacement
,
1741 limit
, is_callable
, &replace_count
);
1743 if (ret
.isString()) {
1744 if (count
) *count
= replace_count
;
1745 if (is_filter
&& replace_count
== 0) {
1748 return ret
.asStrRef();
1755 Array return_value
= Array::CreateDArray();
1756 Array arrSubject
= subject
.toDArray();
1757 for (ArrayIter
iter(arrSubject
); iter
; ++iter
) {
1758 auto old_replace_count
= replace_count
;
1759 String subject_entry
= iter
.second().toString();
1760 Variant ret
= php_replace_in_subject(pattern
, replacement
, subject_entry
,
1761 limit
, is_callable
, &replace_count
);
1763 if (ret
.isString() && !ret
.isNull() &&
1764 (!is_filter
|| replace_count
> old_replace_count
)) {
1765 return_value
.set(iter
.first(), ret
.asStrRef());
1768 if (count
) *count
= replace_count
;
1769 return return_value
;
1772 int preg_replace(Variant
& result
,
1773 const Variant
& pattern
,
1774 const Variant
& replacement
,
1775 const Variant
& subject
,
1776 int limit
/* = -1 */) {
1778 result
= preg_replace_impl(pattern
, replacement
, subject
,
1779 limit
, &count
, false, false);
1783 int preg_replace_callback(Variant
& result
,
1784 const Variant
& pattern
,
1785 const Variant
& callback
,
1786 const Variant
& subject
,
1787 int limit
/* = -1 */) {
1789 result
= preg_replace_impl(pattern
, callback
, subject
,
1790 limit
, &count
, true, false);
1794 int preg_filter(Variant
& result
,
1795 const Variant
& pattern
,
1796 const Variant
& replacement
,
1797 const Variant
& subject
,
1798 int limit
/* = -1 */) {
1800 result
= preg_replace_impl(pattern
, replacement
, subject
,
1801 limit
, &count
, false, true);
1805 ///////////////////////////////////////////////////////////////////////////////
1807 Variant
preg_split(const String
& pattern
, const String
& subject
,
1808 int limit
/* = -1 */, int flags
/* = 0 */) {
1809 PCRECache::Accessor accessor
;
1810 if (!pcre_get_compiled_regex_cache(accessor
, pattern
.get())) {
1813 const pcre_cache_entry
* pce
= accessor
.get();
1815 int no_empty
= flags
& PREG_SPLIT_NO_EMPTY
;
1816 bool delim_capture
= flags
& PREG_SPLIT_DELIM_CAPTURE
;
1817 bool offset_capture
= flags
& PREG_SPLIT_OFFSET_CAPTURE
;
1823 int size_offsets
= 0;
1824 int* offsets
= create_offset_array(pce
, size_offsets
);
1825 SmartFreeHelper
offsetsFreer(offsets
);
1826 if (offsets
== nullptr) {
1830 /* Start at the beginning of the string */
1831 int start_offset
= 0;
1832 int next_offset
= 0;
1833 const char* last_match
= subject
.data();
1834 *rl_last_error_code
= PHP_PCRE_NO_ERROR
;
1836 init_local_extra(&extra
, pce
->extra
);
1838 const bool hackArrOutput
= flags
& PREG_FB_HACK_ARRAYS
;
1840 // Get next piece if no limit or limit not yet reached and something matched
1841 Array return_value
= hackArrOutput
? Array::CreateDict() : Array::Create();
1842 int g_notempty
= 0; /* If the match should not be empty */
1844 PCRECache::Accessor bump_accessor
;
1845 const pcre_cache_entry
* bump_pce
= nullptr; /* instance for empty matches */
1846 while ((limit
== -1 || limit
> 1)) {
1847 int count
= pcre_exec(pce
->re
, &extra
, subject
.data(), subject
.size(),
1848 start_offset
, g_notempty
| utf8_check
,
1849 offsets
, size_offsets
);
1851 /* Subsequent calls to pcre_exec don't need to bother with the
1852 * utf8 validity check: if the subject isn't valid, the first
1853 * call to pcre_exec will have failed, and as long as we only
1854 * set start_offset to known character boundaries we won't
1855 * supply an invalid offset. */
1856 utf8_check
= PCRE_NO_UTF8_CHECK
;
1858 /* Check for too many substrings condition. */
1860 raise_warning("Matched, but too many substrings");
1861 count
= size_offsets
/ 3;
1864 /* If something matched */
1865 if (count
> 0 && offsets
[1] >= offsets
[0]) {
1866 if (!no_empty
|| subject
.data() + offsets
[0] != last_match
) {
1867 if (offset_capture
) {
1868 /* Add (match, offset) pair to the return value */
1869 add_offset_pair_split(return_value
,
1871 subject
.data() + offsets
[0] - last_match
,
1877 /* Add the piece to the return value */
1878 return_value
.append(String(last_match
,
1879 subject
.data() + offsets
[0] - last_match
,
1883 /* One less left to do */
1888 last_match
= subject
.data() + offsets
[1];
1889 next_offset
= offsets
[1];
1891 if (delim_capture
) {
1893 for (i
= 1; i
< count
; i
++) {
1894 match_len
= offsets
[(i
<<1)+1] - offsets
[i
<<1];
1895 /* If we have matched a delimiter */
1896 if (!no_empty
|| match_len
> 0) {
1897 if (offset_capture
) {
1898 add_offset_pair_split(return_value
,
1899 String(subject
.data() + offsets
[i
<<1],
1900 match_len
, CopyString
),
1905 return_value
.append(subject
.substr(offsets
[i
<<1], match_len
));
1910 } else if (count
== PCRE_ERROR_NOMATCH
) {
1911 /* If we previously set PCRE_NOTEMPTY after a null match,
1912 this is not necessarily the end. We need to advance
1913 the start offset, and continue. Fudge the offset values
1914 to achieve this, unless we're already at the end of the string. */
1915 if (g_notempty
!= 0 && start_offset
< subject
.size()) {
1916 if (pce
->compile_options
& PCRE_UTF8
) {
1917 if (bump_pce
== nullptr) {
1918 if (!pcre_get_compiled_regex_cache(bump_accessor
,
1919 String("/./us").get())) {
1922 bump_pce
= bump_accessor
.get();
1924 pcre_extra bump_extra
;
1925 init_local_extra(&bump_extra
, bump_pce
->extra
);
1926 count
= pcre_exec(bump_pce
->re
, &bump_extra
, subject
.data(),
1927 subject
.size(), start_offset
,
1928 utf8_check
, offsets
, size_offsets
);
1930 raise_warning("Unknown error");
1931 offsets
[0] = start_offset
;
1932 offsets
[1] = start_offset
+ 1;
1933 if (pcre_need_log_error(count
)) {
1934 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1935 pattern
.data(), pattern
.size(),
1936 subject
.data(), subject
.size(),
1938 limit
, flags
, start_offset
);
1942 offsets
[0] = start_offset
;
1943 offsets
[1] = start_offset
+ 1;
1948 if (pcre_need_log_error(count
)) {
1949 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1950 pattern
.data(), pattern
.size(),
1951 subject
.data(), subject
.size(),
1953 limit
, flags
, start_offset
, g_notempty
);
1955 pcre_handle_exec_error(count
);
1959 /* If we have matched an empty string, mimic what Perl's /g options does.
1960 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1961 the match again at the same point. If this fails (picked up above) we
1962 advance to the next character. */
1963 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1965 /* Advance to the position right after the last full match */
1966 start_offset
= offsets
[1];
1969 start_offset
= last_match
- subject
.data(); /* offset might have
1971 * but without further
1972 * successful matches */
1973 if (!no_empty
|| start_offset
< subject
.size()) {
1974 if (offset_capture
) {
1975 /* Add the last (match, offset) pair to the return value */
1976 add_offset_pair_split(return_value
,
1977 subject
.substr(start_offset
),
1978 start_offset
, nullptr, hackArrOutput
);
1980 /* Add the last piece to the return value */
1982 (String(last_match
, subject
.data() + subject
.size() - last_match
,
1987 return return_value
;
1990 ///////////////////////////////////////////////////////////////////////////////
1992 String
preg_quote(const String
& str
,
1993 const String
& delimiter
/* = null_string */) {
1994 const char* in_str
= str
.data();
1995 const char* in_str_end
= in_str
+ str
.size();
1997 /* Nothing to do if we got an empty string */
1998 if (in_str
== in_str_end
) {
2002 char delim_char
= 0; /* Delimiter character to be quoted */
2003 bool quote_delim
= false; /* Whether to quote additional delim char */
2004 if (!delimiter
.empty()) {
2005 delim_char
= delimiter
.charAt(0);
2009 /* Allocate enough memory so that even if each character
2010 is quoted, we won't run out of room */
2011 String
ret(4 * str
.size() + 1, ReserveString
);
2012 char* out_str
= ret
.mutableData();
2014 /* Go through the string and quote necessary characters */
2017 for (p
= in_str
, q
= out_str
; p
!= in_str_end
; p
++) {
2020 case '.': case '\\': case '+': case '*': case '?':
2021 case '[': case '^': case ']': case '$': case '(':
2022 case ')': case '{': case '}': case '=': case '!':
2023 case '>': case '<': case '|': case ':': case '-':
2037 if (quote_delim
&& c
== delim_char
)
2045 return ret
.setSize(q
- out_str
);
2048 int preg_last_error() {
2049 return *rl_last_error_code
;
2052 size_t preg_pcre_cache_size() {
2053 return s_pcreCache
.size();
2056 ///////////////////////////////////////////////////////////////////////////////
2059 static void php_reg_eprint(int err
, regex_t
* re
) {
2060 char *buf
= nullptr, *message
= nullptr;
2065 /* get the length of the message */
2066 buf_len
= regerror(REG_ITOA
| err
, re
, nullptr, 0);
2068 buf
= (char *)req::malloc_noptrs(buf_len
);
2069 if (!buf
) return; /* fail silently */
2070 /* finally, get the error message */
2071 regerror(REG_ITOA
| err
, re
, buf
, buf_len
);
2076 len
= regerror(err
, re
, nullptr, 0);
2078 message
= (char *)req::malloc_noptrs(buf_len
+ len
+ 2);
2080 return; /* fail silently */
2083 snprintf(message
, buf_len
, "%s: ", buf
);
2084 buf_len
+= 1; /* so pointer math below works */
2086 /* drop the message into place */
2087 regerror(err
, re
, message
+ buf_len
, len
);
2088 raise_warning("%s", message
);
2094 Variant
php_split(const String
& spliton
, const String
& str
, int count
,
2096 const char* strp
= str
.data();
2097 const char* endp
= strp
+ str
.size();
2100 int copts
= icase
? REG_ICASE
: 0;
2101 int err
= regcomp(&re
, spliton
.data(), REG_EXTENDED
| copts
);
2103 php_reg_eprint(err
, &re
);
2107 Array return_value
= Array::Create();
2110 /* churn through str, generating array entries as we go */
2111 while ((count
== -1 || count
> 1) &&
2112 !(err
= regexec(&re
, strp
, 1, subs
, 0))) {
2113 if (subs
[0].rm_so
== 0 && subs
[0].rm_eo
) {
2114 /* match is at start of string, return empty string */
2115 return_value
.append("");
2116 /* skip ahead the length of the regex match */
2117 strp
+= subs
[0].rm_eo
;
2118 } else if (subs
[0].rm_so
== 0 && subs
[0].rm_eo
== 0) {
2119 /* No more matches */
2121 raise_warning("Invalid Regular Expression to split()");
2124 /* On a real match */
2126 /* make a copy of the substring */
2127 int size
= subs
[0].rm_so
;
2129 /* add it to the array */
2130 return_value
.append(String(strp
, size
, CopyString
));
2132 /* point at our new starting point */
2133 strp
= strp
+ subs
[0].rm_eo
;
2136 /* if we're only looking for a certain number of points,
2137 stop looking once we hit it */
2143 /* see if we encountered an error */
2144 if (err
&& err
!= REG_NOMATCH
) {
2145 php_reg_eprint(err
, &re
);
2150 /* otherwise we just have one last element to add to the array */
2151 int size
= endp
- strp
;
2152 return_value
.append(String(strp
, size
, CopyString
));
2155 return return_value
;
2158 ///////////////////////////////////////////////////////////////////////////////