2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/runtime/base/preg.h"
23 #include <onigposix.h>
26 #include <folly/AtomicHashArray.h>
28 #include "hphp/runtime/base/array-init.h"
29 #include "hphp/runtime/base/array-iterator.h"
30 #include "hphp/runtime/base/builtin-functions.h"
31 #include "hphp/runtime/base/container-functions.h"
32 #include "hphp/runtime/base/execution-context.h"
33 #include "hphp/runtime/base/ini-setting.h"
34 #include "hphp/runtime/base/runtime-option.h"
35 #include "hphp/runtime/base/string-util.h"
36 #include "hphp/runtime/base/init-fini-node.h"
37 #include "hphp/runtime/base/zend-functions.h"
38 #include "hphp/runtime/vm/debug/debug.h"
39 #include "hphp/runtime/vm/treadmill.h"
40 #include "hphp/runtime/vm/vm-regs.h"
42 #include "hphp/runtime/ext/std/ext_std_function.h"
43 #include "hphp/runtime/ext/string/ext_string.h"
45 #include "hphp/runtime/vm/jit/mcgen.h"
46 #include "hphp/runtime/vm/jit/types.h"
47 #include "hphp/runtime/vm/jit/vtune-jit.h"
49 #include "hphp/compiler/json.h"
51 #include "hphp/util/logger.h"
52 #include "hphp/util/concurrent-scalable-cache.h"
54 /* Only defined in pcre >= 8.32 */
55 #ifndef PCRE_STUDY_JIT_COMPILE
56 # define PCRE_STUDY_JIT_COMPILE 0
63 ///////////////////////////////////////////////////////////////////////////////
64 // PCREglobals definition
66 PCREglobals::PCREglobals() {
67 jit_stack
= pcre_jit_stack_alloc(32768, 524288);
68 // Set these to handle uses of pcre prior to PcreExtension::threadInit
69 // In particular, for matching tier overrides during RuntimeOption::Load
70 preg_backtrace_limit
= RuntimeOption::PregBacktraceLimit
;
71 preg_recursion_limit
= RuntimeOption::PregRecursionLimit
;
74 PCREglobals::~PCREglobals() {
75 pcre_jit_stack_free(jit_stack
);
78 ///////////////////////////////////////////////////////////////////////////////
79 // PCRECache definition
82 typedef std::shared_ptr
<const pcre_cache_entry
> EntryPtr
;
83 typedef std::unique_ptr
<LRUCacheKey
> TempKeyCache
;
85 enum class CacheKind
{
92 struct ahm_string_data_same
{
93 bool operator()(const StringData
* s1
, const StringData
* s2
) {
94 // ahm uses -1, -2, -3 as magic values
95 return int64_t(s1
) > 0 && (s1
== s2
|| s1
->same(s2
));
99 typedef folly::AtomicHashArray
<const StringData
*, const pcre_cache_entry
*,
100 string_data_hash
, ahm_string_data_same
> StaticCache
;
101 typedef ConcurrentLRUCache
<LRUCacheKey
, EntryPtr
,
102 LRUCacheKey::HashCompare
> LRUCache
;
103 typedef ConcurrentScalableCache
<LRUCacheKey
, EntryPtr
,
104 LRUCacheKey::HashCompare
> ScalableCache
;
105 typedef StaticCache::value_type StaticCachePair
;
110 : m_kind(Kind::Empty
)
119 m_u
.smart_ptr
.~EntryPtr();
121 case Kind::AccessorKind
:
122 m_u
.accessor
.~ConstAccessor();
127 Accessor
& operator=(const pcre_cache_entry
* ptr
) {
128 assert(m_kind
== Kind::Empty
|| m_kind
== Kind::Ptr
);
134 Accessor
& operator=(EntryPtr
&& ep
) {
136 case Kind::AccessorKind
:
137 m_u
.accessor
.~ConstAccessor();
140 m_kind
= Kind::SmartPtr
;
141 new (&m_u
.smart_ptr
) EntryPtr(std::move(ep
));
144 m_u
.smart_ptr
= std::move(ep
);
150 // No assignment from LRUCache::ConstAccessor since it is non-copyable
151 // Use resetToLRU instead
152 LRUCache::ConstAccessor
& resetToLRU() {
155 m_u
.smart_ptr
.~EntryPtr();
158 m_kind
= Kind::AccessorKind
;
159 new (&m_u
.accessor
) LRUCache::ConstAccessor();
161 case Kind::AccessorKind
:
167 const pcre_cache_entry
* get() {
169 case Kind::Empty
: return nullptr;
170 case Kind::Ptr
: return m_u
.ptr
;
171 case Kind::SmartPtr
: return m_u
.smart_ptr
.get();
172 case Kind::AccessorKind
: return m_u
.accessor
->get();
174 always_assert(false);
177 const EntryPtr
& entryPtr() const {
178 assert(m_kind
== Kind::SmartPtr
);
179 return m_u
.smart_ptr
;
183 enum class Kind
: uint8_t {
194 const pcre_cache_entry
* ptr
;
196 LRUCache::ConstAccessor accessor
;
204 : m_kind(CacheKind::Static
), m_staticCache(nullptr)
206 reinit(CacheKind::Static
);
210 if (m_kind
== CacheKind::Static
&& m_staticCache
.load()) {
211 DestroyStatic(m_staticCache
);
215 void reinit(CacheKind kind
);
216 bool find(Accessor
& accessor
, const StringData
* key
,
217 TempKeyCache
& keyCache
);
218 void insert(Accessor
& accessor
, const StringData
* regex
,
219 TempKeyCache
& keyCache
, const pcre_cache_entry
* ent
);
220 void dump(const std::string
& filename
);
226 static void DestroyStatic(StaticCache
* cache
);
227 static StaticCache
* CreateStatic();
230 std::atomic
<StaticCache
*> m_staticCache
;
231 std::unique_ptr
<LRUCache
> m_lruCache
;
232 std::unique_ptr
<ScalableCache
> m_scalableCache
;
233 std::atomic
<time_t> m_expire
;
234 std::mutex m_clearMutex
;
237 ///////////////////////////////////////////////////////////////////////////////
240 THREAD_LOCAL(PCREglobals
, tl_pcre_globals
);
242 static PCRECache s_pcreCache
;
244 // The last pcre error code is available for the whole thread.
245 static __thread
int tl_last_error_code
;
247 ///////////////////////////////////////////////////////////////////////////////
248 // pcre_cache_entry implementation
250 pcre_cache_entry::~pcre_cache_entry() {
252 #if PCRE_MAJOR < 8 || (PCRE_MAJOR == 8 && PCRE_MINOR < 20)
255 pcre_free_study(extra
);
262 pcre_literal_data::pcre_literal_data(const char* pattern
, int coptions
) {
263 if (coptions
& ~PCRE_CASELESS
) {
273 std::string pattern_buffer
;
274 while (isalnum((unsigned char)*p
) || (*p
&& strchr("/\\ :-_", *p
))) {
275 // backslash + alphanumeric character --> not a literal (i.e. \d).
276 // backslash + non-alphanumeric character --> literal symbol (i.e. \.)
278 if (!p
[1] || isalnum((unsigned char)p
[1])) {
284 pattern_buffer
+= *p
++;
291 /* This is an encoding of a literal string. */
292 case_insensitive
= coptions
& PCRE_CASELESS
;
293 literal_str
= std::move(pattern_buffer
);
297 bool pcre_literal_data::isLiteral() const {
298 return literal_str
.hasValue();
301 bool pcre_literal_data::matches(const StringData
* subject
,
303 int* offsets
) const {
304 assertx(isLiteral());
307 // Subject must be at least as long as the literal pattern
308 // for a match to occur.
309 if (subject
->size() < literal_str
->length() + pos
) {
313 size_t literal_strlen
= literal_str
->length();
314 auto const subject_c
= subject
->data();
315 auto const literal_c
= literal_str
->c_str();
317 // Make sure an exact match has the right length.
318 if (pos
|| (match_end
&& subject
->size() != literal_strlen
)) {
321 // If only matching the start (^), compare the strings
322 // for the length of the literal pattern.
323 if (case_insensitive
?
324 bstrcaseeq(subject_c
, literal_c
, literal_strlen
) :
325 memcmp(subject_c
, literal_c
, literal_strlen
) == 0) {
327 offsets
[1] = literal_strlen
* sizeof(char);
330 } else if (match_end
) {
331 // Compare the literal pattern against the tail end of the subject.
332 auto const subject_tail
= subject_c
+ (subject
->size() - literal_strlen
);
333 if (case_insensitive
?
334 bstrcaseeq(subject_tail
, literal_c
, literal_strlen
) :
335 memcmp(subject_tail
, literal_c
, literal_strlen
) == 0) {
336 offsets
[0] = (subject
->size() - literal_strlen
) * sizeof(char);
337 offsets
[1] = subject
->size() * sizeof(char);
341 if (!literal_strlen
) {
342 offsets
[0] = offsets
[1] = pos
;
345 // Check if the literal pattern occurs as a substring of the subject.
346 auto const subject_str
= StrNR(subject
);
347 auto const find_response
= subject_str
.asString().find(
348 *literal_str
, pos
, !case_insensitive
);
349 if (find_response
>= 0) {
350 offsets
[0] = find_response
* sizeof(char);
351 offsets
[1] = offsets
[0] + literal_strlen
* sizeof(char);
358 ///////////////////////////////////////////////////////////////////////////////
359 // PCRECache implementation
361 PCRECache::StaticCache
* PCRECache::CreateStatic() {
362 StaticCache::Config config
;
363 config
.maxLoadFactor
= 0.5;
364 return StaticCache::create(
365 RuntimeOption::EvalPCRETableSize
, config
).release();
368 void PCRECache::DestroyStatic(StaticCache
* cache
) {
369 // We delete uncounted keys while iterating the cache, which is OK for
370 // AtomicHashArray, but not OK for other containers, such as
371 // std::unordered_map. If you change the cache type make sure that property
372 // holds or fix this function.
373 static_assert(std::is_same
<PCRECache::StaticCache
,
374 folly::AtomicHashArray
<const StringData
*, const pcre_cache_entry
*,
375 string_data_hash
, ahm_string_data_same
>>::value
,
376 "StaticCache must be an AtomicHashArray or this destructor is wrong.");
377 for (auto& it
: *cache
) {
378 if (it
.first
->isUncounted()) {
379 const_cast<StringData
*>(it
.first
)->destructUncounted();
383 StaticCache::destroy(cache
);
386 void PCRECache::reinit(CacheKind kind
) {
388 case CacheKind::Static
:
389 if (m_staticCache
.load()) {
390 DestroyStatic(m_staticCache
);
391 m_staticCache
= nullptr;
397 case CacheKind::Scalable
:
398 m_scalableCache
.reset();
404 case CacheKind::Static
:
405 m_staticCache
= CreateStatic();
406 m_expire
= time(nullptr) + RuntimeOption::EvalPCREExpireInterval
;
409 m_lruCache
.reset(new LRUCache(RuntimeOption::EvalPCRETableSize
));
411 case CacheKind::Scalable
:
412 m_scalableCache
.reset(
413 new ScalableCache(RuntimeOption::EvalPCRETableSize
));
418 bool PCRECache::find(Accessor
& accessor
,
419 const StringData
* regex
,
420 TempKeyCache
& keyCache
)
423 case CacheKind::Static
:
425 assert(m_staticCache
.load());
426 StaticCache::iterator it
;
427 auto cache
= m_staticCache
.load(std::memory_order_acquire
);
428 if ((it
= cache
->find(regex
)) != cache
->end()) {
429 accessor
= it
->second
;
435 case CacheKind::Scalable
:
438 keyCache
.reset(new LRUCacheKey(regex
->data(), regex
->size()));
441 if (m_kind
== CacheKind::Lru
) {
442 found
= m_lruCache
->find(accessor
.resetToLRU(), *keyCache
);
444 found
= m_scalableCache
->find(accessor
.resetToLRU(), *keyCache
);
449 always_assert(false);
452 void PCRECache::clearStatic() {
453 std::unique_lock
<std::mutex
> lock(m_clearMutex
, std::try_to_lock
);
456 auto newExpire
= time(nullptr) + RuntimeOption::EvalPCREExpireInterval
;
457 m_expire
.store(newExpire
, std::memory_order_relaxed
);
459 auto tmpMap
= CreateStatic();
460 tmpMap
= m_staticCache
.exchange(tmpMap
, std::memory_order_acq_rel
);
462 Treadmill::enqueue([tmpMap
]() {
463 DestroyStatic(tmpMap
);
467 void PCRECache::insert(
469 const StringData
* regex
,
470 TempKeyCache
& keyCache
,
471 const pcre_cache_entry
* ent
474 case CacheKind::Static
:
476 assert(m_staticCache
.load());
477 // Clear the cache if we haven't refreshed it in a while
478 if (time(nullptr) > m_expire
) {
481 auto cache
= m_staticCache
.load(std::memory_order_acquire
);
482 auto key
= regex
->isStatic()
484 : StringData::MakeUncounted(regex
->slice());
485 auto pair
= cache
->insert(StaticCachePair(key
, ent
));
487 // Inserted, container owns the pointer
490 // Not inserted, caller needs to own the pointer
491 if (key
!= regex
) const_cast<StringData
*>(key
)->destructUncounted();
492 accessor
= EntryPtr(ent
);
497 case CacheKind::Scalable
:
500 keyCache
.reset(new LRUCacheKey(regex
->data(), regex
->size()));
502 // Pointer ownership is shared between container and caller
503 accessor
= EntryPtr(ent
);
504 if (m_kind
== CacheKind::Lru
) {
505 m_lruCache
->insert(*keyCache
, accessor
.entryPtr());
507 m_scalableCache
->insert(*keyCache
, accessor
.entryPtr());
514 void PCRECache::dump(const std::string
& filename
) {
515 std::ofstream
out(filename
.c_str());
517 case CacheKind::Static
:
518 for (auto& it
: *m_staticCache
) {
519 out
<< it
.first
->data() << "\n";
523 case CacheKind::Scalable
:
525 std::vector
<LRUCacheKey
> keys
;
526 if (m_kind
== CacheKind::Lru
) {
527 m_lruCache
->snapshotKeys(keys
);
529 m_scalableCache
->snapshotKeys(keys
);
531 for (auto& key
: keys
) {
532 out
<< key
.c_str() << "\n";
540 size_t PCRECache::size() const {
542 case CacheKind::Static
:
543 return m_staticCache
.load(std::memory_order_acquire
)->size();
545 return m_lruCache
->size();
546 case CacheKind::Scalable
:
547 return m_scalableCache
->size();
549 always_assert(false);
552 ///////////////////////////////////////////////////////////////////////////////
553 // Public interface and helper functions
556 PCRECache::CacheKind kind
;
557 if (RuntimeOption::EvalPCRECacheType
== "static") {
558 kind
= PCRECache::CacheKind::Static
;
559 } else if (RuntimeOption::EvalPCRECacheType
== "lru") {
560 kind
= PCRECache::CacheKind::Lru
;
561 } else if (RuntimeOption::EvalPCRECacheType
== "scalable") {
562 kind
= PCRECache::CacheKind::Scalable
;
564 Logger::Warning("Eval.PCRECacheType should be either static, "
566 kind
= PCRECache::CacheKind::Scalable
;
568 s_pcreCache
.reinit(kind
);
574 void pcre_dump_cache(const std::string
& filename
) {
575 s_pcreCache
.dump(filename
);
578 static pcre_jit_stack
* alloc_jit_stack(void* /*data*/) {
579 return tl_pcre_globals
->jit_stack
;
584 template<bool useSmartFree
= false>
585 struct FreeHelperImpl
{
586 explicit FreeHelperImpl(void* p
) : p(p
) {}
588 useSmartFree
? req::free(p
) : free(p
);
591 FreeHelperImpl(const FreeHelperImpl
&) = delete;
592 FreeHelperImpl
& operator=(const FreeHelperImpl
&) = delete;
598 typedef FreeHelperImpl
<true> SmartFreeHelper
;
601 static void init_local_extra(pcre_extra
* local
, pcre_extra
* shared
) {
603 memcpy(local
, shared
, sizeof(pcre_extra
));
605 memset(local
, 0, sizeof(pcre_extra
));
606 local
->flags
= PCRE_EXTRA_MATCH_LIMIT
| PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
608 local
->match_limit
= tl_pcre_globals
->preg_backtrace_limit
;
609 local
->match_limit_recursion
= tl_pcre_globals
->preg_recursion_limit
;
612 static const char* const*
613 get_subpat_names(const pcre_cache_entry
* pce
) {
614 char **subpat_names
= pce
->subpat_names
.load(std::memory_order_relaxed
);
620 * Build a mapping from subpattern numbers to their names. We will always
621 * allocate the table, even though there may be no named subpatterns. This
622 * avoids somewhat more complicated logic in the inner loops.
625 init_local_extra(&extra
, pce
->extra
);
629 subpat_names
= (char **)calloc(pce
->num_subpats
, sizeof(char *));
630 int rc
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_NAMECOUNT
, &name_count
);
632 raise_warning("Internal pcre_fullinfo() error %d", rc
);
635 if (name_count
> 0) {
636 int name_size
, ni
= 0;
637 unsigned short name_idx
;
641 rc1
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_NAMETABLE
, &name_table
);
642 rc2
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_NAMEENTRYSIZE
, &name_size
);
643 rc
= rc2
? rc2
: rc1
;
645 raise_warning("Internal pcre_fullinfo() error %d", rc
);
648 while (ni
++ < name_count
) {
649 name_idx
= 0xff * (unsigned char)name_table
[0] +
650 (unsigned char)name_table
[1];
651 subpat_names
[name_idx
] = name_table
+ 2;
652 if (is_numeric_string(subpat_names
[name_idx
],
653 strlen(subpat_names
[name_idx
]),
654 nullptr, nullptr, 0) != KindOfNull
) {
655 raise_warning("Numeric named subpatterns are not allowed");
658 name_table
+= name_size
;
661 // Store subpat_names into the cache entry
662 char **expected
= nullptr;
663 if (!pce
->subpat_names
.compare_exchange_strong(expected
, subpat_names
)) {
664 // Another thread stored subpat_names already. The array created by the
665 // other thread is now in expected, return it instead and delete the one
673 static bool get_pcre_fullinfo(pcre_cache_entry
* pce
) {
675 init_local_extra(&extra
, pce
->extra
);
677 /* Calculate the size of the offsets array*/
678 int rc
= pcre_fullinfo(pce
->re
, &extra
, PCRE_INFO_CAPTURECOUNT
,
681 raise_warning("Internal pcre_fullinfo() error %d", rc
);
689 pcre_get_compiled_regex_cache(PCRECache::Accessor
& accessor
,
690 const StringData
* regex
) {
691 PCRECache::TempKeyCache tkc
;
693 /* Try to lookup the cached regex entry, and if successful, just pass
694 back the compiled pattern, otherwise go on and compile it. */
695 if (s_pcreCache
.find(accessor
, regex
, tkc
)) {
699 /* Parse through the leading whitespace, and display a warning if we
700 get to the end without encountering a delimiter. */
701 const char *p
= regex
->data();
702 while (isspace((int)*(unsigned char *)p
)) p
++;
704 raise_warning("Empty regular expression");
708 /* Get the delimiter and display a warning if it is alphanumeric
710 char delimiter
= *p
++;
711 if (isalnum((int)*(unsigned char *)&delimiter
) || delimiter
== '\\') {
712 raise_warning("Delimiter must not be alphanumeric or backslash");
716 char start_delimiter
= delimiter
;
717 const char *pp
= strchr("([{< )]}> )]}>", delimiter
);
721 char end_delimiter
= delimiter
;
723 if (start_delimiter
== end_delimiter
) {
724 /* We need to iterate through the pattern, searching for the ending
725 * delimiter, but skipping the backslashed delimiters. If the ending
726 * delimiter is not found, display a warning. */
729 if (*pp
== '\\' && pp
[1] != 0) pp
++;
730 else if (*pp
== delimiter
)
735 raise_warning("No ending delimiter '%c' found: [%s]", delimiter
,
740 /* We iterate through the pattern, searching for the matching ending
741 * delimiter. For each matching starting delimiter, we increment nesting
742 * level, and decrement it for each matching ending delimiter. If we
743 * reach the end of the pattern without matching, display a warning.
745 int brackets
= 1; // brackets nesting level
748 if (*pp
== '\\' && pp
[1] != 0) pp
++;
749 else if (*pp
== end_delimiter
&& --brackets
<= 0)
751 else if (*pp
== start_delimiter
)
756 raise_warning("No ending matching delimiter '%c' found: [%s]",
757 end_delimiter
, regex
->data());
762 /* Make a copy of the actual pattern. */
763 String
spattern(p
, pp
-p
, CopyString
);
764 const char *pattern
= spattern
.data();
766 /* Move on to the options */
769 /* Parse through the options, setting appropriate flags. Display
770 a warning if we encounter an unknown modifier. */
773 bool do_study
= false;
776 /* Perl compatible options */
777 case 'i': coptions
|= PCRE_CASELESS
; break;
778 case 'm': coptions
|= PCRE_MULTILINE
; break;
779 case 's': coptions
|= PCRE_DOTALL
; break;
780 case 'x': coptions
|= PCRE_EXTENDED
; break;
782 /* PCRE specific options */
783 case 'A': coptions
|= PCRE_ANCHORED
; break;
784 case 'D': coptions
|= PCRE_DOLLAR_ENDONLY
; break;
785 case 'S': do_study
= true; break;
786 case 'U': coptions
|= PCRE_UNGREEDY
; break;
787 case 'X': coptions
|= PCRE_EXTRA
; break;
788 case 'u': coptions
|= PCRE_UTF8
;
789 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
790 characters, even in UTF-8 mode. However, this can be changed by setting
791 the PCRE_UCP option. */
793 coptions
|= PCRE_UCP
;
797 /* Custom preg options */
798 case 'e': poptions
|= PREG_REPLACE_EVAL
; break;
805 raise_warning("Unknown modifier '%c': [%s]", pp
[-1], regex
->data());
810 /* We've reached a null byte, now check if we're actually at the end of the
811 string. If not this is a bad expression, and a potential security hole. */
812 if (regex
->size() != (pp
- regex
->data())) {
813 raise_error("Error: Null byte found in pattern");
816 /* Compile pattern and display a warning if compilation failed. */
819 pcre
*re
= pcre_compile(pattern
, coptions
, &error
, &erroffset
, 0);
821 raise_warning("Compilation failed: %s at offset %d", error
, erroffset
);
825 // Careful: from here 're' needs to be freed if something throws.
827 // TODO(t14969501): enable literal_data everywhere and skip the
828 // pcre_compile above.
829 auto const literal_data
= pcre_literal_data(pattern
, coptions
);
831 /* If study option was specified, study the pattern and
832 store the result in extra for passing to pcre_exec. */
833 pcre_extra
*extra
= nullptr;
834 if (!literal_data
.isLiteral()) {
835 if (do_study
|| PCRE_STUDY_JIT_COMPILE
) {
836 int soptions
= PCRE_STUDY_JIT_COMPILE
;
837 extra
= pcre_study(re
, soptions
, &error
);
839 extra
->flags
|= PCRE_EXTRA_MATCH_LIMIT
|
840 PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
841 pcre_assign_jit_stack(extra
, alloc_jit_stack
, nullptr);
843 if (error
!= nullptr) {
845 raise_warning("Error while studying pattern");
851 if ((!RuntimeOption::EvalJitNoGdb
||
852 RuntimeOption::EvalJitUseVtuneAPI
||
853 RuntimeOption::EvalPerfPidMap
) &&
855 extra
->executable_jit
!= nullptr) {
857 pcre_fullinfo(re
, extra
, PCRE_INFO_JITSIZE
, &size
);
859 TCA start
= *(TCA
*)(extra
->executable_jit
);
860 TCA end
= start
+ size
;
861 std::string name
= folly::sformat("HHVM::pcre_jit::{}", pattern
);
863 if (!RuntimeOption::EvalJitNoGdb
&& jit::mcgen::initialized()) {
864 Debug::DebugInfo::Get()->recordStub(Debug::TCRange(start
, end
, false),
867 if (RuntimeOption::EvalJitUseVtuneAPI
) {
868 HPHP::jit::reportHelperToVtune(name
.c_str(), start
, end
);
870 if (RuntimeOption::EvalPerfPidMap
&& jit::mcgen::initialized()) {
871 Debug::DebugInfo::Get()->recordPerfMap(
872 Debug::TCRange(start
, end
, false),
873 SrcKey
{}, nullptr, false, false,
874 HPHP::JSON::Escape(name
.c_str())
881 /* Store the compiled pattern and extra info in the cache. */
882 pcre_cache_entry
* new_entry
= new pcre_cache_entry();
884 new_entry
->extra
= extra
;
885 if (literal_data
.isLiteral()) {
886 new_entry
->literal_data
=
887 std::make_unique
<pcre_literal_data
>(std::move(literal_data
));
890 assert((poptions
& ~0x1) == 0);
891 new_entry
->preg_options
= poptions
;
893 assert((coptions
& 0x80000000) == 0);
894 new_entry
->compile_options
= coptions
;
896 /* Get pcre full info */
897 if (!get_pcre_fullinfo(new_entry
)) {
902 s_pcreCache
.insert(accessor
, regex
, tkc
, new_entry
);
906 static int* create_offset_array(const pcre_cache_entry
* pce
,
908 /* Allocate memory for the offsets array */
909 size_offsets
= pce
->num_subpats
* 3;
910 return (int *)req::malloc_noptrs(size_offsets
* sizeof(int));
913 static inline void add_offset_pair(Array
& result
,
917 auto match_pair
= make_packed_array(str
, offset
);
918 if (name
) result
.set(String(name
), match_pair
);
919 result
.append(match_pair
);
922 static inline bool pcre_need_log_error(int pcre_code
) {
923 return RuntimeOption::EnablePregErrorLog
&&
924 (pcre_code
== PCRE_ERROR_MATCHLIMIT
||
925 pcre_code
== PCRE_ERROR_RECURSIONLIMIT
);
928 static void pcre_log_error(const char* func
, int line
, int pcre_code
,
929 const char* pattern
, int pattern_size
,
930 const char* subject
, int subject_size
,
931 const char* repl
, int repl_size
,
932 int arg1
= 0, int arg2
= 0,
933 int arg3
= 0, int arg4
= 0) {
934 if (!RuntimeOption::EnableHipHopSyntax
) {
937 const char* escapedPattern
;
938 const char* escapedSubject
;
939 const char* escapedRepl
;
940 std::string
p(pattern
, pattern_size
);
941 std::string
s(subject
, subject_size
);
942 std::string
r(repl
, repl_size
);
943 escapedPattern
= Logger::EscapeString(p
);
944 escapedSubject
= Logger::EscapeString(s
);
945 escapedRepl
= Logger::EscapeString(r
);
946 const char* errString
=
947 (pcre_code
== PCRE_ERROR_MATCHLIMIT
) ? "PCRE_ERROR_MATCHLIMIT" :
948 (pcre_code
== PCRE_ERROR_RECURSIONLIMIT
) ? "PCRE_ERROR_RECURSIONLIMIT" :
950 raise_warning_unsampled(
951 "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
952 "limits=(%" PRId64
", %" PRId64
"), extra=(%d, %d, %d, %d)",
953 func
, line
, pcre_code
, errString
,
954 escapedPattern
, escapedSubject
, escapedRepl
,
955 tl_pcre_globals
->preg_backtrace_limit
,
956 tl_pcre_globals
->preg_recursion_limit
,
957 arg1
, arg2
, arg3
, arg4
);
958 free((void *)escapedPattern
);
959 free((void *)escapedSubject
);
960 free((void *)escapedRepl
);
963 static void pcre_handle_exec_error(int pcre_code
) {
966 case PCRE_ERROR_MATCHLIMIT
:
967 preg_code
= PHP_PCRE_BACKTRACK_LIMIT_ERROR
;
969 case PCRE_ERROR_RECURSIONLIMIT
:
970 preg_code
= PHP_PCRE_RECURSION_LIMIT_ERROR
;
972 case PCRE_ERROR_BADUTF8
:
973 preg_code
= PHP_PCRE_BAD_UTF8_ERROR
;
975 case PCRE_ERROR_BADUTF8_OFFSET
:
976 preg_code
= PHP_PCRE_BAD_UTF8_OFFSET_ERROR
;
979 preg_code
= PHP_PCRE_INTERNAL_ERROR
;
982 tl_last_error_code
= preg_code
;
985 ///////////////////////////////////////////////////////////////////////////////
987 Variant
preg_grep(const String
& pattern
, const Array
& input
, int flags
/* = 0 */) {
988 PCRECache::Accessor accessor
;
989 if (!pcre_get_compiled_regex_cache(accessor
, pattern
.get())) {
992 const pcre_cache_entry
* pce
= accessor
.get();
994 int size_offsets
= 0;
995 int* offsets
= create_offset_array(pce
, size_offsets
);
996 if (offsets
== nullptr) {
999 SmartFreeHelper
freer(offsets
);
1001 /* Initialize return array */
1002 Array ret
= Array::Create();
1003 tl_last_error_code
= PHP_PCRE_NO_ERROR
;
1005 /* Go through the input array */
1006 bool invert
= (flags
& PREG_GREP_INVERT
);
1008 init_local_extra(&extra
, pce
->extra
);
1010 for (ArrayIter
iter(input
); iter
; ++iter
) {
1011 String entry
= iter
.second().toString();
1013 /* Perform the match */
1014 int count
= pcre_exec(pce
->re
, &extra
, entry
.data(), entry
.size(),
1015 0, 0, offsets
, size_offsets
);
1017 /* Check for too many substrings condition. */
1019 raise_warning("Matched, but too many substrings");
1020 count
= size_offsets
/ 3;
1021 } else if (count
< 0 && count
!= PCRE_ERROR_NOMATCH
) {
1022 if (pcre_need_log_error(count
)) {
1023 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1024 pattern
.data(), pattern
.size(),
1025 entry
.data(), entry
.size(),
1029 pcre_handle_exec_error(count
);
1033 /* If the entry fits our requirements */
1034 if ((count
> 0 && !invert
) ||
1035 (count
== PCRE_ERROR_NOMATCH
&& invert
)) {
1037 /* Add to return array */
1038 ret
.set(iter
.first(), entry
);
1045 ///////////////////////////////////////////////////////////////////////////////
1047 static Variant
preg_match_impl(const StringData
* pattern
,
1048 const StringData
* subject
,
1049 Variant
* subpats
, int flags
, int start_offset
,
1051 PCRECache::Accessor accessor
;
1052 if (!pcre_get_compiled_regex_cache(accessor
, pattern
)) {
1055 const pcre_cache_entry
* pce
= accessor
.get();
1058 init_local_extra(&extra
, pce
->extra
);
1060 *subpats
= Array::Create();
1062 int exec_options
= 0;
1064 int subpats_order
= global
? PREG_PATTERN_ORDER
: 0;
1065 bool offset_capture
= false;
1067 offset_capture
= flags
& PREG_OFFSET_CAPTURE
;
1070 * subpats_order is pre-set to pattern mode so we change it only if
1074 subpats_order
= flags
& 0xff;
1076 if ((global
&& (subpats_order
< PREG_PATTERN_ORDER
||
1077 subpats_order
> PREG_SET_ORDER
)) ||
1078 (!global
&& subpats_order
!= 0)) {
1079 raise_warning("Invalid flags specified");
1084 /* Negative offset counts from the end of the string. */
1085 if (start_offset
< 0) {
1086 start_offset
= subject
->size() + start_offset
;
1087 if (start_offset
< 0) {
1092 int size_offsets
= 0;
1093 int* offsets
= create_offset_array(pce
, size_offsets
);
1094 SmartFreeHelper
offsetsFreer(offsets
);
1095 int num_subpats
= size_offsets
/ 3;
1096 if (offsets
== nullptr) {
1100 const char* const* subpat_names
= get_subpat_names(pce
);
1101 if (subpat_names
== nullptr) {
1105 /* Allocate match sets array and initialize the values. */
1106 Array match_sets
; /* An array of sets of matches for each
1107 subpattern after a global match */
1108 if (global
&& subpats_order
== PREG_PATTERN_ORDER
) {
1109 for (int i
= 0; i
< num_subpats
; i
++) {
1110 match_sets
.set(i
, Array::Create());
1115 tl_last_error_code
= PHP_PCRE_NO_ERROR
;
1117 int g_notempty
= 0; // If the match should not be empty
1118 const char** stringlist
; // Holds list of subpatterns
1124 * Optimization: If the pattern defines a literal substring,
1125 * compare the strings directly (i.e. memcmp) instead of performing
1126 * the full regular expression evaluation.
1127 * Take the slow path if there are any special compile options.
1129 if (pce
->literal_data
&& !global
) {
1130 assertx(pce
->literal_data
->isLiteral());
1131 /* TODO(t13140878): compare literal against multiple substrings
1132 * in the preg_match_all (global == true) case. */
1133 count
= pce
->literal_data
->matches(subject
, start_offset
, offsets
) ? 1
1134 : PCRE_ERROR_NOMATCH
;
1136 /* Execute the regular expression. */
1137 count
= pcre_exec(pce
->re
, &extra
, subject
->data(), subject
->size(),
1139 exec_options
| g_notempty
,
1140 offsets
, size_offsets
);
1142 /* The string was already proved to be valid UTF-8 */
1143 exec_options
|= PCRE_NO_UTF8_CHECK
;
1145 /* Check for too many substrings condition. */
1147 raise_warning("Matched, but too many substrings");
1148 count
= size_offsets
/ 3;
1151 /* If something has matched */
1156 // Try to get the list of substrings and display a warning if failed.
1157 if (offsets
[1] < offsets
[0] ||
1158 pcre_get_substring_list(subject
->data(), offsets
, count
,
1160 raise_warning("Get subpatterns list failed");
1164 if (global
) { /* global pattern matching */
1165 if (subpats_order
== PREG_PATTERN_ORDER
) {
1166 /* For each subpattern, insert it into the appropriate array. */
1167 for (i
= 0; i
< count
; i
++) {
1168 if (offset_capture
) {
1169 auto& lval
= match_sets
.lvalAt(i
);
1171 add_offset_pair(lval
.toArrRef(),
1172 String(stringlist
[i
],
1173 offsets
[(i
<<1)+1] - offsets
[i
<<1],
1175 offsets
[i
<<1], nullptr);
1177 auto& lval
= match_sets
.lvalAt(i
);
1178 forceToArray(lval
).append(
1179 String(stringlist
[i
], offsets
[(i
<<1)+1] - offsets
[i
<<1],
1185 * If the number of captured subpatterns on this run is
1186 * less than the total possible number, pad the result
1187 * arrays with empty strings.
1189 if (count
< num_subpats
) {
1190 for (; i
< num_subpats
; i
++) {
1191 auto& lval
= match_sets
.lvalAt(i
);
1192 forceToArray(lval
).append("");
1196 Array result_set
= Array::Create();
1198 /* Add all the subpatterns to it */
1199 for (i
= 0; i
< count
; i
++) {
1200 if (offset_capture
) {
1201 add_offset_pair(result_set
,
1202 String(stringlist
[i
],
1203 offsets
[(i
<<1)+1] - offsets
[i
<<1],
1205 offsets
[i
<<1], subpat_names
[i
]);
1207 String
value(stringlist
[i
], offsets
[(i
<<1)+1] - offsets
[i
<<1],
1209 if (subpat_names
[i
]) {
1210 result_set
.set(String(subpat_names
[i
]), value
);
1212 result_set
.append(value
);
1215 /* And add it to the output array */
1216 forceToArray(*subpats
).append(std::move(result_set
));
1218 } else { /* single pattern matching */
1219 /* For each subpattern, insert it into the subpatterns array. */
1220 for (i
= 0; i
< count
; i
++) {
1221 if (offset_capture
) {
1222 add_offset_pair(forceToArray(*subpats
),
1223 String(stringlist
[i
],
1224 offsets
[(i
<<1)+1] - offsets
[i
<<1],
1226 offsets
[i
<<1], subpat_names
[i
]);
1228 String
value(stringlist
[i
], offsets
[(i
<<1)+1] - offsets
[i
<<1],
1230 if (subpat_names
[i
]) {
1231 forceToArray(*subpats
).set(String(subpat_names
[i
]), value
);
1233 forceToArray(*subpats
).append(value
);
1237 pcre_free((void *) stringlist
);
1239 } else if (count
== PCRE_ERROR_NOMATCH
) {
1240 /* If we previously set PCRE_NOTEMPTY after a null match,
1241 this is not necessarily the end. We need to advance
1242 the start offset, and continue. Fudge the offset values
1243 to achieve this, unless we're already at the end of the string. */
1244 if (g_notempty
&& start_offset
< subject
->size()) {
1245 offsets
[0] = start_offset
;
1246 offsets
[1] = start_offset
+ 1;
1250 if (pcre_need_log_error(count
)) {
1251 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1252 pattern
->data(), pattern
->size(),
1253 subject
->data(), subject
->size(),
1255 flags
, start_offset
, g_notempty
, global
);
1257 pcre_handle_exec_error(count
);
1261 /* If we have matched an empty string, mimic what Perl's /g options does.
1262 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1263 the match again at the same point. If this fails (picked up above) we
1264 advance to the next character. */
1265 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1267 /* Advance to the position right after the last full match */
1268 start_offset
= offsets
[1];
1271 /* Add the match sets to the output array and clean up */
1272 if (subpats
&& global
&& subpats_order
== PREG_PATTERN_ORDER
) {
1273 for (i
= 0; i
< num_subpats
; i
++) {
1274 if (subpat_names
[i
]) {
1275 forceToArray(*subpats
).set(String(subpat_names
[i
]), match_sets
[i
]);
1277 forceToArray(*subpats
).append(match_sets
[i
]);
1283 Variant
preg_match(const String
& pattern
, const String
& subject
,
1284 Variant
* matches
/* = nullptr */, int flags
/* = 0 */,
1285 int offset
/* = 0 */) {
1286 return preg_match(pattern
.get(), subject
.get(), matches
, flags
, offset
);
1289 Variant
preg_match(const StringData
* pattern
, const StringData
* subject
,
1290 Variant
* matches
/* = nullptr */, int flags
/* = 0 */,
1291 int offset
/* = 0 */) {
1292 return preg_match_impl(pattern
, subject
, matches
, flags
, offset
, false);
1295 Variant
preg_match_all(const String
& pattern
, const String
& subject
,
1296 Variant
* matches
/* = nullptr */,
1297 int flags
/* = 0 */, int offset
/* = 0 */) {
1298 return preg_match_all(pattern
.get(), subject
.get(), matches
, flags
, offset
);
1301 Variant
preg_match_all(const StringData
* pattern
, const StringData
* subject
,
1302 Variant
* matches
/* = nullptr */,
1303 int flags
/* = 0 */, int offset
/* = 0 */) {
1304 return preg_match_impl(pattern
, subject
, matches
, flags
, offset
, true);
1307 ///////////////////////////////////////////////////////////////////////////////
1309 static String
preg_do_repl_func(const Variant
& function
, const String
& subject
,
1310 int* offsets
, const char* const* subpat_names
,
1312 Array subpats
= Array::Create();
1313 for (int i
= 0; i
< count
; i
++) {
1314 auto off1
= offsets
[i
<<1];
1315 auto off2
= offsets
[(i
<<1)+1];
1316 auto sub
= subject
.substr(off1
, off2
- off1
);
1318 if (subpat_names
[i
]) {
1319 subpats
.set(String(subpat_names
[i
]), sub
);
1321 subpats
.append(sub
);
1325 args
.set(0, subpats
);
1326 return vm_call_user_func(function
, args
).toString();
1329 static bool preg_get_backref(const char** str
, int* backref
) {
1331 const char* walk
= *str
;
1337 if (*walk
== '$' && walk
[1] == '{') {
1343 if (*walk
>= '0' && *walk
<= '9') {
1344 *backref
= *walk
- '0';
1350 if (*walk
&& *walk
>= '0' && *walk
<= '9') {
1351 *backref
= *backref
* 10 + *walk
- '0';
1356 if (*walk
== 0 || *walk
!= '}') {
1366 static Variant
php_pcre_replace(const String
& pattern
, const String
& subject
,
1367 const Variant
& replace_var
, bool callable
,
1368 int limit
, int* replace_count
) {
1369 PCRECache::Accessor accessor
;
1370 if (!pcre_get_compiled_regex_cache(accessor
, pattern
.get())) {
1373 const pcre_cache_entry
* pce
= accessor
.get();
1374 bool eval
= pce
->preg_options
& PREG_REPLACE_EVAL
;
1376 if (RuntimeOption::EvalAuthoritativeMode
) {
1378 "You can't use eval in RepoAuthoritative mode. It breaks all sorts of "
1379 "assumptions we use for speed. Switch to using preg_replace_callback()."
1384 "Modifier /e cannot be used with replacement callback."
1389 "preg_replace(): The /e modifier is deprecated, use "
1390 "preg_replace_callback instead"
1395 int* offsets
= create_offset_array(pce
, size_offsets
);
1396 SmartFreeHelper
offsetsFreer(offsets
);
1397 if (offsets
== nullptr) {
1401 const char* const* subpat_names
= get_subpat_names(pce
);
1402 if (subpat_names
== nullptr) {
1406 const char* replace
= nullptr;
1407 const char* replace_end
= nullptr;
1408 int replace_len
= 0;
1412 replace_val
= replace_var
.toString();
1413 replace
= replace_val
.data();
1414 replace_len
= replace_val
.size();
1415 replace_end
= replace
+ replace_len
;
1418 StringBuffer
result(2 * subject
.size());
1423 const char* match
= nullptr;
1424 int start_offset
= 0;
1425 tl_last_error_code
= PHP_PCRE_NO_ERROR
;
1427 init_local_extra(&extra
, pce
->extra
);
1429 const char* walk
; // Used to walk the replacement string
1430 char walk_last
; // Last walked character
1431 int match_len
; // Length of the current match
1432 int backref
; // Backreference number
1433 int g_notempty
= 0; // If the match should not be empty
1434 int exec_options
= 0; // Options passed to pcre_exec
1436 /* Execute the regular expression. */
1437 int count
= pcre_exec(pce
->re
, &extra
, subject
.data(), subject
.size(),
1439 exec_options
| g_notempty
,
1440 offsets
, size_offsets
);
1442 /* The string was already proved to be valid UTF-8 */
1443 exec_options
|= PCRE_NO_UTF8_CHECK
;
1445 /* Check for too many substrings condition. */
1447 raise_warning("Matched, but too many substrings");
1448 count
= size_offsets
/ 3;
1451 const char* piece
= subject
.data() + start_offset
;
1452 if (count
> 0 && offsets
[1] >= offsets
[0] &&
1453 (limit
== -1 || limit
> 0)) {
1454 if (replace_count
) {
1457 /* Set the match location in subject */
1458 match
= subject
.data() + offsets
[0];
1460 /* If evaluating, do it and add the return string's length */
1463 /* Use custom function to get replacement string and its length. */
1464 eval_result
= preg_do_repl_func(replace_var
, subject
, offsets
,
1465 subpat_names
, count
);
1466 } else { /* do regular substitution */
1469 while (walk
< replace_end
) {
1470 if ('\\' == *walk
|| '$' == *walk
) {
1471 if (walk_last
== '\\') {
1476 if (preg_get_backref(&walk
, &backref
)) {
1477 if (backref
< count
) {
1478 match_len
= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
1480 String esc_match
= HHVM_FN(addslashes
)(
1482 subject
.data() + offsets
[backref
<<1],
1487 match_len
= esc_match
.length();
1494 walk_last
= walk
[-1];
1498 /* copy the part of the string before the match */
1499 result
.append(piece
, match
-piece
);
1501 /* copy replacement and backrefs */
1502 int result_len
= result
.size();
1504 /* If evaluating or using custom function, copy result to the buffer
1507 result
.append(eval_result
.data(), eval_result
.size());
1508 result_len
+= eval_result
.size();
1509 } else { /* do regular backreference copying */
1513 int lastStart
= result
.size();
1514 while (walk
< replace_end
) {
1515 bool handleQuote
= eval
&& '"' == *walk
&& walk_last
!= '\\';
1516 if (handleQuote
&& lastStart
!= result
.size()) {
1517 String
str(result
.data() + lastStart
, result
.size() - lastStart
,
1520 lastStart
= result
.size();
1521 handleQuote
= false;
1523 if ('\\' == *walk
|| '$' == *walk
) {
1524 if (walk_last
== '\\') {
1525 result
.set(result
.size() - 1, *walk
++);
1529 if (preg_get_backref(&walk
, &backref
)) {
1530 if (backref
< count
) {
1531 match_len
= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
1533 String esc_match
= HHVM_FN(addslashes
)(
1535 subject
.data() + offsets
[backref
<<1],
1540 match_len
= esc_match
.length();
1541 result
.append(esc_match
.data(), match_len
);
1544 subject
.data() + offsets
[backref
<<1],
1552 result
.append(*walk
++);
1553 walk_last
= walk
[-1];
1554 if (handleQuote
&& lastStart
!= result
.size()) {
1555 lastStart
= result
.size();
1558 auto full_len
= result
.size();
1559 auto data
= result
.data() + result_len
;
1562 auto const ar
= GetCallerFrame();
1563 // reserve space for "<?php return " + code + ";"
1564 String
prefixedCode(full_len
- result_len
+ 14, ReserveString
);
1566 (ar
->unit()->isHHFile() ? "<?hh return " : "<?php return ");
1567 prefixedCode
+= folly::StringPiece
{data
, full_len
- result_len
};
1568 prefixedCode
+= ";";
1569 auto const unit
= g_context
->compileEvalString(prefixedCode
.get());
1570 auto const ctx
= ar
->func()->cls();
1571 auto const func
= unit
->getMain(ctx
);
1575 if (ar
->hasThis()) {
1576 thiz
= ar
->getThis();
1577 cls
= thiz
->getVMClass();
1580 cls
= ar
->getClass();
1586 auto v
= Variant::attach(
1587 g_context
->invokeFunc(func
, init_null_variant
,
1588 thiz
, cls
, nullptr, nullptr,
1589 ExecutionContext::InvokePseudoMain
)
1591 eval_result
= v
.toString();
1593 result
.resize(result_len
);
1594 result
.append(eval_result
.data(), eval_result
.size());
1602 } else if (count
== PCRE_ERROR_NOMATCH
|| limit
== 0) {
1603 /* If we previously set PCRE_NOTEMPTY after a null match,
1604 this is not necessarily the end. We need to advance
1605 the start offset, and continue. Fudge the offset values
1606 to achieve this, unless we're already at the end of the string. */
1607 if (g_notempty
!= 0 && start_offset
< subject
.size()) {
1608 offsets
[0] = start_offset
;
1609 offsets
[1] = start_offset
+ 1;
1610 result
.append(piece
, 1);
1612 /* stick that last bit of string on our output */
1613 result
.append(piece
, subject
.size() - start_offset
);
1617 if (pcre_need_log_error(count
)) {
1622 if (replace_var
.isObject()) {
1623 stemp
= replace_var
.asCObjRef()->getClassName().asString()
1626 stemp
= replace_var
.toString();
1629 size
= stemp
.size();
1631 s
= replace_val
.data();
1632 size
= replace_val
.size();
1634 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1635 pattern
.data(), pattern
.size(),
1636 subject
.data(), subject
.size(),
1638 callable
, limit
, start_offset
, g_notempty
);
1640 pcre_handle_exec_error(count
);
1644 /* If we have matched an empty string, mimic what Perl's /g options does.
1645 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1646 the match again at the same point. If this fails (picked up above) we
1647 advance to the next character. */
1648 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1650 /* Advance to the next piece. */
1651 start_offset
= offsets
[1];
1654 return result
.detach();
1660 static Variant
php_replace_in_subject(const Variant
& regex
, const Variant
& replace
,
1661 String subject
, int limit
, bool callable
,
1662 int* replace_count
) {
1663 if (!regex
.isArray()) {
1664 Variant ret
= php_pcre_replace(regex
.toString(), subject
, replace
,
1665 callable
, limit
, replace_count
);
1667 if (ret
.isBoolean()) {
1668 assert(!ret
.toBoolean());
1675 if (callable
|| !replace
.isArray()) {
1676 Array arr
= regex
.toArray();
1677 for (ArrayIter
iterRegex(arr
); iterRegex
; ++iterRegex
) {
1678 String regex_entry
= iterRegex
.second().toString();
1679 Variant ret
= php_pcre_replace(regex_entry
, subject
, replace
,
1680 callable
, limit
, replace_count
);
1681 if (ret
.isBoolean()) {
1682 assert(!ret
.toBoolean());
1685 if (!ret
.isString()) {
1688 subject
= ret
.asStrRef();
1689 if (subject
.isNull()) {
1696 Array arrReplace
= replace
.toArray();
1697 Array arrRegex
= regex
.toArray();
1698 ArrayIter
iterReplace(arrReplace
);
1699 for (ArrayIter
iterRegex(arrRegex
); iterRegex
; ++iterRegex
) {
1700 String regex_entry
= iterRegex
.second().toString();
1701 Variant replace_value
;
1703 replace_value
= iterReplace
.second();
1707 Variant ret
= php_pcre_replace(regex_entry
, subject
, replace_value
,
1708 callable
, limit
, replace_count
);
1710 if (ret
.isBoolean()) {
1711 assert(!ret
.toBoolean());
1714 if (!ret
.isString()) {
1717 subject
= ret
.asStrRef();
1718 if (subject
.isNull()) {
1725 Variant
preg_replace_impl(const Variant
& pattern
, const Variant
& replacement
,
1726 const Variant
& subject
, int limit
, Variant
* count
,
1727 bool is_callable
, bool is_filter
) {
1728 assert(!(is_callable
&& is_filter
));
1730 replacement
.isArray() && !pattern
.isArray()) {
1731 raise_warning("Parameter mismatch, pattern is a string while "
1732 "replacement is an array");
1736 int replace_count
= 0;
1737 if (!isContainer(subject
)) {
1738 Variant ret
= php_replace_in_subject(pattern
, replacement
,
1740 limit
, is_callable
, &replace_count
);
1742 if (ret
.isString()) {
1743 if (count
) *count
= replace_count
;
1744 if (is_filter
&& replace_count
== 0) {
1747 return ret
.asStrRef();
1754 Array return_value
= Array::Create();
1755 Array arrSubject
= subject
.toArray();
1756 for (ArrayIter
iter(arrSubject
); iter
; ++iter
) {
1757 auto old_replace_count
= replace_count
;
1758 String subject_entry
= iter
.second().toString();
1759 Variant ret
= php_replace_in_subject(pattern
, replacement
, subject_entry
,
1760 limit
, is_callable
, &replace_count
);
1762 if (ret
.isString() && !ret
.isNull() &&
1763 (!is_filter
|| replace_count
> old_replace_count
)) {
1764 return_value
.set(iter
.first(), ret
.asStrRef());
1767 if (count
) *count
= replace_count
;
1768 return return_value
;
1771 int preg_replace(Variant
& result
,
1772 const Variant
& pattern
,
1773 const Variant
& replacement
,
1774 const Variant
& subject
,
1775 int limit
/* = -1 */) {
1777 result
= preg_replace_impl(pattern
, replacement
, subject
,
1778 limit
, &count
, false, false);
1779 return count
.toInt32();
1782 int preg_replace_callback(Variant
& result
,
1783 const Variant
& pattern
,
1784 const Variant
& callback
,
1785 const Variant
& subject
,
1786 int limit
/* = -1 */) {
1788 result
= preg_replace_impl(pattern
, callback
, subject
,
1789 limit
, &count
, true, false);
1790 return count
.toInt32();
1793 int preg_filter(Variant
& result
,
1794 const Variant
& pattern
,
1795 const Variant
& replacement
,
1796 const Variant
& subject
,
1797 int limit
/* = -1 */) {
1799 result
= preg_replace_impl(pattern
, replacement
, subject
,
1800 limit
, &count
, false, true);
1801 return count
.toInt32();
1804 ///////////////////////////////////////////////////////////////////////////////
1806 Variant
preg_split(const String
& pattern
, const String
& subject
,
1807 int limit
/* = -1 */, int flags
/* = 0 */) {
1808 PCRECache::Accessor accessor
;
1809 if (!pcre_get_compiled_regex_cache(accessor
, pattern
.get())) {
1812 const pcre_cache_entry
* pce
= accessor
.get();
1814 int no_empty
= flags
& PREG_SPLIT_NO_EMPTY
;
1815 bool delim_capture
= flags
& PREG_SPLIT_DELIM_CAPTURE
;
1816 bool offset_capture
= flags
& PREG_SPLIT_OFFSET_CAPTURE
;
1822 int size_offsets
= 0;
1823 int* offsets
= create_offset_array(pce
, size_offsets
);
1824 SmartFreeHelper
offsetsFreer(offsets
);
1825 if (offsets
== nullptr) {
1829 /* Start at the beginning of the string */
1830 int start_offset
= 0;
1831 int next_offset
= 0;
1832 const char* last_match
= subject
.data();
1833 tl_last_error_code
= PHP_PCRE_NO_ERROR
;
1835 init_local_extra(&extra
, pce
->extra
);
1837 // Get next piece if no limit or limit not yet reached and something matched
1838 Array return_value
= Array::Create();
1839 int g_notempty
= 0; /* If the match should not be empty */
1841 PCRECache::Accessor bump_accessor
;
1842 const pcre_cache_entry
* bump_pce
= nullptr; /* instance for empty matches */
1843 while ((limit
== -1 || limit
> 1)) {
1844 int count
= pcre_exec(pce
->re
, &extra
, subject
.data(), subject
.size(),
1845 start_offset
, g_notempty
| utf8_check
,
1846 offsets
, size_offsets
);
1848 /* Subsequent calls to pcre_exec don't need to bother with the
1849 * utf8 validity check: if the subject isn't valid, the first
1850 * call to pcre_exec will have failed, and as long as we only
1851 * set start_offset to known character boundaries we won't
1852 * supply an invalid offset. */
1853 utf8_check
= PCRE_NO_UTF8_CHECK
;
1855 /* Check for too many substrings condition. */
1857 raise_warning("Matched, but too many substrings");
1858 count
= size_offsets
/ 3;
1861 /* If something matched */
1862 if (count
> 0 && offsets
[1] >= offsets
[0]) {
1863 if (!no_empty
|| subject
.data() + offsets
[0] != last_match
) {
1864 if (offset_capture
) {
1865 /* Add (match, offset) pair to the return value */
1866 add_offset_pair(return_value
,
1868 subject
.data() + offsets
[0] - last_match
,
1870 next_offset
, nullptr);
1872 /* Add the piece to the return value */
1873 return_value
.append(String(last_match
,
1874 subject
.data() + offsets
[0] - last_match
,
1878 /* One less left to do */
1883 last_match
= subject
.data() + offsets
[1];
1884 next_offset
= offsets
[1];
1886 if (delim_capture
) {
1888 for (i
= 1; i
< count
; i
++) {
1889 match_len
= offsets
[(i
<<1)+1] - offsets
[i
<<1];
1890 /* If we have matched a delimiter */
1891 if (!no_empty
|| match_len
> 0) {
1892 if (offset_capture
) {
1893 add_offset_pair(return_value
,
1894 String(subject
.data() + offsets
[i
<<1],
1895 match_len
, CopyString
),
1896 offsets
[i
<<1], nullptr);
1898 return_value
.append(subject
.substr(offsets
[i
<<1], match_len
));
1903 } else if (count
== PCRE_ERROR_NOMATCH
) {
1904 /* If we previously set PCRE_NOTEMPTY after a null match,
1905 this is not necessarily the end. We need to advance
1906 the start offset, and continue. Fudge the offset values
1907 to achieve this, unless we're already at the end of the string. */
1908 if (g_notempty
!= 0 && start_offset
< subject
.size()) {
1909 if (pce
->compile_options
& PCRE_UTF8
) {
1910 if (bump_pce
== nullptr) {
1911 if (!pcre_get_compiled_regex_cache(bump_accessor
,
1912 String("/./us").get())) {
1915 bump_pce
= bump_accessor
.get();
1917 pcre_extra bump_extra
;
1918 init_local_extra(&bump_extra
, bump_pce
->extra
);
1919 count
= pcre_exec(bump_pce
->re
, &bump_extra
, subject
.data(),
1920 subject
.size(), start_offset
,
1921 utf8_check
, offsets
, size_offsets
);
1923 raise_warning("Unknown error");
1924 offsets
[0] = start_offset
;
1925 offsets
[1] = start_offset
+ 1;
1926 if (pcre_need_log_error(count
)) {
1927 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1928 pattern
.data(), pattern
.size(),
1929 subject
.data(), subject
.size(),
1931 limit
, flags
, start_offset
);
1935 offsets
[0] = start_offset
;
1936 offsets
[1] = start_offset
+ 1;
1941 if (pcre_need_log_error(count
)) {
1942 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1943 pattern
.data(), pattern
.size(),
1944 subject
.data(), subject
.size(),
1946 limit
, flags
, start_offset
, g_notempty
);
1948 pcre_handle_exec_error(count
);
1952 /* If we have matched an empty string, mimic what Perl's /g options does.
1953 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1954 the match again at the same point. If this fails (picked up above) we
1955 advance to the next character. */
1956 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1958 /* Advance to the position right after the last full match */
1959 start_offset
= offsets
[1];
1962 start_offset
= last_match
- subject
.data(); /* offset might have
1964 * but without further
1965 * successful matches */
1966 if (!no_empty
|| start_offset
< subject
.size()) {
1967 if (offset_capture
) {
1968 /* Add the last (match, offset) pair to the return value */
1969 add_offset_pair(return_value
,
1970 subject
.substr(start_offset
),
1971 start_offset
, nullptr);
1973 /* Add the last piece to the return value */
1975 (String(last_match
, subject
.data() + subject
.size() - last_match
,
1980 return return_value
;
1983 ///////////////////////////////////////////////////////////////////////////////
1985 String
preg_quote(const String
& str
,
1986 const String
& delimiter
/* = null_string */) {
1987 const char* in_str
= str
.data();
1988 const char* in_str_end
= in_str
+ str
.size();
1990 /* Nothing to do if we got an empty string */
1991 if (in_str
== in_str_end
) {
1995 char delim_char
= 0; /* Delimiter character to be quoted */
1996 bool quote_delim
= false; /* Whether to quote additional delim char */
1997 if (!delimiter
.empty()) {
1998 delim_char
= delimiter
.charAt(0);
2002 /* Allocate enough memory so that even if each character
2003 is quoted, we won't run out of room */
2004 String
ret(4 * str
.size() + 1, ReserveString
);
2005 char* out_str
= ret
.mutableData();
2007 /* Go through the string and quote necessary characters */
2010 for (p
= in_str
, q
= out_str
; p
!= in_str_end
; p
++) {
2013 case '.': case '\\': case '+': case '*': case '?':
2014 case '[': case '^': case ']': case '$': case '(':
2015 case ')': case '{': case '}': case '=': case '!':
2016 case '>': case '<': case '|': case ':': case '-':
2029 if (quote_delim
&& c
== delim_char
)
2037 return ret
.setSize(q
- out_str
);
2040 int preg_last_error() {
2041 return tl_last_error_code
;
2044 size_t preg_pcre_cache_size() {
2045 return s_pcreCache
.size();
2048 ///////////////////////////////////////////////////////////////////////////////
2051 static void php_reg_eprint(int err
, regex_t
* re
) {
2052 char *buf
= nullptr, *message
= nullptr;
2057 /* get the length of the message */
2058 buf_len
= regerror(REG_ITOA
| err
, re
, nullptr, 0);
2060 buf
= (char *)req::malloc_noptrs(buf_len
);
2061 if (!buf
) return; /* fail silently */
2062 /* finally, get the error message */
2063 regerror(REG_ITOA
| err
, re
, buf
, buf_len
);
2068 len
= regerror(err
, re
, nullptr, 0);
2070 message
= (char *)req::malloc_noptrs(buf_len
+ len
+ 2);
2072 return; /* fail silently */
2075 snprintf(message
, buf_len
, "%s: ", buf
);
2076 buf_len
+= 1; /* so pointer math below works */
2078 /* drop the message into place */
2079 regerror(err
, re
, message
+ buf_len
, len
);
2080 raise_warning("%s", message
);
2086 Variant
php_split(const String
& spliton
, const String
& str
, int count
,
2088 const char* strp
= str
.data();
2089 const char* endp
= strp
+ str
.size();
2092 int copts
= icase
? REG_ICASE
: 0;
2093 int err
= regcomp(&re
, spliton
.data(), REG_EXTENDED
| copts
);
2095 php_reg_eprint(err
, &re
);
2099 Array return_value
= Array::Create();
2102 /* churn through str, generating array entries as we go */
2103 while ((count
== -1 || count
> 1) &&
2104 !(err
= regexec(&re
, strp
, 1, subs
, 0))) {
2105 if (subs
[0].rm_so
== 0 && subs
[0].rm_eo
) {
2106 /* match is at start of string, return empty string */
2107 return_value
.append("");
2108 /* skip ahead the length of the regex match */
2109 strp
+= subs
[0].rm_eo
;
2110 } else if (subs
[0].rm_so
== 0 && subs
[0].rm_eo
== 0) {
2111 /* No more matches */
2113 raise_warning("Invalid Regular Expression to split()");
2116 /* On a real match */
2118 /* make a copy of the substring */
2119 int size
= subs
[0].rm_so
;
2121 /* add it to the array */
2122 return_value
.append(String(strp
, size
, CopyString
));
2124 /* point at our new starting point */
2125 strp
= strp
+ subs
[0].rm_eo
;
2128 /* if we're only looking for a certain number of points,
2129 stop looking once we hit it */
2135 /* see if we encountered an error */
2136 if (err
&& err
!= REG_NOMATCH
) {
2137 php_reg_eprint(err
, &re
);
2142 /* otherwise we just have one last element to add to the array */
2143 int size
= endp
- strp
;
2144 return_value
.append(String(strp
, size
, CopyString
));
2147 return return_value
;
2150 ///////////////////////////////////////////////////////////////////////////////