2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
16 #include "hphp/runtime/base/string_util.h"
17 #include "hphp/runtime/base/util/request_local.h"
18 #include "hphp/util/lock.h"
19 #include "hphp/util/logger.h"
21 #include <onigposix.h>
22 #include "hphp/runtime/base/runtime_option.h"
23 #include "hphp/runtime/base/builtin_functions.h"
24 #include "hphp/runtime/base/zend/zend_functions.h"
25 #include "hphp/runtime/base/array/array_iterator.h"
26 #include "hphp/runtime/base/ini_setting.h"
27 #include "hphp/runtime/base/thread_init_fini.h"
28 #include "tbb/concurrent_hash_map.h"
30 #define PREG_PATTERN_ORDER 1
31 #define PREG_SET_ORDER 2
32 #define PREG_OFFSET_CAPTURE (1<<8)
34 #define PREG_SPLIT_NO_EMPTY (1<<0)
35 #define PREG_SPLIT_DELIM_CAPTURE (1<<1)
36 #define PREG_SPLIT_OFFSET_CAPTURE (1<<2)
38 #define PREG_REPLACE_EVAL (1<<0)
40 #define PREG_GREP_INVERT (1<<0)
42 #define PCRE_CACHE_SIZE 4096
45 PHP_PCRE_NO_ERROR
= 0,
46 PHP_PCRE_INTERNAL_ERROR
,
47 PHP_PCRE_BACKTRACK_LIMIT_ERROR
,
48 PHP_PCRE_RECURSION_LIMIT_ERROR
,
49 PHP_PCRE_BAD_UTF8_ERROR
,
50 PHP_PCRE_BAD_UTF8_OFFSET_ERROR
54 ///////////////////////////////////////////////////////////////////////////////
55 // regex cache and helpers
57 class pcre_cache_entry
{
58 pcre_cache_entry(const pcre_cache_entry
&);
59 pcre_cache_entry
& operator=(const pcre_cache_entry
&);
64 if (extra
) free(extra
); // we don't have pcre_free_study yet
69 pcre_extra
*extra
; // Holds results of studying
74 typedef tbb::concurrent_hash_map
<const StringData
*,const pcre_cache_entry
*,
75 StringDataHashCompare
> PCREStringMap
;
77 static PCREStringMap s_pcreCacheMap
;
79 static const pcre_cache_entry
* lookup_cached_pcre(CStrRef regex
) {
80 PCREStringMap::const_accessor acc
;
81 if (s_pcreCacheMap
.find(acc
, regex
.get())) {
87 static const pcre_cache_entry
*
88 insert_cached_pcre(CStrRef regex
, const pcre_cache_entry
* ent
) {
89 PCREStringMap::accessor acc
;
90 if (s_pcreCacheMap
.insert(acc
, StringData::GetStaticString(regex
.get()))) {
99 * When a cached compiled pcre doesn't have pcre_extra, we use this
102 * FIXME: It's unclear why this needs to be thread-local data instead
103 * of just existing on the stack during the calls to preg_ functions.
105 static __thread pcre_extra t_extra_data
;
107 // The last pcre error code is available for the whole thread.
108 static __thread
int t_last_error_code
;
112 static void preg_init_thread_locals() {
113 IniSetting::Bind("pcre.backtrack_limit", "1000000", ini_on_update_long
,
114 &g_context
->m_preg_backtrace_limit
);
115 IniSetting::Bind("pcre.recursion_limit", "100000", ini_on_update_long
,
116 &g_context
->m_preg_recursion_limit
);
118 InitFiniNode
init(preg_init_thread_locals
, InitFiniNode::ThreadInit
);
120 template<bool useSmartFree
= false>
121 struct FreeHelperImpl
: private boost::noncopyable
{
122 explicit FreeHelperImpl(void* p
) : p(p
) {}
124 useSmartFree
? smart_free(p
) : free(p
);
131 typedef FreeHelperImpl
<true> SmartFreeHelper
;
134 static const pcre_cache_entry
* pcre_get_compiled_regex_cache(CStrRef regex
) {
135 /* Try to lookup the cached regex entry, and if successful, just pass
136 back the compiled pattern, otherwise go on and compile it. */
137 if (const pcre_cache_entry
* pce
= lookup_cached_pcre(regex
)) {
141 /* Parse through the leading whitespace, and display a warning if we
142 get to the end without encountering a delimiter. */
143 const char *p
= regex
.data();
144 while (isspace((int)*(unsigned char *)p
)) p
++;
146 raise_warning("Empty regular expression");
150 /* Get the delimiter and display a warning if it is alphanumeric
152 char delimiter
= *p
++;
153 if (isalnum((int)*(unsigned char *)&delimiter
) || delimiter
== '\\') {
154 raise_warning("Delimiter must not be alphanumeric or backslash");
158 char start_delimiter
= delimiter
;
159 const char *pp
= strchr("([{< )]}> )]}>", delimiter
);
163 char end_delimiter
= delimiter
;
165 if (start_delimiter
== end_delimiter
) {
166 /* We need to iterate through the pattern, searching for the ending
167 * delimiter, but skipping the backslashed delimiters. If the ending
168 * delimiter is not found, display a warning. */
171 if (*pp
== '\\' && pp
[1] != 0) pp
++;
172 else if (*pp
== delimiter
)
177 raise_warning("No ending delimiter '%c' found: [%s]", delimiter
,
182 /* We iterate through the pattern, searching for the matching ending
183 * delimiter. For each matching starting delimiter, we increment nesting
184 * level, and decrement it for each matching ending delimiter. If we
185 * reach the end of the pattern without matching, display a warning.
187 int brackets
= 1; // brackets nesting level
190 if (*pp
== '\\' && pp
[1] != 0) pp
++;
191 else if (*pp
== end_delimiter
&& --brackets
<= 0)
193 else if (*pp
== start_delimiter
)
198 raise_warning("No ending matching delimiter '%c' found: [%s]",
199 end_delimiter
, regex
.data());
204 /* Make a copy of the actual pattern. */
205 String
spattern(p
, pp
-p
, CopyString
);
206 const char *pattern
= spattern
.data();
208 /* Move on to the options */
211 /* Parse through the options, setting appropriate flags. Display
212 a warning if we encounter an unknown modifier. */
215 int do_study
= false;
218 /* Perl compatible options */
219 case 'i': coptions
|= PCRE_CASELESS
; break;
220 case 'm': coptions
|= PCRE_MULTILINE
; break;
221 case 's': coptions
|= PCRE_DOTALL
; break;
222 case 'x': coptions
|= PCRE_EXTENDED
; break;
224 /* PCRE specific options */
225 case 'A': coptions
|= PCRE_ANCHORED
; break;
226 case 'D': coptions
|= PCRE_DOLLAR_ENDONLY
; break;
227 case 'S': do_study
= true; break;
228 case 'U': coptions
|= PCRE_UNGREEDY
; break;
229 case 'X': coptions
|= PCRE_EXTRA
; break;
230 case 'u': coptions
|= PCRE_UTF8
; break;
232 /* Custom preg options */
233 case 'e': poptions
|= PREG_REPLACE_EVAL
; break;
240 raise_warning("Unknown modifier '%c': [%s]", pp
[-1], regex
.data());
245 /* We've reached a null byte, now check if we're actually at the end of the
246 string. If not this is a bad expression, and a potential security hole. */
247 if (regex
.length() != (pp
- regex
.data())) {
248 raise_error("Error: Null byte found in pattern");
251 /* Compile pattern and display a warning if compilation failed. */
254 pcre
*re
= pcre_compile(pattern
, coptions
, &error
, &erroffset
, 0);
256 raise_warning("Compilation failed: %s at offset %d", error
, erroffset
);
259 // Careful: from here 're' needs to be freed if something throws.
261 /* If study option was specified, study the pattern and
262 store the result in extra for passing to pcre_exec. */
263 pcre_extra
*extra
= nullptr;
266 extra
= pcre_study(re
, soptions
, &error
);
268 extra
->flags
|= PCRE_EXTRA_MATCH_LIMIT
|
269 PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
271 if (error
!= nullptr) {
273 raise_warning("Error while studying pattern");
281 /* Store the compiled pattern and extra info in the cache. */
282 pcre_cache_entry
*new_entry
= new pcre_cache_entry();
284 new_entry
->extra
= extra
;
285 new_entry
->preg_options
= poptions
;
286 new_entry
->compile_options
= coptions
;
287 return insert_cached_pcre(regex
, new_entry
);
290 static void set_extra_limits(pcre_extra
*& extra
) {
291 if (extra
== nullptr) {
292 pcre_extra
& extra_data
= t_extra_data
;
293 extra_data
.flags
= PCRE_EXTRA_MATCH_LIMIT
|
294 PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
297 extra
->match_limit
= g_context
->m_preg_backtrace_limit
;
298 extra
->match_limit_recursion
= g_context
->m_preg_recursion_limit
;
301 static int *create_offset_array(const pcre_cache_entry
*pce
,
303 pcre_extra
*extra
= pce
->extra
;
304 set_extra_limits(extra
);
306 /* Calculate the size of the offsets array, and allocate memory for it. */
307 int num_subpats
; // Number of captured subpatterns
308 int rc
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_CAPTURECOUNT
, &num_subpats
);
310 raise_warning("Internal pcre_fullinfo() error %d", rc
);
314 size_offsets
= num_subpats
* 3;
315 return (int *)smart_malloc(size_offsets
* sizeof(int));
318 static pcre
* pcre_get_compiled_regex(CStrRef regex
, pcre_extra
**extra
,
320 const pcre_cache_entry
* pce
= pcre_get_compiled_regex_cache(regex
);
322 *extra
= pce
? pce
->extra
: nullptr;
325 *preg_options
= pce
? pce
->preg_options
: 0;
327 return pce
? pce
->re
: nullptr;
330 static inline void add_offset_pair(Variant
&result
, CStrRef str
, int offset
,
333 match_pair
.append(str
);
334 match_pair
.append(offset
);
337 result
.set(name
, match_pair
);
339 result
.append(match_pair
);
342 static inline bool pcre_need_log_error(int pcre_code
) {
343 return RuntimeOption::EnablePregErrorLog
&&
344 (pcre_code
== PCRE_ERROR_MATCHLIMIT
||
345 pcre_code
== PCRE_ERROR_RECURSIONLIMIT
);
348 static void pcre_log_error(const char *func
, int line
, int pcre_code
,
349 const char *pattern
, int pattern_size
,
350 const char *subject
, int subject_size
,
351 const char *repl
, int repl_size
,
352 int arg1
= 0, int arg2
= 0,
353 int arg3
= 0, int arg4
= 0) {
354 const char *escapedPattern
;
355 const char *escapedSubject
;
356 const char *escapedRepl
;
357 string
p(pattern
, pattern_size
);
358 string
s(subject
, subject_size
);
359 string
r(repl
, repl_size
);
360 escapedPattern
= Logger::EscapeString(p
);
361 escapedSubject
= Logger::EscapeString(s
);
362 escapedRepl
= Logger::EscapeString(r
);
363 const char *errString
=
364 (pcre_code
== PCRE_ERROR_MATCHLIMIT
) ? "PCRE_ERROR_MATCHLIMIT" :
365 (pcre_code
== PCRE_ERROR_RECURSIONLIMIT
) ? "PCRE_ERROR_RECURSIONLIMIT" :
368 "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
369 "limits=(%ld, %ld), extra=(%d, %d, %d, %d)",
370 func
, line
, pcre_code
, errString
,
371 escapedPattern
, escapedSubject
, escapedRepl
,
372 g_context
->m_preg_backtrace_limit
, g_context
->m_preg_recursion_limit
,
373 arg1
, arg2
, arg3
, arg4
);
374 free((void *)escapedPattern
);
375 free((void *)escapedSubject
);
376 free((void *)escapedRepl
);
379 static void pcre_handle_exec_error(int pcre_code
) {
382 case PCRE_ERROR_MATCHLIMIT
:
383 preg_code
= PHP_PCRE_BACKTRACK_LIMIT_ERROR
;
385 case PCRE_ERROR_RECURSIONLIMIT
:
386 preg_code
= PHP_PCRE_RECURSION_LIMIT_ERROR
;
388 case PCRE_ERROR_BADUTF8
:
389 preg_code
= PHP_PCRE_BAD_UTF8_ERROR
;
391 case PCRE_ERROR_BADUTF8_OFFSET
:
392 preg_code
= PHP_PCRE_BAD_UTF8_OFFSET_ERROR
;
395 preg_code
= PHP_PCRE_INTERNAL_ERROR
;
398 t_last_error_code
= preg_code
;
401 ///////////////////////////////////////////////////////////////////////////////
403 Variant
preg_grep(CStrRef pattern
, CArrRef input
, int flags
/* = 0 */) {
404 const pcre_cache_entry
* pce
= pcre_get_compiled_regex_cache(pattern
);
405 if (pce
== nullptr) {
409 int size_offsets
= 0;
410 int *offsets
= create_offset_array(pce
, size_offsets
);
411 if (offsets
== nullptr) {
414 SmartFreeHelper
freer(offsets
);
416 /* Initialize return array */
417 Array ret
= Array::Create();
418 t_last_error_code
= PHP_PCRE_NO_ERROR
;
420 /* Go through the input array */
421 bool invert
= (flags
& PREG_GREP_INVERT
);
422 pcre_extra
*extra
= pce
->extra
;
423 set_extra_limits(extra
);
425 for (ArrayIter
iter(input
); iter
; ++iter
) {
426 String entry
= iter
.second().toString();
428 /* Perform the match */
429 int count
= pcre_exec(pce
->re
, extra
, entry
.data(), entry
.size(),
430 0, 0, offsets
, size_offsets
);
432 /* Check for too many substrings condition. */
434 raise_warning("Matched, but too many substrings");
435 count
= size_offsets
/ 3;
436 } else if (count
< 0 && count
!= PCRE_ERROR_NOMATCH
) {
437 if (pcre_need_log_error(count
)) {
438 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
439 pattern
.data(), pattern
.size(),
440 entry
.data(), entry
.size(),
444 pcre_handle_exec_error(count
);
448 /* If the entry fits our requirements */
449 if ((count
> 0 && !invert
) ||
450 (count
== PCRE_ERROR_NOMATCH
&& invert
)) {
452 /* Add to return array */
453 ret
.set(iter
.first(), entry
);
460 ///////////////////////////////////////////////////////////////////////////////
462 static Variant
preg_match_impl(CStrRef pattern
, CStrRef subject
,
463 Variant
*subpats
, int flags
, int start_offset
,
465 const pcre_cache_entry
* pce
= pcre_get_compiled_regex_cache(pattern
);
466 if (pce
== nullptr) {
470 pcre_extra
*extra
= pce
->extra
;
471 set_extra_limits(extra
);
473 *subpats
= Array::Create();
476 int subpats_order
= global
? PREG_PATTERN_ORDER
: 0;
477 bool offset_capture
= false;
479 offset_capture
= flags
& PREG_OFFSET_CAPTURE
;
482 * subpats_order is pre-set to pattern mode so we change it only if
486 subpats_order
= flags
& 0xff;
488 if ((global
&& (subpats_order
< PREG_PATTERN_ORDER
||
489 subpats_order
> PREG_SET_ORDER
)) ||
490 (!global
&& subpats_order
!= 0)) {
491 raise_warning("Invalid flags specified");
496 /* Negative offset counts from the end of the string. */
497 if (start_offset
< 0) {
498 start_offset
= subject
.size() + start_offset
;
499 if (start_offset
< 0) {
504 int size_offsets
= 0;
505 int *offsets
= create_offset_array(pce
, size_offsets
);
506 SmartFreeHelper
offsetsFreer(offsets
);
507 int num_subpats
= size_offsets
/ 3;
508 if (offsets
== nullptr) {
513 * Build a mapping from subpattern numbers to their names. We will always
514 * allocate the table, even though there may be no named subpatterns. This
515 * avoids somewhat more complicated logic in the inner loops.
517 char **subpat_names
= (char **)smart_malloc(num_subpats
* sizeof(char *));
518 SmartFreeHelper
subpatFreer(subpat_names
);
519 memset(subpat_names
, 0, sizeof(char *) * num_subpats
);
521 int name_cnt
= 0, name_size
, ni
= 0;
523 unsigned short name_idx
;
525 int rc
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_NAMECOUNT
, &name_cnt
);
527 raise_warning("Internal pcre_fullinfo() error %d", rc
);
532 rc1
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_NAMETABLE
, &name_table
);
533 rc2
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_NAMEENTRYSIZE
, &name_size
);
534 rc
= rc2
? rc2
: rc1
;
536 raise_warning("Internal pcre_fullinfo() error %d", rc
);
540 while (ni
++ < name_cnt
) {
541 name_idx
= 0xff * (unsigned char)name_table
[0] +
542 (unsigned char)name_table
[1];
543 subpat_names
[name_idx
] = name_table
+ 2;
544 if (is_numeric_string(subpat_names
[name_idx
],
545 strlen(subpat_names
[name_idx
]),
546 nullptr, nullptr, 0) != KindOfNull
) {
547 raise_warning("Numeric named subpatterns are not allowed");
550 name_table
+= name_size
;
555 /* Allocate match sets array and initialize the values. */
556 Array match_sets
; /* An array of sets of matches for each
557 subpattern after a global match */
558 if (global
&& subpats_order
== PREG_PATTERN_ORDER
) {
559 for (int i
= 0; i
< num_subpats
; i
++) {
560 match_sets
.set(i
, Array::Create());
565 t_last_error_code
= PHP_PCRE_NO_ERROR
;
567 Variant result_set
; // Holds a set of subpatterns after a global match
568 int g_notempty
= 0; // If the match should not be empty
569 const char **stringlist
; // Holds list of subpatterns
572 /* Execute the regular expression. */
573 int count
= pcre_exec(pce
->re
, extra
, subject
.data(), subject
.size(),
574 start_offset
, g_notempty
, offsets
, size_offsets
);
576 /* Check for too many substrings condition. */
578 raise_warning("Matched, but too many substrings");
579 count
= size_offsets
/ 3;
582 /* If something has matched */
586 if (!subpats
) continue;
588 // Try to get the list of substrings and display a warning if failed.
589 if (pcre_get_substring_list(subject
.data(), offsets
, count
,
591 raise_warning("Get subpatterns list failed");
595 if (global
) { /* global pattern matching */
596 if (subpats_order
== PREG_PATTERN_ORDER
) {
597 /* For each subpattern, insert it into the appropriate array. */
598 for (i
= 0; i
< count
; i
++) {
599 if (offset_capture
) {
600 add_offset_pair(match_sets
.lvalAt(i
),
601 String(stringlist
[i
],
602 offsets
[(i
<<1)+1] - offsets
[i
<<1],
604 offsets
[i
<<1], nullptr);
606 match_sets
.lvalAt(i
).append
607 (String(stringlist
[i
],
608 offsets
[(i
<<1)+1] - offsets
[i
<<1], CopyString
));
612 * If the number of captured subpatterns on this run is
613 * less than the total possible number, pad the result
614 * arrays with empty strings.
616 if (count
< num_subpats
) {
617 for (; i
< num_subpats
; i
++) {
618 match_sets
.lvalAt(i
).append("");
622 result_set
= Array::Create();
624 /* Add all the subpatterns to it */
625 for (i
= 0; i
< count
; i
++) {
626 if (offset_capture
) {
627 add_offset_pair(result_set
,
628 String(stringlist
[i
],
629 offsets
[(i
<<1)+1] - offsets
[i
<<1],
631 offsets
[i
<<1], subpat_names
[i
]);
633 String
value(stringlist
[i
], offsets
[(i
<<1)+1] - offsets
[i
<<1],
635 if (subpat_names
[i
]) {
636 result_set
.set(subpat_names
[i
], value
);
638 result_set
.append(value
);
641 /* And add it to the output array */
642 subpats
->append(result_set
);
644 } else { /* single pattern matching */
645 /* For each subpattern, insert it into the subpatterns array. */
646 for (i
= 0; i
< count
; i
++) {
647 if (offset_capture
) {
648 add_offset_pair(*subpats
,
649 String(stringlist
[i
],
650 offsets
[(i
<<1)+1] - offsets
[i
<<1],
652 offsets
[i
<<1], subpat_names
[i
]);
654 String
value(stringlist
[i
], offsets
[(i
<<1)+1] - offsets
[i
<<1],
656 if (subpat_names
[i
]) {
657 subpats
->set(subpat_names
[i
], value
);
659 subpats
->append(value
);
664 pcre_free((void *) stringlist
);
665 } else if (count
== PCRE_ERROR_NOMATCH
) {
666 /* If we previously set PCRE_NOTEMPTY after a null match,
667 this is not necessarily the end. We need to advance
668 the start offset, and continue. Fudge the offset values
669 to achieve this, unless we're already at the end of the string. */
670 if (g_notempty
&& start_offset
< subject
.size()) {
671 offsets
[0] = start_offset
;
672 offsets
[1] = start_offset
+ 1;
676 if (pcre_need_log_error(count
)) {
677 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
678 pattern
.data(), pattern
.size(),
679 subject
.data(), subject
.size(),
681 flags
, start_offset
, g_notempty
, global
);
683 pcre_handle_exec_error(count
);
687 /* If we have matched an empty string, mimic what Perl's /g options does.
688 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
689 the match again at the same point. If this fails (picked up above) we
690 advance to the next character. */
691 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
693 /* Advance to the position right after the last full match */
694 start_offset
= offsets
[1];
697 /* Add the match sets to the output array and clean up */
698 if (subpats
&& global
&& subpats_order
== PREG_PATTERN_ORDER
) {
699 for (i
= 0; i
< num_subpats
; i
++) {
700 if (subpat_names
[i
]) {
701 subpats
->set(subpat_names
[i
], match_sets
[i
]);
703 subpats
->append(match_sets
[i
]);
710 Variant
preg_match(CStrRef pattern
, CStrRef subject
,
711 Variant
&matches
, int flags
/* = 0 */,
712 int offset
/* = 0 */) {
713 return preg_match_impl(pattern
, subject
, &matches
, flags
, offset
, false);
715 Variant
preg_match(CStrRef pattern
, CStrRef subject
, int flags
/* = 0 */,
716 int offset
/* = 0 */) {
717 return preg_match_impl(pattern
, subject
, nullptr, flags
, offset
, false);
720 Variant
preg_match_all(CStrRef pattern
, CStrRef subject
, Variant
&matches
,
721 int flags
/* = 0 */, int offset
/* = 0 */) {
722 return preg_match_impl(pattern
, subject
, &matches
, flags
, offset
, true);
724 Variant
preg_match_all(CStrRef pattern
, CStrRef subject
,
725 int flags
/* = 0 */, int offset
/* = 0 */) {
726 return preg_match_impl(pattern
, subject
, nullptr, flags
, offset
, true);
729 ///////////////////////////////////////////////////////////////////////////////
731 static String
preg_do_repl_func(CVarRef function
, CStrRef subject
,
732 int *offsets
, int count
) {
733 Array subpats
= Array::Create();
734 for (int i
= 0; i
< count
; i
++) {
735 subpats
.append(subject
.substr(offsets
[i
<<1],
736 offsets
[(i
<<1)+1] - offsets
[i
<<1]));
740 args
.set(0, subpats
);
741 return vm_call_user_func(function
, args
);
744 static bool preg_get_backref(const char **str
, int *backref
) {
746 const char *walk
= *str
;
752 if (*walk
== '$' && walk
[1] == '{') {
758 if (*walk
>= '0' && *walk
<= '9') {
759 *backref
= *walk
- '0';
765 if (*walk
&& *walk
>= '0' && *walk
<= '9') {
766 *backref
= *backref
* 10 + *walk
- '0';
771 if (*walk
== 0 || *walk
!= '}') {
781 static String
php_pcre_replace(CStrRef pattern
, CStrRef subject
,
782 CVarRef replace_var
, bool callable
,
783 int limit
, int *replace_count
) {
784 const pcre_cache_entry
* pce
= pcre_get_compiled_regex_cache(pattern
);
785 if (pce
== nullptr) {
789 if (pce
->preg_options
& PREG_REPLACE_EVAL
) {
791 throw NotSupportedException("preg_replace",
792 "Modifier /e cannot be used with replacement "
798 int *offsets
= create_offset_array(pce
, size_offsets
);
799 SmartFreeHelper
offsetsFreer(offsets
);
800 if (offsets
== nullptr) {
804 const char *replace
= nullptr;
805 const char *replace_end
= nullptr;
811 replace_val
= replace_var
.toString();
814 int pidx
= replace_val
.find('(');
815 const char *rd
= replace_val
.data();
816 int rs
= replace_val
.size();
818 if (!(rs
>= 5 && pidx
>= 0 && rd
[pidx
+1] == '"' &&
819 ((rd
[rs
-2] == '"' && rd
[rs
-1] == ')') ||
820 (rd
[rs
-3] == '"' && rd
[rs
-2] == ')' && rd
[rs
-1] == ';')))) {
821 throw NotSupportedException("preg_replace",
822 "Modifier /e must be used with the form "
823 "f(\"<replacement string>\") or "
824 "f(\"<replacement string>\");");
826 eval_fn
= replace_val
.substr(0, pidx
);
827 replace_val
= replace_val
.substr(pidx
+1, rs
- (pidx
+1) - 1);
829 replace
= replace_val
.data();
830 replace_len
= replace_val
.size();
831 replace_end
= replace
+ replace_len
;
834 int alloc_len
= 2 * subject
.size() + 1;
835 char *result
= (char *)malloc(alloc_len
);
840 const char *match
= nullptr;
841 int start_offset
= 0;
842 t_last_error_code
= PHP_PCRE_NO_ERROR
;
843 pcre_extra
*extra
= pce
->extra
;
844 set_extra_limits(extra
);
847 int new_len
; // Length of needed storage
848 const char *walk
; // Used to walk the replacement string
849 char walk_last
; // Last walked character
850 char *walkbuf
; // Location of current replacement in the result
851 int match_len
; // Length of the current match
852 int backref
; // Backreference number
853 int g_notempty
= 0; // If the match should not be empty
855 /* Execute the regular expression. */
856 int count
= pcre_exec(pce
->re
, extra
, subject
.data(), subject
.size(),
857 start_offset
, g_notempty
, offsets
, size_offsets
);
859 /* Check for too many substrings condition. */
861 raise_warning("Matched, but too many substrings");
862 count
= size_offsets
/ 3;
865 const char *piece
= subject
.data() + start_offset
;
866 if (count
> 0 && (limit
== -1 || limit
> 0)) {
870 /* Set the match location in subject */
871 match
= subject
.data() + offsets
[0];
872 new_len
= result_len
+ offsets
[0] - start_offset
; //part before the match
874 /* If evaluating, do it and add the return string's length */
877 /* Use custom function to get replacement string and its length. */
878 eval_result
= preg_do_repl_func(replace_var
, subject
, offsets
, count
);
879 new_len
+= eval_result
.size();
880 } else { /* do regular substitution */
883 while (walk
< replace_end
) {
884 if ('\\' == *walk
|| '$' == *walk
) {
885 if (walk_last
== '\\') {
890 if (preg_get_backref(&walk
, &backref
)) {
891 if (backref
< count
) {
892 new_len
+= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
899 walk_last
= walk
[-1];
903 if (new_len
+ 1 > alloc_len
) {
904 alloc_len
= 1 + alloc_len
+ 2 * new_len
;
905 result
= (char *)realloc(result
, alloc_len
);
907 /* copy the part of the string before the match */
908 memcpy(&result
[result_len
], piece
, match
-piece
);
909 result_len
+= match
-piece
;
911 /* copy replacement and backrefs */
912 walkbuf
= result
+ result_len
;
914 /* If evaluating or using custom function, copy result to the buffer
917 memcpy(walkbuf
, eval_result
.data(), eval_result
.size());
918 result_len
+= eval_result
.size();
919 } else { /* do regular backreference copying */
923 const char* lastStart
= nullptr;
924 while (walk
< replace_end
) {
925 bool handleQuote
= eval
&& '"' == *walk
&& walk_last
!= '\\';
926 if (handleQuote
&& lastStart
!= nullptr) {
927 String
str(lastStart
, walkbuf
- lastStart
, CopyString
);
932 if ('\\' == *walk
|| '$' == *walk
) {
933 if (walk_last
== '\\') {
934 *(walkbuf
-1) = *walk
++;
938 if (preg_get_backref(&walk
, &backref
)) {
939 if (backref
< count
) {
940 match_len
= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
941 memcpy(walkbuf
, subject
.data() + offsets
[backref
<<1],
943 walkbuf
+= match_len
;
948 *walkbuf
++ = *walk
++;
949 walk_last
= walk
[-1];
950 if (handleQuote
&& lastStart
== nullptr) {
956 eval_result
= vm_call_user_func(eval_fn
, params
);
957 memcpy(result
+ result_len
, eval_result
.data(), eval_result
.size());
958 result_len
+= eval_result
.size();
960 /* increment the result length by how much we've added to the string */
961 result_len
+= walkbuf
- (result
+ result_len
);
969 } else if (count
== PCRE_ERROR_NOMATCH
|| limit
== 0) {
970 /* If we previously set PCRE_NOTEMPTY after a null match,
971 this is not necessarily the end. We need to advance
972 the start offset, and continue. Fudge the offset values
973 to achieve this, unless we're already at the end of the string. */
974 if (g_notempty
!= 0 && start_offset
< subject
.size()) {
975 offsets
[0] = start_offset
;
976 offsets
[1] = start_offset
+ 1;
977 memcpy(&result
[result_len
], piece
, 1);
980 new_len
= result_len
+ subject
.size() - start_offset
;
981 if (new_len
+ 1 > alloc_len
) {
982 alloc_len
= new_len
+ 1; /* now we know exactly how long it is */
983 result
= (char *)realloc(result
, alloc_len
);
985 /* stick that last bit of string on our output */
986 memcpy(&result
[result_len
], piece
, subject
.size() - start_offset
);
987 result_len
+= subject
.size() - start_offset
;
988 result
[result_len
] = '\0';
992 if (pcre_need_log_error(count
)) {
997 if (replace_var
.isObject()) {
999 replace_var
.objectForCall()->o_getClassName() + "::__invoke";
1001 stemp
= replace_var
.toString();
1004 size
= stemp
.size();
1006 s
= replace_val
.data();
1007 size
= replace_val
.size();
1009 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1010 pattern
.data(), pattern
.size(),
1011 subject
.data(), subject
.size(),
1013 callable
, limit
, start_offset
, g_notempty
);
1015 pcre_handle_exec_error(count
);
1021 /* If we have matched an empty string, mimic what Perl's /g options does.
1022 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1023 the match again at the same point. If this fails (picked up above) we
1024 advance to the next character. */
1025 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1027 /* Advance to the next piece. */
1028 start_offset
= offsets
[1];
1032 return String(result
, result_len
, AttachString
);
1041 static String
php_replace_in_subject(CVarRef regex
, CVarRef replace
,
1042 String subject
, int limit
, bool callable
,
1043 int *replace_count
) {
1044 if (!regex
.is(KindOfArray
)) {
1045 return php_pcre_replace(regex
.toString(), subject
, replace
,
1046 callable
, limit
, replace_count
);
1049 if (callable
|| !replace
.is(KindOfArray
)) {
1050 Array arr
= regex
.toArray();
1051 for (ArrayIter
iterRegex(arr
); iterRegex
; ++iterRegex
) {
1052 String regex_entry
= iterRegex
.second().toString();
1053 subject
= php_pcre_replace(regex_entry
, subject
, replace
,
1054 callable
, limit
, replace_count
);
1055 if (subject
.isNull()) {
1062 Array arrReplace
= replace
.toArray();
1063 Array arrRegex
= regex
.toArray();
1064 ArrayIter
iterReplace(arrReplace
);
1065 for (ArrayIter
iterRegex(arrRegex
); iterRegex
; ++iterRegex
) {
1066 String regex_entry
= iterRegex
.second().toString();
1067 Variant replace_value
;
1069 replace_value
= iterReplace
.second();
1073 subject
= php_pcre_replace(regex_entry
, subject
, replace_value
,
1074 callable
, limit
, replace_count
);
1075 if (subject
.isNull()) {
1082 Variant
preg_replace_impl(CVarRef pattern
, CVarRef replacement
,
1083 CVarRef subject
, int limit
, Variant
&count
,
1086 replacement
.is(KindOfArray
) && !pattern
.is(KindOfArray
)) {
1087 raise_warning("Parameter mismatch, pattern is a string while "
1088 "replacement is an array");
1092 int replace_count
= 0;
1093 if (!subject
.is(KindOfArray
)) {
1094 String ret
= php_replace_in_subject(pattern
, replacement
,
1096 limit
, is_callable
, &replace_count
);
1097 count
= replace_count
;
1101 Array return_value
= Array::Create();
1102 Array arrSubject
= subject
.toArray();
1103 for (ArrayIter
iter(arrSubject
); iter
; ++iter
) {
1104 String subject_entry
= iter
.second().toString();
1105 String result
= php_replace_in_subject(pattern
, replacement
, subject_entry
,
1106 limit
, is_callable
, &replace_count
);
1107 if (!result
.isNull()) {
1108 return_value
.set(iter
.first(), result
);
1111 count
= replace_count
;
1112 return return_value
;
1115 int preg_replace(Variant
&result
, CVarRef pattern
, CVarRef replacement
,
1116 CVarRef subject
, int limit
/* = -1 */) {
1118 result
= preg_replace_impl(pattern
, replacement
, subject
, limit
, count
, false);
1119 return count
.toInt32();
1122 int preg_replace_callback(Variant
&result
, CVarRef pattern
, CVarRef callback
,
1123 CVarRef subject
, int limit
/* = -1 */) {
1125 result
= preg_replace_impl(pattern
, callback
, subject
, limit
, count
, true);
1126 return count
.toInt32();
1129 ///////////////////////////////////////////////////////////////////////////////
1131 Variant
preg_split(CVarRef pattern
, CVarRef subject
, int limit
/* = -1 */,
1132 int flags
/* = 0 */) {
1133 const pcre_cache_entry
* pce
= pcre_get_compiled_regex_cache(
1134 pattern
.toString());
1135 if (pce
== nullptr) {
1139 int no_empty
= flags
& PREG_SPLIT_NO_EMPTY
;
1140 bool delim_capture
= flags
& PREG_SPLIT_DELIM_CAPTURE
;
1141 bool offset_capture
= flags
& PREG_SPLIT_OFFSET_CAPTURE
;
1147 int size_offsets
= 0;
1148 int *offsets
= create_offset_array(pce
, size_offsets
);
1149 SmartFreeHelper
offsetsFreer(offsets
);
1150 if (offsets
== nullptr) {
1154 String ssubject
= subject
.toString();
1156 /* Start at the beginning of the string */
1157 int start_offset
= 0;
1158 int next_offset
= 0;
1159 const char *last_match
= ssubject
.data();
1160 t_last_error_code
= PHP_PCRE_NO_ERROR
;
1161 pcre_extra
*extra
= pce
->extra
;
1163 // Get next piece if no limit or limit not yet reached and something matched
1164 Variant return_value
= Array::Create();
1165 int g_notempty
= 0; /* If the match should not be empty */
1167 pcre
*re_bump
= nullptr; /* Regex instance for empty matches */
1168 pcre_extra
*extra_bump
= nullptr; /* Almost dummy */
1169 while ((limit
== -1 || limit
> 1)) {
1170 int count
= pcre_exec(pce
->re
, extra
, ssubject
.data(), ssubject
.size(),
1171 start_offset
, g_notempty
| utf8_check
,
1172 offsets
, size_offsets
);
1174 /* Check for too many substrings condition. */
1176 raise_warning("Matched, but too many substrings");
1177 count
= size_offsets
/ 3;
1180 /* If something matched */
1182 /* Subsequent calls to pcre_exec don't need to bother with the
1183 * utf8 validity check: if the subject isn't valid, the first
1184 * call to pcre_exec will have failed, and as long as we only
1185 * set start_offset to known character boundaries we won't
1186 * supply an invalid offset. */
1187 utf8_check
= PCRE_NO_UTF8_CHECK
;
1189 if (!no_empty
|| ssubject
.data() + offsets
[0] != last_match
) {
1190 if (offset_capture
) {
1191 /* Add (match, offset) pair to the return value */
1192 add_offset_pair(return_value
,
1194 ssubject
.data() + offsets
[0] - last_match
,
1196 next_offset
, nullptr);
1198 /* Add the piece to the return value */
1199 return_value
.append(String(last_match
,
1200 ssubject
.data() + offsets
[0] - last_match
,
1204 /* One less left to do */
1209 last_match
= ssubject
.data() + offsets
[1];
1210 next_offset
= offsets
[1];
1212 if (delim_capture
) {
1214 for (i
= 1; i
< count
; i
++) {
1215 match_len
= offsets
[(i
<<1)+1] - offsets
[i
<<1];
1216 /* If we have matched a delimiter */
1217 if (!no_empty
|| match_len
> 0) {
1218 if (offset_capture
) {
1219 add_offset_pair(return_value
,
1220 String(ssubject
.data() + offsets
[i
<<1],
1221 match_len
, CopyString
),
1222 offsets
[i
<<1], nullptr);
1224 return_value
.append(ssubject
.substr(offsets
[i
<<1], match_len
));
1229 } else if (count
== PCRE_ERROR_NOMATCH
) {
1230 /* If we previously set PCRE_NOTEMPTY after a null match,
1231 this is not necessarily the end. We need to advance
1232 the start offset, and continue. Fudge the offset values
1233 to achieve this, unless we're already at the end of the string. */
1234 if (g_notempty
!= 0 && start_offset
< ssubject
.size()) {
1235 if (pce
->compile_options
& PCRE_UTF8
) {
1236 if (re_bump
== nullptr) {
1238 if ((re_bump
= pcre_get_compiled_regex("/./us", &extra_bump
,
1239 &dummy
)) == nullptr) {
1243 count
= pcre_exec(re_bump
, extra_bump
, ssubject
.data(),
1244 ssubject
.size(), start_offset
,
1245 0, offsets
, size_offsets
);
1247 raise_warning("Unknown error");
1248 offsets
[0] = start_offset
;
1249 offsets
[1] = start_offset
+ 1;
1250 if (pcre_need_log_error(count
)) {
1251 String spattern
= pattern
.toString();
1252 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1253 spattern
.data(), spattern
.size(),
1254 ssubject
.data(), ssubject
.size(),
1256 limit
, flags
, start_offset
);
1260 offsets
[0] = start_offset
;
1261 offsets
[1] = start_offset
+ 1;
1266 if (pcre_need_log_error(count
)) {
1267 String spattern
= pattern
.toString();
1268 pcre_log_error(__FUNCTION__
, __LINE__
, count
,
1269 spattern
.data(), spattern
.size(),
1270 ssubject
.data(), ssubject
.size(),
1272 limit
, flags
, start_offset
, g_notempty
);
1274 pcre_handle_exec_error(count
);
1278 /* If we have matched an empty string, mimic what Perl's /g options does.
1279 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1280 the match again at the same point. If this fails (picked up above) we
1281 advance to the next character. */
1282 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1284 /* Advance to the position right after the last full match */
1285 start_offset
= offsets
[1];
1288 start_offset
= last_match
- ssubject
.data(); /* the offset might have been incremented, but without further successful matches */
1289 if (!no_empty
|| start_offset
< ssubject
.size()) {
1290 if (offset_capture
) {
1291 /* Add the last (match, offset) pair to the return value */
1292 add_offset_pair(return_value
,
1293 ssubject
.substr(start_offset
),
1294 start_offset
, nullptr);
1296 /* Add the last piece to the return value */
1298 (String(last_match
, ssubject
.data() + ssubject
.size() - last_match
,
1303 return return_value
;
1306 ///////////////////////////////////////////////////////////////////////////////
1308 String
preg_quote(CStrRef str
, CStrRef delimiter
/* = null_string */) {
1309 const char *in_str
= str
.data();
1310 const char *in_str_end
= in_str
+ str
.size();
1312 /* Nothing to do if we got an empty string */
1313 if (in_str
== in_str_end
) {
1317 char delim_char
= 0; /* Delimiter character to be quoted */
1318 bool quote_delim
= false; /* Whether to quote additional delim char */
1319 if (!delimiter
.empty()) {
1320 delim_char
= delimiter
.charAt(0);
1324 /* Allocate enough memory so that even if each character
1325 is quoted, we won't run out of room */
1326 char *out_str
= (char *)malloc(4 * str
.size() + 1);
1328 /* Go through the string and quote necessary characters */
1331 for (p
= in_str
, q
= out_str
; p
!= in_str_end
; p
++) {
1334 case '.': case '\\': case '+': case '*': case '?':
1335 case '[': case '^': case ']': case '$': case '(':
1336 case ')': case '{': case '}': case '=': case '!':
1337 case '>': case '<': case '|': case ':':
1350 if (quote_delim
&& c
== delim_char
)
1358 return String(out_str
, q
- out_str
, AttachString
);
1361 int preg_last_error() {
1362 return t_last_error_code
;
1365 size_t preg_pcre_cache_size() {
1366 return (size_t)s_pcreCacheMap
.size();
1369 ///////////////////////////////////////////////////////////////////////////////
1372 static void php_reg_eprint(int err
, regex_t
*re
) {
1373 char *buf
= nullptr, *message
= nullptr;
1378 /* get the length of the message */
1379 buf_len
= regerror(REG_ITOA
| err
, re
, nullptr, 0);
1381 buf
= (char *)smart_malloc(buf_len
);
1382 if (!buf
) return; /* fail silently */
1383 /* finally, get the error message */
1384 regerror(REG_ITOA
| err
, re
, buf
, buf_len
);
1389 len
= regerror(err
, re
, nullptr, 0);
1391 message
= (char *)smart_malloc(buf_len
+ len
+ 2);
1393 return; /* fail silently */
1396 snprintf(message
, buf_len
, "%s: ", buf
);
1397 buf_len
+= 1; /* so pointer math below works */
1399 /* drop the message into place */
1400 regerror(err
, re
, message
+ buf_len
, len
);
1401 raise_warning("%s", message
);
1404 smart_free(message
);
1407 Variant
php_split(CStrRef spliton
, CStrRef str
, int count
, bool icase
) {
1408 const char *strp
= str
.data();
1409 const char *endp
= strp
+ str
.size();
1412 int copts
= icase
? REG_ICASE
: 0;
1413 int err
= regcomp(&re
, spliton
.data(), REG_EXTENDED
| copts
);
1415 php_reg_eprint(err
, &re
);
1419 Array return_value
= Array::Create();
1422 /* churn through str, generating array entries as we go */
1423 while ((count
== -1 || count
> 1) &&
1424 !(err
= regexec(&re
, strp
, 1, subs
, 0))) {
1425 if (subs
[0].rm_so
== 0 && subs
[0].rm_eo
) {
1426 /* match is at start of string, return empty string */
1427 return_value
.append("");
1428 /* skip ahead the length of the regex match */
1429 strp
+= subs
[0].rm_eo
;
1430 } else if (subs
[0].rm_so
== 0 && subs
[0].rm_eo
== 0) {
1431 /* No more matches */
1433 raise_warning("Invalid Regular Expression to split()");
1436 /* On a real match */
1438 /* make a copy of the substring */
1439 int size
= subs
[0].rm_so
;
1441 /* add it to the array */
1442 return_value
.append(String(strp
, size
, CopyString
));
1444 /* point at our new starting point */
1445 strp
= strp
+ subs
[0].rm_eo
;
1448 /* if we're only looking for a certain number of points,
1449 stop looking once we hit it */
1455 /* see if we encountered an error */
1456 if (err
&& err
!= REG_NOMATCH
) {
1457 php_reg_eprint(err
, &re
);
1462 /* otherwise we just have one last element to add to the array */
1463 int size
= endp
- strp
;
1464 return_value
.append(String(strp
, size
, CopyString
));
1467 return return_value
;
1470 ///////////////////////////////////////////////////////////////////////////////