2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2013 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Author: Andrei Zmievski <andrei@php.net> |
16 +----------------------------------------------------------------------+
23 #include "php_globals.h"
25 #include "ext/standard/info.h"
26 #include "ext/standard/php_smart_str.h"
28 #if HAVE_PCRE || HAVE_BUNDLED_PCRE
30 #include "ext/standard/php_string.h"
32 #define PREG_PATTERN_ORDER 1
33 #define PREG_SET_ORDER 2
34 #define PREG_OFFSET_CAPTURE (1<<8)
36 #define PREG_SPLIT_NO_EMPTY (1<<0)
37 #define PREG_SPLIT_DELIM_CAPTURE (1<<1)
38 #define PREG_SPLIT_OFFSET_CAPTURE (1<<2)
40 #define PREG_REPLACE_EVAL (1<<0)
42 #define PREG_GREP_INVERT (1<<0)
44 #define PCRE_CACHE_SIZE 4096
47 PHP_PCRE_NO_ERROR
= 0,
48 PHP_PCRE_INTERNAL_ERROR
,
49 PHP_PCRE_BACKTRACK_LIMIT_ERROR
,
50 PHP_PCRE_RECURSION_LIMIT_ERROR
,
51 PHP_PCRE_BAD_UTF8_ERROR
,
52 PHP_PCRE_BAD_UTF8_OFFSET_ERROR
56 ZEND_DECLARE_MODULE_GLOBALS(pcre
)
59 static void pcre_handle_exec_error(int pcre_code TSRMLS_DC
) /* {{{ */
64 case PCRE_ERROR_MATCHLIMIT
:
65 preg_code
= PHP_PCRE_BACKTRACK_LIMIT_ERROR
;
68 case PCRE_ERROR_RECURSIONLIMIT
:
69 preg_code
= PHP_PCRE_RECURSION_LIMIT_ERROR
;
72 case PCRE_ERROR_BADUTF8
:
73 preg_code
= PHP_PCRE_BAD_UTF8_ERROR
;
76 case PCRE_ERROR_BADUTF8_OFFSET
:
77 preg_code
= PHP_PCRE_BAD_UTF8_OFFSET_ERROR
;
81 preg_code
= PHP_PCRE_INTERNAL_ERROR
;
85 PCRE_G(error_code
) = preg_code
;
89 static void php_free_pcre_cache(void *data
) /* {{{ */
91 pcre_cache_entry
*pce
= (pcre_cache_entry
*) data
;
94 if (pce
->extra
) pefree(pce
->extra
, 1);
96 if ((void*)pce
->tables
) pefree((void*)pce
->tables
, 1);
97 pefree(pce
->locale
, 1);
102 static PHP_GINIT_FUNCTION(pcre
) /* {{{ */
104 zend_hash_init(&pcre_globals
->pcre_cache
, 0, NULL
, php_free_pcre_cache
, 1);
105 pcre_globals
->backtrack_limit
= 0;
106 pcre_globals
->recursion_limit
= 0;
107 pcre_globals
->error_code
= PHP_PCRE_NO_ERROR
;
111 static PHP_GSHUTDOWN_FUNCTION(pcre
) /* {{{ */
113 zend_hash_destroy(&pcre_globals
->pcre_cache
);
118 STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL
, OnUpdateLong
, backtrack_limit
, zend_pcre_globals
, pcre_globals
)
119 STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL
, OnUpdateLong
, recursion_limit
, zend_pcre_globals
, pcre_globals
)
123 /* {{{ PHP_MINFO_FUNCTION(pcre) */
124 static PHP_MINFO_FUNCTION(pcre
)
126 php_info_print_table_start();
127 php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
128 php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
129 php_info_print_table_end();
131 DISPLAY_INI_ENTRIES();
135 /* {{{ PHP_MINIT_FUNCTION(pcre) */
136 static PHP_MINIT_FUNCTION(pcre
)
138 REGISTER_INI_ENTRIES();
140 REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER
, CONST_CS
| CONST_PERSISTENT
);
141 REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER
, CONST_CS
| CONST_PERSISTENT
);
142 REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE
, CONST_CS
| CONST_PERSISTENT
);
143 REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY
, CONST_CS
| CONST_PERSISTENT
);
144 REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE
, CONST_CS
| CONST_PERSISTENT
);
145 REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE
, CONST_CS
| CONST_PERSISTENT
);
146 REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT
, CONST_CS
| CONST_PERSISTENT
);
148 REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR
, CONST_CS
| CONST_PERSISTENT
);
149 REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR
, CONST_CS
| CONST_PERSISTENT
);
150 REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR
, CONST_CS
| CONST_PERSISTENT
);
151 REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR
, CONST_CS
| CONST_PERSISTENT
);
152 REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR
, CONST_CS
| CONST_PERSISTENT
);
153 REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR
, CONST_CS
| CONST_PERSISTENT
);
154 REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS
| CONST_PERSISTENT
);
160 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
161 static PHP_MSHUTDOWN_FUNCTION(pcre
)
163 UNREGISTER_INI_ENTRIES();
169 /* {{{ static pcre_clean_cache */
170 static int pcre_clean_cache(void *data
, void *arg TSRMLS_DC
)
172 int *num_clean
= (int *)arg
;
174 if (*num_clean
> 0) {
183 /* {{{ static make_subpats_table */
184 static char **make_subpats_table(int num_subpats
, pcre_cache_entry
*pce TSRMLS_DC
)
186 pcre_extra
*extra
= pce
->extra
;
187 int name_cnt
= 0, name_size
, ni
= 0;
190 unsigned short name_idx
;
191 char **subpat_names
= (char **)ecalloc(num_subpats
, sizeof(char *));
193 rc
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_NAMECOUNT
, &name_cnt
);
195 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Internal pcre_fullinfo() error %d", rc
);
202 rc1
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_NAMETABLE
, &name_table
);
203 rc2
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_NAMEENTRYSIZE
, &name_size
);
204 rc
= rc2
? rc2
: rc1
;
206 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Internal pcre_fullinfo() error %d", rc
);
211 while (ni
++ < name_cnt
) {
212 name_idx
= 0xff * (unsigned char)name_table
[0] + (unsigned char)name_table
[1];
213 subpat_names
[name_idx
] = name_table
+ 2;
214 if (is_numeric_string(subpat_names
[name_idx
], strlen(subpat_names
[name_idx
]), NULL
, NULL
, 0) > 0) {
215 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Numeric named subpatterns are not allowed");
219 name_table
+= name_size
;
227 /* {{{ pcre_get_compiled_regex_cache
229 PHPAPI pcre_cache_entry
* pcre_get_compiled_regex_cache(char *regex
, int regex_len TSRMLS_DC
)
238 char start_delimiter
;
245 unsigned const char *tables
= NULL
;
249 pcre_cache_entry
*pce
;
250 pcre_cache_entry new_entry
;
254 # if defined(PHP_WIN32) && defined(ZTS)
255 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE
);
257 locale
= setlocale(LC_CTYPE
, NULL
);
260 /* Try to lookup the cached regex entry, and if successful, just pass
261 back the compiled pattern, otherwise go on and compile it. */
262 if (zend_hash_find(&PCRE_G(pcre_cache
), regex
, regex_len
+1, (void **)&pce
) == SUCCESS
) {
264 * We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
265 * is, we flush it and compile the pattern from scratch.
267 if (pcre_fullinfo(pce
->re
, NULL
, PCRE_INFO_CAPTURECOUNT
, &count
) == PCRE_ERROR_BADMAGIC
) {
268 zend_hash_clean(&PCRE_G(pcre_cache
));
271 if (!strcmp(pce
->locale
, locale
)) {
282 /* Parse through the leading whitespace, and display a warning if we
283 get to the end without encountering a delimiter. */
284 while (isspace((int)*(unsigned char *)p
)) p
++;
286 php_error_docref(NULL TSRMLS_CC
, E_WARNING
,
287 p
< regex
+ regex_len
? "Null byte in regex" : "Empty regular expression");
291 /* Get the delimiter and display a warning if it is alphanumeric
294 if (isalnum((int)*(unsigned char *)&delimiter
) || delimiter
== '\\') {
295 php_error_docref(NULL TSRMLS_CC
,E_WARNING
, "Delimiter must not be alphanumeric or backslash");
299 start_delimiter
= delimiter
;
300 if ((pp
= strchr("([{< )]}> )]}>", delimiter
)))
302 end_delimiter
= delimiter
;
306 if (start_delimiter
== end_delimiter
) {
307 /* We need to iterate through the pattern, searching for the ending delimiter,
308 but skipping the backslashed delimiters. If the ending delimiter is not
309 found, display a warning. */
311 if (*pp
== '\\' && pp
[1] != 0) pp
++;
312 else if (*pp
== delimiter
)
317 /* We iterate through the pattern, searching for the matching ending
318 * delimiter. For each matching starting delimiter, we increment nesting
319 * level, and decrement it for each matching ending delimiter. If we
320 * reach the end of the pattern without matching, display a warning.
322 int brackets
= 1; /* brackets nesting level */
324 if (*pp
== '\\' && pp
[1] != 0) pp
++;
325 else if (*pp
== end_delimiter
&& --brackets
<= 0)
327 else if (*pp
== start_delimiter
)
334 if (pp
< regex
+ regex_len
) {
335 php_error_docref(NULL TSRMLS_CC
,E_WARNING
, "Null byte in regex");
336 } else if (start_delimiter
== end_delimiter
) {
337 php_error_docref(NULL TSRMLS_CC
,E_WARNING
, "No ending delimiter '%c' found", delimiter
);
339 php_error_docref(NULL TSRMLS_CC
,E_WARNING
, "No ending matching delimiter '%c' found", delimiter
);
344 /* Make a copy of the actual pattern. */
345 pattern
= estrndup(p
, pp
-p
);
347 /* Move on to the options */
350 /* Parse through the options, setting appropriate flags. Display
351 a warning if we encounter an unknown modifier. */
352 while (pp
< regex
+ regex_len
) {
354 /* Perl compatible options */
355 case 'i': coptions
|= PCRE_CASELESS
; break;
356 case 'm': coptions
|= PCRE_MULTILINE
; break;
357 case 's': coptions
|= PCRE_DOTALL
; break;
358 case 'x': coptions
|= PCRE_EXTENDED
; break;
360 /* PCRE specific options */
361 case 'A': coptions
|= PCRE_ANCHORED
; break;
362 case 'D': coptions
|= PCRE_DOLLAR_ENDONLY
;break;
363 case 'S': do_study
= 1; break;
364 case 'U': coptions
|= PCRE_UNGREEDY
; break;
365 case 'X': coptions
|= PCRE_EXTRA
; break;
366 case 'u': coptions
|= PCRE_UTF8
;
367 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
368 characters, even in UTF-8 mode. However, this can be changed by setting
369 the PCRE_UCP option. */
371 coptions
|= PCRE_UCP
;
375 /* Custom preg options */
376 case 'e': poptions
|= PREG_REPLACE_EVAL
; break;
384 php_error_docref(NULL TSRMLS_CC
,E_WARNING
, "Unknown modifier '%c'", pp
[-1]);
386 php_error_docref(NULL TSRMLS_CC
,E_WARNING
, "Null byte in regex");
394 if (strcmp(locale
, "C"))
395 tables
= pcre_maketables();
398 /* Compile pattern and display a warning if compilation failed. */
399 re
= pcre_compile(pattern
,
406 php_error_docref(NULL TSRMLS_CC
,E_WARNING
, "Compilation failed: %s at offset %d", error
, erroffset
);
409 pefree((void*)tables
, 1);
414 /* If study option was specified, study the pattern and
415 store the result in extra for passing to pcre_exec. */
417 extra
= pcre_study(re
, soptions
, &error
);
419 extra
->flags
|= PCRE_EXTRA_MATCH_LIMIT
| PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
422 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Error while studying pattern");
431 * If we reached cache limit, clean out the items from the head of the list;
432 * these are supposedly the oldest ones (but not necessarily the least used
435 if (zend_hash_num_elements(&PCRE_G(pcre_cache
)) == PCRE_CACHE_SIZE
) {
436 int num_clean
= PCRE_CACHE_SIZE
/ 8;
437 zend_hash_apply_with_argument(&PCRE_G(pcre_cache
), pcre_clean_cache
, &num_clean TSRMLS_CC
);
440 /* Store the compiled pattern and extra info in the cache. */
442 new_entry
.extra
= extra
;
443 new_entry
.preg_options
= poptions
;
444 new_entry
.compile_options
= coptions
;
446 new_entry
.locale
= pestrdup(locale
, 1);
447 new_entry
.tables
= tables
;
451 * Interned strings are not duplicated when stored in HashTable,
452 * but all the interned strings created during HTTP request are removed
453 * at end of request. However PCRE_G(pcre_cache) must be consistent
454 * on the next request as well. So we disable usage of interned strings
455 * as hash keys especually for this table.
458 if (IS_INTERNED(regex
)) {
459 regex
= tmp
= estrndup(regex
, regex_len
);
462 zend_hash_update(&PCRE_G(pcre_cache
), regex
, regex_len
+1, (void *)&new_entry
,
463 sizeof(pcre_cache_entry
), (void**)&pce
);
473 /* {{{ pcre_get_compiled_regex
475 PHPAPI pcre
* pcre_get_compiled_regex(char *regex
, pcre_extra
**extra
, int *preg_options TSRMLS_DC
)
477 pcre_cache_entry
* pce
= pcre_get_compiled_regex_cache(regex
, strlen(regex
) TSRMLS_CC
);
480 *extra
= pce
? pce
->extra
: NULL
;
483 *preg_options
= pce
? pce
->preg_options
: 0;
486 return pce
? pce
->re
: NULL
;
490 /* {{{ pcre_get_compiled_regex_ex
492 PHPAPI pcre
* pcre_get_compiled_regex_ex(char *regex
, pcre_extra
**extra
, int *preg_options
, int *compile_options TSRMLS_DC
)
494 pcre_cache_entry
* pce
= pcre_get_compiled_regex_cache(regex
, strlen(regex
) TSRMLS_CC
);
497 *extra
= pce
? pce
->extra
: NULL
;
500 *preg_options
= pce
? pce
->preg_options
: 0;
502 if (compile_options
) {
503 *compile_options
= pce
? pce
->compile_options
: 0;
506 return pce
? pce
->re
: NULL
;
510 /* {{{ add_offset_pair */
511 static inline void add_offset_pair(zval
*result
, char *str
, int len
, int offset
, char *name
)
515 ALLOC_ZVAL(match_pair
);
516 array_init(match_pair
);
517 INIT_PZVAL(match_pair
);
519 /* Add (match, offset) to the return value */
520 add_next_index_stringl(match_pair
, str
, len
, 1);
521 add_next_index_long(match_pair
, offset
);
524 zval_add_ref(&match_pair
);
525 zend_hash_update(Z_ARRVAL_P(result
), name
, strlen(name
)+1, &match_pair
, sizeof(zval
*), NULL
);
527 zend_hash_next_index_insert(Z_ARRVAL_P(result
), &match_pair
, sizeof(zval
*), NULL
);
531 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS
, int global
) /* {{{ */
534 char *regex
; /* Regular expression */
535 char *subject
; /* String to match against */
538 pcre_cache_entry
*pce
; /* Compiled regular expression */
539 zval
*subpats
= NULL
; /* Array for subpatterns */
540 long flags
= 0; /* Match control flags */
541 long start_offset
= 0; /* Where the new search starts */
543 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC
, "ss|zll", ®ex
, ®ex_len
,
544 &subject
, &subject_len
, &subpats
, &flags
, &start_offset
) == FAILURE
) {
548 /* Compile regex or get it from cache. */
549 if ((pce
= pcre_get_compiled_regex_cache(regex
, regex_len TSRMLS_CC
)) == NULL
) {
553 php_pcre_match_impl(pce
, subject
, subject_len
, return_value
, subpats
,
554 global
, ZEND_NUM_ARGS() >= 4, flags
, start_offset TSRMLS_CC
);
558 /* {{{ php_pcre_match_impl() */
559 PHPAPI
void php_pcre_match_impl(pcre_cache_entry
*pce
, char *subject
, int subject_len
, zval
*return_value
,
560 zval
*subpats
, int global
, int use_flags
, long flags
, long start_offset TSRMLS_DC
)
562 zval
*result_set
, /* Holds a set of subpatterns after
564 **match_sets
= NULL
; /* An array of sets of matches for each
565 subpattern after a global match */
566 pcre_extra
*extra
= pce
->extra
;/* Holds results of studying */
567 pcre_extra extra_data
; /* Used locally for exec options */
568 int exoptions
= 0; /* Execution options */
569 int count
= 0; /* Count of matched subpatterns */
570 int *offsets
; /* Array of subpattern offsets */
571 int num_subpats
; /* Number of captured subpatterns */
572 int size_offsets
; /* Size of the offsets array */
573 int matched
; /* Has anything matched */
574 int g_notempty
= 0; /* If the match should not be empty */
575 const char **stringlist
; /* Holds list of subpatterns */
576 char **subpat_names
; /* Array for named subpatterns */
578 int subpats_order
; /* Order of subpattern matches */
579 int offset_capture
; /* Capture match offsets: yes/no */
581 /* Overwrite the passed-in value for subpatterns with an empty array. */
582 if (subpats
!= NULL
) {
587 subpats_order
= global
? PREG_PATTERN_ORDER
: 0;
590 offset_capture
= flags
& PREG_OFFSET_CAPTURE
;
593 * subpats_order is pre-set to pattern mode so we change it only if
597 subpats_order
= flags
& 0xff;
599 if ((global
&& (subpats_order
< PREG_PATTERN_ORDER
|| subpats_order
> PREG_SET_ORDER
)) ||
600 (!global
&& subpats_order
!= 0)) {
601 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Invalid flags specified");
608 /* Negative offset counts from the end of the string. */
609 if (start_offset
< 0) {
610 start_offset
= subject_len
+ start_offset
;
611 if (start_offset
< 0) {
617 extra_data
.flags
= PCRE_EXTRA_MATCH_LIMIT
| PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
620 extra
->match_limit
= PCRE_G(backtrack_limit
);
621 extra
->match_limit_recursion
= PCRE_G(recursion_limit
);
623 /* Calculate the size of the offsets array, and allocate memory for it. */
624 rc
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_CAPTURECOUNT
, &num_subpats
);
626 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Internal pcre_fullinfo() error %d", rc
);
630 size_offsets
= num_subpats
* 3;
633 * Build a mapping from subpattern numbers to their names. We will always
634 * allocate the table, even though there may be no named subpatterns. This
635 * avoids somewhat more complicated logic in the inner loops.
637 subpat_names
= make_subpats_table(num_subpats
, pce TSRMLS_CC
);
642 offsets
= (int *)safe_emalloc(size_offsets
, sizeof(int), 0);
644 /* Allocate match sets array and initialize the values. */
645 if (global
&& subpats
&& subpats_order
== PREG_PATTERN_ORDER
) {
646 match_sets
= (zval
**)safe_emalloc(num_subpats
, sizeof(zval
*), 0);
647 for (i
=0; i
<num_subpats
; i
++) {
648 ALLOC_ZVAL(match_sets
[i
]);
649 array_init(match_sets
[i
]);
650 INIT_PZVAL(match_sets
[i
]);
655 PCRE_G(error_code
) = PHP_PCRE_NO_ERROR
;
658 /* Execute the regular expression. */
659 count
= pcre_exec(pce
->re
, extra
, subject
, subject_len
, start_offset
,
660 exoptions
|g_notempty
, offsets
, size_offsets
);
662 /* the string was already proved to be valid UTF-8 */
663 exoptions
|= PCRE_NO_UTF8_CHECK
;
665 /* Check for too many substrings condition. */
667 php_error_docref(NULL TSRMLS_CC
, E_NOTICE
, "Matched, but too many substrings");
668 count
= size_offsets
/3;
671 /* If something has matched */
675 /* If subpatterns array has been passed, fill it in with values. */
676 if (subpats
!= NULL
) {
677 /* Try to get the list of substrings and display a warning if failed. */
678 if (pcre_get_substring_list(subject
, offsets
, count
, &stringlist
) < 0) {
681 if (match_sets
) efree(match_sets
);
682 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Get subpatterns list failed");
686 if (global
) { /* global pattern matching */
687 if (subpats
&& subpats_order
== PREG_PATTERN_ORDER
) {
688 /* For each subpattern, insert it into the appropriate array. */
689 for (i
= 0; i
< count
; i
++) {
690 if (offset_capture
) {
691 add_offset_pair(match_sets
[i
], (char *)stringlist
[i
],
692 offsets
[(i
<<1)+1] - offsets
[i
<<1], offsets
[i
<<1], NULL
);
694 add_next_index_stringl(match_sets
[i
], (char *)stringlist
[i
],
695 offsets
[(i
<<1)+1] - offsets
[i
<<1], 1);
699 * If the number of captured subpatterns on this run is
700 * less than the total possible number, pad the result
701 * arrays with empty strings.
703 if (count
< num_subpats
) {
704 for (; i
< num_subpats
; i
++) {
705 add_next_index_string(match_sets
[i
], "", 1);
709 /* Allocate the result set array */
710 ALLOC_ZVAL(result_set
);
711 array_init(result_set
);
712 INIT_PZVAL(result_set
);
714 /* Add all the subpatterns to it */
715 for (i
= 0; i
< count
; i
++) {
716 if (offset_capture
) {
717 add_offset_pair(result_set
, (char *)stringlist
[i
],
718 offsets
[(i
<<1)+1] - offsets
[i
<<1], offsets
[i
<<1], subpat_names
[i
]);
720 if (subpat_names
[i
]) {
721 add_assoc_stringl(result_set
, subpat_names
[i
], (char *)stringlist
[i
],
722 offsets
[(i
<<1)+1] - offsets
[i
<<1], 1);
724 add_next_index_stringl(result_set
, (char *)stringlist
[i
],
725 offsets
[(i
<<1)+1] - offsets
[i
<<1], 1);
728 /* And add it to the output array */
729 zend_hash_next_index_insert(Z_ARRVAL_P(subpats
), &result_set
, sizeof(zval
*), NULL
);
731 } else { /* single pattern matching */
732 /* For each subpattern, insert it into the subpatterns array. */
733 for (i
= 0; i
< count
; i
++) {
734 if (offset_capture
) {
735 add_offset_pair(subpats
, (char *)stringlist
[i
],
736 offsets
[(i
<<1)+1] - offsets
[i
<<1],
737 offsets
[i
<<1], subpat_names
[i
]);
739 if (subpat_names
[i
]) {
740 add_assoc_stringl(subpats
, subpat_names
[i
], (char *)stringlist
[i
],
741 offsets
[(i
<<1)+1] - offsets
[i
<<1], 1);
743 add_next_index_stringl(subpats
, (char *)stringlist
[i
],
744 offsets
[(i
<<1)+1] - offsets
[i
<<1], 1);
749 pcre_free((void *) stringlist
);
751 } else if (count
== PCRE_ERROR_NOMATCH
) {
752 /* If we previously set PCRE_NOTEMPTY after a null match,
753 this is not necessarily the end. We need to advance
754 the start offset, and continue. Fudge the offset values
755 to achieve this, unless we're already at the end of the string. */
756 if (g_notempty
!= 0 && start_offset
< subject_len
) {
757 offsets
[0] = start_offset
;
758 offsets
[1] = start_offset
+ 1;
762 pcre_handle_exec_error(count TSRMLS_CC
);
766 /* If we have matched an empty string, mimic what Perl's /g options does.
767 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
768 the match again at the same point. If this fails (picked up above) we
769 advance to the next character. */
770 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
772 /* Advance to the position right after the last full match */
773 start_offset
= offsets
[1];
776 /* Add the match sets to the output array and clean up */
777 if (global
&& subpats
&& subpats_order
== PREG_PATTERN_ORDER
) {
778 for (i
= 0; i
< num_subpats
; i
++) {
779 if (subpat_names
[i
]) {
780 zend_hash_update(Z_ARRVAL_P(subpats
), subpat_names
[i
],
781 strlen(subpat_names
[i
])+1, &match_sets
[i
], sizeof(zval
*), NULL
);
782 Z_ADDREF_P(match_sets
[i
]);
784 zend_hash_next_index_insert(Z_ARRVAL_P(subpats
), &match_sets
[i
], sizeof(zval
*), NULL
);
792 /* Did we encounter an error? */
793 if (PCRE_G(error_code
) == PHP_PCRE_NO_ERROR
) {
794 RETVAL_LONG(matched
);
801 /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
802 Perform a Perl-style regular expression match */
803 static PHP_FUNCTION(preg_match
)
805 php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU
, 0);
809 /* {{{ proto int preg_match_all(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
810 Perform a Perl-style global regular expression match */
811 static PHP_FUNCTION(preg_match_all
)
813 php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU
, 1);
817 /* {{{ preg_get_backref
819 static int preg_get_backref(char **str
, int *backref
)
821 register char in_brace
= 0;
822 register char *walk
= *str
;
827 if (*walk
== '$' && walk
[1] == '{') {
833 if (*walk
>= '0' && *walk
<= '9') {
834 *backref
= *walk
- '0';
839 if (*walk
&& *walk
>= '0' && *walk
<= '9') {
840 *backref
= *backref
* 10 + *walk
- '0';
845 if (*walk
== 0 || *walk
!= '}')
856 /* {{{ preg_do_repl_func
858 static int preg_do_repl_func(zval
*function
, char *subject
, int *offsets
, char **subpat_names
, int count
, char **result TSRMLS_DC
)
860 zval
*retval_ptr
; /* Function return value */
861 zval
**args
[1]; /* Argument to pass to function */
862 zval
*subpats
; /* Captured subpatterns */
863 int result_len
; /* Return value length */
866 MAKE_STD_ZVAL(subpats
);
868 for (i
= 0; i
< count
; i
++) {
869 if (subpat_names
[i
]) {
870 add_assoc_stringl(subpats
, subpat_names
[i
], &subject
[offsets
[i
<<1]] , offsets
[(i
<<1)+1] - offsets
[i
<<1], 1);
872 add_next_index_stringl(subpats
, &subject
[offsets
[i
<<1]], offsets
[(i
<<1)+1] - offsets
[i
<<1], 1);
876 if (call_user_function_ex(EG(function_table
), NULL
, function
, &retval_ptr
, 1, args
, 0, NULL TSRMLS_CC
) == SUCCESS
&& retval_ptr
) {
877 convert_to_string_ex(&retval_ptr
);
878 *result
= estrndup(Z_STRVAL_P(retval_ptr
), Z_STRLEN_P(retval_ptr
));
879 result_len
= Z_STRLEN_P(retval_ptr
);
880 zval_ptr_dtor(&retval_ptr
);
882 if (!EG(exception
)) {
883 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Unable to call custom replacement function");
885 result_len
= offsets
[1] - offsets
[0];
886 *result
= estrndup(&subject
[offsets
[0]], result_len
);
889 zval_ptr_dtor(&subpats
);
897 static int preg_do_eval(char *eval_str
, int eval_str_len
, char *subject
,
898 int *offsets
, int count
, char **result TSRMLS_DC
)
900 zval retval
; /* Return value from evaluation */
901 char *eval_str_end
, /* End of eval string */
902 *match
, /* Current match for a backref */
903 *esc_match
, /* Quote-escaped match */
904 *walk
, /* Used to walk the code string */
905 *segment
, /* Start of segment to append while walking */
906 walk_last
; /* Last walked character */
907 int match_len
; /* Length of the match */
908 int esc_match_len
; /* Length of the quote-escaped match */
909 int result_len
; /* Length of the result of the evaluation */
910 int backref
; /* Current backref */
911 char *compiled_string_description
;
912 smart_str code
= {0};
914 eval_str_end
= eval_str
+ eval_str_len
;
915 walk
= segment
= eval_str
;
918 while (walk
< eval_str_end
) {
919 /* If found a backreference.. */
920 if ('\\' == *walk
|| '$' == *walk
) {
921 smart_str_appendl(&code
, segment
, walk
- segment
);
922 if (walk_last
== '\\') {
923 code
.c
[code
.len
-1] = *walk
++;
929 if (preg_get_backref(&walk
, &backref
)) {
930 if (backref
< count
) {
931 /* Find the corresponding string match and substitute it
932 in instead of the backref */
933 match
= subject
+ offsets
[backref
<<1];
934 match_len
= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
936 esc_match
= php_addslashes(match
, match_len
, &esc_match_len
, 0 TSRMLS_CC
);
945 smart_str_appendl(&code
, esc_match
, esc_match_len
);
949 /* Clean up and reassign */
956 walk_last
= walk
[-1];
958 smart_str_appendl(&code
, segment
, walk
- segment
);
961 compiled_string_description
= zend_make_compiled_string_description("regexp code" TSRMLS_CC
);
963 if (zend_eval_stringl(code
.c
, code
.len
, &retval
, compiled_string_description TSRMLS_CC
) == FAILURE
) {
964 efree(compiled_string_description
);
965 php_error_docref(NULL TSRMLS_CC
,E_ERROR
, "Failed evaluating code: %s%s", PHP_EOL
, code
.c
);
966 /* zend_error() does not return in this case */
968 efree(compiled_string_description
);
969 convert_to_string(&retval
);
971 /* Save the return value and its length */
972 *result
= estrndup(Z_STRVAL(retval
), Z_STRLEN(retval
));
973 result_len
= Z_STRLEN(retval
);
977 smart_str_free(&code
);
983 /* {{{ php_pcre_replace
985 PHPAPI
char *php_pcre_replace(char *regex
, int regex_len
,
986 char *subject
, int subject_len
,
987 zval
*replace_val
, int is_callable_replace
,
988 int *result_len
, int limit
, int *replace_count TSRMLS_DC
)
990 pcre_cache_entry
*pce
; /* Compiled regular expression */
992 /* Compile regex or get it from cache. */
993 if ((pce
= pcre_get_compiled_regex_cache(regex
, regex_len TSRMLS_CC
)) == NULL
) {
997 return php_pcre_replace_impl(pce
, subject
, subject_len
, replace_val
,
998 is_callable_replace
, result_len
, limit
, replace_count TSRMLS_CC
);
1002 /* {{{ php_pcre_replace_impl() */
1003 PHPAPI
char *php_pcre_replace_impl(pcre_cache_entry
*pce
, char *subject
, int subject_len
, zval
*replace_val
,
1004 int is_callable_replace
, int *result_len
, int limit
, int *replace_count TSRMLS_DC
)
1006 pcre_extra
*extra
= pce
->extra
;/* Holds results of studying */
1007 pcre_extra extra_data
; /* Used locally for exec options */
1008 int exoptions
= 0; /* Execution options */
1009 int count
= 0; /* Count of matched subpatterns */
1010 int *offsets
; /* Array of subpattern offsets */
1011 char **subpat_names
; /* Array for named subpatterns */
1012 int num_subpats
; /* Number of captured subpatterns */
1013 int size_offsets
; /* Size of the offsets array */
1014 int new_len
; /* Length of needed storage */
1015 int alloc_len
; /* Actual allocated length */
1016 int eval_result_len
=0; /* Length of the eval'ed or
1017 function-returned string */
1018 int match_len
; /* Length of the current match */
1019 int backref
; /* Backreference number */
1020 int eval
; /* If the replacement string should be eval'ed */
1021 int start_offset
; /* Where the new search starts */
1022 int g_notempty
=0; /* If the match should not be empty */
1023 int replace_len
=0; /* Length of replacement string */
1024 char *result
, /* Result of replacement */
1025 *replace
=NULL
, /* Replacement string */
1026 *new_buf
, /* Temporary buffer for re-allocation */
1027 *walkbuf
, /* Location of current replacement in the result */
1028 *walk
, /* Used to walk the replacement string */
1029 *match
, /* The current match */
1030 *piece
, /* The current piece of subject */
1031 *replace_end
=NULL
, /* End of replacement string */
1032 *eval_result
, /* Result of eval or custom function */
1033 walk_last
; /* Last walked character */
1036 if (extra
== NULL
) {
1037 extra_data
.flags
= PCRE_EXTRA_MATCH_LIMIT
| PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
1038 extra
= &extra_data
;
1040 extra
->match_limit
= PCRE_G(backtrack_limit
);
1041 extra
->match_limit_recursion
= PCRE_G(recursion_limit
);
1043 eval
= pce
->preg_options
& PREG_REPLACE_EVAL
;
1044 if (is_callable_replace
) {
1046 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Modifier /e cannot be used with replacement callback");
1050 replace
= Z_STRVAL_P(replace_val
);
1051 replace_len
= Z_STRLEN_P(replace_val
);
1052 replace_end
= replace
+ replace_len
;
1055 /* Calculate the size of the offsets array, and allocate memory for it. */
1056 rc
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_CAPTURECOUNT
, &num_subpats
);
1058 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Internal pcre_fullinfo() error %d", rc
);
1062 size_offsets
= num_subpats
* 3;
1065 * Build a mapping from subpattern numbers to their names. We will always
1066 * allocate the table, even though there may be no named subpatterns. This
1067 * avoids somewhat more complicated logic in the inner loops.
1069 subpat_names
= make_subpats_table(num_subpats
, pce TSRMLS_CC
);
1070 if (!subpat_names
) {
1074 offsets
= (int *)safe_emalloc(size_offsets
, sizeof(int), 0);
1076 alloc_len
= 2 * subject_len
+ 1;
1077 result
= safe_emalloc(alloc_len
, sizeof(char), 0);
1083 PCRE_G(error_code
) = PHP_PCRE_NO_ERROR
;
1086 /* Execute the regular expression. */
1087 count
= pcre_exec(pce
->re
, extra
, subject
, subject_len
, start_offset
,
1088 exoptions
|g_notempty
, offsets
, size_offsets
);
1090 /* the string was already proved to be valid UTF-8 */
1091 exoptions
|= PCRE_NO_UTF8_CHECK
;
1093 /* Check for too many substrings condition. */
1095 php_error_docref(NULL TSRMLS_CC
,E_NOTICE
, "Matched, but too many substrings");
1096 count
= size_offsets
/3;
1099 piece
= subject
+ start_offset
;
1101 if (count
> 0 && (limit
== -1 || limit
> 0)) {
1102 if (replace_count
) {
1105 /* Set the match location in subject */
1106 match
= subject
+ offsets
[0];
1108 new_len
= *result_len
+ offsets
[0] - start_offset
; /* part before the match */
1110 /* If evaluating, do it and add the return string's length */
1112 eval_result_len
= preg_do_eval(replace
, replace_len
, subject
,
1113 offsets
, count
, &eval_result TSRMLS_CC
);
1114 new_len
+= eval_result_len
;
1115 } else if (is_callable_replace
) {
1116 /* Use custom function to get replacement string and its length. */
1117 eval_result_len
= preg_do_repl_func(replace_val
, subject
, offsets
, subpat_names
, count
, &eval_result TSRMLS_CC
);
1118 new_len
+= eval_result_len
;
1119 } else { /* do regular substitution */
1122 while (walk
< replace_end
) {
1123 if ('\\' == *walk
|| '$' == *walk
) {
1124 if (walk_last
== '\\') {
1129 if (preg_get_backref(&walk
, &backref
)) {
1130 if (backref
< count
)
1131 new_len
+= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
1137 walk_last
= walk
[-1];
1141 if (new_len
+ 1 > alloc_len
) {
1142 alloc_len
= 1 + alloc_len
+ 2 * new_len
;
1143 new_buf
= emalloc(alloc_len
);
1144 memcpy(new_buf
, result
, *result_len
);
1148 /* copy the part of the string before the match */
1149 memcpy(&result
[*result_len
], piece
, match
-piece
);
1150 *result_len
+= match
-piece
;
1152 /* copy replacement and backrefs */
1153 walkbuf
= result
+ *result_len
;
1155 /* If evaluating or using custom function, copy result to the buffer
1157 if (eval
|| is_callable_replace
) {
1158 memcpy(walkbuf
, eval_result
, eval_result_len
);
1159 *result_len
+= eval_result_len
;
1160 STR_FREE(eval_result
);
1161 } else { /* do regular backreference copying */
1164 while (walk
< replace_end
) {
1165 if ('\\' == *walk
|| '$' == *walk
) {
1166 if (walk_last
== '\\') {
1167 *(walkbuf
-1) = *walk
++;
1171 if (preg_get_backref(&walk
, &backref
)) {
1172 if (backref
< count
) {
1173 match_len
= offsets
[(backref
<<1)+1] - offsets
[backref
<<1];
1174 memcpy(walkbuf
, subject
+ offsets
[backref
<<1], match_len
);
1175 walkbuf
+= match_len
;
1180 *walkbuf
++ = *walk
++;
1181 walk_last
= walk
[-1];
1184 /* increment the result length by how much we've added to the string */
1185 *result_len
+= walkbuf
- (result
+ *result_len
);
1191 } else if (count
== PCRE_ERROR_NOMATCH
|| limit
== 0) {
1192 /* If we previously set PCRE_NOTEMPTY after a null match,
1193 this is not necessarily the end. We need to advance
1194 the start offset, and continue. Fudge the offset values
1195 to achieve this, unless we're already at the end of the string. */
1196 if (g_notempty
!= 0 && start_offset
< subject_len
) {
1197 offsets
[0] = start_offset
;
1198 offsets
[1] = start_offset
+ 1;
1199 memcpy(&result
[*result_len
], piece
, 1);
1202 new_len
= *result_len
+ subject_len
- start_offset
;
1203 if (new_len
+ 1 > alloc_len
) {
1204 alloc_len
= new_len
+ 1; /* now we know exactly how long it is */
1205 new_buf
= safe_emalloc(alloc_len
, sizeof(char), 0);
1206 memcpy(new_buf
, result
, *result_len
);
1210 /* stick that last bit of string on our output */
1211 memcpy(&result
[*result_len
], piece
, subject_len
- start_offset
);
1212 *result_len
+= subject_len
- start_offset
;
1213 result
[*result_len
] = '\0';
1217 pcre_handle_exec_error(count TSRMLS_CC
);
1223 /* If we have matched an empty string, mimic what Perl's /g options does.
1224 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1225 the match again at the same point. If this fails (picked up above) we
1226 advance to the next character. */
1227 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1229 /* Advance to the next piece. */
1230 start_offset
= offsets
[1];
1234 efree(subpat_names
);
1240 /* {{{ php_replace_in_subject
1242 static char *php_replace_in_subject(zval
*regex
, zval
*replace
, zval
**subject
, int *result_len
, int limit
, int is_callable_replace
, int *replace_count TSRMLS_DC
)
1245 **replace_entry
= NULL
,
1248 char *subject_value
,
1252 /* Make sure we're dealing with strings. */
1253 convert_to_string_ex(subject
);
1254 /* FIXME: This might need to be changed to STR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
1255 ZVAL_STRINGL(&empty_replace
, "", 0, 0);
1257 /* If regex is an array */
1258 if (Z_TYPE_P(regex
) == IS_ARRAY
) {
1259 /* Duplicate subject string for repeated replacement */
1260 subject_value
= estrndup(Z_STRVAL_PP(subject
), Z_STRLEN_PP(subject
));
1261 subject_len
= Z_STRLEN_PP(subject
);
1262 *result_len
= subject_len
;
1264 zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex
));
1266 replace_value
= replace
;
1267 if (Z_TYPE_P(replace
) == IS_ARRAY
&& !is_callable_replace
)
1268 zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace
));
1270 /* For each entry in the regex array, get the entry */
1271 while (zend_hash_get_current_data(Z_ARRVAL_P(regex
), (void **)®ex_entry
) == SUCCESS
) {
1272 /* Make sure we're dealing with strings. */
1273 convert_to_string_ex(regex_entry
);
1275 /* If replace is an array and not a callable construct */
1276 if (Z_TYPE_P(replace
) == IS_ARRAY
&& !is_callable_replace
) {
1277 /* Get current entry */
1278 if (zend_hash_get_current_data(Z_ARRVAL_P(replace
), (void **)&replace_entry
) == SUCCESS
) {
1279 if (!is_callable_replace
) {
1280 convert_to_string_ex(replace_entry
);
1282 replace_value
= *replace_entry
;
1283 zend_hash_move_forward(Z_ARRVAL_P(replace
));
1285 /* We've run out of replacement strings, so use an empty one */
1286 replace_value
= &empty_replace
;
1290 /* Do the actual replacement and put the result back into subject_value
1291 for further replacements. */
1292 if ((result
= php_pcre_replace(Z_STRVAL_PP(regex_entry
),
1293 Z_STRLEN_PP(regex_entry
),
1297 is_callable_replace
,
1300 replace_count TSRMLS_CC
)) != NULL
) {
1301 efree(subject_value
);
1302 subject_value
= result
;
1303 subject_len
= *result_len
;
1305 efree(subject_value
);
1309 zend_hash_move_forward(Z_ARRVAL_P(regex
));
1312 return subject_value
;
1314 result
= php_pcre_replace(Z_STRVAL_P(regex
),
1316 Z_STRVAL_PP(subject
),
1317 Z_STRLEN_PP(subject
),
1319 is_callable_replace
,
1322 replace_count TSRMLS_CC
);
1328 /* {{{ preg_replace_impl
1330 static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS
, int is_callable_replace
, int is_filter
)
1343 char *callback_name
;
1344 int replace_count
=0, old_replace_count
;
1346 /* Get function parameters and do error-checking. */
1347 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC
, "ZZZ|lZ", ®ex
, &replace
, &subject
, &limit
, &zcount
) == FAILURE
) {
1351 if (!is_callable_replace
&& Z_TYPE_PP(replace
) == IS_ARRAY
&& Z_TYPE_PP(regex
) != IS_ARRAY
) {
1352 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Parameter mismatch, pattern is a string while replacement is an array");
1356 SEPARATE_ZVAL(replace
);
1357 if (Z_TYPE_PP(replace
) != IS_ARRAY
&& (Z_TYPE_PP(replace
) != IS_OBJECT
|| !is_callable_replace
)) {
1358 convert_to_string_ex(replace
);
1360 if (is_callable_replace
) {
1361 if (!zend_is_callable(*replace
, 0, &callback_name TSRMLS_CC
)) {
1362 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Requires argument 2, '%s', to be a valid callback", callback_name
);
1363 efree(callback_name
);
1364 MAKE_COPY_ZVAL(subject
, return_value
);
1367 efree(callback_name
);
1370 SEPARATE_ZVAL(regex
);
1371 SEPARATE_ZVAL(subject
);
1373 if (ZEND_NUM_ARGS() > 3) {
1377 if (Z_TYPE_PP(regex
) != IS_ARRAY
)
1378 convert_to_string_ex(regex
);
1380 /* if subject is an array */
1381 if (Z_TYPE_PP(subject
) == IS_ARRAY
) {
1382 array_init(return_value
);
1383 zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject
));
1385 /* For each subject entry, convert it to string, then perform replacement
1386 and add the result to the return_value array. */
1387 while (zend_hash_get_current_data(Z_ARRVAL_PP(subject
), (void **)&subject_entry
) == SUCCESS
) {
1388 SEPARATE_ZVAL(subject_entry
);
1389 old_replace_count
= replace_count
;
1390 if ((result
= php_replace_in_subject(*regex
, *replace
, subject_entry
, &result_len
, limit_val
, is_callable_replace
, &replace_count TSRMLS_CC
)) != NULL
) {
1391 if (!is_filter
|| replace_count
> old_replace_count
) {
1392 /* Add to return array */
1393 switch(zend_hash_get_current_key(Z_ARRVAL_PP(subject
), &string_key
, &num_key
, 0))
1395 case HASH_KEY_IS_STRING
:
1396 add_assoc_stringl(return_value
, string_key
, result
, result_len
, 0);
1399 case HASH_KEY_IS_LONG
:
1400 add_index_stringl(return_value
, num_key
, result
, result_len
, 0);
1408 zend_hash_move_forward(Z_ARRVAL_PP(subject
));
1410 } else { /* if subject is not an array */
1411 old_replace_count
= replace_count
;
1412 if ((result
= php_replace_in_subject(*regex
, *replace
, subject
, &result_len
, limit_val
, is_callable_replace
, &replace_count TSRMLS_CC
)) != NULL
) {
1413 if (!is_filter
|| replace_count
> old_replace_count
) {
1414 RETVAL_STRINGL(result
, result_len
, 0);
1420 if (ZEND_NUM_ARGS() > 4) {
1422 ZVAL_LONG(*zcount
, replace_count
);
1428 /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1429 Perform Perl-style regular expression replacement. */
1430 static PHP_FUNCTION(preg_replace
)
1432 preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU
, 0, 0);
1436 /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
1437 Perform Perl-style regular expression replacement using replacement callback. */
1438 static PHP_FUNCTION(preg_replace_callback
)
1440 preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU
, 1, 0);
1444 /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1445 Perform Perl-style regular expression replacement and only return matches. */
1446 static PHP_FUNCTION(preg_filter
)
1448 preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU
, 0, 1);
1452 /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
1453 Split string into an array using a perl-style regular expression as a delimiter */
1454 static PHP_FUNCTION(preg_split
)
1456 char *regex
; /* Regular expression */
1457 char *subject
; /* String to match against */
1460 long limit_val
= -1;/* Integer value of limit */
1461 long flags
= 0; /* Match control flags */
1462 pcre_cache_entry
*pce
; /* Compiled regular expression */
1464 /* Get function parameters and do error checking */
1465 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC
, "ss|ll", ®ex
, ®ex_len
,
1466 &subject
, &subject_len
, &limit_val
, &flags
) == FAILURE
) {
1470 /* Compile regex or get it from cache. */
1471 if ((pce
= pcre_get_compiled_regex_cache(regex
, regex_len TSRMLS_CC
)) == NULL
) {
1475 php_pcre_split_impl(pce
, subject
, subject_len
, return_value
, limit_val
, flags TSRMLS_CC
);
1479 /* {{{ php_pcre_split
1481 PHPAPI
void php_pcre_split_impl(pcre_cache_entry
*pce
, char *subject
, int subject_len
, zval
*return_value
,
1482 long limit_val
, long flags TSRMLS_DC
)
1484 pcre_extra
*extra
= NULL
; /* Holds results of studying */
1485 pcre
*re_bump
= NULL
; /* Regex instance for empty matches */
1486 pcre_extra
*extra_bump
= NULL
; /* Almost dummy */
1487 pcre_extra extra_data
; /* Used locally for exec options */
1488 int *offsets
; /* Array of subpattern offsets */
1489 int size_offsets
; /* Size of the offsets array */
1490 int exoptions
= 0; /* Execution options */
1491 int count
= 0; /* Count of matched subpatterns */
1492 int start_offset
; /* Where the new search starts */
1493 int next_offset
; /* End of the last delimiter match + 1 */
1494 int g_notempty
= 0; /* If the match should not be empty */
1495 char *last_match
; /* Location of last match */
1497 int no_empty
; /* If NO_EMPTY flag is set */
1498 int delim_capture
; /* If delimiters should be captured */
1499 int offset_capture
; /* If offsets should be captured */
1501 no_empty
= flags
& PREG_SPLIT_NO_EMPTY
;
1502 delim_capture
= flags
& PREG_SPLIT_DELIM_CAPTURE
;
1503 offset_capture
= flags
& PREG_SPLIT_OFFSET_CAPTURE
;
1505 if (limit_val
== 0) {
1509 if (extra
== NULL
) {
1510 extra_data
.flags
= PCRE_EXTRA_MATCH_LIMIT
| PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
1511 extra
= &extra_data
;
1513 extra
->match_limit
= PCRE_G(backtrack_limit
);
1514 extra
->match_limit_recursion
= PCRE_G(recursion_limit
);
1516 /* Initialize return value */
1517 array_init(return_value
);
1519 /* Calculate the size of the offsets array, and allocate memory for it. */
1520 rc
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_CAPTURECOUNT
, &size_offsets
);
1522 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Internal pcre_fullinfo() error %d", rc
);
1525 size_offsets
= (size_offsets
+ 1) * 3;
1526 offsets
= (int *)safe_emalloc(size_offsets
, sizeof(int), 0);
1528 /* Start at the beginning of the string */
1531 last_match
= subject
;
1532 PCRE_G(error_code
) = PHP_PCRE_NO_ERROR
;
1534 /* Get next piece if no limit or limit not yet reached and something matched*/
1535 while ((limit_val
== -1 || limit_val
> 1)) {
1536 count
= pcre_exec(pce
->re
, extra
, subject
,
1537 subject_len
, start_offset
,
1538 exoptions
|g_notempty
, offsets
, size_offsets
);
1540 /* the string was already proved to be valid UTF-8 */
1541 exoptions
|= PCRE_NO_UTF8_CHECK
;
1543 /* Check for too many substrings condition. */
1545 php_error_docref(NULL TSRMLS_CC
,E_NOTICE
, "Matched, but too many substrings");
1546 count
= size_offsets
/3;
1549 /* If something matched */
1551 if (!no_empty
|| &subject
[offsets
[0]] != last_match
) {
1553 if (offset_capture
) {
1554 /* Add (match, offset) pair to the return value */
1555 add_offset_pair(return_value
, last_match
, &subject
[offsets
[0]]-last_match
, next_offset
, NULL
);
1557 /* Add the piece to the return value */
1558 add_next_index_stringl(return_value
, last_match
,
1559 &subject
[offsets
[0]]-last_match
, 1);
1562 /* One less left to do */
1563 if (limit_val
!= -1)
1567 last_match
= &subject
[offsets
[1]];
1568 next_offset
= offsets
[1];
1570 if (delim_capture
) {
1572 for (i
= 1; i
< count
; i
++) {
1573 match_len
= offsets
[(i
<<1)+1] - offsets
[i
<<1];
1574 /* If we have matched a delimiter */
1575 if (!no_empty
|| match_len
> 0) {
1576 if (offset_capture
) {
1577 add_offset_pair(return_value
, &subject
[offsets
[i
<<1]], match_len
, offsets
[i
<<1], NULL
);
1579 add_next_index_stringl(return_value
,
1580 &subject
[offsets
[i
<<1]],
1586 } else if (count
== PCRE_ERROR_NOMATCH
) {
1587 /* If we previously set PCRE_NOTEMPTY after a null match,
1588 this is not necessarily the end. We need to advance
1589 the start offset, and continue. Fudge the offset values
1590 to achieve this, unless we're already at the end of the string. */
1591 if (g_notempty
!= 0 && start_offset
< subject_len
) {
1592 if (pce
->compile_options
& PCRE_UTF8
) {
1593 if (re_bump
== NULL
) {
1596 if ((re_bump
= pcre_get_compiled_regex("/./us", &extra_bump
, &dummy TSRMLS_CC
)) == NULL
) {
1600 count
= pcre_exec(re_bump
, extra_bump
, subject
,
1601 subject_len
, start_offset
,
1602 exoptions
, offsets
, size_offsets
);
1604 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Unknown error");
1608 offsets
[0] = start_offset
;
1609 offsets
[1] = start_offset
+ 1;
1614 pcre_handle_exec_error(count TSRMLS_CC
);
1618 /* If we have matched an empty string, mimic what Perl's /g options does.
1619 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1620 the match again at the same point. If this fails (picked up above) we
1621 advance to the next character. */
1622 g_notempty
= (offsets
[1] == offsets
[0])? PCRE_NOTEMPTY
| PCRE_ANCHORED
: 0;
1624 /* Advance to the position right after the last full match */
1625 start_offset
= offsets
[1];
1629 start_offset
= last_match
- subject
; /* the offset might have been incremented, but without further successful matches */
1631 if (!no_empty
|| start_offset
< subject_len
)
1633 if (offset_capture
) {
1634 /* Add the last (match, offset) pair to the return value */
1635 add_offset_pair(return_value
, &subject
[start_offset
], subject_len
- start_offset
, start_offset
, NULL
);
1637 /* Add the last piece to the return value */
1638 add_next_index_stringl(return_value
, last_match
, subject
+ subject_len
- last_match
, 1);
1648 /* {{{ proto string preg_quote(string str [, string delim_char])
1649 Quote regular expression characters plus an optional character */
1650 static PHP_FUNCTION(preg_quote
)
1653 char *in_str
; /* Input string argument */
1654 char *in_str_end
; /* End of the input string */
1656 char *delim
= NULL
; /* Additional delimiter argument */
1657 char *out_str
, /* Output string with quoted characters */
1658 *p
, /* Iterator for input string */
1659 *q
, /* Iterator for output string */
1660 delim_char
=0, /* Delimiter character to be quoted */
1661 c
; /* Current character */
1662 zend_bool quote_delim
= 0; /* Whether to quote additional delim char */
1664 /* Get the arguments and check for errors */
1665 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC
, "s|s", &in_str
, &in_str_len
,
1666 &delim
, &delim_len
) == FAILURE
) {
1670 in_str_end
= in_str
+ in_str_len
;
1672 /* Nothing to do if we got an empty string */
1673 if (in_str
== in_str_end
) {
1674 RETURN_EMPTY_STRING();
1677 if (delim
&& *delim
) {
1678 delim_char
= delim
[0];
1682 /* Allocate enough memory so that even if each character
1683 is quoted, we won't run out of room */
1684 out_str
= safe_emalloc(4, in_str_len
, 1);
1686 /* Go through the string and quote necessary characters */
1687 for(p
= in_str
, q
= out_str
; p
!= in_str_end
; p
++) {
1722 if (quote_delim
&& c
== delim_char
)
1730 /* Reallocate string and return it */
1731 RETVAL_STRINGL(erealloc(out_str
, q
- out_str
+ 1), q
- out_str
, 0);
1735 /* {{{ proto array preg_grep(string regex, array input [, int flags])
1736 Searches array and returns entries which match regex */
1737 static PHP_FUNCTION(preg_grep
)
1739 char *regex
; /* Regular expression */
1741 zval
*input
; /* Input array */
1742 long flags
= 0; /* Match control flags */
1743 pcre_cache_entry
*pce
; /* Compiled regular expression */
1745 /* Get arguments and do error checking */
1746 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC
, "sa|l", ®ex
, ®ex_len
,
1747 &input
, &flags
) == FAILURE
) {
1751 /* Compile regex or get it from cache. */
1752 if ((pce
= pcre_get_compiled_regex_cache(regex
, regex_len TSRMLS_CC
)) == NULL
) {
1756 php_pcre_grep_impl(pce
, input
, return_value
, flags TSRMLS_CC
);
1760 PHPAPI
void php_pcre_grep_impl(pcre_cache_entry
*pce
, zval
*input
, zval
*return_value
, long flags TSRMLS_DC
) /* {{{ */
1762 zval
**entry
; /* An entry in the input array */
1763 pcre_extra
*extra
= pce
->extra
;/* Holds results of studying */
1764 pcre_extra extra_data
; /* Used locally for exec options */
1765 int *offsets
; /* Array of subpattern offsets */
1766 int size_offsets
; /* Size of the offsets array */
1767 int count
= 0; /* Count of matched subpatterns */
1770 zend_bool invert
; /* Whether to return non-matching
1774 invert
= flags
& PREG_GREP_INVERT
? 1 : 0;
1776 if (extra
== NULL
) {
1777 extra_data
.flags
= PCRE_EXTRA_MATCH_LIMIT
| PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
1778 extra
= &extra_data
;
1780 extra
->match_limit
= PCRE_G(backtrack_limit
);
1781 extra
->match_limit_recursion
= PCRE_G(recursion_limit
);
1783 /* Calculate the size of the offsets array, and allocate memory for it. */
1784 rc
= pcre_fullinfo(pce
->re
, extra
, PCRE_INFO_CAPTURECOUNT
, &size_offsets
);
1786 php_error_docref(NULL TSRMLS_CC
, E_WARNING
, "Internal pcre_fullinfo() error %d", rc
);
1789 size_offsets
= (size_offsets
+ 1) * 3;
1790 offsets
= (int *)safe_emalloc(size_offsets
, sizeof(int), 0);
1792 /* Initialize return array */
1793 array_init(return_value
);
1795 PCRE_G(error_code
) = PHP_PCRE_NO_ERROR
;
1797 /* Go through the input array */
1798 zend_hash_internal_pointer_reset(Z_ARRVAL_P(input
));
1799 while (zend_hash_get_current_data(Z_ARRVAL_P(input
), (void **)&entry
) == SUCCESS
) {
1800 zval subject
= **entry
;
1802 if (Z_TYPE_PP(entry
) != IS_STRING
) {
1803 zval_copy_ctor(&subject
);
1804 convert_to_string(&subject
);
1807 /* Perform the match */
1808 count
= pcre_exec(pce
->re
, extra
, Z_STRVAL(subject
),
1809 Z_STRLEN(subject
), 0,
1810 0, offsets
, size_offsets
);
1812 /* Check for too many substrings condition. */
1814 php_error_docref(NULL TSRMLS_CC
, E_NOTICE
, "Matched, but too many substrings");
1815 count
= size_offsets
/3;
1816 } else if (count
< 0 && count
!= PCRE_ERROR_NOMATCH
) {
1817 pcre_handle_exec_error(count TSRMLS_CC
);
1821 /* If the entry fits our requirements */
1822 if ((count
> 0 && !invert
) || (count
== PCRE_ERROR_NOMATCH
&& invert
)) {
1826 /* Add to return array */
1827 switch (zend_hash_get_current_key(Z_ARRVAL_P(input
), &string_key
, &num_key
, 0))
1829 case HASH_KEY_IS_STRING
:
1830 zend_hash_update(Z_ARRVAL_P(return_value
), string_key
,
1831 strlen(string_key
)+1, entry
, sizeof(zval
*), NULL
);
1834 case HASH_KEY_IS_LONG
:
1835 zend_hash_index_update(Z_ARRVAL_P(return_value
), num_key
, entry
,
1836 sizeof(zval
*), NULL
);
1841 if (Z_TYPE_PP(entry
) != IS_STRING
) {
1842 zval_dtor(&subject
);
1845 zend_hash_move_forward(Z_ARRVAL_P(input
));
1847 zend_hash_internal_pointer_reset(Z_ARRVAL_P(input
));
1853 /* {{{ proto int preg_last_error()
1854 Returns the error code of the last regexp execution. */
1855 static PHP_FUNCTION(preg_last_error
)
1857 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC
, "") == FAILURE
) {
1861 RETURN_LONG(PCRE_G(error_code
));
1865 /* {{{ module definition structures */
1868 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match
, 0, 0, 2)
1869 ZEND_ARG_INFO(0, pattern
)
1870 ZEND_ARG_INFO(0, subject
)
1871 ZEND_ARG_INFO(1, subpatterns
) /* array */
1872 ZEND_ARG_INFO(0, flags
)
1873 ZEND_ARG_INFO(0, offset
)
1876 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all
, 0, 0, 2)
1877 ZEND_ARG_INFO(0, pattern
)
1878 ZEND_ARG_INFO(0, subject
)
1879 ZEND_ARG_INFO(1, subpatterns
) /* array */
1880 ZEND_ARG_INFO(0, flags
)
1881 ZEND_ARG_INFO(0, offset
)
1884 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace
, 0, 0, 3)
1885 ZEND_ARG_INFO(0, regex
)
1886 ZEND_ARG_INFO(0, replace
)
1887 ZEND_ARG_INFO(0, subject
)
1888 ZEND_ARG_INFO(0, limit
)
1889 ZEND_ARG_INFO(1, count
)
1892 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback
, 0, 0, 3)
1893 ZEND_ARG_INFO(0, regex
)
1894 ZEND_ARG_INFO(0, callback
)
1895 ZEND_ARG_INFO(0, subject
)
1896 ZEND_ARG_INFO(0, limit
)
1897 ZEND_ARG_INFO(1, count
)
1900 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split
, 0, 0, 2)
1901 ZEND_ARG_INFO(0, pattern
)
1902 ZEND_ARG_INFO(0, subject
)
1903 ZEND_ARG_INFO(0, limit
)
1904 ZEND_ARG_INFO(0, flags
)
1907 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote
, 0, 0, 1)
1908 ZEND_ARG_INFO(0, str
)
1909 ZEND_ARG_INFO(0, delim_char
)
1912 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep
, 0, 0, 2)
1913 ZEND_ARG_INFO(0, regex
)
1914 ZEND_ARG_INFO(0, input
) /* array */
1915 ZEND_ARG_INFO(0, flags
)
1918 ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error
, 0)
1922 static const zend_function_entry pcre_functions
[] = {
1923 PHP_FE(preg_match
, arginfo_preg_match
)
1924 PHP_FE(preg_match_all
, arginfo_preg_match_all
)
1925 PHP_FE(preg_replace
, arginfo_preg_replace
)
1926 PHP_FE(preg_replace_callback
, arginfo_preg_replace_callback
)
1927 PHP_FE(preg_filter
, arginfo_preg_replace
)
1928 PHP_FE(preg_split
, arginfo_preg_split
)
1929 PHP_FE(preg_quote
, arginfo_preg_quote
)
1930 PHP_FE(preg_grep
, arginfo_preg_grep
)
1931 PHP_FE(preg_last_error
, arginfo_preg_last_error
)
1935 zend_module_entry pcre_module_entry
= {
1936 STANDARD_MODULE_HEADER
,
1940 PHP_MSHUTDOWN(pcre
),
1945 PHP_MODULE_GLOBALS(pcre
),
1947 PHP_GSHUTDOWN(pcre
),
1949 STANDARD_MODULE_PROPERTIES_EX
1952 #ifdef COMPILE_DL_PCRE
1953 ZEND_GET_MODULE(pcre
)
1958 #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
1965 * vim600: sw=4 ts=4 fdm=marker
1966 * vim<600: sw=4 ts=4