make #includes consistent
[hiphop-php.git] / hphp / runtime / base / preg.cpp
blob89e91ee427160dc08c8b212a8c8f80ed776a819f
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
16 #include "hphp/runtime/base/string_util.h"
17 #include "hphp/runtime/base/util/request_local.h"
18 #include "hphp/util/lock.h"
19 #include "hphp/util/logger.h"
20 #include <pcre.h>
21 #include <onigposix.h>
22 #include "hphp/runtime/base/runtime_option.h"
23 #include "hphp/runtime/base/builtin_functions.h"
24 #include "hphp/runtime/base/zend/zend_functions.h"
25 #include "hphp/runtime/base/array/array_iterator.h"
26 #include "hphp/runtime/base/ini_setting.h"
27 #include "hphp/runtime/base/thread_init_fini.h"
28 #include "tbb/concurrent_hash_map.h"
30 #define PREG_PATTERN_ORDER 1
31 #define PREG_SET_ORDER 2
32 #define PREG_OFFSET_CAPTURE (1<<8)
34 #define PREG_SPLIT_NO_EMPTY (1<<0)
35 #define PREG_SPLIT_DELIM_CAPTURE (1<<1)
36 #define PREG_SPLIT_OFFSET_CAPTURE (1<<2)
38 #define PREG_REPLACE_EVAL (1<<0)
40 #define PREG_GREP_INVERT (1<<0)
42 #define PCRE_CACHE_SIZE 4096
44 enum {
45 PHP_PCRE_NO_ERROR = 0,
46 PHP_PCRE_INTERNAL_ERROR,
47 PHP_PCRE_BACKTRACK_LIMIT_ERROR,
48 PHP_PCRE_RECURSION_LIMIT_ERROR,
49 PHP_PCRE_BAD_UTF8_ERROR,
50 PHP_PCRE_BAD_UTF8_OFFSET_ERROR
53 namespace HPHP {
54 ///////////////////////////////////////////////////////////////////////////////
55 // regex cache and helpers
57 class pcre_cache_entry {
58 pcre_cache_entry(const pcre_cache_entry&);
59 pcre_cache_entry& operator=(const pcre_cache_entry&);
61 public:
62 pcre_cache_entry() {}
63 ~pcre_cache_entry() {
64 if (extra) free(extra); // we don't have pcre_free_study yet
65 pcre_free(re);
68 pcre *re;
69 pcre_extra *extra; // Holds results of studying
70 int preg_options;
71 int compile_options;
74 typedef tbb::concurrent_hash_map<const StringData*,const pcre_cache_entry*,
75 StringDataHashCompare> PCREStringMap;
77 static PCREStringMap s_pcreCacheMap;
79 static const pcre_cache_entry* lookup_cached_pcre(CStrRef regex) {
80 PCREStringMap::const_accessor acc;
81 if (s_pcreCacheMap.find(acc, regex.get())) {
82 return acc->second;
84 return 0;
87 static const pcre_cache_entry*
88 insert_cached_pcre(CStrRef regex, const pcre_cache_entry* ent) {
89 PCREStringMap::accessor acc;
90 if (s_pcreCacheMap.insert(acc, StringData::GetStaticString(regex.get()))) {
91 acc->second = ent;
92 return ent;
94 delete ent;
95 return acc->second;
99 * When a cached compiled pcre doesn't have pcre_extra, we use this
100 * one.
102 * FIXME: It's unclear why this needs to be thread-local data instead
103 * of just existing on the stack during the calls to preg_ functions.
105 static __thread pcre_extra t_extra_data;
107 // The last pcre error code is available for the whole thread.
108 static __thread int t_last_error_code;
110 namespace {
112 static void preg_init_thread_locals() {
113 IniSetting::Bind("pcre.backtrack_limit", "1000000", ini_on_update_long,
114 &g_context->m_preg_backtrace_limit);
115 IniSetting::Bind("pcre.recursion_limit", "100000", ini_on_update_long,
116 &g_context->m_preg_recursion_limit);
118 InitFiniNode init(preg_init_thread_locals, InitFiniNode::ThreadInit);
120 template<bool useSmartFree = false>
121 struct FreeHelperImpl : private boost::noncopyable {
122 explicit FreeHelperImpl(void* p) : p(p) {}
123 ~FreeHelperImpl() {
124 useSmartFree ? smart_free(p) : free(p);
127 private:
128 void* p;
131 typedef FreeHelperImpl<true> SmartFreeHelper;
134 static const pcre_cache_entry* pcre_get_compiled_regex_cache(CStrRef regex) {
135 /* Try to lookup the cached regex entry, and if successful, just pass
136 back the compiled pattern, otherwise go on and compile it. */
137 if (const pcre_cache_entry* pce = lookup_cached_pcre(regex)) {
138 return pce;
141 /* Parse through the leading whitespace, and display a warning if we
142 get to the end without encountering a delimiter. */
143 const char *p = regex.data();
144 while (isspace((int)*(unsigned char *)p)) p++;
145 if (*p == 0) {
146 raise_warning("Empty regular expression");
147 return nullptr;
150 /* Get the delimiter and display a warning if it is alphanumeric
151 or a backslash. */
152 char delimiter = *p++;
153 if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
154 raise_warning("Delimiter must not be alphanumeric or backslash");
155 return nullptr;
158 char start_delimiter = delimiter;
159 const char *pp = strchr("([{< )]}> )]}>", delimiter);
160 if (pp) {
161 delimiter = pp[5];
163 char end_delimiter = delimiter;
165 if (start_delimiter == end_delimiter) {
166 /* We need to iterate through the pattern, searching for the ending
167 * delimiter, but skipping the backslashed delimiters. If the ending
168 * delimiter is not found, display a warning. */
169 pp = p;
170 while (*pp != 0) {
171 if (*pp == '\\' && pp[1] != 0) pp++;
172 else if (*pp == delimiter)
173 break;
174 pp++;
176 if (*pp == 0) {
177 raise_warning("No ending delimiter '%c' found: [%s]", delimiter,
178 regex.data());
179 return nullptr;
181 } else {
182 /* We iterate through the pattern, searching for the matching ending
183 * delimiter. For each matching starting delimiter, we increment nesting
184 * level, and decrement it for each matching ending delimiter. If we
185 * reach the end of the pattern without matching, display a warning.
187 int brackets = 1; // brackets nesting level
188 pp = p;
189 while (*pp != 0) {
190 if (*pp == '\\' && pp[1] != 0) pp++;
191 else if (*pp == end_delimiter && --brackets <= 0)
192 break;
193 else if (*pp == start_delimiter)
194 brackets++;
195 pp++;
197 if (*pp == 0) {
198 raise_warning("No ending matching delimiter '%c' found: [%s]",
199 end_delimiter, regex.data());
200 return nullptr;
204 /* Make a copy of the actual pattern. */
205 String spattern(p, pp-p, CopyString);
206 const char *pattern = spattern.data();
208 /* Move on to the options */
209 pp++;
211 /* Parse through the options, setting appropriate flags. Display
212 a warning if we encounter an unknown modifier. */
213 int coptions = 0;
214 int poptions = 0;
215 int do_study = false;
216 while (*pp != 0) {
217 switch (*pp++) {
218 /* Perl compatible options */
219 case 'i': coptions |= PCRE_CASELESS; break;
220 case 'm': coptions |= PCRE_MULTILINE; break;
221 case 's': coptions |= PCRE_DOTALL; break;
222 case 'x': coptions |= PCRE_EXTENDED; break;
224 /* PCRE specific options */
225 case 'A': coptions |= PCRE_ANCHORED; break;
226 case 'D': coptions |= PCRE_DOLLAR_ENDONLY; break;
227 case 'S': do_study = true; break;
228 case 'U': coptions |= PCRE_UNGREEDY; break;
229 case 'X': coptions |= PCRE_EXTRA; break;
230 case 'u': coptions |= PCRE_UTF8; break;
232 /* Custom preg options */
233 case 'e': poptions |= PREG_REPLACE_EVAL; break;
235 case ' ':
236 case '\n':
237 break;
239 default:
240 raise_warning("Unknown modifier '%c': [%s]", pp[-1], regex.data());
241 return nullptr;
245 /* We've reached a null byte, now check if we're actually at the end of the
246 string. If not this is a bad expression, and a potential security hole. */
247 if (regex.length() != (pp - regex.data())) {
248 raise_error("Error: Null byte found in pattern");
251 /* Compile pattern and display a warning if compilation failed. */
252 const char *error;
253 int erroffset;
254 pcre *re = pcre_compile(pattern, coptions, &error, &erroffset, 0);
255 if (re == nullptr) {
256 raise_warning("Compilation failed: %s at offset %d", error, erroffset);
257 return nullptr;
259 // Careful: from here 're' needs to be freed if something throws.
261 /* If study option was specified, study the pattern and
262 store the result in extra for passing to pcre_exec. */
263 pcre_extra *extra = nullptr;
264 if (do_study) {
265 int soptions = 0;
266 extra = pcre_study(re, soptions, &error);
267 if (extra) {
268 extra->flags |= PCRE_EXTRA_MATCH_LIMIT |
269 PCRE_EXTRA_MATCH_LIMIT_RECURSION;
271 if (error != nullptr) {
272 try {
273 raise_warning("Error while studying pattern");
274 } catch (...) {
275 pcre_free(re);
276 throw;
281 /* Store the compiled pattern and extra info in the cache. */
282 pcre_cache_entry *new_entry = new pcre_cache_entry();
283 new_entry->re = re;
284 new_entry->extra = extra;
285 new_entry->preg_options = poptions;
286 new_entry->compile_options = coptions;
287 return insert_cached_pcre(regex, new_entry);
290 static void set_extra_limits(pcre_extra*& extra) {
291 if (extra == nullptr) {
292 pcre_extra& extra_data = t_extra_data;
293 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT |
294 PCRE_EXTRA_MATCH_LIMIT_RECURSION;
295 extra = &extra_data;
297 extra->match_limit = g_context->m_preg_backtrace_limit;
298 extra->match_limit_recursion = g_context->m_preg_recursion_limit;
301 static int *create_offset_array(const pcre_cache_entry *pce,
302 int &size_offsets) {
303 pcre_extra *extra = pce->extra;
304 set_extra_limits(extra);
306 /* Calculate the size of the offsets array, and allocate memory for it. */
307 int num_subpats; // Number of captured subpatterns
308 int rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
309 if (rc < 0) {
310 raise_warning("Internal pcre_fullinfo() error %d", rc);
311 return nullptr;
313 num_subpats++;
314 size_offsets = num_subpats * 3;
315 return (int *)smart_malloc(size_offsets * sizeof(int));
318 static pcre* pcre_get_compiled_regex(CStrRef regex, pcre_extra **extra,
319 int *preg_options) {
320 const pcre_cache_entry* pce = pcre_get_compiled_regex_cache(regex);
321 if (extra) {
322 *extra = pce ? pce->extra : nullptr;
324 if (preg_options) {
325 *preg_options = pce ? pce->preg_options : 0;
327 return pce ? pce->re : nullptr;
330 static inline void add_offset_pair(Variant &result, CStrRef str, int offset,
331 const char *name) {
332 Array match_pair;
333 match_pair.append(str);
334 match_pair.append(offset);
336 if (name) {
337 result.set(name, match_pair);
339 result.append(match_pair);
342 static inline bool pcre_need_log_error(int pcre_code) {
343 return RuntimeOption::EnablePregErrorLog &&
344 (pcre_code == PCRE_ERROR_MATCHLIMIT ||
345 pcre_code == PCRE_ERROR_RECURSIONLIMIT);
348 static void pcre_log_error(const char *func, int line, int pcre_code,
349 const char *pattern, int pattern_size,
350 const char *subject, int subject_size,
351 const char *repl, int repl_size,
352 int arg1 = 0, int arg2 = 0,
353 int arg3 = 0, int arg4 = 0) {
354 const char *escapedPattern;
355 const char *escapedSubject;
356 const char *escapedRepl;
357 string p(pattern, pattern_size);
358 string s(subject, subject_size);
359 string r(repl, repl_size);
360 escapedPattern = Logger::EscapeString(p);
361 escapedSubject = Logger::EscapeString(s);
362 escapedRepl = Logger::EscapeString(r);
363 const char *errString =
364 (pcre_code == PCRE_ERROR_MATCHLIMIT) ? "PCRE_ERROR_MATCHLIMIT" :
365 (pcre_code == PCRE_ERROR_RECURSIONLIMIT) ? "PCRE_ERROR_RECURSIONLIMIT" :
366 "UNKNOWN";
367 raise_debugging(
368 "REGEXERR: %s/%d: err=%d(%s), pattern='%s', subject='%s', repl='%s', "
369 "limits=(%ld, %ld), extra=(%d, %d, %d, %d)",
370 func, line, pcre_code, errString,
371 escapedPattern, escapedSubject, escapedRepl,
372 g_context->m_preg_backtrace_limit, g_context->m_preg_recursion_limit,
373 arg1, arg2, arg3, arg4);
374 free((void *)escapedPattern);
375 free((void *)escapedSubject);
376 free((void *)escapedRepl);
379 static void pcre_handle_exec_error(int pcre_code) {
380 int preg_code = 0;
381 switch (pcre_code) {
382 case PCRE_ERROR_MATCHLIMIT:
383 preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
384 break;
385 case PCRE_ERROR_RECURSIONLIMIT:
386 preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
387 break;
388 case PCRE_ERROR_BADUTF8:
389 preg_code = PHP_PCRE_BAD_UTF8_ERROR;
390 break;
391 case PCRE_ERROR_BADUTF8_OFFSET:
392 preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
393 break;
394 default:
395 preg_code = PHP_PCRE_INTERNAL_ERROR;
396 break;
398 t_last_error_code = preg_code;
401 ///////////////////////////////////////////////////////////////////////////////
403 Variant preg_grep(CStrRef pattern, CArrRef input, int flags /* = 0 */) {
404 const pcre_cache_entry* pce = pcre_get_compiled_regex_cache(pattern);
405 if (pce == nullptr) {
406 return false;
409 int size_offsets = 0;
410 int *offsets = create_offset_array(pce, size_offsets);
411 if (offsets == nullptr) {
412 return false;
414 SmartFreeHelper freer(offsets);
416 /* Initialize return array */
417 Array ret = Array::Create();
418 t_last_error_code = PHP_PCRE_NO_ERROR;
420 /* Go through the input array */
421 bool invert = (flags & PREG_GREP_INVERT);
422 pcre_extra *extra = pce->extra;
423 set_extra_limits(extra);
425 for (ArrayIter iter(input); iter; ++iter) {
426 String entry = iter.second().toString();
428 /* Perform the match */
429 int count = pcre_exec(pce->re, extra, entry.data(), entry.size(),
430 0, 0, offsets, size_offsets);
432 /* Check for too many substrings condition. */
433 if (count == 0) {
434 raise_warning("Matched, but too many substrings");
435 count = size_offsets / 3;
436 } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
437 if (pcre_need_log_error(count)) {
438 pcre_log_error(__FUNCTION__, __LINE__, count,
439 pattern.data(), pattern.size(),
440 entry.data(), entry.size(),
441 "", 0,
442 flags);
444 pcre_handle_exec_error(count);
445 break;
448 /* If the entry fits our requirements */
449 if ((count > 0 && !invert) ||
450 (count == PCRE_ERROR_NOMATCH && invert)) {
452 /* Add to return array */
453 ret.set(iter.first(), entry);
457 return ret;
460 ///////////////////////////////////////////////////////////////////////////////
462 static Variant preg_match_impl(CStrRef pattern, CStrRef subject,
463 Variant *subpats, int flags, int start_offset,
464 bool global) {
465 const pcre_cache_entry* pce = pcre_get_compiled_regex_cache(pattern);
466 if (pce == nullptr) {
467 return false;
470 pcre_extra *extra = pce->extra;
471 set_extra_limits(extra);
472 if (subpats) {
473 *subpats = Array::Create();
476 int subpats_order = global ? PREG_PATTERN_ORDER : 0;
477 bool offset_capture = false;
478 if (flags) {
479 offset_capture = flags & PREG_OFFSET_CAPTURE;
482 * subpats_order is pre-set to pattern mode so we change it only if
483 * necessary.
485 if (flags & 0xff) {
486 subpats_order = flags & 0xff;
488 if ((global && (subpats_order < PREG_PATTERN_ORDER ||
489 subpats_order > PREG_SET_ORDER)) ||
490 (!global && subpats_order != 0)) {
491 raise_warning("Invalid flags specified");
492 return false;
496 /* Negative offset counts from the end of the string. */
497 if (start_offset < 0) {
498 start_offset = subject.size() + start_offset;
499 if (start_offset < 0) {
500 start_offset = 0;
504 int size_offsets = 0;
505 int *offsets = create_offset_array(pce, size_offsets);
506 SmartFreeHelper offsetsFreer(offsets);
507 int num_subpats = size_offsets / 3;
508 if (offsets == nullptr) {
509 return false;
513 * Build a mapping from subpattern numbers to their names. We will always
514 * allocate the table, even though there may be no named subpatterns. This
515 * avoids somewhat more complicated logic in the inner loops.
517 char **subpat_names = (char **)smart_malloc(num_subpats * sizeof(char *));
518 SmartFreeHelper subpatFreer(subpat_names);
519 memset(subpat_names, 0, sizeof(char *) * num_subpats);
521 int name_cnt = 0, name_size, ni = 0;
522 char *name_table;
523 unsigned short name_idx;
525 int rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
526 if (rc < 0) {
527 raise_warning("Internal pcre_fullinfo() error %d", rc);
528 return false;
530 if (name_cnt > 0) {
531 int rc1, rc2;
532 rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
533 rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
534 rc = rc2 ? rc2 : rc1;
535 if (rc < 0) {
536 raise_warning("Internal pcre_fullinfo() error %d", rc);
537 return false;
540 while (ni++ < name_cnt) {
541 name_idx = 0xff * (unsigned char)name_table[0] +
542 (unsigned char)name_table[1];
543 subpat_names[name_idx] = name_table + 2;
544 if (is_numeric_string(subpat_names[name_idx],
545 strlen(subpat_names[name_idx]),
546 nullptr, nullptr, 0) != KindOfNull) {
547 raise_warning("Numeric named subpatterns are not allowed");
548 return false;
550 name_table += name_size;
555 /* Allocate match sets array and initialize the values. */
556 Array match_sets; /* An array of sets of matches for each
557 subpattern after a global match */
558 if (global && subpats_order == PREG_PATTERN_ORDER) {
559 for (int i = 0; i < num_subpats; i++) {
560 match_sets.set(i, Array::Create());
564 int matched = 0;
565 t_last_error_code = PHP_PCRE_NO_ERROR;
567 Variant result_set; // Holds a set of subpatterns after a global match
568 int g_notempty = 0; // If the match should not be empty
569 const char **stringlist; // Holds list of subpatterns
570 int i;
571 do {
572 /* Execute the regular expression. */
573 int count = pcre_exec(pce->re, extra, subject.data(), subject.size(),
574 start_offset, g_notempty, offsets, size_offsets);
576 /* Check for too many substrings condition. */
577 if (count == 0) {
578 raise_warning("Matched, but too many substrings");
579 count = size_offsets / 3;
582 /* If something has matched */
583 if (count > 0) {
584 matched++;
586 if (!subpats) continue;
588 // Try to get the list of substrings and display a warning if failed.
589 if (pcre_get_substring_list(subject.data(), offsets, count,
590 &stringlist) < 0) {
591 raise_warning("Get subpatterns list failed");
592 return false;
595 if (global) { /* global pattern matching */
596 if (subpats_order == PREG_PATTERN_ORDER) {
597 /* For each subpattern, insert it into the appropriate array. */
598 for (i = 0; i < count; i++) {
599 if (offset_capture) {
600 add_offset_pair(match_sets.lvalAt(i),
601 String(stringlist[i],
602 offsets[(i<<1)+1] - offsets[i<<1],
603 CopyString),
604 offsets[i<<1], nullptr);
605 } else {
606 match_sets.lvalAt(i).append
607 (String(stringlist[i],
608 offsets[(i<<1)+1] - offsets[i<<1], CopyString));
612 * If the number of captured subpatterns on this run is
613 * less than the total possible number, pad the result
614 * arrays with empty strings.
616 if (count < num_subpats) {
617 for (; i < num_subpats; i++) {
618 match_sets.lvalAt(i).append("");
621 } else {
622 result_set = Array::Create();
624 /* Add all the subpatterns to it */
625 for (i = 0; i < count; i++) {
626 if (offset_capture) {
627 add_offset_pair(result_set,
628 String(stringlist[i],
629 offsets[(i<<1)+1] - offsets[i<<1],
630 CopyString),
631 offsets[i<<1], subpat_names[i]);
632 } else {
633 String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
634 CopyString);
635 if (subpat_names[i]) {
636 result_set.set(subpat_names[i], value);
638 result_set.append(value);
641 /* And add it to the output array */
642 subpats->append(result_set);
644 } else { /* single pattern matching */
645 /* For each subpattern, insert it into the subpatterns array. */
646 for (i = 0; i < count; i++) {
647 if (offset_capture) {
648 add_offset_pair(*subpats,
649 String(stringlist[i],
650 offsets[(i<<1)+1] - offsets[i<<1],
651 CopyString),
652 offsets[i<<1], subpat_names[i]);
653 } else {
654 String value(stringlist[i], offsets[(i<<1)+1] - offsets[i<<1],
655 CopyString);
656 if (subpat_names[i]) {
657 subpats->set(subpat_names[i], value);
659 subpats->append(value);
664 pcre_free((void *) stringlist);
665 } else if (count == PCRE_ERROR_NOMATCH) {
666 /* If we previously set PCRE_NOTEMPTY after a null match,
667 this is not necessarily the end. We need to advance
668 the start offset, and continue. Fudge the offset values
669 to achieve this, unless we're already at the end of the string. */
670 if (g_notempty && start_offset < subject.size()) {
671 offsets[0] = start_offset;
672 offsets[1] = start_offset + 1;
673 } else
674 break;
675 } else {
676 if (pcre_need_log_error(count)) {
677 pcre_log_error(__FUNCTION__, __LINE__, count,
678 pattern.data(), pattern.size(),
679 subject.data(), subject.size(),
680 "", 0,
681 flags, start_offset, g_notempty, global);
683 pcre_handle_exec_error(count);
684 break;
687 /* If we have matched an empty string, mimic what Perl's /g options does.
688 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
689 the match again at the same point. If this fails (picked up above) we
690 advance to the next character. */
691 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
693 /* Advance to the position right after the last full match */
694 start_offset = offsets[1];
695 } while (global);
697 /* Add the match sets to the output array and clean up */
698 if (subpats && global && subpats_order == PREG_PATTERN_ORDER) {
699 for (i = 0; i < num_subpats; i++) {
700 if (subpat_names[i]) {
701 subpats->set(subpat_names[i], match_sets[i]);
703 subpats->append(match_sets[i]);
707 return matched;
710 Variant preg_match(CStrRef pattern, CStrRef subject,
711 Variant &matches, int flags /* = 0 */,
712 int offset /* = 0 */) {
713 return preg_match_impl(pattern, subject, &matches, flags, offset, false);
715 Variant preg_match(CStrRef pattern, CStrRef subject, int flags /* = 0 */,
716 int offset /* = 0 */) {
717 return preg_match_impl(pattern, subject, nullptr, flags, offset, false);
720 Variant preg_match_all(CStrRef pattern, CStrRef subject, Variant &matches,
721 int flags /* = 0 */, int offset /* = 0 */) {
722 return preg_match_impl(pattern, subject, &matches, flags, offset, true);
724 Variant preg_match_all(CStrRef pattern, CStrRef subject,
725 int flags /* = 0 */, int offset /* = 0 */) {
726 return preg_match_impl(pattern, subject, nullptr, flags, offset, true);
729 ///////////////////////////////////////////////////////////////////////////////
731 static String preg_do_repl_func(CVarRef function, CStrRef subject,
732 int *offsets, int count) {
733 Array subpats = Array::Create();
734 for (int i = 0; i < count; i++) {
735 subpats.append(subject.substr(offsets[i<<1],
736 offsets[(i<<1)+1] - offsets[i<<1]));
739 Array args;
740 args.set(0, subpats);
741 return vm_call_user_func(function, args);
744 static bool preg_get_backref(const char **str, int *backref) {
745 char in_brace = 0;
746 const char *walk = *str;
748 if (walk[1] == 0) {
749 return false;
752 if (*walk == '$' && walk[1] == '{') {
753 in_brace = 1;
754 walk++;
756 walk++;
758 if (*walk >= '0' && *walk <= '9') {
759 *backref = *walk - '0';
760 walk++;
761 } else {
762 return false;
765 if (*walk && *walk >= '0' && *walk <= '9') {
766 *backref = *backref * 10 + *walk - '0';
767 walk++;
770 if (in_brace) {
771 if (*walk == 0 || *walk != '}') {
772 return false;
774 walk++;
777 *str = walk;
778 return true;
781 static String php_pcre_replace(CStrRef pattern, CStrRef subject,
782 CVarRef replace_var, bool callable,
783 int limit, int *replace_count) {
784 const pcre_cache_entry* pce = pcre_get_compiled_regex_cache(pattern);
785 if (pce == nullptr) {
786 return false;
788 bool eval = false;
789 if (pce->preg_options & PREG_REPLACE_EVAL) {
790 if (callable)
791 throw NotSupportedException("preg_replace",
792 "Modifier /e cannot be used with replacement "
793 "callback.");
794 eval = true;
797 int size_offsets;
798 int *offsets = create_offset_array(pce, size_offsets);
799 SmartFreeHelper offsetsFreer(offsets);
800 if (offsets == nullptr) {
801 return false;
804 const char *replace = nullptr;
805 const char *replace_end = nullptr;
806 int replace_len = 0;
807 String replace_val;
808 String eval_fn;
810 if (!callable) {
811 replace_val = replace_var.toString();
812 if (eval) {
813 // Extract eval fn
814 int pidx = replace_val.find('(');
815 const char *rd = replace_val.data();
816 int rs = replace_val.size();
818 if (!(rs >= 5 && pidx >= 0 && rd[pidx+1] == '"' &&
819 ((rd[rs-2] == '"' && rd[rs-1] == ')') ||
820 (rd[rs-3] == '"' && rd[rs-2] == ')' && rd[rs-1] == ';')))) {
821 throw NotSupportedException("preg_replace",
822 "Modifier /e must be used with the form "
823 "f(\"<replacement string>\") or "
824 "f(\"<replacement string>\");");
826 eval_fn = replace_val.substr(0, pidx);
827 replace_val = replace_val.substr(pidx+1, rs - (pidx+1) - 1);
829 replace = replace_val.data();
830 replace_len = replace_val.size();
831 replace_end = replace + replace_len;
834 int alloc_len = 2 * subject.size() + 1;
835 char *result = (char *)malloc(alloc_len);
837 try {
839 /* Initialize */
840 const char *match = nullptr;
841 int start_offset = 0;
842 t_last_error_code = PHP_PCRE_NO_ERROR;
843 pcre_extra *extra = pce->extra;
844 set_extra_limits(extra);
846 int result_len = 0;
847 int new_len; // Length of needed storage
848 const char *walk; // Used to walk the replacement string
849 char walk_last; // Last walked character
850 char *walkbuf; // Location of current replacement in the result
851 int match_len; // Length of the current match
852 int backref; // Backreference number
853 int g_notempty = 0; // If the match should not be empty
854 while (1) {
855 /* Execute the regular expression. */
856 int count = pcre_exec(pce->re, extra, subject.data(), subject.size(),
857 start_offset, g_notempty, offsets, size_offsets);
859 /* Check for too many substrings condition. */
860 if (count == 0) {
861 raise_warning("Matched, but too many substrings");
862 count = size_offsets / 3;
865 const char *piece = subject.data() + start_offset;
866 if (count > 0 && (limit == -1 || limit > 0)) {
867 if (replace_count) {
868 ++*replace_count;
870 /* Set the match location in subject */
871 match = subject.data() + offsets[0];
872 new_len = result_len + offsets[0] - start_offset; //part before the match
874 /* If evaluating, do it and add the return string's length */
875 String eval_result;
876 if (callable) {
877 /* Use custom function to get replacement string and its length. */
878 eval_result = preg_do_repl_func(replace_var, subject, offsets, count);
879 new_len += eval_result.size();
880 } else { /* do regular substitution */
881 walk = replace;
882 walk_last = 0;
883 while (walk < replace_end) {
884 if ('\\' == *walk || '$' == *walk) {
885 if (walk_last == '\\') {
886 walk++;
887 walk_last = 0;
888 continue;
890 if (preg_get_backref(&walk, &backref)) {
891 if (backref < count) {
892 new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
894 continue;
897 new_len++;
898 walk++;
899 walk_last = walk[-1];
903 if (new_len + 1 > alloc_len) {
904 alloc_len = 1 + alloc_len + 2 * new_len;
905 result = (char *)realloc(result, alloc_len);
907 /* copy the part of the string before the match */
908 memcpy(&result[result_len], piece, match-piece);
909 result_len += match-piece;
911 /* copy replacement and backrefs */
912 walkbuf = result + result_len;
914 /* If evaluating or using custom function, copy result to the buffer
915 * and clean up. */
916 if (callable) {
917 memcpy(walkbuf, eval_result.data(), eval_result.size());
918 result_len += eval_result.size();
919 } else { /* do regular backreference copying */
920 walk = replace;
921 walk_last = 0;
922 Array params;
923 const char* lastStart = nullptr;
924 while (walk < replace_end) {
925 bool handleQuote = eval && '"' == *walk && walk_last != '\\';
926 if (handleQuote && lastStart != nullptr) {
927 String str(lastStart, walkbuf - lastStart, CopyString);
928 params.append(str);
929 lastStart = nullptr;
930 handleQuote = false;
932 if ('\\' == *walk || '$' == *walk) {
933 if (walk_last == '\\') {
934 *(walkbuf-1) = *walk++;
935 walk_last = 0;
936 continue;
938 if (preg_get_backref(&walk, &backref)) {
939 if (backref < count) {
940 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
941 memcpy(walkbuf, subject.data() + offsets[backref<<1],
942 match_len);
943 walkbuf += match_len;
945 continue;
948 *walkbuf++ = *walk++;
949 walk_last = walk[-1];
950 if (handleQuote && lastStart == nullptr) {
951 lastStart = walkbuf;
954 *walkbuf = '\0';
955 if (eval) {
956 eval_result = vm_call_user_func(eval_fn, params);
957 memcpy(result + result_len, eval_result.data(), eval_result.size());
958 result_len += eval_result.size();
959 } else {
960 /* increment the result length by how much we've added to the string */
961 result_len += walkbuf - (result + result_len);
965 if (limit != -1) {
966 limit--;
969 } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
970 /* If we previously set PCRE_NOTEMPTY after a null match,
971 this is not necessarily the end. We need to advance
972 the start offset, and continue. Fudge the offset values
973 to achieve this, unless we're already at the end of the string. */
974 if (g_notempty != 0 && start_offset < subject.size()) {
975 offsets[0] = start_offset;
976 offsets[1] = start_offset + 1;
977 memcpy(&result[result_len], piece, 1);
978 (result_len)++;
979 } else {
980 new_len = result_len + subject.size() - start_offset;
981 if (new_len + 1 > alloc_len) {
982 alloc_len = new_len + 1; /* now we know exactly how long it is */
983 result = (char *)realloc(result, alloc_len);
985 /* stick that last bit of string on our output */
986 memcpy(&result[result_len], piece, subject.size() - start_offset);
987 result_len += subject.size() - start_offset;
988 result[result_len] = '\0';
989 break;
991 } else {
992 if (pcre_need_log_error(count)) {
993 const char *s;
994 int size;
995 String stemp;
996 if (callable) {
997 if (replace_var.isObject()) {
998 stemp =
999 replace_var.objectForCall()->o_getClassName() + "::__invoke";
1000 } else {
1001 stemp = replace_var.toString();
1003 s = stemp.data();
1004 size = stemp.size();
1005 } else {
1006 s = replace_val.data();
1007 size = replace_val.size();
1009 pcre_log_error(__FUNCTION__, __LINE__, count,
1010 pattern.data(), pattern.size(),
1011 subject.data(), subject.size(),
1012 s, size,
1013 callable, limit, start_offset, g_notempty);
1015 pcre_handle_exec_error(count);
1016 free(result);
1017 result = nullptr;
1018 break;
1021 /* If we have matched an empty string, mimic what Perl's /g options does.
1022 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1023 the match again at the same point. If this fails (picked up above) we
1024 advance to the next character. */
1025 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1027 /* Advance to the next piece. */
1028 start_offset = offsets[1];
1031 if (result) {
1032 return String(result, result_len, AttachString);
1034 return String();
1035 } catch (...) {
1036 free(result);
1037 throw;
1041 static String php_replace_in_subject(CVarRef regex, CVarRef replace,
1042 String subject, int limit, bool callable,
1043 int *replace_count) {
1044 if (!regex.is(KindOfArray)) {
1045 return php_pcre_replace(regex.toString(), subject, replace,
1046 callable, limit, replace_count);
1049 if (callable || !replace.is(KindOfArray)) {
1050 Array arr = regex.toArray();
1051 for (ArrayIter iterRegex(arr); iterRegex; ++iterRegex) {
1052 String regex_entry = iterRegex.second().toString();
1053 subject = php_pcre_replace(regex_entry, subject, replace,
1054 callable, limit, replace_count);
1055 if (subject.isNull()) {
1056 return subject;
1059 return subject;
1062 Array arrReplace = replace.toArray();
1063 Array arrRegex = regex.toArray();
1064 ArrayIter iterReplace(arrReplace);
1065 for (ArrayIter iterRegex(arrRegex); iterRegex; ++iterRegex) {
1066 String regex_entry = iterRegex.second().toString();
1067 Variant replace_value;
1068 if (iterReplace) {
1069 replace_value = iterReplace.second();
1070 ++iterReplace;
1073 subject = php_pcre_replace(regex_entry, subject, replace_value,
1074 callable, limit, replace_count);
1075 if (subject.isNull()) {
1076 return subject;
1079 return subject;
1082 Variant preg_replace_impl(CVarRef pattern, CVarRef replacement,
1083 CVarRef subject, int limit, Variant &count,
1084 bool is_callable) {
1085 if (!is_callable &&
1086 replacement.is(KindOfArray) && !pattern.is(KindOfArray)) {
1087 raise_warning("Parameter mismatch, pattern is a string while "
1088 "replacement is an array");
1089 return false;
1092 int replace_count = 0;
1093 if (!subject.is(KindOfArray)) {
1094 String ret = php_replace_in_subject(pattern, replacement,
1095 subject.toString(),
1096 limit, is_callable, &replace_count);
1097 count = replace_count;
1098 return ret;
1101 Array return_value = Array::Create();
1102 Array arrSubject = subject.toArray();
1103 for (ArrayIter iter(arrSubject); iter; ++iter) {
1104 String subject_entry = iter.second().toString();
1105 String result = php_replace_in_subject(pattern, replacement, subject_entry,
1106 limit, is_callable, &replace_count);
1107 if (!result.isNull()) {
1108 return_value.set(iter.first(), result);
1111 count = replace_count;
1112 return return_value;
1115 int preg_replace(Variant &result, CVarRef pattern, CVarRef replacement,
1116 CVarRef subject, int limit /* = -1 */) {
1117 Variant count;
1118 result = preg_replace_impl(pattern, replacement, subject, limit, count, false);
1119 return count.toInt32();
1122 int preg_replace_callback(Variant &result, CVarRef pattern, CVarRef callback,
1123 CVarRef subject, int limit /* = -1 */) {
1124 Variant count;
1125 result = preg_replace_impl(pattern, callback, subject, limit, count, true);
1126 return count.toInt32();
1129 ///////////////////////////////////////////////////////////////////////////////
1131 Variant preg_split(CVarRef pattern, CVarRef subject, int limit /* = -1 */,
1132 int flags /* = 0 */) {
1133 const pcre_cache_entry* pce = pcre_get_compiled_regex_cache(
1134 pattern.toString());
1135 if (pce == nullptr) {
1136 return false;
1139 int no_empty = flags & PREG_SPLIT_NO_EMPTY;
1140 bool delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1141 bool offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1143 if (limit == 0) {
1144 limit = -1;
1147 int size_offsets = 0;
1148 int *offsets = create_offset_array(pce, size_offsets);
1149 SmartFreeHelper offsetsFreer(offsets);
1150 if (offsets == nullptr) {
1151 return false;
1154 String ssubject = subject.toString();
1156 /* Start at the beginning of the string */
1157 int start_offset = 0;
1158 int next_offset = 0;
1159 const char *last_match = ssubject.data();
1160 t_last_error_code = PHP_PCRE_NO_ERROR;
1161 pcre_extra *extra = pce->extra;
1163 // Get next piece if no limit or limit not yet reached and something matched
1164 Variant return_value = Array::Create();
1165 int g_notempty = 0; /* If the match should not be empty */
1166 int utf8_check = 0;
1167 pcre *re_bump = nullptr; /* Regex instance for empty matches */
1168 pcre_extra *extra_bump = nullptr; /* Almost dummy */
1169 while ((limit == -1 || limit > 1)) {
1170 int count = pcre_exec(pce->re, extra, ssubject.data(), ssubject.size(),
1171 start_offset, g_notempty | utf8_check,
1172 offsets, size_offsets);
1174 /* Check for too many substrings condition. */
1175 if (count == 0) {
1176 raise_warning("Matched, but too many substrings");
1177 count = size_offsets / 3;
1180 /* If something matched */
1181 if (count > 0) {
1182 /* Subsequent calls to pcre_exec don't need to bother with the
1183 * utf8 validity check: if the subject isn't valid, the first
1184 * call to pcre_exec will have failed, and as long as we only
1185 * set start_offset to known character boundaries we won't
1186 * supply an invalid offset. */
1187 utf8_check = PCRE_NO_UTF8_CHECK;
1189 if (!no_empty || ssubject.data() + offsets[0] != last_match) {
1190 if (offset_capture) {
1191 /* Add (match, offset) pair to the return value */
1192 add_offset_pair(return_value,
1193 String(last_match,
1194 ssubject.data() + offsets[0] - last_match,
1195 CopyString),
1196 next_offset, nullptr);
1197 } else {
1198 /* Add the piece to the return value */
1199 return_value.append(String(last_match,
1200 ssubject.data() + offsets[0] - last_match,
1201 CopyString));
1204 /* One less left to do */
1205 if (limit != -1)
1206 limit--;
1209 last_match = ssubject.data() + offsets[1];
1210 next_offset = offsets[1];
1212 if (delim_capture) {
1213 int i, match_len;
1214 for (i = 1; i < count; i++) {
1215 match_len = offsets[(i<<1)+1] - offsets[i<<1];
1216 /* If we have matched a delimiter */
1217 if (!no_empty || match_len > 0) {
1218 if (offset_capture) {
1219 add_offset_pair(return_value,
1220 String(ssubject.data() + offsets[i<<1],
1221 match_len, CopyString),
1222 offsets[i<<1], nullptr);
1223 } else {
1224 return_value.append(ssubject.substr(offsets[i<<1], match_len));
1229 } else if (count == PCRE_ERROR_NOMATCH) {
1230 /* If we previously set PCRE_NOTEMPTY after a null match,
1231 this is not necessarily the end. We need to advance
1232 the start offset, and continue. Fudge the offset values
1233 to achieve this, unless we're already at the end of the string. */
1234 if (g_notempty != 0 && start_offset < ssubject.size()) {
1235 if (pce->compile_options & PCRE_UTF8) {
1236 if (re_bump == nullptr) {
1237 int dummy;
1238 if ((re_bump = pcre_get_compiled_regex("/./us", &extra_bump,
1239 &dummy)) == nullptr) {
1240 return false;
1243 count = pcre_exec(re_bump, extra_bump, ssubject.data(),
1244 ssubject.size(), start_offset,
1245 0, offsets, size_offsets);
1246 if (count < 1) {
1247 raise_warning("Unknown error");
1248 offsets[0] = start_offset;
1249 offsets[1] = start_offset + 1;
1250 if (pcre_need_log_error(count)) {
1251 String spattern = pattern.toString();
1252 pcre_log_error(__FUNCTION__, __LINE__, count,
1253 spattern.data(), spattern.size(),
1254 ssubject.data(), ssubject.size(),
1255 "", 0,
1256 limit, flags, start_offset);
1259 } else {
1260 offsets[0] = start_offset;
1261 offsets[1] = start_offset + 1;
1263 } else
1264 break;
1265 } else {
1266 if (pcre_need_log_error(count)) {
1267 String spattern = pattern.toString();
1268 pcre_log_error(__FUNCTION__, __LINE__, count,
1269 spattern.data(), spattern.size(),
1270 ssubject.data(), ssubject.size(),
1271 "", 0,
1272 limit, flags, start_offset, g_notempty);
1274 pcre_handle_exec_error(count);
1275 break;
1278 /* If we have matched an empty string, mimic what Perl's /g options does.
1279 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1280 the match again at the same point. If this fails (picked up above) we
1281 advance to the next character. */
1282 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1284 /* Advance to the position right after the last full match */
1285 start_offset = offsets[1];
1288 start_offset = last_match - ssubject.data(); /* the offset might have been incremented, but without further successful matches */
1289 if (!no_empty || start_offset < ssubject.size()) {
1290 if (offset_capture) {
1291 /* Add the last (match, offset) pair to the return value */
1292 add_offset_pair(return_value,
1293 ssubject.substr(start_offset),
1294 start_offset, nullptr);
1295 } else {
1296 /* Add the last piece to the return value */
1297 return_value.append
1298 (String(last_match, ssubject.data() + ssubject.size() - last_match,
1299 CopyString));
1303 return return_value;
1306 ///////////////////////////////////////////////////////////////////////////////
1308 String preg_quote(CStrRef str, CStrRef delimiter /* = null_string */) {
1309 const char *in_str = str.data();
1310 const char *in_str_end = in_str + str.size();
1312 /* Nothing to do if we got an empty string */
1313 if (in_str == in_str_end) {
1314 return str;
1317 char delim_char = 0; /* Delimiter character to be quoted */
1318 bool quote_delim = false; /* Whether to quote additional delim char */
1319 if (!delimiter.empty()) {
1320 delim_char = delimiter.charAt(0);
1321 quote_delim = true;
1324 /* Allocate enough memory so that even if each character
1325 is quoted, we won't run out of room */
1326 char *out_str = (char *)malloc(4 * str.size() + 1);
1328 /* Go through the string and quote necessary characters */
1329 const char *p;
1330 char *q;
1331 for (p = in_str, q = out_str; p != in_str_end; p++) {
1332 char c = *p;
1333 switch (c) {
1334 case '.': case '\\': case '+': case '*': case '?':
1335 case '[': case '^': case ']': case '$': case '(':
1336 case ')': case '{': case '}': case '=': case '!':
1337 case '>': case '<': case '|': case ':':
1338 *q++ = '\\';
1339 *q++ = c;
1340 break;
1342 case '\0':
1343 *q++ = '\\';
1344 *q++ = '0';
1345 *q++ = '0';
1346 *q++ = '0';
1347 break;
1349 default:
1350 if (quote_delim && c == delim_char)
1351 *q++ = '\\';
1352 *q++ = c;
1353 break;
1356 *q = '\0';
1358 return String(out_str, q - out_str, AttachString);
1361 int preg_last_error() {
1362 return t_last_error_code;
1365 size_t preg_pcre_cache_size() {
1366 return (size_t)s_pcreCacheMap.size();
1369 ///////////////////////////////////////////////////////////////////////////////
1370 // regexec
1372 static void php_reg_eprint(int err, regex_t *re) {
1373 char *buf = nullptr, *message = nullptr;
1374 size_t len;
1375 size_t buf_len;
1377 #ifdef REG_ITOA
1378 /* get the length of the message */
1379 buf_len = regerror(REG_ITOA | err, re, nullptr, 0);
1380 if (buf_len) {
1381 buf = (char *)smart_malloc(buf_len);
1382 if (!buf) return; /* fail silently */
1383 /* finally, get the error message */
1384 regerror(REG_ITOA | err, re, buf, buf_len);
1386 #else
1387 buf_len = 0;
1388 #endif
1389 len = regerror(err, re, nullptr, 0);
1390 if (len) {
1391 message = (char *)smart_malloc(buf_len + len + 2);
1392 if (!message) {
1393 return; /* fail silently */
1395 if (buf_len) {
1396 snprintf(message, buf_len, "%s: ", buf);
1397 buf_len += 1; /* so pointer math below works */
1399 /* drop the message into place */
1400 regerror(err, re, message + buf_len, len);
1401 raise_warning("%s", message);
1403 smart_free(buf);
1404 smart_free(message);
1407 Variant php_split(CStrRef spliton, CStrRef str, int count, bool icase) {
1408 const char *strp = str.data();
1409 const char *endp = strp + str.size();
1411 regex_t re;
1412 int copts = icase ? REG_ICASE : 0;
1413 int err = regcomp(&re, spliton.data(), REG_EXTENDED | copts);
1414 if (err) {
1415 php_reg_eprint(err, &re);
1416 return false;
1419 Array return_value = Array::Create();
1420 regmatch_t subs[1];
1422 /* churn through str, generating array entries as we go */
1423 while ((count == -1 || count > 1) &&
1424 !(err = regexec(&re, strp, 1, subs, 0))) {
1425 if (subs[0].rm_so == 0 && subs[0].rm_eo) {
1426 /* match is at start of string, return empty string */
1427 return_value.append("");
1428 /* skip ahead the length of the regex match */
1429 strp += subs[0].rm_eo;
1430 } else if (subs[0].rm_so == 0 && subs[0].rm_eo == 0) {
1431 /* No more matches */
1432 regfree(&re);
1433 raise_warning("Invalid Regular Expression to split()");
1434 return false;
1435 } else {
1436 /* On a real match */
1438 /* make a copy of the substring */
1439 int size = subs[0].rm_so;
1441 /* add it to the array */
1442 return_value.append(String(strp, size, CopyString));
1444 /* point at our new starting point */
1445 strp = strp + subs[0].rm_eo;
1448 /* if we're only looking for a certain number of points,
1449 stop looking once we hit it */
1450 if (count != -1) {
1451 count--;
1455 /* see if we encountered an error */
1456 if (err && err != REG_NOMATCH) {
1457 php_reg_eprint(err, &re);
1458 regfree(&re);
1459 return false;
1462 /* otherwise we just have one last element to add to the array */
1463 int size = endp - strp;
1464 return_value.append(String(strp, size, CopyString));
1466 regfree(&re);
1467 return return_value;
1470 ///////////////////////////////////////////////////////////////////////////////