[Bug #848556] Remove \d* from second alternative to avoid exponential case when repea...
[pytest.git] / Modules / _sre.c
blob128bf9b6b67722f3eef496b126fbd03f2a261448
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 #ifndef SRE_RECURSIVE
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #define PY_SSIZE_T_CLEAN
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
47 #include "sre.h"
49 #include <ctype.h>
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
56 #define SRE_PY_MODULE "re"
58 /* defining this one enables tracing */
59 #undef VERBOSE
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
64 #define HAVE_UNICODE
65 #endif
66 #endif
68 /* -------------------------------------------------------------------- */
69 /* optional features */
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
74 /* enables aggressive inlining (always on for Visual C) */
75 #undef USE_INLINE
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
82 #endif
84 /* -------------------------------------------------------------------- */
86 #if defined(_MSC_VER)
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
93 #else
94 #define LOCAL(type) static type
95 #endif
97 /* error codes */
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
103 #if defined(VERBOSE)
104 #define TRACE(v) printf v
105 #else
106 #define TRACE(v)
107 #endif
109 /* -------------------------------------------------------------------- */
110 /* search engine state */
112 /* default character predicates (run sre_chars.py to regenerate tables) */
114 #define SRE_DIGIT_MASK 1
115 #define SRE_SPACE_MASK 2
116 #define SRE_LINEBREAK_MASK 4
117 #define SRE_ALNUM_MASK 8
118 #define SRE_WORD_MASK 16
120 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
122 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
123 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
125 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
126 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
127 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
128 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
131 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
132 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
133 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
134 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
135 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
136 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
137 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
138 120, 121, 122, 123, 124, 125, 126, 127 };
140 #define SRE_IS_DIGIT(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
142 #define SRE_IS_SPACE(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
144 #define SRE_IS_LINEBREAK(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
146 #define SRE_IS_ALNUM(ch)\
147 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
148 #define SRE_IS_WORD(ch)\
149 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151 static unsigned int sre_lower(unsigned int ch)
153 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
156 /* locale-specific character predicates */
157 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
158 * warnings when c's type supports only numbers < N+1 */
159 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
160 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
161 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
162 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
163 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165 static unsigned int sre_lower_locale(unsigned int ch)
167 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
170 /* unicode-specific character predicates */
172 #if defined(HAVE_UNICODE)
174 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180 static unsigned int sre_lower_unicode(unsigned int ch)
182 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
185 #endif
187 LOCAL(int)
188 sre_category(SRE_CODE category, unsigned int ch)
190 switch (category) {
192 case SRE_CATEGORY_DIGIT:
193 return SRE_IS_DIGIT(ch);
194 case SRE_CATEGORY_NOT_DIGIT:
195 return !SRE_IS_DIGIT(ch);
196 case SRE_CATEGORY_SPACE:
197 return SRE_IS_SPACE(ch);
198 case SRE_CATEGORY_NOT_SPACE:
199 return !SRE_IS_SPACE(ch);
200 case SRE_CATEGORY_WORD:
201 return SRE_IS_WORD(ch);
202 case SRE_CATEGORY_NOT_WORD:
203 return !SRE_IS_WORD(ch);
204 case SRE_CATEGORY_LINEBREAK:
205 return SRE_IS_LINEBREAK(ch);
206 case SRE_CATEGORY_NOT_LINEBREAK:
207 return !SRE_IS_LINEBREAK(ch);
209 case SRE_CATEGORY_LOC_WORD:
210 return SRE_LOC_IS_WORD(ch);
211 case SRE_CATEGORY_LOC_NOT_WORD:
212 return !SRE_LOC_IS_WORD(ch);
214 #if defined(HAVE_UNICODE)
215 case SRE_CATEGORY_UNI_DIGIT:
216 return SRE_UNI_IS_DIGIT(ch);
217 case SRE_CATEGORY_UNI_NOT_DIGIT:
218 return !SRE_UNI_IS_DIGIT(ch);
219 case SRE_CATEGORY_UNI_SPACE:
220 return SRE_UNI_IS_SPACE(ch);
221 case SRE_CATEGORY_UNI_NOT_SPACE:
222 return !SRE_UNI_IS_SPACE(ch);
223 case SRE_CATEGORY_UNI_WORD:
224 return SRE_UNI_IS_WORD(ch);
225 case SRE_CATEGORY_UNI_NOT_WORD:
226 return !SRE_UNI_IS_WORD(ch);
227 case SRE_CATEGORY_UNI_LINEBREAK:
228 return SRE_UNI_IS_LINEBREAK(ch);
229 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
230 return !SRE_UNI_IS_LINEBREAK(ch);
231 #else
232 case SRE_CATEGORY_UNI_DIGIT:
233 return SRE_IS_DIGIT(ch);
234 case SRE_CATEGORY_UNI_NOT_DIGIT:
235 return !SRE_IS_DIGIT(ch);
236 case SRE_CATEGORY_UNI_SPACE:
237 return SRE_IS_SPACE(ch);
238 case SRE_CATEGORY_UNI_NOT_SPACE:
239 return !SRE_IS_SPACE(ch);
240 case SRE_CATEGORY_UNI_WORD:
241 return SRE_LOC_IS_WORD(ch);
242 case SRE_CATEGORY_UNI_NOT_WORD:
243 return !SRE_LOC_IS_WORD(ch);
244 case SRE_CATEGORY_UNI_LINEBREAK:
245 return SRE_IS_LINEBREAK(ch);
246 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
247 return !SRE_IS_LINEBREAK(ch);
248 #endif
250 return 0;
253 /* helpers */
255 static void
256 data_stack_dealloc(SRE_STATE* state)
258 if (state->data_stack) {
259 PyMem_FREE(state->data_stack);
260 state->data_stack = NULL;
262 state->data_stack_size = state->data_stack_base = 0;
265 static int
266 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
268 Py_ssize_t minsize, cursize;
269 minsize = state->data_stack_base+size;
270 cursize = state->data_stack_size;
271 if (cursize < minsize) {
272 void* stack;
273 cursize = minsize+minsize/4+1024;
274 TRACE(("allocate/grow stack %d\n", cursize));
275 stack = PyMem_REALLOC(state->data_stack, cursize);
276 if (!stack) {
277 data_stack_dealloc(state);
278 return SRE_ERROR_MEMORY;
280 state->data_stack = (char *)stack;
281 state->data_stack_size = cursize;
283 return 0;
286 /* generate 8-bit version */
288 #define SRE_CHAR unsigned char
289 #define SRE_AT sre_at
290 #define SRE_COUNT sre_count
291 #define SRE_CHARSET sre_charset
292 #define SRE_INFO sre_info
293 #define SRE_MATCH sre_match
294 #define SRE_MATCH_CONTEXT sre_match_context
295 #define SRE_SEARCH sre_search
296 #define SRE_LITERAL_TEMPLATE sre_literal_template
298 #if defined(HAVE_UNICODE)
300 #define SRE_RECURSIVE
301 #include "_sre.c"
302 #undef SRE_RECURSIVE
304 #undef SRE_LITERAL_TEMPLATE
305 #undef SRE_SEARCH
306 #undef SRE_MATCH
307 #undef SRE_MATCH_CONTEXT
308 #undef SRE_INFO
309 #undef SRE_CHARSET
310 #undef SRE_COUNT
311 #undef SRE_AT
312 #undef SRE_CHAR
314 /* generate 16-bit unicode version */
316 #define SRE_CHAR Py_UNICODE
317 #define SRE_AT sre_uat
318 #define SRE_COUNT sre_ucount
319 #define SRE_CHARSET sre_ucharset
320 #define SRE_INFO sre_uinfo
321 #define SRE_MATCH sre_umatch
322 #define SRE_MATCH_CONTEXT sre_umatch_context
323 #define SRE_SEARCH sre_usearch
324 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
325 #endif
327 #endif /* SRE_RECURSIVE */
329 /* -------------------------------------------------------------------- */
330 /* String matching engine */
332 /* the following section is compiled twice, with different character
333 settings */
335 LOCAL(int)
336 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338 /* check if pointer is at given position */
340 Py_ssize_t thisp, thatp;
342 switch (at) {
344 case SRE_AT_BEGINNING:
345 case SRE_AT_BEGINNING_STRING:
346 return ((void*) ptr == state->beginning);
348 case SRE_AT_BEGINNING_LINE:
349 return ((void*) ptr == state->beginning ||
350 SRE_IS_LINEBREAK((int) ptr[-1]));
352 case SRE_AT_END:
353 return (((void*) (ptr+1) == state->end &&
354 SRE_IS_LINEBREAK((int) ptr[0])) ||
355 ((void*) ptr == state->end));
357 case SRE_AT_END_LINE:
358 return ((void*) ptr == state->end ||
359 SRE_IS_LINEBREAK((int) ptr[0]));
361 case SRE_AT_END_STRING:
362 return ((void*) ptr == state->end);
364 case SRE_AT_BOUNDARY:
365 if (state->beginning == state->end)
366 return 0;
367 thatp = ((void*) ptr > state->beginning) ?
368 SRE_IS_WORD((int) ptr[-1]) : 0;
369 thisp = ((void*) ptr < state->end) ?
370 SRE_IS_WORD((int) ptr[0]) : 0;
371 return thisp != thatp;
373 case SRE_AT_NON_BOUNDARY:
374 if (state->beginning == state->end)
375 return 0;
376 thatp = ((void*) ptr > state->beginning) ?
377 SRE_IS_WORD((int) ptr[-1]) : 0;
378 thisp = ((void*) ptr < state->end) ?
379 SRE_IS_WORD((int) ptr[0]) : 0;
380 return thisp == thatp;
382 case SRE_AT_LOC_BOUNDARY:
383 if (state->beginning == state->end)
384 return 0;
385 thatp = ((void*) ptr > state->beginning) ?
386 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
387 thisp = ((void*) ptr < state->end) ?
388 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
389 return thisp != thatp;
391 case SRE_AT_LOC_NON_BOUNDARY:
392 if (state->beginning == state->end)
393 return 0;
394 thatp = ((void*) ptr > state->beginning) ?
395 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
396 thisp = ((void*) ptr < state->end) ?
397 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
398 return thisp == thatp;
400 #if defined(HAVE_UNICODE)
401 case SRE_AT_UNI_BOUNDARY:
402 if (state->beginning == state->end)
403 return 0;
404 thatp = ((void*) ptr > state->beginning) ?
405 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
406 thisp = ((void*) ptr < state->end) ?
407 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
408 return thisp != thatp;
410 case SRE_AT_UNI_NON_BOUNDARY:
411 if (state->beginning == state->end)
412 return 0;
413 thatp = ((void*) ptr > state->beginning) ?
414 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
415 thisp = ((void*) ptr < state->end) ?
416 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
417 return thisp == thatp;
418 #endif
422 return 0;
425 LOCAL(int)
426 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428 /* check if character is a member of the given set */
430 int ok = 1;
432 for (;;) {
433 switch (*set++) {
435 case SRE_OP_FAILURE:
436 return !ok;
438 case SRE_OP_LITERAL:
439 /* <LITERAL> <code> */
440 if (ch == set[0])
441 return ok;
442 set++;
443 break;
445 case SRE_OP_CATEGORY:
446 /* <CATEGORY> <code> */
447 if (sre_category(set[0], (int) ch))
448 return ok;
449 set += 1;
450 break;
452 case SRE_OP_CHARSET:
453 if (sizeof(SRE_CODE) == 2) {
454 /* <CHARSET> <bitmap> (16 bits per code word) */
455 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
456 return ok;
457 set += 16;
459 else {
460 /* <CHARSET> <bitmap> (32 bits per code word) */
461 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
462 return ok;
463 set += 8;
465 break;
467 case SRE_OP_RANGE:
468 /* <RANGE> <lower> <upper> */
469 if (set[0] <= ch && ch <= set[1])
470 return ok;
471 set += 2;
472 break;
474 case SRE_OP_NEGATE:
475 ok = !ok;
476 break;
478 case SRE_OP_BIGCHARSET:
479 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 Py_ssize_t count, block;
482 count = *(set++);
484 if (sizeof(SRE_CODE) == 2) {
485 block = ((unsigned char*)set)[ch >> 8];
486 set += 128;
487 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
488 return ok;
489 set += count*16;
491 else {
492 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
493 * warnings when c's type supports only numbers < N+1 */
494 if (!(ch & ~65535))
495 block = ((unsigned char*)set)[ch >> 8];
496 else
497 block = -1;
498 set += 64;
499 if (block >=0 &&
500 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
501 return ok;
502 set += count*8;
504 break;
507 default:
508 /* internal error -- there's not much we can do about it
509 here, so let's just pretend it didn't match... */
510 return 0;
515 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517 LOCAL(Py_ssize_t)
518 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
520 SRE_CODE chr;
521 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
522 SRE_CHAR* end = (SRE_CHAR *)state->end;
523 Py_ssize_t i;
525 /* adjust end */
526 if (maxcount < end - ptr && maxcount != 65535)
527 end = ptr + maxcount;
529 switch (pattern[0]) {
531 case SRE_OP_IN:
532 /* repeated set */
533 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
534 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
535 ptr++;
536 break;
538 case SRE_OP_ANY:
539 /* repeated dot wildcard. */
540 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
541 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
542 ptr++;
543 break;
545 case SRE_OP_ANY_ALL:
546 /* repeated dot wildcard. skip to the end of the target
547 string, and backtrack from there */
548 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
549 ptr = end;
550 break;
552 case SRE_OP_LITERAL:
553 /* repeated literal */
554 chr = pattern[1];
555 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
556 while (ptr < end && (SRE_CODE) *ptr == chr)
557 ptr++;
558 break;
560 case SRE_OP_LITERAL_IGNORE:
561 /* repeated literal */
562 chr = pattern[1];
563 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
564 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
565 ptr++;
566 break;
568 case SRE_OP_NOT_LITERAL:
569 /* repeated non-literal */
570 chr = pattern[1];
571 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
572 while (ptr < end && (SRE_CODE) *ptr != chr)
573 ptr++;
574 break;
576 case SRE_OP_NOT_LITERAL_IGNORE:
577 /* repeated non-literal */
578 chr = pattern[1];
579 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
580 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
581 ptr++;
582 break;
584 default:
585 /* repeated single character pattern */
586 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
587 while ((SRE_CHAR*) state->ptr < end) {
588 i = SRE_MATCH(state, pattern);
589 if (i < 0)
590 return i;
591 if (!i)
592 break;
594 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
595 (SRE_CHAR*) state->ptr - ptr));
596 return (SRE_CHAR*) state->ptr - ptr;
599 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
600 return ptr - (SRE_CHAR*) state->ptr;
603 #if 0 /* not used in this release */
604 LOCAL(int)
605 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
607 /* check if an SRE_OP_INFO block matches at the current position.
608 returns the number of SRE_CODE objects to skip if successful, 0
609 if no match */
611 SRE_CHAR* end = state->end;
612 SRE_CHAR* ptr = state->ptr;
613 Py_ssize_t i;
615 /* check minimal length */
616 if (pattern[3] && (end - ptr) < pattern[3])
617 return 0;
619 /* check known prefix */
620 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
621 /* <length> <skip> <prefix data> <overlap data> */
622 for (i = 0; i < pattern[5]; i++)
623 if ((SRE_CODE) ptr[i] != pattern[7 + i])
624 return 0;
625 return pattern[0] + 2 * pattern[6];
627 return pattern[0];
629 #endif
631 /* The macros below should be used to protect recursive SRE_MATCH()
632 * calls that *failed* and do *not* return immediately (IOW, those
633 * that will backtrack). Explaining:
635 * - Recursive SRE_MATCH() returned true: that's usually a success
636 * (besides atypical cases like ASSERT_NOT), therefore there's no
637 * reason to restore lastmark;
639 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
640 * is returning to the caller: If the current SRE_MATCH() is the
641 * top function of the recursion, returning false will be a matching
642 * failure, and it doesn't matter where lastmark is pointing to.
643 * If it's *not* the top function, it will be a recursive SRE_MATCH()
644 * failure by itself, and the calling SRE_MATCH() will have to deal
645 * with the failure by the same rules explained here (it will restore
646 * lastmark by itself if necessary);
648 * - Recursive SRE_MATCH() returned false, and will continue the
649 * outside 'for' loop: must be protected when breaking, since the next
650 * OP could potentially depend on lastmark;
652 * - Recursive SRE_MATCH() returned false, and will be called again
653 * inside a local for/while loop: must be protected between each
654 * loop iteration, since the recursive SRE_MATCH() could do anything,
655 * and could potentially depend on lastmark.
657 * For more information, check the discussion at SF patch #712900.
659 #define LASTMARK_SAVE() \
660 do { \
661 ctx->lastmark = state->lastmark; \
662 ctx->lastindex = state->lastindex; \
663 } while (0)
664 #define LASTMARK_RESTORE() \
665 do { \
666 state->lastmark = ctx->lastmark; \
667 state->lastindex = ctx->lastindex; \
668 } while (0)
670 #define RETURN_ERROR(i) do { return i; } while(0)
671 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
672 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
674 #define RETURN_ON_ERROR(i) \
675 do { if (i < 0) RETURN_ERROR(i); } while (0)
676 #define RETURN_ON_SUCCESS(i) \
677 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
678 #define RETURN_ON_FAILURE(i) \
679 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
681 #define SFY(x) #x
683 #define DATA_STACK_ALLOC(state, type, ptr) \
684 do { \
685 alloc_pos = state->data_stack_base; \
686 TRACE(("allocating %s in %d (%d)\n", \
687 SFY(type), alloc_pos, sizeof(type))); \
688 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
689 int j = data_stack_grow(state, sizeof(type)); \
690 if (j < 0) return j; \
691 if (ctx_pos != -1) \
692 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
694 ptr = (type*)(state->data_stack+alloc_pos); \
695 state->data_stack_base += sizeof(type); \
696 } while (0)
698 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
699 do { \
700 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
701 ptr = (type*)(state->data_stack+pos); \
702 } while (0)
704 #define DATA_STACK_PUSH(state, data, size) \
705 do { \
706 TRACE(("copy data in %p to %d (%d)\n", \
707 data, state->data_stack_base, size)); \
708 if (state->data_stack_size < state->data_stack_base+size) { \
709 int j = data_stack_grow(state, size); \
710 if (j < 0) return j; \
711 if (ctx_pos != -1) \
712 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
714 memcpy(state->data_stack+state->data_stack_base, data, size); \
715 state->data_stack_base += size; \
716 } while (0)
718 #define DATA_STACK_POP(state, data, size, discard) \
719 do { \
720 TRACE(("copy data to %p from %d (%d)\n", \
721 data, state->data_stack_base-size, size)); \
722 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
723 if (discard) \
724 state->data_stack_base -= size; \
725 } while (0)
727 #define DATA_STACK_POP_DISCARD(state, size) \
728 do { \
729 TRACE(("discard data from %d (%d)\n", \
730 state->data_stack_base-size, size)); \
731 state->data_stack_base -= size; \
732 } while(0)
734 #define DATA_PUSH(x) \
735 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
736 #define DATA_POP(x) \
737 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
738 #define DATA_POP_DISCARD(x) \
739 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
740 #define DATA_ALLOC(t,p) \
741 DATA_STACK_ALLOC(state, t, p)
742 #define DATA_LOOKUP_AT(t,p,pos) \
743 DATA_STACK_LOOKUP_AT(state,t,p,pos)
745 #define MARK_PUSH(lastmark) \
746 do if (lastmark > 0) { \
747 i = lastmark; /* ctx->lastmark may change if reallocated */ \
748 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
749 } while (0)
750 #define MARK_POP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
753 } while (0)
754 #define MARK_POP_KEEP(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
757 } while (0)
758 #define MARK_POP_DISCARD(lastmark) \
759 do if (lastmark > 0) { \
760 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
761 } while (0)
763 #define JUMP_NONE 0
764 #define JUMP_MAX_UNTIL_1 1
765 #define JUMP_MAX_UNTIL_2 2
766 #define JUMP_MAX_UNTIL_3 3
767 #define JUMP_MIN_UNTIL_1 4
768 #define JUMP_MIN_UNTIL_2 5
769 #define JUMP_MIN_UNTIL_3 6
770 #define JUMP_REPEAT 7
771 #define JUMP_REPEAT_ONE_1 8
772 #define JUMP_REPEAT_ONE_2 9
773 #define JUMP_MIN_REPEAT_ONE 10
774 #define JUMP_BRANCH 11
775 #define JUMP_ASSERT 12
776 #define JUMP_ASSERT_NOT 13
778 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
779 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
780 nextctx->last_ctx_pos = ctx_pos; \
781 nextctx->jump = jumpvalue; \
782 nextctx->pattern = nextpattern; \
783 ctx_pos = alloc_pos; \
784 ctx = nextctx; \
785 goto entrance; \
786 jumplabel: \
787 while (0) /* gcc doesn't like labels at end of scopes */ \
789 typedef struct {
790 Py_ssize_t last_ctx_pos;
791 Py_ssize_t jump;
792 SRE_CHAR* ptr;
793 SRE_CODE* pattern;
794 Py_ssize_t count;
795 Py_ssize_t lastmark;
796 Py_ssize_t lastindex;
797 union {
798 SRE_CODE chr;
799 SRE_REPEAT* rep;
800 } u;
801 } SRE_MATCH_CONTEXT;
803 /* check if string matches the given pattern. returns <0 for
804 error, 0 for failure, and 1 for success */
805 LOCAL(Py_ssize_t)
806 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
808 SRE_CHAR* end = (SRE_CHAR *)state->end;
809 Py_ssize_t alloc_pos, ctx_pos = -1;
810 Py_ssize_t i, ret = 0;
811 Py_ssize_t jump;
813 SRE_MATCH_CONTEXT* ctx;
814 SRE_MATCH_CONTEXT* nextctx;
816 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
818 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
819 ctx->last_ctx_pos = -1;
820 ctx->jump = JUMP_NONE;
821 ctx->pattern = pattern;
822 ctx_pos = alloc_pos;
824 entrance:
826 ctx->ptr = (SRE_CHAR *)state->ptr;
828 if (ctx->pattern[0] == SRE_OP_INFO) {
829 /* optimization info block */
830 /* <INFO> <1=skip> <2=flags> <3=min> ... */
831 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
832 TRACE(("reject (got %d chars, need %d)\n",
833 (end - ctx->ptr), ctx->pattern[3]));
834 RETURN_FAILURE;
836 ctx->pattern += ctx->pattern[1] + 1;
839 for (;;) {
841 switch (*ctx->pattern++) {
843 case SRE_OP_MARK:
844 /* set mark */
845 /* <MARK> <gid> */
846 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
847 ctx->ptr, ctx->pattern[0]));
848 i = ctx->pattern[0];
849 if (i & 1)
850 state->lastindex = i/2 + 1;
851 if (i > state->lastmark) {
852 /* state->lastmark is the highest valid index in the
853 state->mark array. If it is increased by more than 1,
854 the intervening marks must be set to NULL to signal
855 that these marks have not been encountered. */
856 Py_ssize_t j = state->lastmark + 1;
857 while (j < i)
858 state->mark[j++] = NULL;
859 state->lastmark = i;
861 state->mark[i] = ctx->ptr;
862 ctx->pattern++;
863 break;
865 case SRE_OP_LITERAL:
866 /* match literal string */
867 /* <LITERAL> <code> */
868 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
869 ctx->ptr, *ctx->pattern));
870 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
871 RETURN_FAILURE;
872 ctx->pattern++;
873 ctx->ptr++;
874 break;
876 case SRE_OP_NOT_LITERAL:
877 /* match anything that is not literal character */
878 /* <NOT_LITERAL> <code> */
879 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
881 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
882 RETURN_FAILURE;
883 ctx->pattern++;
884 ctx->ptr++;
885 break;
887 case SRE_OP_SUCCESS:
888 /* end of pattern */
889 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
890 state->ptr = ctx->ptr;
891 RETURN_SUCCESS;
893 case SRE_OP_AT:
894 /* match at given position */
895 /* <AT> <code> */
896 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
897 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
898 RETURN_FAILURE;
899 ctx->pattern++;
900 break;
902 case SRE_OP_CATEGORY:
903 /* match at given category */
904 /* <CATEGORY> <code> */
905 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
906 ctx->ptr, *ctx->pattern));
907 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
908 RETURN_FAILURE;
909 ctx->pattern++;
910 ctx->ptr++;
911 break;
913 case SRE_OP_ANY:
914 /* match anything (except a newline) */
915 /* <ANY> */
916 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
917 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
918 RETURN_FAILURE;
919 ctx->ptr++;
920 break;
922 case SRE_OP_ANY_ALL:
923 /* match anything */
924 /* <ANY_ALL> */
925 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
926 if (ctx->ptr >= end)
927 RETURN_FAILURE;
928 ctx->ptr++;
929 break;
931 case SRE_OP_IN:
932 /* match set member (or non_member) */
933 /* <IN> <skip> <set> */
934 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
935 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
936 RETURN_FAILURE;
937 ctx->pattern += ctx->pattern[0];
938 ctx->ptr++;
939 break;
941 case SRE_OP_LITERAL_IGNORE:
942 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
943 ctx->pattern, ctx->ptr, ctx->pattern[0]));
944 if (ctx->ptr >= end ||
945 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
946 RETURN_FAILURE;
947 ctx->pattern++;
948 ctx->ptr++;
949 break;
951 case SRE_OP_NOT_LITERAL_IGNORE:
952 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
953 ctx->pattern, ctx->ptr, *ctx->pattern));
954 if (ctx->ptr >= end ||
955 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
956 RETURN_FAILURE;
957 ctx->pattern++;
958 ctx->ptr++;
959 break;
961 case SRE_OP_IN_IGNORE:
962 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
963 if (ctx->ptr >= end
964 || !SRE_CHARSET(ctx->pattern+1,
965 (SRE_CODE)state->lower(*ctx->ptr)))
966 RETURN_FAILURE;
967 ctx->pattern += ctx->pattern[0];
968 ctx->ptr++;
969 break;
971 case SRE_OP_JUMP:
972 case SRE_OP_INFO:
973 /* jump forward */
974 /* <JUMP> <offset> */
975 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
976 ctx->ptr, ctx->pattern[0]));
977 ctx->pattern += ctx->pattern[0];
978 break;
980 case SRE_OP_BRANCH:
981 /* alternation */
982 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
983 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
984 LASTMARK_SAVE();
985 ctx->u.rep = state->repeat;
986 if (ctx->u.rep)
987 MARK_PUSH(ctx->lastmark);
988 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
989 if (ctx->pattern[1] == SRE_OP_LITERAL &&
990 (ctx->ptr >= end ||
991 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
992 continue;
993 if (ctx->pattern[1] == SRE_OP_IN &&
994 (ctx->ptr >= end ||
995 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
996 continue;
997 state->ptr = ctx->ptr;
998 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
999 if (ret) {
1000 if (ctx->u.rep)
1001 MARK_POP_DISCARD(ctx->lastmark);
1002 RETURN_ON_ERROR(ret);
1003 RETURN_SUCCESS;
1005 if (ctx->u.rep)
1006 MARK_POP_KEEP(ctx->lastmark);
1007 LASTMARK_RESTORE();
1009 if (ctx->u.rep)
1010 MARK_POP_DISCARD(ctx->lastmark);
1011 RETURN_FAILURE;
1013 case SRE_OP_REPEAT_ONE:
1014 /* match repeated sequence (maximizing regexp) */
1016 /* this operator only works if the repeated item is
1017 exactly one character wide, and we're not already
1018 collecting backtracking points. for other cases,
1019 use the MAX_REPEAT operator */
1021 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1023 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1024 ctx->pattern[1], ctx->pattern[2]));
1026 if (ctx->ptr + ctx->pattern[1] > end)
1027 RETURN_FAILURE; /* cannot match */
1029 state->ptr = ctx->ptr;
1031 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1032 RETURN_ON_ERROR(ret);
1033 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1034 ctx->count = ret;
1035 ctx->ptr += ctx->count;
1037 /* when we arrive here, count contains the number of
1038 matches, and ctx->ptr points to the tail of the target
1039 string. check if the rest of the pattern matches,
1040 and backtrack if not. */
1042 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1043 RETURN_FAILURE;
1045 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1046 /* tail is empty. we're finished */
1047 state->ptr = ctx->ptr;
1048 RETURN_SUCCESS;
1051 LASTMARK_SAVE();
1053 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1054 /* tail starts with a literal. skip positions where
1055 the rest of the pattern cannot possibly match */
1056 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1057 for (;;) {
1058 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1059 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1060 ctx->ptr--;
1061 ctx->count--;
1063 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1064 break;
1065 state->ptr = ctx->ptr;
1066 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1067 ctx->pattern+ctx->pattern[0]);
1068 if (ret) {
1069 RETURN_ON_ERROR(ret);
1070 RETURN_SUCCESS;
1073 LASTMARK_RESTORE();
1075 ctx->ptr--;
1076 ctx->count--;
1079 } else {
1080 /* general case */
1081 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1082 state->ptr = ctx->ptr;
1083 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1084 ctx->pattern+ctx->pattern[0]);
1085 if (ret) {
1086 RETURN_ON_ERROR(ret);
1087 RETURN_SUCCESS;
1089 ctx->ptr--;
1090 ctx->count--;
1091 LASTMARK_RESTORE();
1094 RETURN_FAILURE;
1096 case SRE_OP_MIN_REPEAT_ONE:
1097 /* match repeated sequence (minimizing regexp) */
1099 /* this operator only works if the repeated item is
1100 exactly one character wide, and we're not already
1101 collecting backtracking points. for other cases,
1102 use the MIN_REPEAT operator */
1104 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1106 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1107 ctx->pattern[1], ctx->pattern[2]));
1109 if (ctx->ptr + ctx->pattern[1] > end)
1110 RETURN_FAILURE; /* cannot match */
1112 state->ptr = ctx->ptr;
1114 if (ctx->pattern[1] == 0)
1115 ctx->count = 0;
1116 else {
1117 /* count using pattern min as the maximum */
1118 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1119 RETURN_ON_ERROR(ret);
1120 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1121 if (ret < (Py_ssize_t) ctx->pattern[1])
1122 /* didn't match minimum number of times */
1123 RETURN_FAILURE;
1124 /* advance past minimum matches of repeat */
1125 ctx->count = ret;
1126 ctx->ptr += ctx->count;
1129 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1130 /* tail is empty. we're finished */
1131 state->ptr = ctx->ptr;
1132 RETURN_SUCCESS;
1134 } else {
1135 /* general case */
1136 LASTMARK_SAVE();
1137 while ((Py_ssize_t)ctx->pattern[2] == 65535
1138 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1139 state->ptr = ctx->ptr;
1140 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1141 ctx->pattern+ctx->pattern[0]);
1142 if (ret) {
1143 RETURN_ON_ERROR(ret);
1144 RETURN_SUCCESS;
1146 state->ptr = ctx->ptr;
1147 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1148 RETURN_ON_ERROR(ret);
1149 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1150 if (ret == 0)
1151 break;
1152 assert(ret == 1);
1153 ctx->ptr++;
1154 ctx->count++;
1155 LASTMARK_RESTORE();
1158 RETURN_FAILURE;
1160 case SRE_OP_REPEAT:
1161 /* create repeat context. all the hard work is done
1162 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1163 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1164 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1165 ctx->pattern[1], ctx->pattern[2]));
1167 /* install new repeat context */
1168 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1169 ctx->u.rep->count = -1;
1170 ctx->u.rep->pattern = ctx->pattern;
1171 ctx->u.rep->prev = state->repeat;
1172 ctx->u.rep->last_ptr = NULL;
1173 state->repeat = ctx->u.rep;
1175 state->ptr = ctx->ptr;
1176 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1177 state->repeat = ctx->u.rep->prev;
1178 PyObject_FREE(ctx->u.rep);
1180 if (ret) {
1181 RETURN_ON_ERROR(ret);
1182 RETURN_SUCCESS;
1184 RETURN_FAILURE;
1186 case SRE_OP_MAX_UNTIL:
1187 /* maximizing repeat */
1188 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1190 /* FIXME: we probably need to deal with zero-width
1191 matches in here... */
1193 ctx->u.rep = state->repeat;
1194 if (!ctx->u.rep)
1195 RETURN_ERROR(SRE_ERROR_STATE);
1197 state->ptr = ctx->ptr;
1199 ctx->count = ctx->u.rep->count+1;
1201 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1202 ctx->ptr, ctx->count));
1204 if (ctx->count < ctx->u.rep->pattern[1]) {
1205 /* not enough matches */
1206 ctx->u.rep->count = ctx->count;
1207 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1208 ctx->u.rep->pattern+3);
1209 if (ret) {
1210 RETURN_ON_ERROR(ret);
1211 RETURN_SUCCESS;
1213 ctx->u.rep->count = ctx->count-1;
1214 state->ptr = ctx->ptr;
1215 RETURN_FAILURE;
1218 if ((ctx->count < ctx->u.rep->pattern[2] ||
1219 ctx->u.rep->pattern[2] == 65535) &&
1220 state->ptr != ctx->u.rep->last_ptr) {
1221 /* we may have enough matches, but if we can
1222 match another item, do so */
1223 ctx->u.rep->count = ctx->count;
1224 LASTMARK_SAVE();
1225 MARK_PUSH(ctx->lastmark);
1226 /* zero-width match protection */
1227 DATA_PUSH(&ctx->u.rep->last_ptr);
1228 ctx->u.rep->last_ptr = state->ptr;
1229 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1230 ctx->u.rep->pattern+3);
1231 DATA_POP(&ctx->u.rep->last_ptr);
1232 if (ret) {
1233 MARK_POP_DISCARD(ctx->lastmark);
1234 RETURN_ON_ERROR(ret);
1235 RETURN_SUCCESS;
1237 MARK_POP(ctx->lastmark);
1238 LASTMARK_RESTORE();
1239 ctx->u.rep->count = ctx->count-1;
1240 state->ptr = ctx->ptr;
1243 /* cannot match more repeated items here. make sure the
1244 tail matches */
1245 state->repeat = ctx->u.rep->prev;
1246 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1247 RETURN_ON_SUCCESS(ret);
1248 state->repeat = ctx->u.rep;
1249 state->ptr = ctx->ptr;
1250 RETURN_FAILURE;
1252 case SRE_OP_MIN_UNTIL:
1253 /* minimizing repeat */
1254 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1256 ctx->u.rep = state->repeat;
1257 if (!ctx->u.rep)
1258 RETURN_ERROR(SRE_ERROR_STATE);
1260 state->ptr = ctx->ptr;
1262 ctx->count = ctx->u.rep->count+1;
1264 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1265 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1267 if (ctx->count < ctx->u.rep->pattern[1]) {
1268 /* not enough matches */
1269 ctx->u.rep->count = ctx->count;
1270 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1271 ctx->u.rep->pattern+3);
1272 if (ret) {
1273 RETURN_ON_ERROR(ret);
1274 RETURN_SUCCESS;
1276 ctx->u.rep->count = ctx->count-1;
1277 state->ptr = ctx->ptr;
1278 RETURN_FAILURE;
1281 LASTMARK_SAVE();
1283 /* see if the tail matches */
1284 state->repeat = ctx->u.rep->prev;
1285 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1286 if (ret) {
1287 RETURN_ON_ERROR(ret);
1288 RETURN_SUCCESS;
1291 state->repeat = ctx->u.rep;
1292 state->ptr = ctx->ptr;
1294 LASTMARK_RESTORE();
1296 if (ctx->count >= ctx->u.rep->pattern[2]
1297 && ctx->u.rep->pattern[2] != 65535)
1298 RETURN_FAILURE;
1300 ctx->u.rep->count = ctx->count;
1301 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1302 ctx->u.rep->pattern+3);
1303 if (ret) {
1304 RETURN_ON_ERROR(ret);
1305 RETURN_SUCCESS;
1307 ctx->u.rep->count = ctx->count-1;
1308 state->ptr = ctx->ptr;
1309 RETURN_FAILURE;
1311 case SRE_OP_GROUPREF:
1312 /* match backreference */
1313 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1314 ctx->ptr, ctx->pattern[0]));
1315 i = ctx->pattern[0];
1317 Py_ssize_t groupref = i+i;
1318 if (groupref >= state->lastmark) {
1319 RETURN_FAILURE;
1320 } else {
1321 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1322 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1323 if (!p || !e || e < p)
1324 RETURN_FAILURE;
1325 while (p < e) {
1326 if (ctx->ptr >= end || *ctx->ptr != *p)
1327 RETURN_FAILURE;
1328 p++; ctx->ptr++;
1332 ctx->pattern++;
1333 break;
1335 case SRE_OP_GROUPREF_IGNORE:
1336 /* match backreference */
1337 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1338 ctx->ptr, ctx->pattern[0]));
1339 i = ctx->pattern[0];
1341 Py_ssize_t groupref = i+i;
1342 if (groupref >= state->lastmark) {
1343 RETURN_FAILURE;
1344 } else {
1345 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1346 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1347 if (!p || !e || e < p)
1348 RETURN_FAILURE;
1349 while (p < e) {
1350 if (ctx->ptr >= end ||
1351 state->lower(*ctx->ptr) != state->lower(*p))
1352 RETURN_FAILURE;
1353 p++; ctx->ptr++;
1357 ctx->pattern++;
1358 break;
1360 case SRE_OP_GROUPREF_EXISTS:
1361 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1362 ctx->ptr, ctx->pattern[0]));
1363 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1364 i = ctx->pattern[0];
1366 Py_ssize_t groupref = i+i;
1367 if (groupref >= state->lastmark) {
1368 ctx->pattern += ctx->pattern[1];
1369 break;
1370 } else {
1371 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1372 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1373 if (!p || !e || e < p) {
1374 ctx->pattern += ctx->pattern[1];
1375 break;
1379 ctx->pattern += 2;
1380 break;
1382 case SRE_OP_ASSERT:
1383 /* assert subpattern */
1384 /* <ASSERT> <skip> <back> <pattern> */
1385 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1386 ctx->ptr, ctx->pattern[1]));
1387 state->ptr = ctx->ptr - ctx->pattern[1];
1388 if (state->ptr < state->beginning)
1389 RETURN_FAILURE;
1390 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1391 RETURN_ON_FAILURE(ret);
1392 ctx->pattern += ctx->pattern[0];
1393 break;
1395 case SRE_OP_ASSERT_NOT:
1396 /* assert not subpattern */
1397 /* <ASSERT_NOT> <skip> <back> <pattern> */
1398 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1399 ctx->ptr, ctx->pattern[1]));
1400 state->ptr = ctx->ptr - ctx->pattern[1];
1401 if (state->ptr >= state->beginning) {
1402 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1403 if (ret) {
1404 RETURN_ON_ERROR(ret);
1405 RETURN_FAILURE;
1408 ctx->pattern += ctx->pattern[0];
1409 break;
1411 case SRE_OP_FAILURE:
1412 /* immediate failure */
1413 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1414 RETURN_FAILURE;
1416 default:
1417 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1418 ctx->pattern[-1]));
1419 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1423 exit:
1424 ctx_pos = ctx->last_ctx_pos;
1425 jump = ctx->jump;
1426 DATA_POP_DISCARD(ctx);
1427 if (ctx_pos == -1)
1428 return ret;
1429 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1431 switch (jump) {
1432 case JUMP_MAX_UNTIL_2:
1433 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1434 goto jump_max_until_2;
1435 case JUMP_MAX_UNTIL_3:
1436 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1437 goto jump_max_until_3;
1438 case JUMP_MIN_UNTIL_2:
1439 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1440 goto jump_min_until_2;
1441 case JUMP_MIN_UNTIL_3:
1442 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1443 goto jump_min_until_3;
1444 case JUMP_BRANCH:
1445 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1446 goto jump_branch;
1447 case JUMP_MAX_UNTIL_1:
1448 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1449 goto jump_max_until_1;
1450 case JUMP_MIN_UNTIL_1:
1451 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1452 goto jump_min_until_1;
1453 case JUMP_REPEAT:
1454 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1455 goto jump_repeat;
1456 case JUMP_REPEAT_ONE_1:
1457 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1458 goto jump_repeat_one_1;
1459 case JUMP_REPEAT_ONE_2:
1460 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1461 goto jump_repeat_one_2;
1462 case JUMP_MIN_REPEAT_ONE:
1463 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1464 goto jump_min_repeat_one;
1465 case JUMP_ASSERT:
1466 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1467 goto jump_assert;
1468 case JUMP_ASSERT_NOT:
1469 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1470 goto jump_assert_not;
1471 case JUMP_NONE:
1472 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1473 break;
1476 return ret; /* should never get here */
1479 LOCAL(Py_ssize_t)
1480 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1482 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1483 SRE_CHAR* end = (SRE_CHAR *)state->end;
1484 Py_ssize_t status = 0;
1485 Py_ssize_t prefix_len = 0;
1486 Py_ssize_t prefix_skip = 0;
1487 SRE_CODE* prefix = NULL;
1488 SRE_CODE* charset = NULL;
1489 SRE_CODE* overlap = NULL;
1490 int flags = 0;
1492 if (pattern[0] == SRE_OP_INFO) {
1493 /* optimization info block */
1494 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1496 flags = pattern[2];
1498 if (pattern[3] > 1) {
1499 /* adjust end point (but make sure we leave at least one
1500 character in there, so literal search will work) */
1501 end -= pattern[3]-1;
1502 if (end <= ptr)
1503 end = ptr+1;
1506 if (flags & SRE_INFO_PREFIX) {
1507 /* pattern starts with a known prefix */
1508 /* <length> <skip> <prefix data> <overlap data> */
1509 prefix_len = pattern[5];
1510 prefix_skip = pattern[6];
1511 prefix = pattern + 7;
1512 overlap = prefix + prefix_len - 1;
1513 } else if (flags & SRE_INFO_CHARSET)
1514 /* pattern starts with a character from a known set */
1515 /* <charset> */
1516 charset = pattern + 5;
1518 pattern += 1 + pattern[1];
1521 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1522 TRACE(("charset = %p\n", charset));
1524 #if defined(USE_FAST_SEARCH)
1525 if (prefix_len > 1) {
1526 /* pattern starts with a known prefix. use the overlap
1527 table to skip forward as fast as we possibly can */
1528 Py_ssize_t i = 0;
1529 end = (SRE_CHAR *)state->end;
1530 while (ptr < end) {
1531 for (;;) {
1532 if ((SRE_CODE) ptr[0] != prefix[i]) {
1533 if (!i)
1534 break;
1535 else
1536 i = overlap[i];
1537 } else {
1538 if (++i == prefix_len) {
1539 /* found a potential match */
1540 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1541 state->start = ptr + 1 - prefix_len;
1542 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1543 if (flags & SRE_INFO_LITERAL)
1544 return 1; /* we got all of it */
1545 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1546 if (status != 0)
1547 return status;
1548 /* close but no cigar -- try again */
1549 i = overlap[i];
1551 break;
1554 ptr++;
1556 return 0;
1558 #endif
1560 if (pattern[0] == SRE_OP_LITERAL) {
1561 /* pattern starts with a literal character. this is used
1562 for short prefixes, and if fast search is disabled */
1563 SRE_CODE chr = pattern[1];
1564 end = (SRE_CHAR *)state->end;
1565 for (;;) {
1566 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1567 ptr++;
1568 if (ptr >= end)
1569 return 0;
1570 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1571 state->start = ptr;
1572 state->ptr = ++ptr;
1573 if (flags & SRE_INFO_LITERAL)
1574 return 1; /* we got all of it */
1575 status = SRE_MATCH(state, pattern + 2);
1576 if (status != 0)
1577 break;
1579 } else if (charset) {
1580 /* pattern starts with a character from a known set */
1581 end = (SRE_CHAR *)state->end;
1582 for (;;) {
1583 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1584 ptr++;
1585 if (ptr >= end)
1586 return 0;
1587 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1588 state->start = ptr;
1589 state->ptr = ptr;
1590 status = SRE_MATCH(state, pattern);
1591 if (status != 0)
1592 break;
1593 ptr++;
1595 } else
1596 /* general case */
1597 while (ptr <= end) {
1598 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1599 state->start = state->ptr = ptr++;
1600 status = SRE_MATCH(state, pattern);
1601 if (status != 0)
1602 break;
1605 return status;
1608 LOCAL(int)
1609 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1611 /* check if given string is a literal template (i.e. no escapes) */
1612 while (len-- > 0)
1613 if (*ptr++ == '\\')
1614 return 0;
1615 return 1;
1618 #if !defined(SRE_RECURSIVE)
1620 /* -------------------------------------------------------------------- */
1621 /* factories and destructors */
1623 /* see sre.h for object declarations */
1624 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1625 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1627 static PyObject *
1628 sre_codesize(PyObject* self, PyObject *unused)
1630 return Py_BuildValue("l", sizeof(SRE_CODE));
1633 static PyObject *
1634 sre_getlower(PyObject* self, PyObject* args)
1636 int character, flags;
1637 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1638 return NULL;
1639 if (flags & SRE_FLAG_LOCALE)
1640 return Py_BuildValue("i", sre_lower_locale(character));
1641 if (flags & SRE_FLAG_UNICODE)
1642 #if defined(HAVE_UNICODE)
1643 return Py_BuildValue("i", sre_lower_unicode(character));
1644 #else
1645 return Py_BuildValue("i", sre_lower_locale(character));
1646 #endif
1647 return Py_BuildValue("i", sre_lower(character));
1650 LOCAL(void)
1651 state_reset(SRE_STATE* state)
1653 /* FIXME: dynamic! */
1654 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1656 state->lastmark = -1;
1657 state->lastindex = -1;
1659 state->repeat = NULL;
1661 data_stack_dealloc(state);
1664 static void*
1665 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1667 /* given a python object, return a data pointer, a length (in
1668 characters), and a character size. return NULL if the object
1669 is not a string (or not compatible) */
1671 PyBufferProcs *buffer;
1672 Py_ssize_t size, bytes;
1673 int charsize;
1674 void* ptr;
1676 #if defined(HAVE_UNICODE)
1677 if (PyUnicode_Check(string)) {
1678 /* unicode strings doesn't always support the buffer interface */
1679 ptr = (void*) PyUnicode_AS_DATA(string);
1680 bytes = PyUnicode_GET_DATA_SIZE(string);
1681 size = PyUnicode_GET_SIZE(string);
1682 charsize = sizeof(Py_UNICODE);
1684 } else {
1685 #endif
1687 /* get pointer to string buffer */
1688 buffer = string->ob_type->tp_as_buffer;
1689 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1690 buffer->bf_getsegcount(string, NULL) != 1) {
1691 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1692 return NULL;
1695 /* determine buffer size */
1696 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1697 if (bytes < 0) {
1698 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1699 return NULL;
1702 /* determine character size */
1703 #if PY_VERSION_HEX >= 0x01060000
1704 size = PyObject_Size(string);
1705 #else
1706 size = PyObject_Length(string);
1707 #endif
1709 if (PyString_Check(string) || bytes == size)
1710 charsize = 1;
1711 #if defined(HAVE_UNICODE)
1712 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1713 charsize = sizeof(Py_UNICODE);
1714 #endif
1715 else {
1716 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1717 return NULL;
1720 #if defined(HAVE_UNICODE)
1722 #endif
1724 *p_length = size;
1725 *p_charsize = charsize;
1727 return ptr;
1730 LOCAL(PyObject*)
1731 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1732 Py_ssize_t start, Py_ssize_t end)
1734 /* prepare state object */
1736 Py_ssize_t length;
1737 int charsize;
1738 void* ptr;
1740 memset(state, 0, sizeof(SRE_STATE));
1742 state->lastmark = -1;
1743 state->lastindex = -1;
1745 ptr = getstring(string, &length, &charsize);
1746 if (!ptr)
1747 return NULL;
1749 /* adjust boundaries */
1750 if (start < 0)
1751 start = 0;
1752 else if (start > length)
1753 start = length;
1755 if (end < 0)
1756 end = 0;
1757 else if (end > length)
1758 end = length;
1760 state->charsize = charsize;
1762 state->beginning = ptr;
1764 state->start = (void*) ((char*) ptr + start * state->charsize);
1765 state->end = (void*) ((char*) ptr + end * state->charsize);
1767 Py_INCREF(string);
1768 state->string = string;
1769 state->pos = start;
1770 state->endpos = end;
1772 if (pattern->flags & SRE_FLAG_LOCALE)
1773 state->lower = sre_lower_locale;
1774 else if (pattern->flags & SRE_FLAG_UNICODE)
1775 #if defined(HAVE_UNICODE)
1776 state->lower = sre_lower_unicode;
1777 #else
1778 state->lower = sre_lower_locale;
1779 #endif
1780 else
1781 state->lower = sre_lower;
1783 return string;
1786 LOCAL(void)
1787 state_fini(SRE_STATE* state)
1789 Py_XDECREF(state->string);
1790 data_stack_dealloc(state);
1793 /* calculate offset from start of string */
1794 #define STATE_OFFSET(state, member)\
1795 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1797 LOCAL(PyObject*)
1798 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1800 Py_ssize_t i, j;
1802 index = (index - 1) * 2;
1804 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1805 if (empty)
1806 /* want empty string */
1807 i = j = 0;
1808 else {
1809 Py_INCREF(Py_None);
1810 return Py_None;
1812 } else {
1813 i = STATE_OFFSET(state, state->mark[index]);
1814 j = STATE_OFFSET(state, state->mark[index+1]);
1817 return PySequence_GetSlice(string, i, j);
1820 static void
1821 pattern_error(int status)
1823 switch (status) {
1824 case SRE_ERROR_RECURSION_LIMIT:
1825 PyErr_SetString(
1826 PyExc_RuntimeError,
1827 "maximum recursion limit exceeded"
1829 break;
1830 case SRE_ERROR_MEMORY:
1831 PyErr_NoMemory();
1832 break;
1833 default:
1834 /* other error codes indicate compiler/engine bugs */
1835 PyErr_SetString(
1836 PyExc_RuntimeError,
1837 "internal error in regular expression engine"
1842 static void
1843 pattern_dealloc(PatternObject* self)
1845 if (self->weakreflist != NULL)
1846 PyObject_ClearWeakRefs((PyObject *) self);
1847 Py_XDECREF(self->pattern);
1848 Py_XDECREF(self->groupindex);
1849 Py_XDECREF(self->indexgroup);
1850 PyObject_DEL(self);
1853 static PyObject*
1854 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1856 SRE_STATE state;
1857 int status;
1859 PyObject* string;
1860 Py_ssize_t start = 0;
1861 Py_ssize_t end = PY_SSIZE_T_MAX;
1862 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1863 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1864 &string, &start, &end))
1865 return NULL;
1867 string = state_init(&state, self, string, start, end);
1868 if (!string)
1869 return NULL;
1871 state.ptr = state.start;
1873 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1875 if (state.charsize == 1) {
1876 status = sre_match(&state, PatternObject_GetCode(self));
1877 } else {
1878 #if defined(HAVE_UNICODE)
1879 status = sre_umatch(&state, PatternObject_GetCode(self));
1880 #endif
1883 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1885 state_fini(&state);
1887 return pattern_new_match(self, &state, status);
1890 static PyObject*
1891 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1893 SRE_STATE state;
1894 int status;
1896 PyObject* string;
1897 Py_ssize_t start = 0;
1898 Py_ssize_t end = PY_SSIZE_T_MAX;
1899 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1900 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1901 &string, &start, &end))
1902 return NULL;
1904 string = state_init(&state, self, string, start, end);
1905 if (!string)
1906 return NULL;
1908 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1910 if (state.charsize == 1) {
1911 status = sre_search(&state, PatternObject_GetCode(self));
1912 } else {
1913 #if defined(HAVE_UNICODE)
1914 status = sre_usearch(&state, PatternObject_GetCode(self));
1915 #endif
1918 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1920 state_fini(&state);
1922 return pattern_new_match(self, &state, status);
1925 static PyObject*
1926 call(char* module, char* function, PyObject* args)
1928 PyObject* name;
1929 PyObject* mod;
1930 PyObject* func;
1931 PyObject* result;
1933 if (!args)
1934 return NULL;
1935 name = PyString_FromString(module);
1936 if (!name)
1937 return NULL;
1938 mod = PyImport_Import(name);
1939 Py_DECREF(name);
1940 if (!mod)
1941 return NULL;
1942 func = PyObject_GetAttrString(mod, function);
1943 Py_DECREF(mod);
1944 if (!func)
1945 return NULL;
1946 result = PyObject_CallObject(func, args);
1947 Py_DECREF(func);
1948 Py_DECREF(args);
1949 return result;
1952 #ifdef USE_BUILTIN_COPY
1953 static int
1954 deepcopy(PyObject** object, PyObject* memo)
1956 PyObject* copy;
1958 copy = call(
1959 "copy", "deepcopy",
1960 PyTuple_Pack(2, *object, memo)
1962 if (!copy)
1963 return 0;
1965 Py_DECREF(*object);
1966 *object = copy;
1968 return 1; /* success */
1970 #endif
1972 static PyObject*
1973 join_list(PyObject* list, PyObject* pattern)
1975 /* join list elements */
1977 PyObject* joiner;
1978 #if PY_VERSION_HEX >= 0x01060000
1979 PyObject* function;
1980 PyObject* args;
1981 #endif
1982 PyObject* result;
1984 switch (PyList_GET_SIZE(list)) {
1985 case 0:
1986 Py_DECREF(list);
1987 return PySequence_GetSlice(pattern, 0, 0);
1988 case 1:
1989 result = PyList_GET_ITEM(list, 0);
1990 Py_INCREF(result);
1991 Py_DECREF(list);
1992 return result;
1995 /* two or more elements: slice out a suitable separator from the
1996 first member, and use that to join the entire list */
1998 joiner = PySequence_GetSlice(pattern, 0, 0);
1999 if (!joiner)
2000 return NULL;
2002 #if PY_VERSION_HEX >= 0x01060000
2003 function = PyObject_GetAttrString(joiner, "join");
2004 if (!function) {
2005 Py_DECREF(joiner);
2006 return NULL;
2008 args = PyTuple_New(1);
2009 if (!args) {
2010 Py_DECREF(function);
2011 Py_DECREF(joiner);
2012 return NULL;
2014 PyTuple_SET_ITEM(args, 0, list);
2015 result = PyObject_CallObject(function, args);
2016 Py_DECREF(args); /* also removes list */
2017 Py_DECREF(function);
2018 #else
2019 result = call(
2020 "string", "join",
2021 PyTuple_Pack(2, list, joiner)
2023 #endif
2024 Py_DECREF(joiner);
2026 return result;
2029 static PyObject*
2030 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2032 SRE_STATE state;
2033 PyObject* list;
2034 int status;
2035 Py_ssize_t i, b, e;
2037 PyObject* string;
2038 Py_ssize_t start = 0;
2039 Py_ssize_t end = PY_SSIZE_T_MAX;
2040 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2041 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2042 &string, &start, &end))
2043 return NULL;
2045 string = state_init(&state, self, string, start, end);
2046 if (!string)
2047 return NULL;
2049 list = PyList_New(0);
2050 if (!list) {
2051 state_fini(&state);
2052 return NULL;
2055 while (state.start <= state.end) {
2057 PyObject* item;
2059 state_reset(&state);
2061 state.ptr = state.start;
2063 if (state.charsize == 1) {
2064 status = sre_search(&state, PatternObject_GetCode(self));
2065 } else {
2066 #if defined(HAVE_UNICODE)
2067 status = sre_usearch(&state, PatternObject_GetCode(self));
2068 #endif
2071 if (status <= 0) {
2072 if (status == 0)
2073 break;
2074 pattern_error(status);
2075 goto error;
2078 /* don't bother to build a match object */
2079 switch (self->groups) {
2080 case 0:
2081 b = STATE_OFFSET(&state, state.start);
2082 e = STATE_OFFSET(&state, state.ptr);
2083 item = PySequence_GetSlice(string, b, e);
2084 if (!item)
2085 goto error;
2086 break;
2087 case 1:
2088 item = state_getslice(&state, 1, string, 1);
2089 if (!item)
2090 goto error;
2091 break;
2092 default:
2093 item = PyTuple_New(self->groups);
2094 if (!item)
2095 goto error;
2096 for (i = 0; i < self->groups; i++) {
2097 PyObject* o = state_getslice(&state, i+1, string, 1);
2098 if (!o) {
2099 Py_DECREF(item);
2100 goto error;
2102 PyTuple_SET_ITEM(item, i, o);
2104 break;
2107 status = PyList_Append(list, item);
2108 Py_DECREF(item);
2109 if (status < 0)
2110 goto error;
2112 if (state.ptr == state.start)
2113 state.start = (void*) ((char*) state.ptr + state.charsize);
2114 else
2115 state.start = state.ptr;
2119 state_fini(&state);
2120 return list;
2122 error:
2123 Py_DECREF(list);
2124 state_fini(&state);
2125 return NULL;
2129 #if PY_VERSION_HEX >= 0x02020000
2130 static PyObject*
2131 pattern_finditer(PatternObject* pattern, PyObject* args)
2133 PyObject* scanner;
2134 PyObject* search;
2135 PyObject* iterator;
2137 scanner = pattern_scanner(pattern, args);
2138 if (!scanner)
2139 return NULL;
2141 search = PyObject_GetAttrString(scanner, "search");
2142 Py_DECREF(scanner);
2143 if (!search)
2144 return NULL;
2146 iterator = PyCallIter_New(search, Py_None);
2147 Py_DECREF(search);
2149 return iterator;
2151 #endif
2153 static PyObject*
2154 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2156 SRE_STATE state;
2157 PyObject* list;
2158 PyObject* item;
2159 int status;
2160 Py_ssize_t n;
2161 Py_ssize_t i;
2162 void* last;
2164 PyObject* string;
2165 Py_ssize_t maxsplit = 0;
2166 static char* kwlist[] = { "source", "maxsplit", NULL };
2167 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2168 &string, &maxsplit))
2169 return NULL;
2171 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2172 if (!string)
2173 return NULL;
2175 list = PyList_New(0);
2176 if (!list) {
2177 state_fini(&state);
2178 return NULL;
2181 n = 0;
2182 last = state.start;
2184 while (!maxsplit || n < maxsplit) {
2186 state_reset(&state);
2188 state.ptr = state.start;
2190 if (state.charsize == 1) {
2191 status = sre_search(&state, PatternObject_GetCode(self));
2192 } else {
2193 #if defined(HAVE_UNICODE)
2194 status = sre_usearch(&state, PatternObject_GetCode(self));
2195 #endif
2198 if (status <= 0) {
2199 if (status == 0)
2200 break;
2201 pattern_error(status);
2202 goto error;
2205 if (state.start == state.ptr) {
2206 if (last == state.end)
2207 break;
2208 /* skip one character */
2209 state.start = (void*) ((char*) state.ptr + state.charsize);
2210 continue;
2213 /* get segment before this match */
2214 item = PySequence_GetSlice(
2215 string, STATE_OFFSET(&state, last),
2216 STATE_OFFSET(&state, state.start)
2218 if (!item)
2219 goto error;
2220 status = PyList_Append(list, item);
2221 Py_DECREF(item);
2222 if (status < 0)
2223 goto error;
2225 /* add groups (if any) */
2226 for (i = 0; i < self->groups; i++) {
2227 item = state_getslice(&state, i+1, string, 0);
2228 if (!item)
2229 goto error;
2230 status = PyList_Append(list, item);
2231 Py_DECREF(item);
2232 if (status < 0)
2233 goto error;
2236 n = n + 1;
2238 last = state.start = state.ptr;
2242 /* get segment following last match (even if empty) */
2243 item = PySequence_GetSlice(
2244 string, STATE_OFFSET(&state, last), state.endpos
2246 if (!item)
2247 goto error;
2248 status = PyList_Append(list, item);
2249 Py_DECREF(item);
2250 if (status < 0)
2251 goto error;
2253 state_fini(&state);
2254 return list;
2256 error:
2257 Py_DECREF(list);
2258 state_fini(&state);
2259 return NULL;
2263 static PyObject*
2264 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2265 Py_ssize_t count, Py_ssize_t subn)
2267 SRE_STATE state;
2268 PyObject* list;
2269 PyObject* item;
2270 PyObject* filter;
2271 PyObject* args;
2272 PyObject* match;
2273 void* ptr;
2274 int status;
2275 Py_ssize_t n;
2276 Py_ssize_t i, b, e;
2277 int bint;
2278 int filter_is_callable;
2280 if (PyCallable_Check(ptemplate)) {
2281 /* sub/subn takes either a function or a template */
2282 filter = ptemplate;
2283 Py_INCREF(filter);
2284 filter_is_callable = 1;
2285 } else {
2286 /* if not callable, check if it's a literal string */
2287 int literal;
2288 ptr = getstring(ptemplate, &n, &bint);
2289 b = bint;
2290 if (ptr) {
2291 if (b == 1) {
2292 literal = sre_literal_template((unsigned char *)ptr, n);
2293 } else {
2294 #if defined(HAVE_UNICODE)
2295 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2296 #endif
2298 } else {
2299 PyErr_Clear();
2300 literal = 0;
2302 if (literal) {
2303 filter = ptemplate;
2304 Py_INCREF(filter);
2305 filter_is_callable = 0;
2306 } else {
2307 /* not a literal; hand it over to the template compiler */
2308 filter = call(
2309 SRE_PY_MODULE, "_subx",
2310 PyTuple_Pack(2, self, ptemplate)
2312 if (!filter)
2313 return NULL;
2314 filter_is_callable = PyCallable_Check(filter);
2318 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2319 if (!string) {
2320 Py_DECREF(filter);
2321 return NULL;
2324 list = PyList_New(0);
2325 if (!list) {
2326 Py_DECREF(filter);
2327 state_fini(&state);
2328 return NULL;
2331 n = i = 0;
2333 while (!count || n < count) {
2335 state_reset(&state);
2337 state.ptr = state.start;
2339 if (state.charsize == 1) {
2340 status = sre_search(&state, PatternObject_GetCode(self));
2341 } else {
2342 #if defined(HAVE_UNICODE)
2343 status = sre_usearch(&state, PatternObject_GetCode(self));
2344 #endif
2347 if (status <= 0) {
2348 if (status == 0)
2349 break;
2350 pattern_error(status);
2351 goto error;
2354 b = STATE_OFFSET(&state, state.start);
2355 e = STATE_OFFSET(&state, state.ptr);
2357 if (i < b) {
2358 /* get segment before this match */
2359 item = PySequence_GetSlice(string, i, b);
2360 if (!item)
2361 goto error;
2362 status = PyList_Append(list, item);
2363 Py_DECREF(item);
2364 if (status < 0)
2365 goto error;
2367 } else if (i == b && i == e && n > 0)
2368 /* ignore empty match on latest position */
2369 goto next;
2371 if (filter_is_callable) {
2372 /* pass match object through filter */
2373 match = pattern_new_match(self, &state, 1);
2374 if (!match)
2375 goto error;
2376 args = PyTuple_Pack(1, match);
2377 if (!args) {
2378 Py_DECREF(match);
2379 goto error;
2381 item = PyObject_CallObject(filter, args);
2382 Py_DECREF(args);
2383 Py_DECREF(match);
2384 if (!item)
2385 goto error;
2386 } else {
2387 /* filter is literal string */
2388 item = filter;
2389 Py_INCREF(item);
2392 /* add to list */
2393 if (item != Py_None) {
2394 status = PyList_Append(list, item);
2395 Py_DECREF(item);
2396 if (status < 0)
2397 goto error;
2400 i = e;
2401 n = n + 1;
2403 next:
2404 /* move on */
2405 if (state.ptr == state.start)
2406 state.start = (void*) ((char*) state.ptr + state.charsize);
2407 else
2408 state.start = state.ptr;
2412 /* get segment following last match */
2413 if (i < state.endpos) {
2414 item = PySequence_GetSlice(string, i, state.endpos);
2415 if (!item)
2416 goto error;
2417 status = PyList_Append(list, item);
2418 Py_DECREF(item);
2419 if (status < 0)
2420 goto error;
2423 state_fini(&state);
2425 Py_DECREF(filter);
2427 /* convert list to single string (also removes list) */
2428 item = join_list(list, self->pattern);
2430 if (!item)
2431 return NULL;
2433 if (subn)
2434 return Py_BuildValue("Ni", item, n);
2436 return item;
2438 error:
2439 Py_DECREF(list);
2440 state_fini(&state);
2441 Py_DECREF(filter);
2442 return NULL;
2446 static PyObject*
2447 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2449 PyObject* ptemplate;
2450 PyObject* string;
2451 Py_ssize_t count = 0;
2452 static char* kwlist[] = { "repl", "string", "count", NULL };
2453 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2454 &ptemplate, &string, &count))
2455 return NULL;
2457 return pattern_subx(self, ptemplate, string, count, 0);
2460 static PyObject*
2461 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2463 PyObject* ptemplate;
2464 PyObject* string;
2465 Py_ssize_t count = 0;
2466 static char* kwlist[] = { "repl", "string", "count", NULL };
2467 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2468 &ptemplate, &string, &count))
2469 return NULL;
2471 return pattern_subx(self, ptemplate, string, count, 1);
2474 static PyObject*
2475 pattern_copy(PatternObject* self, PyObject *unused)
2477 #ifdef USE_BUILTIN_COPY
2478 PatternObject* copy;
2479 int offset;
2481 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2482 if (!copy)
2483 return NULL;
2485 offset = offsetof(PatternObject, groups);
2487 Py_XINCREF(self->groupindex);
2488 Py_XINCREF(self->indexgroup);
2489 Py_XINCREF(self->pattern);
2491 memcpy((char*) copy + offset, (char*) self + offset,
2492 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2493 copy->weakreflist = NULL;
2495 return (PyObject*) copy;
2496 #else
2497 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2498 return NULL;
2499 #endif
2502 static PyObject*
2503 pattern_deepcopy(PatternObject* self, PyObject* memo)
2505 #ifdef USE_BUILTIN_COPY
2506 PatternObject* copy;
2508 copy = (PatternObject*) pattern_copy(self);
2509 if (!copy)
2510 return NULL;
2512 if (!deepcopy(&copy->groupindex, memo) ||
2513 !deepcopy(&copy->indexgroup, memo) ||
2514 !deepcopy(&copy->pattern, memo)) {
2515 Py_DECREF(copy);
2516 return NULL;
2519 #else
2520 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2521 return NULL;
2522 #endif
2525 PyDoc_STRVAR(pattern_match_doc,
2526 "match(string[, pos[, endpos]]) --> match object or None.\n\
2527 Matches zero or more characters at the beginning of the string");
2529 PyDoc_STRVAR(pattern_search_doc,
2530 "search(string[, pos[, endpos]]) --> match object or None.\n\
2531 Scan through string looking for a match, and return a corresponding\n\
2532 MatchObject instance. Return None if no position in the string matches.");
2534 PyDoc_STRVAR(pattern_split_doc,
2535 "split(string[, maxsplit = 0]) --> list.\n\
2536 Split string by the occurrences of pattern.");
2538 PyDoc_STRVAR(pattern_findall_doc,
2539 "findall(string[, pos[, endpos]]) --> list.\n\
2540 Return a list of all non-overlapping matches of pattern in string.");
2542 PyDoc_STRVAR(pattern_finditer_doc,
2543 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2544 Return an iterator over all non-overlapping matches for the \n\
2545 RE pattern in string. For each match, the iterator returns a\n\
2546 match object.");
2548 PyDoc_STRVAR(pattern_sub_doc,
2549 "sub(repl, string[, count = 0]) --> newstring\n\
2550 Return the string obtained by replacing the leftmost non-overlapping\n\
2551 occurrences of pattern in string by the replacement repl.");
2553 PyDoc_STRVAR(pattern_subn_doc,
2554 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2555 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2556 the leftmost non-overlapping occurrences of pattern with the\n\
2557 replacement repl.");
2559 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2561 static PyMethodDef pattern_methods[] = {
2562 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2563 pattern_match_doc},
2564 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2565 pattern_search_doc},
2566 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2567 pattern_sub_doc},
2568 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2569 pattern_subn_doc},
2570 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2571 pattern_split_doc},
2572 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2573 pattern_findall_doc},
2574 #if PY_VERSION_HEX >= 0x02020000
2575 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2576 pattern_finditer_doc},
2577 #endif
2578 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2579 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2580 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2581 {NULL, NULL}
2584 static PyObject*
2585 pattern_getattr(PatternObject* self, char* name)
2587 PyObject* res;
2589 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2591 if (res)
2592 return res;
2594 PyErr_Clear();
2596 /* attributes */
2597 if (!strcmp(name, "pattern")) {
2598 Py_INCREF(self->pattern);
2599 return self->pattern;
2602 if (!strcmp(name, "flags"))
2603 return Py_BuildValue("i", self->flags);
2605 if (!strcmp(name, "groups"))
2606 return Py_BuildValue("i", self->groups);
2608 if (!strcmp(name, "groupindex") && self->groupindex) {
2609 Py_INCREF(self->groupindex);
2610 return self->groupindex;
2613 PyErr_SetString(PyExc_AttributeError, name);
2614 return NULL;
2617 statichere PyTypeObject Pattern_Type = {
2618 PyObject_HEAD_INIT(NULL)
2619 0, "_" SRE_MODULE ".SRE_Pattern",
2620 sizeof(PatternObject), sizeof(SRE_CODE),
2621 (destructor)pattern_dealloc, /*tp_dealloc*/
2622 0, /*tp_print*/
2623 (getattrfunc)pattern_getattr, /*tp_getattr*/
2624 0, /* tp_setattr */
2625 0, /* tp_compare */
2626 0, /* tp_repr */
2627 0, /* tp_as_number */
2628 0, /* tp_as_sequence */
2629 0, /* tp_as_mapping */
2630 0, /* tp_hash */
2631 0, /* tp_call */
2632 0, /* tp_str */
2633 0, /* tp_getattro */
2634 0, /* tp_setattro */
2635 0, /* tp_as_buffer */
2636 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2637 pattern_doc, /* tp_doc */
2638 0, /* tp_traverse */
2639 0, /* tp_clear */
2640 0, /* tp_richcompare */
2641 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2644 static PyObject *
2645 _compile(PyObject* self_, PyObject* args)
2647 /* "compile" pattern descriptor to pattern object */
2649 PatternObject* self;
2650 Py_ssize_t i, n;
2652 PyObject* pattern;
2653 int flags = 0;
2654 PyObject* code;
2655 Py_ssize_t groups = 0;
2656 PyObject* groupindex = NULL;
2657 PyObject* indexgroup = NULL;
2658 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2659 &PyList_Type, &code, &groups,
2660 &groupindex, &indexgroup))
2661 return NULL;
2663 n = PyList_GET_SIZE(code);
2665 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2666 if (!self)
2667 return NULL;
2669 self->codesize = n;
2671 for (i = 0; i < n; i++) {
2672 PyObject *o = PyList_GET_ITEM(code, i);
2673 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2674 : PyLong_AsUnsignedLong(o);
2675 self->code[i] = (SRE_CODE) value;
2676 if ((unsigned long) self->code[i] != value) {
2677 PyErr_SetString(PyExc_OverflowError,
2678 "regular expression code size limit exceeded");
2679 break;
2683 if (PyErr_Occurred()) {
2684 PyObject_DEL(self);
2685 return NULL;
2688 Py_INCREF(pattern);
2689 self->pattern = pattern;
2691 self->flags = flags;
2693 self->groups = groups;
2695 Py_XINCREF(groupindex);
2696 self->groupindex = groupindex;
2698 Py_XINCREF(indexgroup);
2699 self->indexgroup = indexgroup;
2701 self->weakreflist = NULL;
2703 return (PyObject*) self;
2706 /* -------------------------------------------------------------------- */
2707 /* match methods */
2709 static void
2710 match_dealloc(MatchObject* self)
2712 Py_XDECREF(self->regs);
2713 Py_XDECREF(self->string);
2714 Py_DECREF(self->pattern);
2715 PyObject_DEL(self);
2718 static PyObject*
2719 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2721 if (index < 0 || index >= self->groups) {
2722 /* raise IndexError if we were given a bad group number */
2723 PyErr_SetString(
2724 PyExc_IndexError,
2725 "no such group"
2727 return NULL;
2730 index *= 2;
2732 if (self->string == Py_None || self->mark[index] < 0) {
2733 /* return default value if the string or group is undefined */
2734 Py_INCREF(def);
2735 return def;
2738 return PySequence_GetSlice(
2739 self->string, self->mark[index], self->mark[index+1]
2743 static Py_ssize_t
2744 match_getindex(MatchObject* self, PyObject* index)
2746 Py_ssize_t i;
2748 if (PyInt_Check(index))
2749 return PyInt_AsSsize_t(index);
2751 i = -1;
2753 if (self->pattern->groupindex) {
2754 index = PyObject_GetItem(self->pattern->groupindex, index);
2755 if (index) {
2756 if (PyInt_Check(index) || PyLong_Check(index))
2757 i = PyInt_AsSsize_t(index);
2758 Py_DECREF(index);
2759 } else
2760 PyErr_Clear();
2763 return i;
2766 static PyObject*
2767 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2769 return match_getslice_by_index(self, match_getindex(self, index), def);
2772 static PyObject*
2773 match_expand(MatchObject* self, PyObject* ptemplate)
2775 /* delegate to Python code */
2776 return call(
2777 SRE_PY_MODULE, "_expand",
2778 PyTuple_Pack(3, self->pattern, self, ptemplate)
2782 static PyObject*
2783 match_group(MatchObject* self, PyObject* args)
2785 PyObject* result;
2786 Py_ssize_t i, size;
2788 size = PyTuple_GET_SIZE(args);
2790 switch (size) {
2791 case 0:
2792 result = match_getslice(self, Py_False, Py_None);
2793 break;
2794 case 1:
2795 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2796 break;
2797 default:
2798 /* fetch multiple items */
2799 result = PyTuple_New(size);
2800 if (!result)
2801 return NULL;
2802 for (i = 0; i < size; i++) {
2803 PyObject* item = match_getslice(
2804 self, PyTuple_GET_ITEM(args, i), Py_None
2806 if (!item) {
2807 Py_DECREF(result);
2808 return NULL;
2810 PyTuple_SET_ITEM(result, i, item);
2812 break;
2814 return result;
2817 static PyObject*
2818 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2820 PyObject* result;
2821 Py_ssize_t index;
2823 PyObject* def = Py_None;
2824 static char* kwlist[] = { "default", NULL };
2825 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2826 return NULL;
2828 result = PyTuple_New(self->groups-1);
2829 if (!result)
2830 return NULL;
2832 for (index = 1; index < self->groups; index++) {
2833 PyObject* item;
2834 item = match_getslice_by_index(self, index, def);
2835 if (!item) {
2836 Py_DECREF(result);
2837 return NULL;
2839 PyTuple_SET_ITEM(result, index-1, item);
2842 return result;
2845 static PyObject*
2846 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2848 PyObject* result;
2849 PyObject* keys;
2850 Py_ssize_t index;
2852 PyObject* def = Py_None;
2853 static char* kwlist[] = { "default", NULL };
2854 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2855 return NULL;
2857 result = PyDict_New();
2858 if (!result || !self->pattern->groupindex)
2859 return result;
2861 keys = PyMapping_Keys(self->pattern->groupindex);
2862 if (!keys)
2863 goto failed;
2865 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2866 int status;
2867 PyObject* key;
2868 PyObject* value;
2869 key = PyList_GET_ITEM(keys, index);
2870 if (!key)
2871 goto failed;
2872 value = match_getslice(self, key, def);
2873 if (!value) {
2874 Py_DECREF(key);
2875 goto failed;
2877 status = PyDict_SetItem(result, key, value);
2878 Py_DECREF(value);
2879 if (status < 0)
2880 goto failed;
2883 Py_DECREF(keys);
2885 return result;
2887 failed:
2888 Py_XDECREF(keys);
2889 Py_DECREF(result);
2890 return NULL;
2893 static PyObject*
2894 match_start(MatchObject* self, PyObject* args)
2896 Py_ssize_t index;
2898 PyObject* index_ = Py_False; /* zero */
2899 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
2900 return NULL;
2902 index = match_getindex(self, index_);
2904 if (index < 0 || index >= self->groups) {
2905 PyErr_SetString(
2906 PyExc_IndexError,
2907 "no such group"
2909 return NULL;
2912 /* mark is -1 if group is undefined */
2913 return Py_BuildValue("i", self->mark[index*2]);
2916 static PyObject*
2917 match_end(MatchObject* self, PyObject* args)
2919 Py_ssize_t index;
2921 PyObject* index_ = Py_False; /* zero */
2922 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
2923 return NULL;
2925 index = match_getindex(self, index_);
2927 if (index < 0 || index >= self->groups) {
2928 PyErr_SetString(
2929 PyExc_IndexError,
2930 "no such group"
2932 return NULL;
2935 /* mark is -1 if group is undefined */
2936 return Py_BuildValue("i", self->mark[index*2+1]);
2939 LOCAL(PyObject*)
2940 _pair(Py_ssize_t i1, Py_ssize_t i2)
2942 PyObject* pair;
2943 PyObject* item;
2945 pair = PyTuple_New(2);
2946 if (!pair)
2947 return NULL;
2949 item = PyInt_FromSsize_t(i1);
2950 if (!item)
2951 goto error;
2952 PyTuple_SET_ITEM(pair, 0, item);
2954 item = PyInt_FromSsize_t(i2);
2955 if (!item)
2956 goto error;
2957 PyTuple_SET_ITEM(pair, 1, item);
2959 return pair;
2961 error:
2962 Py_DECREF(pair);
2963 return NULL;
2966 static PyObject*
2967 match_span(MatchObject* self, PyObject* args)
2969 Py_ssize_t index;
2971 PyObject* index_ = Py_False; /* zero */
2972 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
2973 return NULL;
2975 index = match_getindex(self, index_);
2977 if (index < 0 || index >= self->groups) {
2978 PyErr_SetString(
2979 PyExc_IndexError,
2980 "no such group"
2982 return NULL;
2985 /* marks are -1 if group is undefined */
2986 return _pair(self->mark[index*2], self->mark[index*2+1]);
2989 static PyObject*
2990 match_regs(MatchObject* self)
2992 PyObject* regs;
2993 PyObject* item;
2994 Py_ssize_t index;
2996 regs = PyTuple_New(self->groups);
2997 if (!regs)
2998 return NULL;
3000 for (index = 0; index < self->groups; index++) {
3001 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3002 if (!item) {
3003 Py_DECREF(regs);
3004 return NULL;
3006 PyTuple_SET_ITEM(regs, index, item);
3009 Py_INCREF(regs);
3010 self->regs = regs;
3012 return regs;
3015 static PyObject*
3016 match_copy(MatchObject* self, PyObject *unused)
3018 #ifdef USE_BUILTIN_COPY
3019 MatchObject* copy;
3020 Py_ssize_t slots, offset;
3022 slots = 2 * (self->pattern->groups+1);
3024 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3025 if (!copy)
3026 return NULL;
3028 /* this value a constant, but any compiler should be able to
3029 figure that out all by itself */
3030 offset = offsetof(MatchObject, string);
3032 Py_XINCREF(self->pattern);
3033 Py_XINCREF(self->string);
3034 Py_XINCREF(self->regs);
3036 memcpy((char*) copy + offset, (char*) self + offset,
3037 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3039 return (PyObject*) copy;
3040 #else
3041 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3042 return NULL;
3043 #endif
3046 static PyObject*
3047 match_deepcopy(MatchObject* self, PyObject* memo)
3049 #ifdef USE_BUILTIN_COPY
3050 MatchObject* copy;
3052 copy = (MatchObject*) match_copy(self);
3053 if (!copy)
3054 return NULL;
3056 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3057 !deepcopy(&copy->string, memo) ||
3058 !deepcopy(&copy->regs, memo)) {
3059 Py_DECREF(copy);
3060 return NULL;
3063 #else
3064 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3065 return NULL;
3066 #endif
3069 static PyMethodDef match_methods[] = {
3070 {"group", (PyCFunction) match_group, METH_VARARGS},
3071 {"start", (PyCFunction) match_start, METH_VARARGS},
3072 {"end", (PyCFunction) match_end, METH_VARARGS},
3073 {"span", (PyCFunction) match_span, METH_VARARGS},
3074 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3075 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3076 {"expand", (PyCFunction) match_expand, METH_O},
3077 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3078 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3079 {NULL, NULL}
3082 static PyObject*
3083 match_getattr(MatchObject* self, char* name)
3085 PyObject* res;
3087 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3088 if (res)
3089 return res;
3091 PyErr_Clear();
3093 if (!strcmp(name, "lastindex")) {
3094 if (self->lastindex >= 0)
3095 return Py_BuildValue("i", self->lastindex);
3096 Py_INCREF(Py_None);
3097 return Py_None;
3100 if (!strcmp(name, "lastgroup")) {
3101 if (self->pattern->indexgroup && self->lastindex >= 0) {
3102 PyObject* result = PySequence_GetItem(
3103 self->pattern->indexgroup, self->lastindex
3105 if (result)
3106 return result;
3107 PyErr_Clear();
3109 Py_INCREF(Py_None);
3110 return Py_None;
3113 if (!strcmp(name, "string")) {
3114 if (self->string) {
3115 Py_INCREF(self->string);
3116 return self->string;
3117 } else {
3118 Py_INCREF(Py_None);
3119 return Py_None;
3123 if (!strcmp(name, "regs")) {
3124 if (self->regs) {
3125 Py_INCREF(self->regs);
3126 return self->regs;
3127 } else
3128 return match_regs(self);
3131 if (!strcmp(name, "re")) {
3132 Py_INCREF(self->pattern);
3133 return (PyObject*) self->pattern;
3136 if (!strcmp(name, "pos"))
3137 return Py_BuildValue("i", self->pos);
3139 if (!strcmp(name, "endpos"))
3140 return Py_BuildValue("i", self->endpos);
3142 PyErr_SetString(PyExc_AttributeError, name);
3143 return NULL;
3146 /* FIXME: implement setattr("string", None) as a special case (to
3147 detach the associated string, if any */
3149 statichere PyTypeObject Match_Type = {
3150 PyObject_HEAD_INIT(NULL)
3151 0, "_" SRE_MODULE ".SRE_Match",
3152 sizeof(MatchObject), sizeof(Py_ssize_t),
3153 (destructor)match_dealloc, /*tp_dealloc*/
3154 0, /*tp_print*/
3155 (getattrfunc)match_getattr /*tp_getattr*/
3158 static PyObject*
3159 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3161 /* create match object (from state object) */
3163 MatchObject* match;
3164 Py_ssize_t i, j;
3165 char* base;
3166 int n;
3168 if (status > 0) {
3170 /* create match object (with room for extra group marks) */
3171 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3172 2*(pattern->groups+1));
3173 if (!match)
3174 return NULL;
3176 Py_INCREF(pattern);
3177 match->pattern = pattern;
3179 Py_INCREF(state->string);
3180 match->string = state->string;
3182 match->regs = NULL;
3183 match->groups = pattern->groups+1;
3185 /* fill in group slices */
3187 base = (char*) state->beginning;
3188 n = state->charsize;
3190 match->mark[0] = ((char*) state->start - base) / n;
3191 match->mark[1] = ((char*) state->ptr - base) / n;
3193 for (i = j = 0; i < pattern->groups; i++, j+=2)
3194 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3195 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3196 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3197 } else
3198 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3200 match->pos = state->pos;
3201 match->endpos = state->endpos;
3203 match->lastindex = state->lastindex;
3205 return (PyObject*) match;
3207 } else if (status == 0) {
3209 /* no match */
3210 Py_INCREF(Py_None);
3211 return Py_None;
3215 /* internal error */
3216 pattern_error(status);
3217 return NULL;
3221 /* -------------------------------------------------------------------- */
3222 /* scanner methods (experimental) */
3224 static void
3225 scanner_dealloc(ScannerObject* self)
3227 state_fini(&self->state);
3228 Py_DECREF(self->pattern);
3229 PyObject_DEL(self);
3232 static PyObject*
3233 scanner_match(ScannerObject* self, PyObject *unused)
3235 SRE_STATE* state = &self->state;
3236 PyObject* match;
3237 int status;
3239 state_reset(state);
3241 state->ptr = state->start;
3243 if (state->charsize == 1) {
3244 status = sre_match(state, PatternObject_GetCode(self->pattern));
3245 } else {
3246 #if defined(HAVE_UNICODE)
3247 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3248 #endif
3251 match = pattern_new_match((PatternObject*) self->pattern,
3252 state, status);
3254 if (status == 0 || state->ptr == state->start)
3255 state->start = (void*) ((char*) state->ptr + state->charsize);
3256 else
3257 state->start = state->ptr;
3259 return match;
3263 static PyObject*
3264 scanner_search(ScannerObject* self, PyObject *unused)
3266 SRE_STATE* state = &self->state;
3267 PyObject* match;
3268 int status;
3270 state_reset(state);
3272 state->ptr = state->start;
3274 if (state->charsize == 1) {
3275 status = sre_search(state, PatternObject_GetCode(self->pattern));
3276 } else {
3277 #if defined(HAVE_UNICODE)
3278 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3279 #endif
3282 match = pattern_new_match((PatternObject*) self->pattern,
3283 state, status);
3285 if (status == 0 || state->ptr == state->start)
3286 state->start = (void*) ((char*) state->ptr + state->charsize);
3287 else
3288 state->start = state->ptr;
3290 return match;
3293 static PyMethodDef scanner_methods[] = {
3294 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3295 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3296 {NULL, NULL}
3299 static PyObject*
3300 scanner_getattr(ScannerObject* self, char* name)
3302 PyObject* res;
3304 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3305 if (res)
3306 return res;
3308 PyErr_Clear();
3310 /* attributes */
3311 if (!strcmp(name, "pattern")) {
3312 Py_INCREF(self->pattern);
3313 return self->pattern;
3316 PyErr_SetString(PyExc_AttributeError, name);
3317 return NULL;
3320 statichere PyTypeObject Scanner_Type = {
3321 PyObject_HEAD_INIT(NULL)
3322 0, "_" SRE_MODULE ".SRE_Scanner",
3323 sizeof(ScannerObject), 0,
3324 (destructor)scanner_dealloc, /*tp_dealloc*/
3325 0, /*tp_print*/
3326 (getattrfunc)scanner_getattr, /*tp_getattr*/
3329 static PyObject*
3330 pattern_scanner(PatternObject* pattern, PyObject* args)
3332 /* create search state object */
3334 ScannerObject* self;
3336 PyObject* string;
3337 Py_ssize_t start = 0;
3338 Py_ssize_t end = PY_SSIZE_T_MAX;
3339 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3340 return NULL;
3342 /* create scanner object */
3343 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3344 if (!self)
3345 return NULL;
3347 string = state_init(&self->state, pattern, string, start, end);
3348 if (!string) {
3349 PyObject_DEL(self);
3350 return NULL;
3353 Py_INCREF(pattern);
3354 self->pattern = (PyObject*) pattern;
3356 return (PyObject*) self;
3359 static PyMethodDef _functions[] = {
3360 {"compile", _compile, METH_VARARGS},
3361 {"getcodesize", sre_codesize, METH_NOARGS},
3362 {"getlower", sre_getlower, METH_VARARGS},
3363 {NULL, NULL}
3366 #if PY_VERSION_HEX < 0x02030000
3367 DL_EXPORT(void) init_sre(void)
3368 #else
3369 PyMODINIT_FUNC init_sre(void)
3370 #endif
3372 PyObject* m;
3373 PyObject* d;
3374 PyObject* x;
3376 /* Patch object types */
3377 Pattern_Type.ob_type = Match_Type.ob_type =
3378 Scanner_Type.ob_type = &PyType_Type;
3380 m = Py_InitModule("_" SRE_MODULE, _functions);
3381 if (m == NULL)
3382 return;
3383 d = PyModule_GetDict(m);
3385 x = PyInt_FromLong(SRE_MAGIC);
3386 if (x) {
3387 PyDict_SetItemString(d, "MAGIC", x);
3388 Py_DECREF(x);
3391 x = PyInt_FromLong(sizeof(SRE_CODE));
3392 if (x) {
3393 PyDict_SetItemString(d, "CODESIZE", x);
3394 Py_DECREF(x);
3397 x = PyString_FromString(copyright);
3398 if (x) {
3399 PyDict_SetItemString(d, "copyright", x);
3400 Py_DECREF(x);
3404 #endif /* !defined(SRE_RECURSIVE) */
3406 /* vim:ts=4:sw=4:et