Updated documentation for findCaller() to indicate that a 3-tuple is now returned...
[python.git] / Modules / _sre.c
blobc1eb71cf2697cd1e26573470643401d11c8bf797
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 #ifndef SRE_RECURSIVE
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #define PY_SSIZE_T_CLEAN
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
47 #include "sre.h"
49 #include <ctype.h>
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
56 #define SRE_PY_MODULE "re"
58 /* defining this one enables tracing */
59 #undef VERBOSE
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
64 #define HAVE_UNICODE
65 #endif
66 #endif
68 /* -------------------------------------------------------------------- */
69 /* optional features */
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
74 /* enables aggressive inlining (always on for Visual C) */
75 #undef USE_INLINE
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
82 #endif
84 /* -------------------------------------------------------------------- */
86 #if defined(_MSC_VER)
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
93 #else
94 #define LOCAL(type) static type
95 #endif
97 /* error codes */
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
103 #if defined(VERBOSE)
104 #define TRACE(v) printf v
105 #else
106 #define TRACE(v)
107 #endif
109 /* -------------------------------------------------------------------- */
110 /* search engine state */
112 /* default character predicates (run sre_chars.py to regenerate tables) */
114 #define SRE_DIGIT_MASK 1
115 #define SRE_SPACE_MASK 2
116 #define SRE_LINEBREAK_MASK 4
117 #define SRE_ALNUM_MASK 8
118 #define SRE_WORD_MASK 16
120 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
122 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
123 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
125 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
126 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
127 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
128 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
131 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
132 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
133 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
134 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
135 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
136 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
137 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
138 120, 121, 122, 123, 124, 125, 126, 127 };
140 #define SRE_IS_DIGIT(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
142 #define SRE_IS_SPACE(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
144 #define SRE_IS_LINEBREAK(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
146 #define SRE_IS_ALNUM(ch)\
147 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
148 #define SRE_IS_WORD(ch)\
149 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151 static unsigned int sre_lower(unsigned int ch)
153 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
156 /* locale-specific character predicates */
157 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
158 * warnings when c's type supports only numbers < N+1 */
159 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
160 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
161 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
162 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
163 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165 static unsigned int sre_lower_locale(unsigned int ch)
167 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
170 /* unicode-specific character predicates */
172 #if defined(HAVE_UNICODE)
174 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180 static unsigned int sre_lower_unicode(unsigned int ch)
182 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
185 #endif
187 LOCAL(int)
188 sre_category(SRE_CODE category, unsigned int ch)
190 switch (category) {
192 case SRE_CATEGORY_DIGIT:
193 return SRE_IS_DIGIT(ch);
194 case SRE_CATEGORY_NOT_DIGIT:
195 return !SRE_IS_DIGIT(ch);
196 case SRE_CATEGORY_SPACE:
197 return SRE_IS_SPACE(ch);
198 case SRE_CATEGORY_NOT_SPACE:
199 return !SRE_IS_SPACE(ch);
200 case SRE_CATEGORY_WORD:
201 return SRE_IS_WORD(ch);
202 case SRE_CATEGORY_NOT_WORD:
203 return !SRE_IS_WORD(ch);
204 case SRE_CATEGORY_LINEBREAK:
205 return SRE_IS_LINEBREAK(ch);
206 case SRE_CATEGORY_NOT_LINEBREAK:
207 return !SRE_IS_LINEBREAK(ch);
209 case SRE_CATEGORY_LOC_WORD:
210 return SRE_LOC_IS_WORD(ch);
211 case SRE_CATEGORY_LOC_NOT_WORD:
212 return !SRE_LOC_IS_WORD(ch);
214 #if defined(HAVE_UNICODE)
215 case SRE_CATEGORY_UNI_DIGIT:
216 return SRE_UNI_IS_DIGIT(ch);
217 case SRE_CATEGORY_UNI_NOT_DIGIT:
218 return !SRE_UNI_IS_DIGIT(ch);
219 case SRE_CATEGORY_UNI_SPACE:
220 return SRE_UNI_IS_SPACE(ch);
221 case SRE_CATEGORY_UNI_NOT_SPACE:
222 return !SRE_UNI_IS_SPACE(ch);
223 case SRE_CATEGORY_UNI_WORD:
224 return SRE_UNI_IS_WORD(ch);
225 case SRE_CATEGORY_UNI_NOT_WORD:
226 return !SRE_UNI_IS_WORD(ch);
227 case SRE_CATEGORY_UNI_LINEBREAK:
228 return SRE_UNI_IS_LINEBREAK(ch);
229 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
230 return !SRE_UNI_IS_LINEBREAK(ch);
231 #else
232 case SRE_CATEGORY_UNI_DIGIT:
233 return SRE_IS_DIGIT(ch);
234 case SRE_CATEGORY_UNI_NOT_DIGIT:
235 return !SRE_IS_DIGIT(ch);
236 case SRE_CATEGORY_UNI_SPACE:
237 return SRE_IS_SPACE(ch);
238 case SRE_CATEGORY_UNI_NOT_SPACE:
239 return !SRE_IS_SPACE(ch);
240 case SRE_CATEGORY_UNI_WORD:
241 return SRE_LOC_IS_WORD(ch);
242 case SRE_CATEGORY_UNI_NOT_WORD:
243 return !SRE_LOC_IS_WORD(ch);
244 case SRE_CATEGORY_UNI_LINEBREAK:
245 return SRE_IS_LINEBREAK(ch);
246 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
247 return !SRE_IS_LINEBREAK(ch);
248 #endif
250 return 0;
253 /* helpers */
255 static void
256 data_stack_dealloc(SRE_STATE* state)
258 if (state->data_stack) {
259 PyMem_FREE(state->data_stack);
260 state->data_stack = NULL;
262 state->data_stack_size = state->data_stack_base = 0;
265 static int
266 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
268 Py_ssize_t minsize, cursize;
269 minsize = state->data_stack_base+size;
270 cursize = state->data_stack_size;
271 if (cursize < minsize) {
272 void* stack;
273 cursize = minsize+minsize/4+1024;
274 TRACE(("allocate/grow stack %d\n", cursize));
275 stack = PyMem_REALLOC(state->data_stack, cursize);
276 if (!stack) {
277 data_stack_dealloc(state);
278 return SRE_ERROR_MEMORY;
280 state->data_stack = (char *)stack;
281 state->data_stack_size = cursize;
283 return 0;
286 /* generate 8-bit version */
288 #define SRE_CHAR unsigned char
289 #define SRE_AT sre_at
290 #define SRE_COUNT sre_count
291 #define SRE_CHARSET sre_charset
292 #define SRE_INFO sre_info
293 #define SRE_MATCH sre_match
294 #define SRE_MATCH_CONTEXT sre_match_context
295 #define SRE_SEARCH sre_search
296 #define SRE_LITERAL_TEMPLATE sre_literal_template
298 #if defined(HAVE_UNICODE)
300 #define SRE_RECURSIVE
301 #include "_sre.c"
302 #undef SRE_RECURSIVE
304 #undef SRE_LITERAL_TEMPLATE
305 #undef SRE_SEARCH
306 #undef SRE_MATCH
307 #undef SRE_MATCH_CONTEXT
308 #undef SRE_INFO
309 #undef SRE_CHARSET
310 #undef SRE_COUNT
311 #undef SRE_AT
312 #undef SRE_CHAR
314 /* generate 16-bit unicode version */
316 #define SRE_CHAR Py_UNICODE
317 #define SRE_AT sre_uat
318 #define SRE_COUNT sre_ucount
319 #define SRE_CHARSET sre_ucharset
320 #define SRE_INFO sre_uinfo
321 #define SRE_MATCH sre_umatch
322 #define SRE_MATCH_CONTEXT sre_umatch_context
323 #define SRE_SEARCH sre_usearch
324 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
325 #endif
327 #endif /* SRE_RECURSIVE */
329 /* -------------------------------------------------------------------- */
330 /* String matching engine */
332 /* the following section is compiled twice, with different character
333 settings */
335 LOCAL(int)
336 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338 /* check if pointer is at given position */
340 Py_ssize_t thisp, thatp;
342 switch (at) {
344 case SRE_AT_BEGINNING:
345 case SRE_AT_BEGINNING_STRING:
346 return ((void*) ptr == state->beginning);
348 case SRE_AT_BEGINNING_LINE:
349 return ((void*) ptr == state->beginning ||
350 SRE_IS_LINEBREAK((int) ptr[-1]));
352 case SRE_AT_END:
353 return (((void*) (ptr+1) == state->end &&
354 SRE_IS_LINEBREAK((int) ptr[0])) ||
355 ((void*) ptr == state->end));
357 case SRE_AT_END_LINE:
358 return ((void*) ptr == state->end ||
359 SRE_IS_LINEBREAK((int) ptr[0]));
361 case SRE_AT_END_STRING:
362 return ((void*) ptr == state->end);
364 case SRE_AT_BOUNDARY:
365 if (state->beginning == state->end)
366 return 0;
367 thatp = ((void*) ptr > state->beginning) ?
368 SRE_IS_WORD((int) ptr[-1]) : 0;
369 thisp = ((void*) ptr < state->end) ?
370 SRE_IS_WORD((int) ptr[0]) : 0;
371 return thisp != thatp;
373 case SRE_AT_NON_BOUNDARY:
374 if (state->beginning == state->end)
375 return 0;
376 thatp = ((void*) ptr > state->beginning) ?
377 SRE_IS_WORD((int) ptr[-1]) : 0;
378 thisp = ((void*) ptr < state->end) ?
379 SRE_IS_WORD((int) ptr[0]) : 0;
380 return thisp == thatp;
382 case SRE_AT_LOC_BOUNDARY:
383 if (state->beginning == state->end)
384 return 0;
385 thatp = ((void*) ptr > state->beginning) ?
386 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
387 thisp = ((void*) ptr < state->end) ?
388 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
389 return thisp != thatp;
391 case SRE_AT_LOC_NON_BOUNDARY:
392 if (state->beginning == state->end)
393 return 0;
394 thatp = ((void*) ptr > state->beginning) ?
395 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
396 thisp = ((void*) ptr < state->end) ?
397 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
398 return thisp == thatp;
400 #if defined(HAVE_UNICODE)
401 case SRE_AT_UNI_BOUNDARY:
402 if (state->beginning == state->end)
403 return 0;
404 thatp = ((void*) ptr > state->beginning) ?
405 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
406 thisp = ((void*) ptr < state->end) ?
407 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
408 return thisp != thatp;
410 case SRE_AT_UNI_NON_BOUNDARY:
411 if (state->beginning == state->end)
412 return 0;
413 thatp = ((void*) ptr > state->beginning) ?
414 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
415 thisp = ((void*) ptr < state->end) ?
416 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
417 return thisp == thatp;
418 #endif
422 return 0;
425 LOCAL(int)
426 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428 /* check if character is a member of the given set */
430 int ok = 1;
432 for (;;) {
433 switch (*set++) {
435 case SRE_OP_FAILURE:
436 return !ok;
438 case SRE_OP_LITERAL:
439 /* <LITERAL> <code> */
440 if (ch == set[0])
441 return ok;
442 set++;
443 break;
445 case SRE_OP_CATEGORY:
446 /* <CATEGORY> <code> */
447 if (sre_category(set[0], (int) ch))
448 return ok;
449 set += 1;
450 break;
452 case SRE_OP_CHARSET:
453 if (sizeof(SRE_CODE) == 2) {
454 /* <CHARSET> <bitmap> (16 bits per code word) */
455 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
456 return ok;
457 set += 16;
459 else {
460 /* <CHARSET> <bitmap> (32 bits per code word) */
461 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
462 return ok;
463 set += 8;
465 break;
467 case SRE_OP_RANGE:
468 /* <RANGE> <lower> <upper> */
469 if (set[0] <= ch && ch <= set[1])
470 return ok;
471 set += 2;
472 break;
474 case SRE_OP_NEGATE:
475 ok = !ok;
476 break;
478 case SRE_OP_BIGCHARSET:
479 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 Py_ssize_t count, block;
482 count = *(set++);
484 if (sizeof(SRE_CODE) == 2) {
485 block = ((unsigned char*)set)[ch >> 8];
486 set += 128;
487 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
488 return ok;
489 set += count*16;
491 else {
492 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
493 * warnings when c's type supports only numbers < N+1 */
494 if (!(ch & ~65535))
495 block = ((unsigned char*)set)[ch >> 8];
496 else
497 block = -1;
498 set += 64;
499 if (block >=0 &&
500 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
501 return ok;
502 set += count*8;
504 break;
507 default:
508 /* internal error -- there's not much we can do about it
509 here, so let's just pretend it didn't match... */
510 return 0;
515 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517 LOCAL(Py_ssize_t)
518 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
520 SRE_CODE chr;
521 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
522 SRE_CHAR* end = (SRE_CHAR *)state->end;
523 Py_ssize_t i;
525 /* adjust end */
526 if (maxcount < end - ptr && maxcount != 65535)
527 end = ptr + maxcount;
529 switch (pattern[0]) {
531 case SRE_OP_IN:
532 /* repeated set */
533 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
534 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
535 ptr++;
536 break;
538 case SRE_OP_ANY:
539 /* repeated dot wildcard. */
540 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
541 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
542 ptr++;
543 break;
545 case SRE_OP_ANY_ALL:
546 /* repeated dot wildcard. skip to the end of the target
547 string, and backtrack from there */
548 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
549 ptr = end;
550 break;
552 case SRE_OP_LITERAL:
553 /* repeated literal */
554 chr = pattern[1];
555 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
556 while (ptr < end && (SRE_CODE) *ptr == chr)
557 ptr++;
558 break;
560 case SRE_OP_LITERAL_IGNORE:
561 /* repeated literal */
562 chr = pattern[1];
563 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
564 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
565 ptr++;
566 break;
568 case SRE_OP_NOT_LITERAL:
569 /* repeated non-literal */
570 chr = pattern[1];
571 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
572 while (ptr < end && (SRE_CODE) *ptr != chr)
573 ptr++;
574 break;
576 case SRE_OP_NOT_LITERAL_IGNORE:
577 /* repeated non-literal */
578 chr = pattern[1];
579 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
580 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
581 ptr++;
582 break;
584 default:
585 /* repeated single character pattern */
586 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
587 while ((SRE_CHAR*) state->ptr < end) {
588 i = SRE_MATCH(state, pattern);
589 if (i < 0)
590 return i;
591 if (!i)
592 break;
594 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
595 (SRE_CHAR*) state->ptr - ptr));
596 return (SRE_CHAR*) state->ptr - ptr;
599 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
600 return ptr - (SRE_CHAR*) state->ptr;
603 #if 0 /* not used in this release */
604 LOCAL(int)
605 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
607 /* check if an SRE_OP_INFO block matches at the current position.
608 returns the number of SRE_CODE objects to skip if successful, 0
609 if no match */
611 SRE_CHAR* end = state->end;
612 SRE_CHAR* ptr = state->ptr;
613 Py_ssize_t i;
615 /* check minimal length */
616 if (pattern[3] && (end - ptr) < pattern[3])
617 return 0;
619 /* check known prefix */
620 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
621 /* <length> <skip> <prefix data> <overlap data> */
622 for (i = 0; i < pattern[5]; i++)
623 if ((SRE_CODE) ptr[i] != pattern[7 + i])
624 return 0;
625 return pattern[0] + 2 * pattern[6];
627 return pattern[0];
629 #endif
631 /* The macros below should be used to protect recursive SRE_MATCH()
632 * calls that *failed* and do *not* return immediately (IOW, those
633 * that will backtrack). Explaining:
635 * - Recursive SRE_MATCH() returned true: that's usually a success
636 * (besides atypical cases like ASSERT_NOT), therefore there's no
637 * reason to restore lastmark;
639 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
640 * is returning to the caller: If the current SRE_MATCH() is the
641 * top function of the recursion, returning false will be a matching
642 * failure, and it doesn't matter where lastmark is pointing to.
643 * If it's *not* the top function, it will be a recursive SRE_MATCH()
644 * failure by itself, and the calling SRE_MATCH() will have to deal
645 * with the failure by the same rules explained here (it will restore
646 * lastmark by itself if necessary);
648 * - Recursive SRE_MATCH() returned false, and will continue the
649 * outside 'for' loop: must be protected when breaking, since the next
650 * OP could potentially depend on lastmark;
652 * - Recursive SRE_MATCH() returned false, and will be called again
653 * inside a local for/while loop: must be protected between each
654 * loop iteration, since the recursive SRE_MATCH() could do anything,
655 * and could potentially depend on lastmark.
657 * For more information, check the discussion at SF patch #712900.
659 #define LASTMARK_SAVE() \
660 do { \
661 ctx->lastmark = state->lastmark; \
662 ctx->lastindex = state->lastindex; \
663 } while (0)
664 #define LASTMARK_RESTORE() \
665 do { \
666 state->lastmark = ctx->lastmark; \
667 state->lastindex = ctx->lastindex; \
668 } while (0)
670 #define RETURN_ERROR(i) do { return i; } while(0)
671 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
672 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
674 #define RETURN_ON_ERROR(i) \
675 do { if (i < 0) RETURN_ERROR(i); } while (0)
676 #define RETURN_ON_SUCCESS(i) \
677 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
678 #define RETURN_ON_FAILURE(i) \
679 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
681 #define SFY(x) #x
683 #define DATA_STACK_ALLOC(state, type, ptr) \
684 do { \
685 alloc_pos = state->data_stack_base; \
686 TRACE(("allocating %s in %d (%d)\n", \
687 SFY(type), alloc_pos, sizeof(type))); \
688 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
689 int j = data_stack_grow(state, sizeof(type)); \
690 if (j < 0) return j; \
691 if (ctx_pos != -1) \
692 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
694 ptr = (type*)(state->data_stack+alloc_pos); \
695 state->data_stack_base += sizeof(type); \
696 } while (0)
698 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
699 do { \
700 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
701 ptr = (type*)(state->data_stack+pos); \
702 } while (0)
704 #define DATA_STACK_PUSH(state, data, size) \
705 do { \
706 TRACE(("copy data in %p to %d (%d)\n", \
707 data, state->data_stack_base, size)); \
708 if (state->data_stack_size < state->data_stack_base+size) { \
709 int j = data_stack_grow(state, size); \
710 if (j < 0) return j; \
711 if (ctx_pos != -1) \
712 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
714 memcpy(state->data_stack+state->data_stack_base, data, size); \
715 state->data_stack_base += size; \
716 } while (0)
718 #define DATA_STACK_POP(state, data, size, discard) \
719 do { \
720 TRACE(("copy data to %p from %d (%d)\n", \
721 data, state->data_stack_base-size, size)); \
722 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
723 if (discard) \
724 state->data_stack_base -= size; \
725 } while (0)
727 #define DATA_STACK_POP_DISCARD(state, size) \
728 do { \
729 TRACE(("discard data from %d (%d)\n", \
730 state->data_stack_base-size, size)); \
731 state->data_stack_base -= size; \
732 } while(0)
734 #define DATA_PUSH(x) \
735 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
736 #define DATA_POP(x) \
737 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
738 #define DATA_POP_DISCARD(x) \
739 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
740 #define DATA_ALLOC(t,p) \
741 DATA_STACK_ALLOC(state, t, p)
742 #define DATA_LOOKUP_AT(t,p,pos) \
743 DATA_STACK_LOOKUP_AT(state,t,p,pos)
745 #define MARK_PUSH(lastmark) \
746 do if (lastmark > 0) { \
747 i = lastmark; /* ctx->lastmark may change if reallocated */ \
748 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
749 } while (0)
750 #define MARK_POP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
753 } while (0)
754 #define MARK_POP_KEEP(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
757 } while (0)
758 #define MARK_POP_DISCARD(lastmark) \
759 do if (lastmark > 0) { \
760 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
761 } while (0)
763 #define JUMP_NONE 0
764 #define JUMP_MAX_UNTIL_1 1
765 #define JUMP_MAX_UNTIL_2 2
766 #define JUMP_MAX_UNTIL_3 3
767 #define JUMP_MIN_UNTIL_1 4
768 #define JUMP_MIN_UNTIL_2 5
769 #define JUMP_MIN_UNTIL_3 6
770 #define JUMP_REPEAT 7
771 #define JUMP_REPEAT_ONE_1 8
772 #define JUMP_REPEAT_ONE_2 9
773 #define JUMP_MIN_REPEAT_ONE 10
774 #define JUMP_BRANCH 11
775 #define JUMP_ASSERT 12
776 #define JUMP_ASSERT_NOT 13
778 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
779 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
780 nextctx->last_ctx_pos = ctx_pos; \
781 nextctx->jump = jumpvalue; \
782 nextctx->pattern = nextpattern; \
783 ctx_pos = alloc_pos; \
784 ctx = nextctx; \
785 goto entrance; \
786 jumplabel: \
787 while (0) /* gcc doesn't like labels at end of scopes */ \
789 typedef struct {
790 Py_ssize_t last_ctx_pos;
791 Py_ssize_t jump;
792 SRE_CHAR* ptr;
793 SRE_CODE* pattern;
794 Py_ssize_t count;
795 Py_ssize_t lastmark;
796 Py_ssize_t lastindex;
797 union {
798 SRE_CODE chr;
799 SRE_REPEAT* rep;
800 } u;
801 } SRE_MATCH_CONTEXT;
803 /* check if string matches the given pattern. returns <0 for
804 error, 0 for failure, and 1 for success */
805 LOCAL(Py_ssize_t)
806 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
808 SRE_CHAR* end = (SRE_CHAR *)state->end;
809 Py_ssize_t alloc_pos, ctx_pos = -1;
810 Py_ssize_t i, ret = 0;
811 Py_ssize_t jump;
813 SRE_MATCH_CONTEXT* ctx;
814 SRE_MATCH_CONTEXT* nextctx;
816 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
818 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
819 ctx->last_ctx_pos = -1;
820 ctx->jump = JUMP_NONE;
821 ctx->pattern = pattern;
822 ctx_pos = alloc_pos;
824 entrance:
826 ctx->ptr = (SRE_CHAR *)state->ptr;
828 if (ctx->pattern[0] == SRE_OP_INFO) {
829 /* optimization info block */
830 /* <INFO> <1=skip> <2=flags> <3=min> ... */
831 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
832 TRACE(("reject (got %d chars, need %d)\n",
833 (end - ctx->ptr), ctx->pattern[3]));
834 RETURN_FAILURE;
836 ctx->pattern += ctx->pattern[1] + 1;
839 for (;;) {
841 switch (*ctx->pattern++) {
843 case SRE_OP_MARK:
844 /* set mark */
845 /* <MARK> <gid> */
846 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
847 ctx->ptr, ctx->pattern[0]));
848 i = ctx->pattern[0];
849 if (i & 1)
850 state->lastindex = i/2 + 1;
851 if (i > state->lastmark) {
852 /* state->lastmark is the highest valid index in the
853 state->mark array. If it is increased by more than 1,
854 the intervening marks must be set to NULL to signal
855 that these marks have not been encountered. */
856 Py_ssize_t j = state->lastmark + 1;
857 while (j < i)
858 state->mark[j++] = NULL;
859 state->lastmark = i;
861 state->mark[i] = ctx->ptr;
862 ctx->pattern++;
863 break;
865 case SRE_OP_LITERAL:
866 /* match literal string */
867 /* <LITERAL> <code> */
868 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
869 ctx->ptr, *ctx->pattern));
870 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
871 RETURN_FAILURE;
872 ctx->pattern++;
873 ctx->ptr++;
874 break;
876 case SRE_OP_NOT_LITERAL:
877 /* match anything that is not literal character */
878 /* <NOT_LITERAL> <code> */
879 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
881 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
882 RETURN_FAILURE;
883 ctx->pattern++;
884 ctx->ptr++;
885 break;
887 case SRE_OP_SUCCESS:
888 /* end of pattern */
889 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
890 state->ptr = ctx->ptr;
891 RETURN_SUCCESS;
893 case SRE_OP_AT:
894 /* match at given position */
895 /* <AT> <code> */
896 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
897 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
898 RETURN_FAILURE;
899 ctx->pattern++;
900 break;
902 case SRE_OP_CATEGORY:
903 /* match at given category */
904 /* <CATEGORY> <code> */
905 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
906 ctx->ptr, *ctx->pattern));
907 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
908 RETURN_FAILURE;
909 ctx->pattern++;
910 ctx->ptr++;
911 break;
913 case SRE_OP_ANY:
914 /* match anything (except a newline) */
915 /* <ANY> */
916 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
917 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
918 RETURN_FAILURE;
919 ctx->ptr++;
920 break;
922 case SRE_OP_ANY_ALL:
923 /* match anything */
924 /* <ANY_ALL> */
925 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
926 if (ctx->ptr >= end)
927 RETURN_FAILURE;
928 ctx->ptr++;
929 break;
931 case SRE_OP_IN:
932 /* match set member (or non_member) */
933 /* <IN> <skip> <set> */
934 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
935 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
936 RETURN_FAILURE;
937 ctx->pattern += ctx->pattern[0];
938 ctx->ptr++;
939 break;
941 case SRE_OP_LITERAL_IGNORE:
942 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
943 ctx->pattern, ctx->ptr, ctx->pattern[0]));
944 if (ctx->ptr >= end ||
945 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
946 RETURN_FAILURE;
947 ctx->pattern++;
948 ctx->ptr++;
949 break;
951 case SRE_OP_NOT_LITERAL_IGNORE:
952 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
953 ctx->pattern, ctx->ptr, *ctx->pattern));
954 if (ctx->ptr >= end ||
955 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
956 RETURN_FAILURE;
957 ctx->pattern++;
958 ctx->ptr++;
959 break;
961 case SRE_OP_IN_IGNORE:
962 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
963 if (ctx->ptr >= end
964 || !SRE_CHARSET(ctx->pattern+1,
965 (SRE_CODE)state->lower(*ctx->ptr)))
966 RETURN_FAILURE;
967 ctx->pattern += ctx->pattern[0];
968 ctx->ptr++;
969 break;
971 case SRE_OP_JUMP:
972 case SRE_OP_INFO:
973 /* jump forward */
974 /* <JUMP> <offset> */
975 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
976 ctx->ptr, ctx->pattern[0]));
977 ctx->pattern += ctx->pattern[0];
978 break;
980 case SRE_OP_BRANCH:
981 /* alternation */
982 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
983 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
984 LASTMARK_SAVE();
985 ctx->u.rep = state->repeat;
986 if (ctx->u.rep)
987 MARK_PUSH(ctx->lastmark);
988 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
989 if (ctx->pattern[1] == SRE_OP_LITERAL &&
990 (ctx->ptr >= end ||
991 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
992 continue;
993 if (ctx->pattern[1] == SRE_OP_IN &&
994 (ctx->ptr >= end ||
995 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
996 continue;
997 state->ptr = ctx->ptr;
998 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
999 if (ret) {
1000 if (ctx->u.rep)
1001 MARK_POP_DISCARD(ctx->lastmark);
1002 RETURN_ON_ERROR(ret);
1003 RETURN_SUCCESS;
1005 if (ctx->u.rep)
1006 MARK_POP_KEEP(ctx->lastmark);
1007 LASTMARK_RESTORE();
1009 if (ctx->u.rep)
1010 MARK_POP_DISCARD(ctx->lastmark);
1011 RETURN_FAILURE;
1013 case SRE_OP_REPEAT_ONE:
1014 /* match repeated sequence (maximizing regexp) */
1016 /* this operator only works if the repeated item is
1017 exactly one character wide, and we're not already
1018 collecting backtracking points. for other cases,
1019 use the MAX_REPEAT operator */
1021 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1023 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1024 ctx->pattern[1], ctx->pattern[2]));
1026 if (ctx->ptr + ctx->pattern[1] > end)
1027 RETURN_FAILURE; /* cannot match */
1029 state->ptr = ctx->ptr;
1031 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1032 RETURN_ON_ERROR(ret);
1033 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1034 ctx->count = ret;
1035 ctx->ptr += ctx->count;
1037 /* when we arrive here, count contains the number of
1038 matches, and ctx->ptr points to the tail of the target
1039 string. check if the rest of the pattern matches,
1040 and backtrack if not. */
1042 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1043 RETURN_FAILURE;
1045 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1046 /* tail is empty. we're finished */
1047 state->ptr = ctx->ptr;
1048 RETURN_SUCCESS;
1051 LASTMARK_SAVE();
1053 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1054 /* tail starts with a literal. skip positions where
1055 the rest of the pattern cannot possibly match */
1056 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1057 for (;;) {
1058 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1059 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1060 ctx->ptr--;
1061 ctx->count--;
1063 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1064 break;
1065 state->ptr = ctx->ptr;
1066 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1067 ctx->pattern+ctx->pattern[0]);
1068 if (ret) {
1069 RETURN_ON_ERROR(ret);
1070 RETURN_SUCCESS;
1073 LASTMARK_RESTORE();
1075 ctx->ptr--;
1076 ctx->count--;
1079 } else {
1080 /* general case */
1081 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1082 state->ptr = ctx->ptr;
1083 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1084 ctx->pattern+ctx->pattern[0]);
1085 if (ret) {
1086 RETURN_ON_ERROR(ret);
1087 RETURN_SUCCESS;
1089 ctx->ptr--;
1090 ctx->count--;
1091 LASTMARK_RESTORE();
1094 RETURN_FAILURE;
1096 case SRE_OP_MIN_REPEAT_ONE:
1097 /* match repeated sequence (minimizing regexp) */
1099 /* this operator only works if the repeated item is
1100 exactly one character wide, and we're not already
1101 collecting backtracking points. for other cases,
1102 use the MIN_REPEAT operator */
1104 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1106 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1107 ctx->pattern[1], ctx->pattern[2]));
1109 if (ctx->ptr + ctx->pattern[1] > end)
1110 RETURN_FAILURE; /* cannot match */
1112 state->ptr = ctx->ptr;
1114 if (ctx->pattern[1] == 0)
1115 ctx->count = 0;
1116 else {
1117 /* count using pattern min as the maximum */
1118 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1119 RETURN_ON_ERROR(ret);
1120 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1121 if (ret < (Py_ssize_t) ctx->pattern[1])
1122 /* didn't match minimum number of times */
1123 RETURN_FAILURE;
1124 /* advance past minimum matches of repeat */
1125 ctx->count = ret;
1126 ctx->ptr += ctx->count;
1129 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1130 /* tail is empty. we're finished */
1131 state->ptr = ctx->ptr;
1132 RETURN_SUCCESS;
1134 } else {
1135 /* general case */
1136 LASTMARK_SAVE();
1137 while ((Py_ssize_t)ctx->pattern[2] == 65535
1138 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1139 state->ptr = ctx->ptr;
1140 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1141 ctx->pattern+ctx->pattern[0]);
1142 if (ret) {
1143 RETURN_ON_ERROR(ret);
1144 RETURN_SUCCESS;
1146 state->ptr = ctx->ptr;
1147 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1148 RETURN_ON_ERROR(ret);
1149 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1150 if (ret == 0)
1151 break;
1152 assert(ret == 1);
1153 ctx->ptr++;
1154 ctx->count++;
1155 LASTMARK_RESTORE();
1158 RETURN_FAILURE;
1160 case SRE_OP_REPEAT:
1161 /* create repeat context. all the hard work is done
1162 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1163 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1164 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1165 ctx->pattern[1], ctx->pattern[2]));
1167 /* install new repeat context */
1168 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1169 if (!ctx->u.rep) {
1170 PyErr_NoMemory();
1171 RETURN_FAILURE;
1173 ctx->u.rep->count = -1;
1174 ctx->u.rep->pattern = ctx->pattern;
1175 ctx->u.rep->prev = state->repeat;
1176 ctx->u.rep->last_ptr = NULL;
1177 state->repeat = ctx->u.rep;
1179 state->ptr = ctx->ptr;
1180 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1181 state->repeat = ctx->u.rep->prev;
1182 PyObject_FREE(ctx->u.rep);
1184 if (ret) {
1185 RETURN_ON_ERROR(ret);
1186 RETURN_SUCCESS;
1188 RETURN_FAILURE;
1190 case SRE_OP_MAX_UNTIL:
1191 /* maximizing repeat */
1192 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1194 /* FIXME: we probably need to deal with zero-width
1195 matches in here... */
1197 ctx->u.rep = state->repeat;
1198 if (!ctx->u.rep)
1199 RETURN_ERROR(SRE_ERROR_STATE);
1201 state->ptr = ctx->ptr;
1203 ctx->count = ctx->u.rep->count+1;
1205 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1206 ctx->ptr, ctx->count));
1208 if (ctx->count < ctx->u.rep->pattern[1]) {
1209 /* not enough matches */
1210 ctx->u.rep->count = ctx->count;
1211 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1212 ctx->u.rep->pattern+3);
1213 if (ret) {
1214 RETURN_ON_ERROR(ret);
1215 RETURN_SUCCESS;
1217 ctx->u.rep->count = ctx->count-1;
1218 state->ptr = ctx->ptr;
1219 RETURN_FAILURE;
1222 if ((ctx->count < ctx->u.rep->pattern[2] ||
1223 ctx->u.rep->pattern[2] == 65535) &&
1224 state->ptr != ctx->u.rep->last_ptr) {
1225 /* we may have enough matches, but if we can
1226 match another item, do so */
1227 ctx->u.rep->count = ctx->count;
1228 LASTMARK_SAVE();
1229 MARK_PUSH(ctx->lastmark);
1230 /* zero-width match protection */
1231 DATA_PUSH(&ctx->u.rep->last_ptr);
1232 ctx->u.rep->last_ptr = state->ptr;
1233 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1234 ctx->u.rep->pattern+3);
1235 DATA_POP(&ctx->u.rep->last_ptr);
1236 if (ret) {
1237 MARK_POP_DISCARD(ctx->lastmark);
1238 RETURN_ON_ERROR(ret);
1239 RETURN_SUCCESS;
1241 MARK_POP(ctx->lastmark);
1242 LASTMARK_RESTORE();
1243 ctx->u.rep->count = ctx->count-1;
1244 state->ptr = ctx->ptr;
1247 /* cannot match more repeated items here. make sure the
1248 tail matches */
1249 state->repeat = ctx->u.rep->prev;
1250 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1251 RETURN_ON_SUCCESS(ret);
1252 state->repeat = ctx->u.rep;
1253 state->ptr = ctx->ptr;
1254 RETURN_FAILURE;
1256 case SRE_OP_MIN_UNTIL:
1257 /* minimizing repeat */
1258 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1260 ctx->u.rep = state->repeat;
1261 if (!ctx->u.rep)
1262 RETURN_ERROR(SRE_ERROR_STATE);
1264 state->ptr = ctx->ptr;
1266 ctx->count = ctx->u.rep->count+1;
1268 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1269 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1271 if (ctx->count < ctx->u.rep->pattern[1]) {
1272 /* not enough matches */
1273 ctx->u.rep->count = ctx->count;
1274 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1275 ctx->u.rep->pattern+3);
1276 if (ret) {
1277 RETURN_ON_ERROR(ret);
1278 RETURN_SUCCESS;
1280 ctx->u.rep->count = ctx->count-1;
1281 state->ptr = ctx->ptr;
1282 RETURN_FAILURE;
1285 LASTMARK_SAVE();
1287 /* see if the tail matches */
1288 state->repeat = ctx->u.rep->prev;
1289 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1290 if (ret) {
1291 RETURN_ON_ERROR(ret);
1292 RETURN_SUCCESS;
1295 state->repeat = ctx->u.rep;
1296 state->ptr = ctx->ptr;
1298 LASTMARK_RESTORE();
1300 if (ctx->count >= ctx->u.rep->pattern[2]
1301 && ctx->u.rep->pattern[2] != 65535)
1302 RETURN_FAILURE;
1304 ctx->u.rep->count = ctx->count;
1305 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1306 ctx->u.rep->pattern+3);
1307 if (ret) {
1308 RETURN_ON_ERROR(ret);
1309 RETURN_SUCCESS;
1311 ctx->u.rep->count = ctx->count-1;
1312 state->ptr = ctx->ptr;
1313 RETURN_FAILURE;
1315 case SRE_OP_GROUPREF:
1316 /* match backreference */
1317 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1318 ctx->ptr, ctx->pattern[0]));
1319 i = ctx->pattern[0];
1321 Py_ssize_t groupref = i+i;
1322 if (groupref >= state->lastmark) {
1323 RETURN_FAILURE;
1324 } else {
1325 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1326 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1327 if (!p || !e || e < p)
1328 RETURN_FAILURE;
1329 while (p < e) {
1330 if (ctx->ptr >= end || *ctx->ptr != *p)
1331 RETURN_FAILURE;
1332 p++; ctx->ptr++;
1336 ctx->pattern++;
1337 break;
1339 case SRE_OP_GROUPREF_IGNORE:
1340 /* match backreference */
1341 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1342 ctx->ptr, ctx->pattern[0]));
1343 i = ctx->pattern[0];
1345 Py_ssize_t groupref = i+i;
1346 if (groupref >= state->lastmark) {
1347 RETURN_FAILURE;
1348 } else {
1349 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1350 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1351 if (!p || !e || e < p)
1352 RETURN_FAILURE;
1353 while (p < e) {
1354 if (ctx->ptr >= end ||
1355 state->lower(*ctx->ptr) != state->lower(*p))
1356 RETURN_FAILURE;
1357 p++; ctx->ptr++;
1361 ctx->pattern++;
1362 break;
1364 case SRE_OP_GROUPREF_EXISTS:
1365 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1366 ctx->ptr, ctx->pattern[0]));
1367 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1368 i = ctx->pattern[0];
1370 Py_ssize_t groupref = i+i;
1371 if (groupref >= state->lastmark) {
1372 ctx->pattern += ctx->pattern[1];
1373 break;
1374 } else {
1375 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1376 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1377 if (!p || !e || e < p) {
1378 ctx->pattern += ctx->pattern[1];
1379 break;
1383 ctx->pattern += 2;
1384 break;
1386 case SRE_OP_ASSERT:
1387 /* assert subpattern */
1388 /* <ASSERT> <skip> <back> <pattern> */
1389 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1390 ctx->ptr, ctx->pattern[1]));
1391 state->ptr = ctx->ptr - ctx->pattern[1];
1392 if (state->ptr < state->beginning)
1393 RETURN_FAILURE;
1394 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1395 RETURN_ON_FAILURE(ret);
1396 ctx->pattern += ctx->pattern[0];
1397 break;
1399 case SRE_OP_ASSERT_NOT:
1400 /* assert not subpattern */
1401 /* <ASSERT_NOT> <skip> <back> <pattern> */
1402 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1403 ctx->ptr, ctx->pattern[1]));
1404 state->ptr = ctx->ptr - ctx->pattern[1];
1405 if (state->ptr >= state->beginning) {
1406 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1407 if (ret) {
1408 RETURN_ON_ERROR(ret);
1409 RETURN_FAILURE;
1412 ctx->pattern += ctx->pattern[0];
1413 break;
1415 case SRE_OP_FAILURE:
1416 /* immediate failure */
1417 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1418 RETURN_FAILURE;
1420 default:
1421 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1422 ctx->pattern[-1]));
1423 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1427 exit:
1428 ctx_pos = ctx->last_ctx_pos;
1429 jump = ctx->jump;
1430 DATA_POP_DISCARD(ctx);
1431 if (ctx_pos == -1)
1432 return ret;
1433 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1435 switch (jump) {
1436 case JUMP_MAX_UNTIL_2:
1437 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1438 goto jump_max_until_2;
1439 case JUMP_MAX_UNTIL_3:
1440 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1441 goto jump_max_until_3;
1442 case JUMP_MIN_UNTIL_2:
1443 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1444 goto jump_min_until_2;
1445 case JUMP_MIN_UNTIL_3:
1446 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1447 goto jump_min_until_3;
1448 case JUMP_BRANCH:
1449 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1450 goto jump_branch;
1451 case JUMP_MAX_UNTIL_1:
1452 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1453 goto jump_max_until_1;
1454 case JUMP_MIN_UNTIL_1:
1455 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1456 goto jump_min_until_1;
1457 case JUMP_REPEAT:
1458 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1459 goto jump_repeat;
1460 case JUMP_REPEAT_ONE_1:
1461 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1462 goto jump_repeat_one_1;
1463 case JUMP_REPEAT_ONE_2:
1464 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1465 goto jump_repeat_one_2;
1466 case JUMP_MIN_REPEAT_ONE:
1467 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1468 goto jump_min_repeat_one;
1469 case JUMP_ASSERT:
1470 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1471 goto jump_assert;
1472 case JUMP_ASSERT_NOT:
1473 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1474 goto jump_assert_not;
1475 case JUMP_NONE:
1476 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1477 break;
1480 return ret; /* should never get here */
1483 LOCAL(Py_ssize_t)
1484 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1486 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1487 SRE_CHAR* end = (SRE_CHAR *)state->end;
1488 Py_ssize_t status = 0;
1489 Py_ssize_t prefix_len = 0;
1490 Py_ssize_t prefix_skip = 0;
1491 SRE_CODE* prefix = NULL;
1492 SRE_CODE* charset = NULL;
1493 SRE_CODE* overlap = NULL;
1494 int flags = 0;
1496 if (pattern[0] == SRE_OP_INFO) {
1497 /* optimization info block */
1498 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1500 flags = pattern[2];
1502 if (pattern[3] > 1) {
1503 /* adjust end point (but make sure we leave at least one
1504 character in there, so literal search will work) */
1505 end -= pattern[3]-1;
1506 if (end <= ptr)
1507 end = ptr+1;
1510 if (flags & SRE_INFO_PREFIX) {
1511 /* pattern starts with a known prefix */
1512 /* <length> <skip> <prefix data> <overlap data> */
1513 prefix_len = pattern[5];
1514 prefix_skip = pattern[6];
1515 prefix = pattern + 7;
1516 overlap = prefix + prefix_len - 1;
1517 } else if (flags & SRE_INFO_CHARSET)
1518 /* pattern starts with a character from a known set */
1519 /* <charset> */
1520 charset = pattern + 5;
1522 pattern += 1 + pattern[1];
1525 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1526 TRACE(("charset = %p\n", charset));
1528 #if defined(USE_FAST_SEARCH)
1529 if (prefix_len > 1) {
1530 /* pattern starts with a known prefix. use the overlap
1531 table to skip forward as fast as we possibly can */
1532 Py_ssize_t i = 0;
1533 end = (SRE_CHAR *)state->end;
1534 while (ptr < end) {
1535 for (;;) {
1536 if ((SRE_CODE) ptr[0] != prefix[i]) {
1537 if (!i)
1538 break;
1539 else
1540 i = overlap[i];
1541 } else {
1542 if (++i == prefix_len) {
1543 /* found a potential match */
1544 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1545 state->start = ptr + 1 - prefix_len;
1546 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1547 if (flags & SRE_INFO_LITERAL)
1548 return 1; /* we got all of it */
1549 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1550 if (status != 0)
1551 return status;
1552 /* close but no cigar -- try again */
1553 i = overlap[i];
1555 break;
1558 ptr++;
1560 return 0;
1562 #endif
1564 if (pattern[0] == SRE_OP_LITERAL) {
1565 /* pattern starts with a literal character. this is used
1566 for short prefixes, and if fast search is disabled */
1567 SRE_CODE chr = pattern[1];
1568 end = (SRE_CHAR *)state->end;
1569 for (;;) {
1570 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1571 ptr++;
1572 if (ptr >= end)
1573 return 0;
1574 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1575 state->start = ptr;
1576 state->ptr = ++ptr;
1577 if (flags & SRE_INFO_LITERAL)
1578 return 1; /* we got all of it */
1579 status = SRE_MATCH(state, pattern + 2);
1580 if (status != 0)
1581 break;
1583 } else if (charset) {
1584 /* pattern starts with a character from a known set */
1585 end = (SRE_CHAR *)state->end;
1586 for (;;) {
1587 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1588 ptr++;
1589 if (ptr >= end)
1590 return 0;
1591 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1592 state->start = ptr;
1593 state->ptr = ptr;
1594 status = SRE_MATCH(state, pattern);
1595 if (status != 0)
1596 break;
1597 ptr++;
1599 } else
1600 /* general case */
1601 while (ptr <= end) {
1602 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1603 state->start = state->ptr = ptr++;
1604 status = SRE_MATCH(state, pattern);
1605 if (status != 0)
1606 break;
1609 return status;
1612 LOCAL(int)
1613 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1615 /* check if given string is a literal template (i.e. no escapes) */
1616 while (len-- > 0)
1617 if (*ptr++ == '\\')
1618 return 0;
1619 return 1;
1622 #if !defined(SRE_RECURSIVE)
1624 /* -------------------------------------------------------------------- */
1625 /* factories and destructors */
1627 /* see sre.h for object declarations */
1628 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1629 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1631 static PyObject *
1632 sre_codesize(PyObject* self, PyObject *unused)
1634 return Py_BuildValue("l", sizeof(SRE_CODE));
1637 static PyObject *
1638 sre_getlower(PyObject* self, PyObject* args)
1640 int character, flags;
1641 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1642 return NULL;
1643 if (flags & SRE_FLAG_LOCALE)
1644 return Py_BuildValue("i", sre_lower_locale(character));
1645 if (flags & SRE_FLAG_UNICODE)
1646 #if defined(HAVE_UNICODE)
1647 return Py_BuildValue("i", sre_lower_unicode(character));
1648 #else
1649 return Py_BuildValue("i", sre_lower_locale(character));
1650 #endif
1651 return Py_BuildValue("i", sre_lower(character));
1654 LOCAL(void)
1655 state_reset(SRE_STATE* state)
1657 /* FIXME: dynamic! */
1658 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1660 state->lastmark = -1;
1661 state->lastindex = -1;
1663 state->repeat = NULL;
1665 data_stack_dealloc(state);
1668 static void*
1669 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1671 /* given a python object, return a data pointer, a length (in
1672 characters), and a character size. return NULL if the object
1673 is not a string (or not compatible) */
1675 PyBufferProcs *buffer;
1676 Py_ssize_t size, bytes;
1677 int charsize;
1678 void* ptr;
1680 #if defined(HAVE_UNICODE)
1681 if (PyUnicode_Check(string)) {
1682 /* unicode strings doesn't always support the buffer interface */
1683 ptr = (void*) PyUnicode_AS_DATA(string);
1684 bytes = PyUnicode_GET_DATA_SIZE(string);
1685 size = PyUnicode_GET_SIZE(string);
1686 charsize = sizeof(Py_UNICODE);
1688 } else {
1689 #endif
1691 /* get pointer to string buffer */
1692 buffer = string->ob_type->tp_as_buffer;
1693 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1694 buffer->bf_getsegcount(string, NULL) != 1) {
1695 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1696 return NULL;
1699 /* determine buffer size */
1700 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1701 if (bytes < 0) {
1702 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1703 return NULL;
1706 /* determine character size */
1707 #if PY_VERSION_HEX >= 0x01060000
1708 size = PyObject_Size(string);
1709 #else
1710 size = PyObject_Length(string);
1711 #endif
1713 if (PyString_Check(string) || bytes == size)
1714 charsize = 1;
1715 #if defined(HAVE_UNICODE)
1716 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1717 charsize = sizeof(Py_UNICODE);
1718 #endif
1719 else {
1720 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1721 return NULL;
1724 #if defined(HAVE_UNICODE)
1726 #endif
1728 *p_length = size;
1729 *p_charsize = charsize;
1731 return ptr;
1734 LOCAL(PyObject*)
1735 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1736 Py_ssize_t start, Py_ssize_t end)
1738 /* prepare state object */
1740 Py_ssize_t length;
1741 int charsize;
1742 void* ptr;
1744 memset(state, 0, sizeof(SRE_STATE));
1746 state->lastmark = -1;
1747 state->lastindex = -1;
1749 ptr = getstring(string, &length, &charsize);
1750 if (!ptr)
1751 return NULL;
1753 /* adjust boundaries */
1754 if (start < 0)
1755 start = 0;
1756 else if (start > length)
1757 start = length;
1759 if (end < 0)
1760 end = 0;
1761 else if (end > length)
1762 end = length;
1764 state->charsize = charsize;
1766 state->beginning = ptr;
1768 state->start = (void*) ((char*) ptr + start * state->charsize);
1769 state->end = (void*) ((char*) ptr + end * state->charsize);
1771 Py_INCREF(string);
1772 state->string = string;
1773 state->pos = start;
1774 state->endpos = end;
1776 if (pattern->flags & SRE_FLAG_LOCALE)
1777 state->lower = sre_lower_locale;
1778 else if (pattern->flags & SRE_FLAG_UNICODE)
1779 #if defined(HAVE_UNICODE)
1780 state->lower = sre_lower_unicode;
1781 #else
1782 state->lower = sre_lower_locale;
1783 #endif
1784 else
1785 state->lower = sre_lower;
1787 return string;
1790 LOCAL(void)
1791 state_fini(SRE_STATE* state)
1793 Py_XDECREF(state->string);
1794 data_stack_dealloc(state);
1797 /* calculate offset from start of string */
1798 #define STATE_OFFSET(state, member)\
1799 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1801 LOCAL(PyObject*)
1802 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1804 Py_ssize_t i, j;
1806 index = (index - 1) * 2;
1808 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1809 if (empty)
1810 /* want empty string */
1811 i = j = 0;
1812 else {
1813 Py_INCREF(Py_None);
1814 return Py_None;
1816 } else {
1817 i = STATE_OFFSET(state, state->mark[index]);
1818 j = STATE_OFFSET(state, state->mark[index+1]);
1821 return PySequence_GetSlice(string, i, j);
1824 static void
1825 pattern_error(int status)
1827 switch (status) {
1828 case SRE_ERROR_RECURSION_LIMIT:
1829 PyErr_SetString(
1830 PyExc_RuntimeError,
1831 "maximum recursion limit exceeded"
1833 break;
1834 case SRE_ERROR_MEMORY:
1835 PyErr_NoMemory();
1836 break;
1837 default:
1838 /* other error codes indicate compiler/engine bugs */
1839 PyErr_SetString(
1840 PyExc_RuntimeError,
1841 "internal error in regular expression engine"
1846 static void
1847 pattern_dealloc(PatternObject* self)
1849 if (self->weakreflist != NULL)
1850 PyObject_ClearWeakRefs((PyObject *) self);
1851 Py_XDECREF(self->pattern);
1852 Py_XDECREF(self->groupindex);
1853 Py_XDECREF(self->indexgroup);
1854 PyObject_DEL(self);
1857 static PyObject*
1858 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1860 SRE_STATE state;
1861 int status;
1863 PyObject* string;
1864 Py_ssize_t start = 0;
1865 Py_ssize_t end = PY_SSIZE_T_MAX;
1866 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1867 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1868 &string, &start, &end))
1869 return NULL;
1871 string = state_init(&state, self, string, start, end);
1872 if (!string)
1873 return NULL;
1875 state.ptr = state.start;
1877 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1879 if (state.charsize == 1) {
1880 status = sre_match(&state, PatternObject_GetCode(self));
1881 } else {
1882 #if defined(HAVE_UNICODE)
1883 status = sre_umatch(&state, PatternObject_GetCode(self));
1884 #endif
1887 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1888 if (PyErr_Occurred())
1889 return NULL;
1891 state_fini(&state);
1893 return pattern_new_match(self, &state, status);
1896 static PyObject*
1897 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1899 SRE_STATE state;
1900 int status;
1902 PyObject* string;
1903 Py_ssize_t start = 0;
1904 Py_ssize_t end = PY_SSIZE_T_MAX;
1905 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1906 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1907 &string, &start, &end))
1908 return NULL;
1910 string = state_init(&state, self, string, start, end);
1911 if (!string)
1912 return NULL;
1914 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1916 if (state.charsize == 1) {
1917 status = sre_search(&state, PatternObject_GetCode(self));
1918 } else {
1919 #if defined(HAVE_UNICODE)
1920 status = sre_usearch(&state, PatternObject_GetCode(self));
1921 #endif
1924 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1926 state_fini(&state);
1928 if (PyErr_Occurred())
1929 return NULL;
1931 return pattern_new_match(self, &state, status);
1934 static PyObject*
1935 call(char* module, char* function, PyObject* args)
1937 PyObject* name;
1938 PyObject* mod;
1939 PyObject* func;
1940 PyObject* result;
1942 if (!args)
1943 return NULL;
1944 name = PyString_FromString(module);
1945 if (!name)
1946 return NULL;
1947 mod = PyImport_Import(name);
1948 Py_DECREF(name);
1949 if (!mod)
1950 return NULL;
1951 func = PyObject_GetAttrString(mod, function);
1952 Py_DECREF(mod);
1953 if (!func)
1954 return NULL;
1955 result = PyObject_CallObject(func, args);
1956 Py_DECREF(func);
1957 Py_DECREF(args);
1958 return result;
1961 #ifdef USE_BUILTIN_COPY
1962 static int
1963 deepcopy(PyObject** object, PyObject* memo)
1965 PyObject* copy;
1967 copy = call(
1968 "copy", "deepcopy",
1969 PyTuple_Pack(2, *object, memo)
1971 if (!copy)
1972 return 0;
1974 Py_DECREF(*object);
1975 *object = copy;
1977 return 1; /* success */
1979 #endif
1981 static PyObject*
1982 join_list(PyObject* list, PyObject* pattern)
1984 /* join list elements */
1986 PyObject* joiner;
1987 #if PY_VERSION_HEX >= 0x01060000
1988 PyObject* function;
1989 PyObject* args;
1990 #endif
1991 PyObject* result;
1993 switch (PyList_GET_SIZE(list)) {
1994 case 0:
1995 Py_DECREF(list);
1996 return PySequence_GetSlice(pattern, 0, 0);
1997 case 1:
1998 result = PyList_GET_ITEM(list, 0);
1999 Py_INCREF(result);
2000 Py_DECREF(list);
2001 return result;
2004 /* two or more elements: slice out a suitable separator from the
2005 first member, and use that to join the entire list */
2007 joiner = PySequence_GetSlice(pattern, 0, 0);
2008 if (!joiner)
2009 return NULL;
2011 #if PY_VERSION_HEX >= 0x01060000
2012 function = PyObject_GetAttrString(joiner, "join");
2013 if (!function) {
2014 Py_DECREF(joiner);
2015 return NULL;
2017 args = PyTuple_New(1);
2018 if (!args) {
2019 Py_DECREF(function);
2020 Py_DECREF(joiner);
2021 return NULL;
2023 PyTuple_SET_ITEM(args, 0, list);
2024 result = PyObject_CallObject(function, args);
2025 Py_DECREF(args); /* also removes list */
2026 Py_DECREF(function);
2027 #else
2028 result = call(
2029 "string", "join",
2030 PyTuple_Pack(2, list, joiner)
2032 #endif
2033 Py_DECREF(joiner);
2035 return result;
2038 static PyObject*
2039 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2041 SRE_STATE state;
2042 PyObject* list;
2043 int status;
2044 Py_ssize_t i, b, e;
2046 PyObject* string;
2047 Py_ssize_t start = 0;
2048 Py_ssize_t end = PY_SSIZE_T_MAX;
2049 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2050 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2051 &string, &start, &end))
2052 return NULL;
2054 string = state_init(&state, self, string, start, end);
2055 if (!string)
2056 return NULL;
2058 list = PyList_New(0);
2059 if (!list) {
2060 state_fini(&state);
2061 return NULL;
2064 while (state.start <= state.end) {
2066 PyObject* item;
2068 state_reset(&state);
2070 state.ptr = state.start;
2072 if (state.charsize == 1) {
2073 status = sre_search(&state, PatternObject_GetCode(self));
2074 } else {
2075 #if defined(HAVE_UNICODE)
2076 status = sre_usearch(&state, PatternObject_GetCode(self));
2077 #endif
2080 if (PyErr_Occurred())
2081 goto error;
2083 if (status <= 0) {
2084 if (status == 0)
2085 break;
2086 pattern_error(status);
2087 goto error;
2090 /* don't bother to build a match object */
2091 switch (self->groups) {
2092 case 0:
2093 b = STATE_OFFSET(&state, state.start);
2094 e = STATE_OFFSET(&state, state.ptr);
2095 item = PySequence_GetSlice(string, b, e);
2096 if (!item)
2097 goto error;
2098 break;
2099 case 1:
2100 item = state_getslice(&state, 1, string, 1);
2101 if (!item)
2102 goto error;
2103 break;
2104 default:
2105 item = PyTuple_New(self->groups);
2106 if (!item)
2107 goto error;
2108 for (i = 0; i < self->groups; i++) {
2109 PyObject* o = state_getslice(&state, i+1, string, 1);
2110 if (!o) {
2111 Py_DECREF(item);
2112 goto error;
2114 PyTuple_SET_ITEM(item, i, o);
2116 break;
2119 status = PyList_Append(list, item);
2120 Py_DECREF(item);
2121 if (status < 0)
2122 goto error;
2124 if (state.ptr == state.start)
2125 state.start = (void*) ((char*) state.ptr + state.charsize);
2126 else
2127 state.start = state.ptr;
2131 state_fini(&state);
2132 return list;
2134 error:
2135 Py_DECREF(list);
2136 state_fini(&state);
2137 return NULL;
2141 #if PY_VERSION_HEX >= 0x02020000
2142 static PyObject*
2143 pattern_finditer(PatternObject* pattern, PyObject* args)
2145 PyObject* scanner;
2146 PyObject* search;
2147 PyObject* iterator;
2149 scanner = pattern_scanner(pattern, args);
2150 if (!scanner)
2151 return NULL;
2153 search = PyObject_GetAttrString(scanner, "search");
2154 Py_DECREF(scanner);
2155 if (!search)
2156 return NULL;
2158 iterator = PyCallIter_New(search, Py_None);
2159 Py_DECREF(search);
2161 return iterator;
2163 #endif
2165 static PyObject*
2166 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2168 SRE_STATE state;
2169 PyObject* list;
2170 PyObject* item;
2171 int status;
2172 Py_ssize_t n;
2173 Py_ssize_t i;
2174 void* last;
2176 PyObject* string;
2177 Py_ssize_t maxsplit = 0;
2178 static char* kwlist[] = { "source", "maxsplit", NULL };
2179 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2180 &string, &maxsplit))
2181 return NULL;
2183 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2184 if (!string)
2185 return NULL;
2187 list = PyList_New(0);
2188 if (!list) {
2189 state_fini(&state);
2190 return NULL;
2193 n = 0;
2194 last = state.start;
2196 while (!maxsplit || n < maxsplit) {
2198 state_reset(&state);
2200 state.ptr = state.start;
2202 if (state.charsize == 1) {
2203 status = sre_search(&state, PatternObject_GetCode(self));
2204 } else {
2205 #if defined(HAVE_UNICODE)
2206 status = sre_usearch(&state, PatternObject_GetCode(self));
2207 #endif
2210 if (PyErr_Occurred())
2211 goto error;
2213 if (status <= 0) {
2214 if (status == 0)
2215 break;
2216 pattern_error(status);
2217 goto error;
2220 if (state.start == state.ptr) {
2221 if (last == state.end)
2222 break;
2223 /* skip one character */
2224 state.start = (void*) ((char*) state.ptr + state.charsize);
2225 continue;
2228 /* get segment before this match */
2229 item = PySequence_GetSlice(
2230 string, STATE_OFFSET(&state, last),
2231 STATE_OFFSET(&state, state.start)
2233 if (!item)
2234 goto error;
2235 status = PyList_Append(list, item);
2236 Py_DECREF(item);
2237 if (status < 0)
2238 goto error;
2240 /* add groups (if any) */
2241 for (i = 0; i < self->groups; i++) {
2242 item = state_getslice(&state, i+1, string, 0);
2243 if (!item)
2244 goto error;
2245 status = PyList_Append(list, item);
2246 Py_DECREF(item);
2247 if (status < 0)
2248 goto error;
2251 n = n + 1;
2253 last = state.start = state.ptr;
2257 /* get segment following last match (even if empty) */
2258 item = PySequence_GetSlice(
2259 string, STATE_OFFSET(&state, last), state.endpos
2261 if (!item)
2262 goto error;
2263 status = PyList_Append(list, item);
2264 Py_DECREF(item);
2265 if (status < 0)
2266 goto error;
2268 state_fini(&state);
2269 return list;
2271 error:
2272 Py_DECREF(list);
2273 state_fini(&state);
2274 return NULL;
2278 static PyObject*
2279 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2280 Py_ssize_t count, Py_ssize_t subn)
2282 SRE_STATE state;
2283 PyObject* list;
2284 PyObject* item;
2285 PyObject* filter;
2286 PyObject* args;
2287 PyObject* match;
2288 void* ptr;
2289 int status;
2290 Py_ssize_t n;
2291 Py_ssize_t i, b, e;
2292 int bint;
2293 int filter_is_callable;
2295 if (PyCallable_Check(ptemplate)) {
2296 /* sub/subn takes either a function or a template */
2297 filter = ptemplate;
2298 Py_INCREF(filter);
2299 filter_is_callable = 1;
2300 } else {
2301 /* if not callable, check if it's a literal string */
2302 int literal;
2303 ptr = getstring(ptemplate, &n, &bint);
2304 b = bint;
2305 if (ptr) {
2306 if (b == 1) {
2307 literal = sre_literal_template((unsigned char *)ptr, n);
2308 } else {
2309 #if defined(HAVE_UNICODE)
2310 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2311 #endif
2313 } else {
2314 PyErr_Clear();
2315 literal = 0;
2317 if (literal) {
2318 filter = ptemplate;
2319 Py_INCREF(filter);
2320 filter_is_callable = 0;
2321 } else {
2322 /* not a literal; hand it over to the template compiler */
2323 filter = call(
2324 SRE_PY_MODULE, "_subx",
2325 PyTuple_Pack(2, self, ptemplate)
2327 if (!filter)
2328 return NULL;
2329 filter_is_callable = PyCallable_Check(filter);
2333 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2334 if (!string) {
2335 Py_DECREF(filter);
2336 return NULL;
2339 list = PyList_New(0);
2340 if (!list) {
2341 Py_DECREF(filter);
2342 state_fini(&state);
2343 return NULL;
2346 n = i = 0;
2348 while (!count || n < count) {
2350 state_reset(&state);
2352 state.ptr = state.start;
2354 if (state.charsize == 1) {
2355 status = sre_search(&state, PatternObject_GetCode(self));
2356 } else {
2357 #if defined(HAVE_UNICODE)
2358 status = sre_usearch(&state, PatternObject_GetCode(self));
2359 #endif
2362 if (PyErr_Occurred())
2363 goto error;
2365 if (status <= 0) {
2366 if (status == 0)
2367 break;
2368 pattern_error(status);
2369 goto error;
2372 b = STATE_OFFSET(&state, state.start);
2373 e = STATE_OFFSET(&state, state.ptr);
2375 if (i < b) {
2376 /* get segment before this match */
2377 item = PySequence_GetSlice(string, i, b);
2378 if (!item)
2379 goto error;
2380 status = PyList_Append(list, item);
2381 Py_DECREF(item);
2382 if (status < 0)
2383 goto error;
2385 } else if (i == b && i == e && n > 0)
2386 /* ignore empty match on latest position */
2387 goto next;
2389 if (filter_is_callable) {
2390 /* pass match object through filter */
2391 match = pattern_new_match(self, &state, 1);
2392 if (!match)
2393 goto error;
2394 args = PyTuple_Pack(1, match);
2395 if (!args) {
2396 Py_DECREF(match);
2397 goto error;
2399 item = PyObject_CallObject(filter, args);
2400 Py_DECREF(args);
2401 Py_DECREF(match);
2402 if (!item)
2403 goto error;
2404 } else {
2405 /* filter is literal string */
2406 item = filter;
2407 Py_INCREF(item);
2410 /* add to list */
2411 if (item != Py_None) {
2412 status = PyList_Append(list, item);
2413 Py_DECREF(item);
2414 if (status < 0)
2415 goto error;
2418 i = e;
2419 n = n + 1;
2421 next:
2422 /* move on */
2423 if (state.ptr == state.start)
2424 state.start = (void*) ((char*) state.ptr + state.charsize);
2425 else
2426 state.start = state.ptr;
2430 /* get segment following last match */
2431 if (i < state.endpos) {
2432 item = PySequence_GetSlice(string, i, state.endpos);
2433 if (!item)
2434 goto error;
2435 status = PyList_Append(list, item);
2436 Py_DECREF(item);
2437 if (status < 0)
2438 goto error;
2441 state_fini(&state);
2443 Py_DECREF(filter);
2445 /* convert list to single string (also removes list) */
2446 item = join_list(list, self->pattern);
2448 if (!item)
2449 return NULL;
2451 if (subn)
2452 return Py_BuildValue("Ni", item, n);
2454 return item;
2456 error:
2457 Py_DECREF(list);
2458 state_fini(&state);
2459 Py_DECREF(filter);
2460 return NULL;
2464 static PyObject*
2465 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2467 PyObject* ptemplate;
2468 PyObject* string;
2469 Py_ssize_t count = 0;
2470 static char* kwlist[] = { "repl", "string", "count", NULL };
2471 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2472 &ptemplate, &string, &count))
2473 return NULL;
2475 return pattern_subx(self, ptemplate, string, count, 0);
2478 static PyObject*
2479 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2481 PyObject* ptemplate;
2482 PyObject* string;
2483 Py_ssize_t count = 0;
2484 static char* kwlist[] = { "repl", "string", "count", NULL };
2485 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2486 &ptemplate, &string, &count))
2487 return NULL;
2489 return pattern_subx(self, ptemplate, string, count, 1);
2492 static PyObject*
2493 pattern_copy(PatternObject* self, PyObject *unused)
2495 #ifdef USE_BUILTIN_COPY
2496 PatternObject* copy;
2497 int offset;
2499 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2500 if (!copy)
2501 return NULL;
2503 offset = offsetof(PatternObject, groups);
2505 Py_XINCREF(self->groupindex);
2506 Py_XINCREF(self->indexgroup);
2507 Py_XINCREF(self->pattern);
2509 memcpy((char*) copy + offset, (char*) self + offset,
2510 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2511 copy->weakreflist = NULL;
2513 return (PyObject*) copy;
2514 #else
2515 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2516 return NULL;
2517 #endif
2520 static PyObject*
2521 pattern_deepcopy(PatternObject* self, PyObject* memo)
2523 #ifdef USE_BUILTIN_COPY
2524 PatternObject* copy;
2526 copy = (PatternObject*) pattern_copy(self);
2527 if (!copy)
2528 return NULL;
2530 if (!deepcopy(&copy->groupindex, memo) ||
2531 !deepcopy(&copy->indexgroup, memo) ||
2532 !deepcopy(&copy->pattern, memo)) {
2533 Py_DECREF(copy);
2534 return NULL;
2537 #else
2538 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2539 return NULL;
2540 #endif
2543 PyDoc_STRVAR(pattern_match_doc,
2544 "match(string[, pos[, endpos]]) --> match object or None.\n\
2545 Matches zero or more characters at the beginning of the string");
2547 PyDoc_STRVAR(pattern_search_doc,
2548 "search(string[, pos[, endpos]]) --> match object or None.\n\
2549 Scan through string looking for a match, and return a corresponding\n\
2550 MatchObject instance. Return None if no position in the string matches.");
2552 PyDoc_STRVAR(pattern_split_doc,
2553 "split(string[, maxsplit = 0]) --> list.\n\
2554 Split string by the occurrences of pattern.");
2556 PyDoc_STRVAR(pattern_findall_doc,
2557 "findall(string[, pos[, endpos]]) --> list.\n\
2558 Return a list of all non-overlapping matches of pattern in string.");
2560 PyDoc_STRVAR(pattern_finditer_doc,
2561 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2562 Return an iterator over all non-overlapping matches for the \n\
2563 RE pattern in string. For each match, the iterator returns a\n\
2564 match object.");
2566 PyDoc_STRVAR(pattern_sub_doc,
2567 "sub(repl, string[, count = 0]) --> newstring\n\
2568 Return the string obtained by replacing the leftmost non-overlapping\n\
2569 occurrences of pattern in string by the replacement repl.");
2571 PyDoc_STRVAR(pattern_subn_doc,
2572 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2573 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2574 the leftmost non-overlapping occurrences of pattern with the\n\
2575 replacement repl.");
2577 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2579 static PyMethodDef pattern_methods[] = {
2580 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2581 pattern_match_doc},
2582 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2583 pattern_search_doc},
2584 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2585 pattern_sub_doc},
2586 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2587 pattern_subn_doc},
2588 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2589 pattern_split_doc},
2590 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2591 pattern_findall_doc},
2592 #if PY_VERSION_HEX >= 0x02020000
2593 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2594 pattern_finditer_doc},
2595 #endif
2596 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2597 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2598 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2599 {NULL, NULL}
2602 static PyObject*
2603 pattern_getattr(PatternObject* self, char* name)
2605 PyObject* res;
2607 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2609 if (res)
2610 return res;
2612 PyErr_Clear();
2614 /* attributes */
2615 if (!strcmp(name, "pattern")) {
2616 Py_INCREF(self->pattern);
2617 return self->pattern;
2620 if (!strcmp(name, "flags"))
2621 return Py_BuildValue("i", self->flags);
2623 if (!strcmp(name, "groups"))
2624 return Py_BuildValue("i", self->groups);
2626 if (!strcmp(name, "groupindex") && self->groupindex) {
2627 Py_INCREF(self->groupindex);
2628 return self->groupindex;
2631 PyErr_SetString(PyExc_AttributeError, name);
2632 return NULL;
2635 statichere PyTypeObject Pattern_Type = {
2636 PyObject_HEAD_INIT(NULL)
2637 0, "_" SRE_MODULE ".SRE_Pattern",
2638 sizeof(PatternObject), sizeof(SRE_CODE),
2639 (destructor)pattern_dealloc, /*tp_dealloc*/
2640 0, /*tp_print*/
2641 (getattrfunc)pattern_getattr, /*tp_getattr*/
2642 0, /* tp_setattr */
2643 0, /* tp_compare */
2644 0, /* tp_repr */
2645 0, /* tp_as_number */
2646 0, /* tp_as_sequence */
2647 0, /* tp_as_mapping */
2648 0, /* tp_hash */
2649 0, /* tp_call */
2650 0, /* tp_str */
2651 0, /* tp_getattro */
2652 0, /* tp_setattro */
2653 0, /* tp_as_buffer */
2654 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2655 pattern_doc, /* tp_doc */
2656 0, /* tp_traverse */
2657 0, /* tp_clear */
2658 0, /* tp_richcompare */
2659 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2662 static PyObject *
2663 _compile(PyObject* self_, PyObject* args)
2665 /* "compile" pattern descriptor to pattern object */
2667 PatternObject* self;
2668 Py_ssize_t i, n;
2670 PyObject* pattern;
2671 int flags = 0;
2672 PyObject* code;
2673 Py_ssize_t groups = 0;
2674 PyObject* groupindex = NULL;
2675 PyObject* indexgroup = NULL;
2676 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2677 &PyList_Type, &code, &groups,
2678 &groupindex, &indexgroup))
2679 return NULL;
2681 n = PyList_GET_SIZE(code);
2683 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2684 if (!self)
2685 return NULL;
2687 self->codesize = n;
2689 for (i = 0; i < n; i++) {
2690 PyObject *o = PyList_GET_ITEM(code, i);
2691 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2692 : PyLong_AsUnsignedLong(o);
2693 self->code[i] = (SRE_CODE) value;
2694 if ((unsigned long) self->code[i] != value) {
2695 PyErr_SetString(PyExc_OverflowError,
2696 "regular expression code size limit exceeded");
2697 break;
2701 if (PyErr_Occurred()) {
2702 PyObject_DEL(self);
2703 return NULL;
2706 Py_INCREF(pattern);
2707 self->pattern = pattern;
2709 self->flags = flags;
2711 self->groups = groups;
2713 Py_XINCREF(groupindex);
2714 self->groupindex = groupindex;
2716 Py_XINCREF(indexgroup);
2717 self->indexgroup = indexgroup;
2719 self->weakreflist = NULL;
2721 return (PyObject*) self;
2724 /* -------------------------------------------------------------------- */
2725 /* match methods */
2727 static void
2728 match_dealloc(MatchObject* self)
2730 Py_XDECREF(self->regs);
2731 Py_XDECREF(self->string);
2732 Py_DECREF(self->pattern);
2733 PyObject_DEL(self);
2736 static PyObject*
2737 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2739 if (index < 0 || index >= self->groups) {
2740 /* raise IndexError if we were given a bad group number */
2741 PyErr_SetString(
2742 PyExc_IndexError,
2743 "no such group"
2745 return NULL;
2748 index *= 2;
2750 if (self->string == Py_None || self->mark[index] < 0) {
2751 /* return default value if the string or group is undefined */
2752 Py_INCREF(def);
2753 return def;
2756 return PySequence_GetSlice(
2757 self->string, self->mark[index], self->mark[index+1]
2761 static Py_ssize_t
2762 match_getindex(MatchObject* self, PyObject* index)
2764 Py_ssize_t i;
2766 if (PyInt_Check(index))
2767 return PyInt_AsSsize_t(index);
2769 i = -1;
2771 if (self->pattern->groupindex) {
2772 index = PyObject_GetItem(self->pattern->groupindex, index);
2773 if (index) {
2774 if (PyInt_Check(index) || PyLong_Check(index))
2775 i = PyInt_AsSsize_t(index);
2776 Py_DECREF(index);
2777 } else
2778 PyErr_Clear();
2781 return i;
2784 static PyObject*
2785 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2787 return match_getslice_by_index(self, match_getindex(self, index), def);
2790 static PyObject*
2791 match_expand(MatchObject* self, PyObject* ptemplate)
2793 /* delegate to Python code */
2794 return call(
2795 SRE_PY_MODULE, "_expand",
2796 PyTuple_Pack(3, self->pattern, self, ptemplate)
2800 static PyObject*
2801 match_group(MatchObject* self, PyObject* args)
2803 PyObject* result;
2804 Py_ssize_t i, size;
2806 size = PyTuple_GET_SIZE(args);
2808 switch (size) {
2809 case 0:
2810 result = match_getslice(self, Py_False, Py_None);
2811 break;
2812 case 1:
2813 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2814 break;
2815 default:
2816 /* fetch multiple items */
2817 result = PyTuple_New(size);
2818 if (!result)
2819 return NULL;
2820 for (i = 0; i < size; i++) {
2821 PyObject* item = match_getslice(
2822 self, PyTuple_GET_ITEM(args, i), Py_None
2824 if (!item) {
2825 Py_DECREF(result);
2826 return NULL;
2828 PyTuple_SET_ITEM(result, i, item);
2830 break;
2832 return result;
2835 static PyObject*
2836 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2838 PyObject* result;
2839 Py_ssize_t index;
2841 PyObject* def = Py_None;
2842 static char* kwlist[] = { "default", NULL };
2843 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2844 return NULL;
2846 result = PyTuple_New(self->groups-1);
2847 if (!result)
2848 return NULL;
2850 for (index = 1; index < self->groups; index++) {
2851 PyObject* item;
2852 item = match_getslice_by_index(self, index, def);
2853 if (!item) {
2854 Py_DECREF(result);
2855 return NULL;
2857 PyTuple_SET_ITEM(result, index-1, item);
2860 return result;
2863 static PyObject*
2864 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2866 PyObject* result;
2867 PyObject* keys;
2868 Py_ssize_t index;
2870 PyObject* def = Py_None;
2871 static char* kwlist[] = { "default", NULL };
2872 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2873 return NULL;
2875 result = PyDict_New();
2876 if (!result || !self->pattern->groupindex)
2877 return result;
2879 keys = PyMapping_Keys(self->pattern->groupindex);
2880 if (!keys)
2881 goto failed;
2883 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2884 int status;
2885 PyObject* key;
2886 PyObject* value;
2887 key = PyList_GET_ITEM(keys, index);
2888 if (!key)
2889 goto failed;
2890 value = match_getslice(self, key, def);
2891 if (!value) {
2892 Py_DECREF(key);
2893 goto failed;
2895 status = PyDict_SetItem(result, key, value);
2896 Py_DECREF(value);
2897 if (status < 0)
2898 goto failed;
2901 Py_DECREF(keys);
2903 return result;
2905 failed:
2906 Py_XDECREF(keys);
2907 Py_DECREF(result);
2908 return NULL;
2911 static PyObject*
2912 match_start(MatchObject* self, PyObject* args)
2914 Py_ssize_t index;
2916 PyObject* index_ = Py_False; /* zero */
2917 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
2918 return NULL;
2920 index = match_getindex(self, index_);
2922 if (index < 0 || index >= self->groups) {
2923 PyErr_SetString(
2924 PyExc_IndexError,
2925 "no such group"
2927 return NULL;
2930 /* mark is -1 if group is undefined */
2931 return Py_BuildValue("i", self->mark[index*2]);
2934 static PyObject*
2935 match_end(MatchObject* self, PyObject* args)
2937 Py_ssize_t index;
2939 PyObject* index_ = Py_False; /* zero */
2940 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
2941 return NULL;
2943 index = match_getindex(self, index_);
2945 if (index < 0 || index >= self->groups) {
2946 PyErr_SetString(
2947 PyExc_IndexError,
2948 "no such group"
2950 return NULL;
2953 /* mark is -1 if group is undefined */
2954 return Py_BuildValue("i", self->mark[index*2+1]);
2957 LOCAL(PyObject*)
2958 _pair(Py_ssize_t i1, Py_ssize_t i2)
2960 PyObject* pair;
2961 PyObject* item;
2963 pair = PyTuple_New(2);
2964 if (!pair)
2965 return NULL;
2967 item = PyInt_FromSsize_t(i1);
2968 if (!item)
2969 goto error;
2970 PyTuple_SET_ITEM(pair, 0, item);
2972 item = PyInt_FromSsize_t(i2);
2973 if (!item)
2974 goto error;
2975 PyTuple_SET_ITEM(pair, 1, item);
2977 return pair;
2979 error:
2980 Py_DECREF(pair);
2981 return NULL;
2984 static PyObject*
2985 match_span(MatchObject* self, PyObject* args)
2987 Py_ssize_t index;
2989 PyObject* index_ = Py_False; /* zero */
2990 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
2991 return NULL;
2993 index = match_getindex(self, index_);
2995 if (index < 0 || index >= self->groups) {
2996 PyErr_SetString(
2997 PyExc_IndexError,
2998 "no such group"
3000 return NULL;
3003 /* marks are -1 if group is undefined */
3004 return _pair(self->mark[index*2], self->mark[index*2+1]);
3007 static PyObject*
3008 match_regs(MatchObject* self)
3010 PyObject* regs;
3011 PyObject* item;
3012 Py_ssize_t index;
3014 regs = PyTuple_New(self->groups);
3015 if (!regs)
3016 return NULL;
3018 for (index = 0; index < self->groups; index++) {
3019 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3020 if (!item) {
3021 Py_DECREF(regs);
3022 return NULL;
3024 PyTuple_SET_ITEM(regs, index, item);
3027 Py_INCREF(regs);
3028 self->regs = regs;
3030 return regs;
3033 static PyObject*
3034 match_copy(MatchObject* self, PyObject *unused)
3036 #ifdef USE_BUILTIN_COPY
3037 MatchObject* copy;
3038 Py_ssize_t slots, offset;
3040 slots = 2 * (self->pattern->groups+1);
3042 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3043 if (!copy)
3044 return NULL;
3046 /* this value a constant, but any compiler should be able to
3047 figure that out all by itself */
3048 offset = offsetof(MatchObject, string);
3050 Py_XINCREF(self->pattern);
3051 Py_XINCREF(self->string);
3052 Py_XINCREF(self->regs);
3054 memcpy((char*) copy + offset, (char*) self + offset,
3055 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3057 return (PyObject*) copy;
3058 #else
3059 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3060 return NULL;
3061 #endif
3064 static PyObject*
3065 match_deepcopy(MatchObject* self, PyObject* memo)
3067 #ifdef USE_BUILTIN_COPY
3068 MatchObject* copy;
3070 copy = (MatchObject*) match_copy(self);
3071 if (!copy)
3072 return NULL;
3074 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3075 !deepcopy(&copy->string, memo) ||
3076 !deepcopy(&copy->regs, memo)) {
3077 Py_DECREF(copy);
3078 return NULL;
3081 #else
3082 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3083 return NULL;
3084 #endif
3087 static PyMethodDef match_methods[] = {
3088 {"group", (PyCFunction) match_group, METH_VARARGS},
3089 {"start", (PyCFunction) match_start, METH_VARARGS},
3090 {"end", (PyCFunction) match_end, METH_VARARGS},
3091 {"span", (PyCFunction) match_span, METH_VARARGS},
3092 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3093 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3094 {"expand", (PyCFunction) match_expand, METH_O},
3095 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3096 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3097 {NULL, NULL}
3100 static PyObject*
3101 match_getattr(MatchObject* self, char* name)
3103 PyObject* res;
3105 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3106 if (res)
3107 return res;
3109 PyErr_Clear();
3111 if (!strcmp(name, "lastindex")) {
3112 if (self->lastindex >= 0)
3113 return Py_BuildValue("i", self->lastindex);
3114 Py_INCREF(Py_None);
3115 return Py_None;
3118 if (!strcmp(name, "lastgroup")) {
3119 if (self->pattern->indexgroup && self->lastindex >= 0) {
3120 PyObject* result = PySequence_GetItem(
3121 self->pattern->indexgroup, self->lastindex
3123 if (result)
3124 return result;
3125 PyErr_Clear();
3127 Py_INCREF(Py_None);
3128 return Py_None;
3131 if (!strcmp(name, "string")) {
3132 if (self->string) {
3133 Py_INCREF(self->string);
3134 return self->string;
3135 } else {
3136 Py_INCREF(Py_None);
3137 return Py_None;
3141 if (!strcmp(name, "regs")) {
3142 if (self->regs) {
3143 Py_INCREF(self->regs);
3144 return self->regs;
3145 } else
3146 return match_regs(self);
3149 if (!strcmp(name, "re")) {
3150 Py_INCREF(self->pattern);
3151 return (PyObject*) self->pattern;
3154 if (!strcmp(name, "pos"))
3155 return Py_BuildValue("i", self->pos);
3157 if (!strcmp(name, "endpos"))
3158 return Py_BuildValue("i", self->endpos);
3160 PyErr_SetString(PyExc_AttributeError, name);
3161 return NULL;
3164 /* FIXME: implement setattr("string", None) as a special case (to
3165 detach the associated string, if any */
3167 statichere PyTypeObject Match_Type = {
3168 PyObject_HEAD_INIT(NULL)
3169 0, "_" SRE_MODULE ".SRE_Match",
3170 sizeof(MatchObject), sizeof(Py_ssize_t),
3171 (destructor)match_dealloc, /*tp_dealloc*/
3172 0, /*tp_print*/
3173 (getattrfunc)match_getattr /*tp_getattr*/
3176 static PyObject*
3177 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3179 /* create match object (from state object) */
3181 MatchObject* match;
3182 Py_ssize_t i, j;
3183 char* base;
3184 int n;
3186 if (status > 0) {
3188 /* create match object (with room for extra group marks) */
3189 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3190 2*(pattern->groups+1));
3191 if (!match)
3192 return NULL;
3194 Py_INCREF(pattern);
3195 match->pattern = pattern;
3197 Py_INCREF(state->string);
3198 match->string = state->string;
3200 match->regs = NULL;
3201 match->groups = pattern->groups+1;
3203 /* fill in group slices */
3205 base = (char*) state->beginning;
3206 n = state->charsize;
3208 match->mark[0] = ((char*) state->start - base) / n;
3209 match->mark[1] = ((char*) state->ptr - base) / n;
3211 for (i = j = 0; i < pattern->groups; i++, j+=2)
3212 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3213 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3214 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3215 } else
3216 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3218 match->pos = state->pos;
3219 match->endpos = state->endpos;
3221 match->lastindex = state->lastindex;
3223 return (PyObject*) match;
3225 } else if (status == 0) {
3227 /* no match */
3228 Py_INCREF(Py_None);
3229 return Py_None;
3233 /* internal error */
3234 pattern_error(status);
3235 return NULL;
3239 /* -------------------------------------------------------------------- */
3240 /* scanner methods (experimental) */
3242 static void
3243 scanner_dealloc(ScannerObject* self)
3245 state_fini(&self->state);
3246 Py_DECREF(self->pattern);
3247 PyObject_DEL(self);
3250 static PyObject*
3251 scanner_match(ScannerObject* self, PyObject *unused)
3253 SRE_STATE* state = &self->state;
3254 PyObject* match;
3255 int status;
3257 state_reset(state);
3259 state->ptr = state->start;
3261 if (state->charsize == 1) {
3262 status = sre_match(state, PatternObject_GetCode(self->pattern));
3263 } else {
3264 #if defined(HAVE_UNICODE)
3265 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3266 #endif
3268 if (PyErr_Occurred())
3269 return NULL;
3271 match = pattern_new_match((PatternObject*) self->pattern,
3272 state, status);
3274 if (status == 0 || state->ptr == state->start)
3275 state->start = (void*) ((char*) state->ptr + state->charsize);
3276 else
3277 state->start = state->ptr;
3279 return match;
3283 static PyObject*
3284 scanner_search(ScannerObject* self, PyObject *unused)
3286 SRE_STATE* state = &self->state;
3287 PyObject* match;
3288 int status;
3290 state_reset(state);
3292 state->ptr = state->start;
3294 if (state->charsize == 1) {
3295 status = sre_search(state, PatternObject_GetCode(self->pattern));
3296 } else {
3297 #if defined(HAVE_UNICODE)
3298 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3299 #endif
3301 if (PyErr_Occurred())
3302 return NULL;
3304 match = pattern_new_match((PatternObject*) self->pattern,
3305 state, status);
3307 if (status == 0 || state->ptr == state->start)
3308 state->start = (void*) ((char*) state->ptr + state->charsize);
3309 else
3310 state->start = state->ptr;
3312 return match;
3315 static PyMethodDef scanner_methods[] = {
3316 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3317 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3318 {NULL, NULL}
3321 static PyObject*
3322 scanner_getattr(ScannerObject* self, char* name)
3324 PyObject* res;
3326 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3327 if (res)
3328 return res;
3330 PyErr_Clear();
3332 /* attributes */
3333 if (!strcmp(name, "pattern")) {
3334 Py_INCREF(self->pattern);
3335 return self->pattern;
3338 PyErr_SetString(PyExc_AttributeError, name);
3339 return NULL;
3342 statichere PyTypeObject Scanner_Type = {
3343 PyObject_HEAD_INIT(NULL)
3344 0, "_" SRE_MODULE ".SRE_Scanner",
3345 sizeof(ScannerObject), 0,
3346 (destructor)scanner_dealloc, /*tp_dealloc*/
3347 0, /*tp_print*/
3348 (getattrfunc)scanner_getattr, /*tp_getattr*/
3351 static PyObject*
3352 pattern_scanner(PatternObject* pattern, PyObject* args)
3354 /* create search state object */
3356 ScannerObject* self;
3358 PyObject* string;
3359 Py_ssize_t start = 0;
3360 Py_ssize_t end = PY_SSIZE_T_MAX;
3361 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3362 return NULL;
3364 /* create scanner object */
3365 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3366 if (!self)
3367 return NULL;
3369 string = state_init(&self->state, pattern, string, start, end);
3370 if (!string) {
3371 PyObject_DEL(self);
3372 return NULL;
3375 Py_INCREF(pattern);
3376 self->pattern = (PyObject*) pattern;
3378 return (PyObject*) self;
3381 static PyMethodDef _functions[] = {
3382 {"compile", _compile, METH_VARARGS},
3383 {"getcodesize", sre_codesize, METH_NOARGS},
3384 {"getlower", sre_getlower, METH_VARARGS},
3385 {NULL, NULL}
3388 #if PY_VERSION_HEX < 0x02030000
3389 DL_EXPORT(void) init_sre(void)
3390 #else
3391 PyMODINIT_FUNC init_sre(void)
3392 #endif
3394 PyObject* m;
3395 PyObject* d;
3396 PyObject* x;
3398 /* Patch object types */
3399 Pattern_Type.ob_type = Match_Type.ob_type =
3400 Scanner_Type.ob_type = &PyType_Type;
3402 m = Py_InitModule("_" SRE_MODULE, _functions);
3403 if (m == NULL)
3404 return;
3405 d = PyModule_GetDict(m);
3407 x = PyInt_FromLong(SRE_MAGIC);
3408 if (x) {
3409 PyDict_SetItemString(d, "MAGIC", x);
3410 Py_DECREF(x);
3413 x = PyInt_FromLong(sizeof(SRE_CODE));
3414 if (x) {
3415 PyDict_SetItemString(d, "CODESIZE", x);
3416 Py_DECREF(x);
3419 x = PyString_FromString(copyright);
3420 if (x) {
3421 PyDict_SetItemString(d, "copyright", x);
3422 Py_DECREF(x);
3426 #endif /* !defined(SRE_RECURSIVE) */
3428 /* vim:ts=4:sw=4:et