Fix off-by-one error that resulted in missed characters
[pytest.git] / Modules / _sre.c
blob354210064bff49e7f14db71526696c432126b7e5
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 #ifndef SRE_RECURSIVE
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #define PY_SSIZE_T_CLEAN
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
47 #include "sre.h"
49 #include <ctype.h>
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
56 #define SRE_PY_MODULE "re"
58 /* defining this one enables tracing */
59 #undef VERBOSE
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
64 #define HAVE_UNICODE
65 #endif
66 #endif
68 /* -------------------------------------------------------------------- */
69 /* optional features */
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
74 /* enables aggressive inlining (always on for Visual C) */
75 #undef USE_INLINE
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
82 #endif
84 /* -------------------------------------------------------------------- */
86 #if defined(_MSC_VER)
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
93 #else
94 #define LOCAL(type) static type
95 #endif
97 /* error codes */
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
103 #if defined(VERBOSE)
104 #define TRACE(v) printf v
105 #else
106 #define TRACE(v)
107 #endif
109 /* -------------------------------------------------------------------- */
110 /* search engine state */
112 /* default character predicates (run sre_chars.py to regenerate tables) */
114 #define SRE_DIGIT_MASK 1
115 #define SRE_SPACE_MASK 2
116 #define SRE_LINEBREAK_MASK 4
117 #define SRE_ALNUM_MASK 8
118 #define SRE_WORD_MASK 16
120 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
122 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
123 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
125 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
126 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
127 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
128 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
131 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
132 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
133 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
134 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
135 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
136 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
137 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
138 120, 121, 122, 123, 124, 125, 126, 127 };
140 #define SRE_IS_DIGIT(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
142 #define SRE_IS_SPACE(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
144 #define SRE_IS_LINEBREAK(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
146 #define SRE_IS_ALNUM(ch)\
147 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
148 #define SRE_IS_WORD(ch)\
149 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151 static unsigned int sre_lower(unsigned int ch)
153 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
156 /* locale-specific character predicates */
157 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
158 * warnings when c's type supports only numbers < N+1 */
159 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
160 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
161 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
162 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
163 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165 static unsigned int sre_lower_locale(unsigned int ch)
167 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
170 /* unicode-specific character predicates */
172 #if defined(HAVE_UNICODE)
174 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180 static unsigned int sre_lower_unicode(unsigned int ch)
182 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
185 #endif
187 LOCAL(int)
188 sre_category(SRE_CODE category, unsigned int ch)
190 switch (category) {
192 case SRE_CATEGORY_DIGIT:
193 return SRE_IS_DIGIT(ch);
194 case SRE_CATEGORY_NOT_DIGIT:
195 return !SRE_IS_DIGIT(ch);
196 case SRE_CATEGORY_SPACE:
197 return SRE_IS_SPACE(ch);
198 case SRE_CATEGORY_NOT_SPACE:
199 return !SRE_IS_SPACE(ch);
200 case SRE_CATEGORY_WORD:
201 return SRE_IS_WORD(ch);
202 case SRE_CATEGORY_NOT_WORD:
203 return !SRE_IS_WORD(ch);
204 case SRE_CATEGORY_LINEBREAK:
205 return SRE_IS_LINEBREAK(ch);
206 case SRE_CATEGORY_NOT_LINEBREAK:
207 return !SRE_IS_LINEBREAK(ch);
209 case SRE_CATEGORY_LOC_WORD:
210 return SRE_LOC_IS_WORD(ch);
211 case SRE_CATEGORY_LOC_NOT_WORD:
212 return !SRE_LOC_IS_WORD(ch);
214 #if defined(HAVE_UNICODE)
215 case SRE_CATEGORY_UNI_DIGIT:
216 return SRE_UNI_IS_DIGIT(ch);
217 case SRE_CATEGORY_UNI_NOT_DIGIT:
218 return !SRE_UNI_IS_DIGIT(ch);
219 case SRE_CATEGORY_UNI_SPACE:
220 return SRE_UNI_IS_SPACE(ch);
221 case SRE_CATEGORY_UNI_NOT_SPACE:
222 return !SRE_UNI_IS_SPACE(ch);
223 case SRE_CATEGORY_UNI_WORD:
224 return SRE_UNI_IS_WORD(ch);
225 case SRE_CATEGORY_UNI_NOT_WORD:
226 return !SRE_UNI_IS_WORD(ch);
227 case SRE_CATEGORY_UNI_LINEBREAK:
228 return SRE_UNI_IS_LINEBREAK(ch);
229 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
230 return !SRE_UNI_IS_LINEBREAK(ch);
231 #else
232 case SRE_CATEGORY_UNI_DIGIT:
233 return SRE_IS_DIGIT(ch);
234 case SRE_CATEGORY_UNI_NOT_DIGIT:
235 return !SRE_IS_DIGIT(ch);
236 case SRE_CATEGORY_UNI_SPACE:
237 return SRE_IS_SPACE(ch);
238 case SRE_CATEGORY_UNI_NOT_SPACE:
239 return !SRE_IS_SPACE(ch);
240 case SRE_CATEGORY_UNI_WORD:
241 return SRE_LOC_IS_WORD(ch);
242 case SRE_CATEGORY_UNI_NOT_WORD:
243 return !SRE_LOC_IS_WORD(ch);
244 case SRE_CATEGORY_UNI_LINEBREAK:
245 return SRE_IS_LINEBREAK(ch);
246 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
247 return !SRE_IS_LINEBREAK(ch);
248 #endif
250 return 0;
253 /* helpers */
255 static void
256 data_stack_dealloc(SRE_STATE* state)
258 if (state->data_stack) {
259 PyMem_FREE(state->data_stack);
260 state->data_stack = NULL;
262 state->data_stack_size = state->data_stack_base = 0;
265 static int
266 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
268 Py_ssize_t minsize, cursize;
269 minsize = state->data_stack_base+size;
270 cursize = state->data_stack_size;
271 if (cursize < minsize) {
272 void* stack;
273 cursize = minsize+minsize/4+1024;
274 TRACE(("allocate/grow stack %d\n", cursize));
275 stack = PyMem_REALLOC(state->data_stack, cursize);
276 if (!stack) {
277 data_stack_dealloc(state);
278 return SRE_ERROR_MEMORY;
280 state->data_stack = (char *)stack;
281 state->data_stack_size = cursize;
283 return 0;
286 /* generate 8-bit version */
288 #define SRE_CHAR unsigned char
289 #define SRE_AT sre_at
290 #define SRE_COUNT sre_count
291 #define SRE_CHARSET sre_charset
292 #define SRE_INFO sre_info
293 #define SRE_MATCH sre_match
294 #define SRE_MATCH_CONTEXT sre_match_context
295 #define SRE_SEARCH sre_search
296 #define SRE_LITERAL_TEMPLATE sre_literal_template
298 #if defined(HAVE_UNICODE)
300 #define SRE_RECURSIVE
301 #include "_sre.c"
302 #undef SRE_RECURSIVE
304 #undef SRE_LITERAL_TEMPLATE
305 #undef SRE_SEARCH
306 #undef SRE_MATCH
307 #undef SRE_MATCH_CONTEXT
308 #undef SRE_INFO
309 #undef SRE_CHARSET
310 #undef SRE_COUNT
311 #undef SRE_AT
312 #undef SRE_CHAR
314 /* generate 16-bit unicode version */
316 #define SRE_CHAR Py_UNICODE
317 #define SRE_AT sre_uat
318 #define SRE_COUNT sre_ucount
319 #define SRE_CHARSET sre_ucharset
320 #define SRE_INFO sre_uinfo
321 #define SRE_MATCH sre_umatch
322 #define SRE_MATCH_CONTEXT sre_umatch_context
323 #define SRE_SEARCH sre_usearch
324 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
325 #endif
327 #endif /* SRE_RECURSIVE */
329 /* -------------------------------------------------------------------- */
330 /* String matching engine */
332 /* the following section is compiled twice, with different character
333 settings */
335 LOCAL(int)
336 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338 /* check if pointer is at given position */
340 Py_ssize_t thisp, thatp;
342 switch (at) {
344 case SRE_AT_BEGINNING:
345 case SRE_AT_BEGINNING_STRING:
346 return ((void*) ptr == state->beginning);
348 case SRE_AT_BEGINNING_LINE:
349 return ((void*) ptr == state->beginning ||
350 SRE_IS_LINEBREAK((int) ptr[-1]));
352 case SRE_AT_END:
353 return (((void*) (ptr+1) == state->end &&
354 SRE_IS_LINEBREAK((int) ptr[0])) ||
355 ((void*) ptr == state->end));
357 case SRE_AT_END_LINE:
358 return ((void*) ptr == state->end ||
359 SRE_IS_LINEBREAK((int) ptr[0]));
361 case SRE_AT_END_STRING:
362 return ((void*) ptr == state->end);
364 case SRE_AT_BOUNDARY:
365 if (state->beginning == state->end)
366 return 0;
367 thatp = ((void*) ptr > state->beginning) ?
368 SRE_IS_WORD((int) ptr[-1]) : 0;
369 thisp = ((void*) ptr < state->end) ?
370 SRE_IS_WORD((int) ptr[0]) : 0;
371 return thisp != thatp;
373 case SRE_AT_NON_BOUNDARY:
374 if (state->beginning == state->end)
375 return 0;
376 thatp = ((void*) ptr > state->beginning) ?
377 SRE_IS_WORD((int) ptr[-1]) : 0;
378 thisp = ((void*) ptr < state->end) ?
379 SRE_IS_WORD((int) ptr[0]) : 0;
380 return thisp == thatp;
382 case SRE_AT_LOC_BOUNDARY:
383 if (state->beginning == state->end)
384 return 0;
385 thatp = ((void*) ptr > state->beginning) ?
386 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
387 thisp = ((void*) ptr < state->end) ?
388 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
389 return thisp != thatp;
391 case SRE_AT_LOC_NON_BOUNDARY:
392 if (state->beginning == state->end)
393 return 0;
394 thatp = ((void*) ptr > state->beginning) ?
395 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
396 thisp = ((void*) ptr < state->end) ?
397 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
398 return thisp == thatp;
400 #if defined(HAVE_UNICODE)
401 case SRE_AT_UNI_BOUNDARY:
402 if (state->beginning == state->end)
403 return 0;
404 thatp = ((void*) ptr > state->beginning) ?
405 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
406 thisp = ((void*) ptr < state->end) ?
407 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
408 return thisp != thatp;
410 case SRE_AT_UNI_NON_BOUNDARY:
411 if (state->beginning == state->end)
412 return 0;
413 thatp = ((void*) ptr > state->beginning) ?
414 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
415 thisp = ((void*) ptr < state->end) ?
416 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
417 return thisp == thatp;
418 #endif
422 return 0;
425 LOCAL(int)
426 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428 /* check if character is a member of the given set */
430 int ok = 1;
432 for (;;) {
433 switch (*set++) {
435 case SRE_OP_FAILURE:
436 return !ok;
438 case SRE_OP_LITERAL:
439 /* <LITERAL> <code> */
440 if (ch == set[0])
441 return ok;
442 set++;
443 break;
445 case SRE_OP_CATEGORY:
446 /* <CATEGORY> <code> */
447 if (sre_category(set[0], (int) ch))
448 return ok;
449 set += 1;
450 break;
452 case SRE_OP_CHARSET:
453 if (sizeof(SRE_CODE) == 2) {
454 /* <CHARSET> <bitmap> (16 bits per code word) */
455 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
456 return ok;
457 set += 16;
459 else {
460 /* <CHARSET> <bitmap> (32 bits per code word) */
461 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
462 return ok;
463 set += 8;
465 break;
467 case SRE_OP_RANGE:
468 /* <RANGE> <lower> <upper> */
469 if (set[0] <= ch && ch <= set[1])
470 return ok;
471 set += 2;
472 break;
474 case SRE_OP_NEGATE:
475 ok = !ok;
476 break;
478 case SRE_OP_BIGCHARSET:
479 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 Py_ssize_t count, block;
482 count = *(set++);
484 if (sizeof(SRE_CODE) == 2) {
485 block = ((unsigned char*)set)[ch >> 8];
486 set += 128;
487 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
488 return ok;
489 set += count*16;
491 else {
492 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
493 * warnings when c's type supports only numbers < N+1 */
494 if (!(ch & ~65535))
495 block = ((unsigned char*)set)[ch >> 8];
496 else
497 block = -1;
498 set += 64;
499 if (block >=0 &&
500 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
501 return ok;
502 set += count*8;
504 break;
507 default:
508 /* internal error -- there's not much we can do about it
509 here, so let's just pretend it didn't match... */
510 return 0;
515 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517 LOCAL(Py_ssize_t)
518 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
520 SRE_CODE chr;
521 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
522 SRE_CHAR* end = (SRE_CHAR *)state->end;
523 Py_ssize_t i;
525 /* adjust end */
526 if (maxcount < end - ptr && maxcount != 65535)
527 end = ptr + maxcount;
529 switch (pattern[0]) {
531 case SRE_OP_IN:
532 /* repeated set */
533 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
534 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
535 ptr++;
536 break;
538 case SRE_OP_ANY:
539 /* repeated dot wildcard. */
540 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
541 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
542 ptr++;
543 break;
545 case SRE_OP_ANY_ALL:
546 /* repeated dot wildcard. skip to the end of the target
547 string, and backtrack from there */
548 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
549 ptr = end;
550 break;
552 case SRE_OP_LITERAL:
553 /* repeated literal */
554 chr = pattern[1];
555 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
556 while (ptr < end && (SRE_CODE) *ptr == chr)
557 ptr++;
558 break;
560 case SRE_OP_LITERAL_IGNORE:
561 /* repeated literal */
562 chr = pattern[1];
563 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
564 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
565 ptr++;
566 break;
568 case SRE_OP_NOT_LITERAL:
569 /* repeated non-literal */
570 chr = pattern[1];
571 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
572 while (ptr < end && (SRE_CODE) *ptr != chr)
573 ptr++;
574 break;
576 case SRE_OP_NOT_LITERAL_IGNORE:
577 /* repeated non-literal */
578 chr = pattern[1];
579 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
580 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
581 ptr++;
582 break;
584 default:
585 /* repeated single character pattern */
586 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
587 while ((SRE_CHAR*) state->ptr < end) {
588 i = SRE_MATCH(state, pattern);
589 if (i < 0)
590 return i;
591 if (!i)
592 break;
594 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
595 (SRE_CHAR*) state->ptr - ptr));
596 return (SRE_CHAR*) state->ptr - ptr;
599 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
600 return ptr - (SRE_CHAR*) state->ptr;
603 #if 0 /* not used in this release */
604 LOCAL(int)
605 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
607 /* check if an SRE_OP_INFO block matches at the current position.
608 returns the number of SRE_CODE objects to skip if successful, 0
609 if no match */
611 SRE_CHAR* end = state->end;
612 SRE_CHAR* ptr = state->ptr;
613 Py_ssize_t i;
615 /* check minimal length */
616 if (pattern[3] && (end - ptr) < pattern[3])
617 return 0;
619 /* check known prefix */
620 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
621 /* <length> <skip> <prefix data> <overlap data> */
622 for (i = 0; i < pattern[5]; i++)
623 if ((SRE_CODE) ptr[i] != pattern[7 + i])
624 return 0;
625 return pattern[0] + 2 * pattern[6];
627 return pattern[0];
629 #endif
631 /* The macros below should be used to protect recursive SRE_MATCH()
632 * calls that *failed* and do *not* return immediately (IOW, those
633 * that will backtrack). Explaining:
635 * - Recursive SRE_MATCH() returned true: that's usually a success
636 * (besides atypical cases like ASSERT_NOT), therefore there's no
637 * reason to restore lastmark;
639 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
640 * is returning to the caller: If the current SRE_MATCH() is the
641 * top function of the recursion, returning false will be a matching
642 * failure, and it doesn't matter where lastmark is pointing to.
643 * If it's *not* the top function, it will be a recursive SRE_MATCH()
644 * failure by itself, and the calling SRE_MATCH() will have to deal
645 * with the failure by the same rules explained here (it will restore
646 * lastmark by itself if necessary);
648 * - Recursive SRE_MATCH() returned false, and will continue the
649 * outside 'for' loop: must be protected when breaking, since the next
650 * OP could potentially depend on lastmark;
652 * - Recursive SRE_MATCH() returned false, and will be called again
653 * inside a local for/while loop: must be protected between each
654 * loop iteration, since the recursive SRE_MATCH() could do anything,
655 * and could potentially depend on lastmark.
657 * For more information, check the discussion at SF patch #712900.
659 #define LASTMARK_SAVE() \
660 do { \
661 ctx->lastmark = state->lastmark; \
662 ctx->lastindex = state->lastindex; \
663 } while (0)
664 #define LASTMARK_RESTORE() \
665 do { \
666 state->lastmark = ctx->lastmark; \
667 state->lastindex = ctx->lastindex; \
668 } while (0)
670 #define RETURN_ERROR(i) do { return i; } while(0)
671 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
672 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
674 #define RETURN_ON_ERROR(i) \
675 do { if (i < 0) RETURN_ERROR(i); } while (0)
676 #define RETURN_ON_SUCCESS(i) \
677 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
678 #define RETURN_ON_FAILURE(i) \
679 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
681 #define SFY(x) #x
683 #define DATA_STACK_ALLOC(state, type, ptr) \
684 do { \
685 alloc_pos = state->data_stack_base; \
686 TRACE(("allocating %s in %d (%d)\n", \
687 SFY(type), alloc_pos, sizeof(type))); \
688 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
689 int j = data_stack_grow(state, sizeof(type)); \
690 if (j < 0) return j; \
691 if (ctx_pos != -1) \
692 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
694 ptr = (type*)(state->data_stack+alloc_pos); \
695 state->data_stack_base += sizeof(type); \
696 } while (0)
698 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
699 do { \
700 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
701 ptr = (type*)(state->data_stack+pos); \
702 } while (0)
704 #define DATA_STACK_PUSH(state, data, size) \
705 do { \
706 TRACE(("copy data in %p to %d (%d)\n", \
707 data, state->data_stack_base, size)); \
708 if (state->data_stack_size < state->data_stack_base+size) { \
709 int j = data_stack_grow(state, size); \
710 if (j < 0) return j; \
711 if (ctx_pos != -1) \
712 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
714 memcpy(state->data_stack+state->data_stack_base, data, size); \
715 state->data_stack_base += size; \
716 } while (0)
718 #define DATA_STACK_POP(state, data, size, discard) \
719 do { \
720 TRACE(("copy data to %p from %d (%d)\n", \
721 data, state->data_stack_base-size, size)); \
722 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
723 if (discard) \
724 state->data_stack_base -= size; \
725 } while (0)
727 #define DATA_STACK_POP_DISCARD(state, size) \
728 do { \
729 TRACE(("discard data from %d (%d)\n", \
730 state->data_stack_base-size, size)); \
731 state->data_stack_base -= size; \
732 } while(0)
734 #define DATA_PUSH(x) \
735 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
736 #define DATA_POP(x) \
737 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
738 #define DATA_POP_DISCARD(x) \
739 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
740 #define DATA_ALLOC(t,p) \
741 DATA_STACK_ALLOC(state, t, p)
742 #define DATA_LOOKUP_AT(t,p,pos) \
743 DATA_STACK_LOOKUP_AT(state,t,p,pos)
745 #define MARK_PUSH(lastmark) \
746 do if (lastmark > 0) { \
747 i = lastmark; /* ctx->lastmark may change if reallocated */ \
748 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
749 } while (0)
750 #define MARK_POP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
753 } while (0)
754 #define MARK_POP_KEEP(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
757 } while (0)
758 #define MARK_POP_DISCARD(lastmark) \
759 do if (lastmark > 0) { \
760 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
761 } while (0)
763 #define JUMP_NONE 0
764 #define JUMP_MAX_UNTIL_1 1
765 #define JUMP_MAX_UNTIL_2 2
766 #define JUMP_MAX_UNTIL_3 3
767 #define JUMP_MIN_UNTIL_1 4
768 #define JUMP_MIN_UNTIL_2 5
769 #define JUMP_MIN_UNTIL_3 6
770 #define JUMP_REPEAT 7
771 #define JUMP_REPEAT_ONE_1 8
772 #define JUMP_REPEAT_ONE_2 9
773 #define JUMP_MIN_REPEAT_ONE 10
774 #define JUMP_BRANCH 11
775 #define JUMP_ASSERT 12
776 #define JUMP_ASSERT_NOT 13
778 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
779 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
780 nextctx->last_ctx_pos = ctx_pos; \
781 nextctx->jump = jumpvalue; \
782 nextctx->pattern = nextpattern; \
783 ctx_pos = alloc_pos; \
784 ctx = nextctx; \
785 goto entrance; \
786 jumplabel: \
787 while (0) /* gcc doesn't like labels at end of scopes */ \
789 typedef struct {
790 Py_ssize_t last_ctx_pos;
791 Py_ssize_t jump;
792 SRE_CHAR* ptr;
793 SRE_CODE* pattern;
794 Py_ssize_t count;
795 Py_ssize_t lastmark;
796 Py_ssize_t lastindex;
797 union {
798 SRE_CODE chr;
799 SRE_REPEAT* rep;
800 } u;
801 } SRE_MATCH_CONTEXT;
803 /* check if string matches the given pattern. returns <0 for
804 error, 0 for failure, and 1 for success */
805 LOCAL(Py_ssize_t)
806 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
808 SRE_CHAR* end = (SRE_CHAR *)state->end;
809 Py_ssize_t alloc_pos, ctx_pos = -1;
810 Py_ssize_t i, ret = 0;
811 Py_ssize_t jump;
813 SRE_MATCH_CONTEXT* ctx;
814 SRE_MATCH_CONTEXT* nextctx;
816 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
818 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
819 ctx->last_ctx_pos = -1;
820 ctx->jump = JUMP_NONE;
821 ctx->pattern = pattern;
822 ctx_pos = alloc_pos;
824 entrance:
826 ctx->ptr = (SRE_CHAR *)state->ptr;
828 if (ctx->pattern[0] == SRE_OP_INFO) {
829 /* optimization info block */
830 /* <INFO> <1=skip> <2=flags> <3=min> ... */
831 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
832 TRACE(("reject (got %d chars, need %d)\n",
833 (end - ctx->ptr), ctx->pattern[3]));
834 RETURN_FAILURE;
836 ctx->pattern += ctx->pattern[1] + 1;
839 for (;;) {
841 switch (*ctx->pattern++) {
843 case SRE_OP_MARK:
844 /* set mark */
845 /* <MARK> <gid> */
846 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
847 ctx->ptr, ctx->pattern[0]));
848 i = ctx->pattern[0];
849 if (i & 1)
850 state->lastindex = i/2 + 1;
851 if (i > state->lastmark) {
852 /* state->lastmark is the highest valid index in the
853 state->mark array. If it is increased by more than 1,
854 the intervening marks must be set to NULL to signal
855 that these marks have not been encountered. */
856 Py_ssize_t j = state->lastmark + 1;
857 while (j < i)
858 state->mark[j++] = NULL;
859 state->lastmark = i;
861 state->mark[i] = ctx->ptr;
862 ctx->pattern++;
863 break;
865 case SRE_OP_LITERAL:
866 /* match literal string */
867 /* <LITERAL> <code> */
868 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
869 ctx->ptr, *ctx->pattern));
870 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
871 RETURN_FAILURE;
872 ctx->pattern++;
873 ctx->ptr++;
874 break;
876 case SRE_OP_NOT_LITERAL:
877 /* match anything that is not literal character */
878 /* <NOT_LITERAL> <code> */
879 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
881 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
882 RETURN_FAILURE;
883 ctx->pattern++;
884 ctx->ptr++;
885 break;
887 case SRE_OP_SUCCESS:
888 /* end of pattern */
889 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
890 state->ptr = ctx->ptr;
891 RETURN_SUCCESS;
893 case SRE_OP_AT:
894 /* match at given position */
895 /* <AT> <code> */
896 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
897 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
898 RETURN_FAILURE;
899 ctx->pattern++;
900 break;
902 case SRE_OP_CATEGORY:
903 /* match at given category */
904 /* <CATEGORY> <code> */
905 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
906 ctx->ptr, *ctx->pattern));
907 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
908 RETURN_FAILURE;
909 ctx->pattern++;
910 ctx->ptr++;
911 break;
913 case SRE_OP_ANY:
914 /* match anything (except a newline) */
915 /* <ANY> */
916 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
917 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
918 RETURN_FAILURE;
919 ctx->ptr++;
920 break;
922 case SRE_OP_ANY_ALL:
923 /* match anything */
924 /* <ANY_ALL> */
925 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
926 if (ctx->ptr >= end)
927 RETURN_FAILURE;
928 ctx->ptr++;
929 break;
931 case SRE_OP_IN:
932 /* match set member (or non_member) */
933 /* <IN> <skip> <set> */
934 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
935 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
936 RETURN_FAILURE;
937 ctx->pattern += ctx->pattern[0];
938 ctx->ptr++;
939 break;
941 case SRE_OP_LITERAL_IGNORE:
942 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
943 ctx->pattern, ctx->ptr, ctx->pattern[0]));
944 if (ctx->ptr >= end ||
945 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
946 RETURN_FAILURE;
947 ctx->pattern++;
948 ctx->ptr++;
949 break;
951 case SRE_OP_NOT_LITERAL_IGNORE:
952 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
953 ctx->pattern, ctx->ptr, *ctx->pattern));
954 if (ctx->ptr >= end ||
955 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
956 RETURN_FAILURE;
957 ctx->pattern++;
958 ctx->ptr++;
959 break;
961 case SRE_OP_IN_IGNORE:
962 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
963 if (ctx->ptr >= end
964 || !SRE_CHARSET(ctx->pattern+1,
965 (SRE_CODE)state->lower(*ctx->ptr)))
966 RETURN_FAILURE;
967 ctx->pattern += ctx->pattern[0];
968 ctx->ptr++;
969 break;
971 case SRE_OP_JUMP:
972 case SRE_OP_INFO:
973 /* jump forward */
974 /* <JUMP> <offset> */
975 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
976 ctx->ptr, ctx->pattern[0]));
977 ctx->pattern += ctx->pattern[0];
978 break;
980 case SRE_OP_BRANCH:
981 /* alternation */
982 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
983 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
984 LASTMARK_SAVE();
985 ctx->u.rep = state->repeat;
986 if (ctx->u.rep)
987 MARK_PUSH(ctx->lastmark);
988 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
989 if (ctx->pattern[1] == SRE_OP_LITERAL &&
990 (ctx->ptr >= end ||
991 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
992 continue;
993 if (ctx->pattern[1] == SRE_OP_IN &&
994 (ctx->ptr >= end ||
995 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
996 continue;
997 state->ptr = ctx->ptr;
998 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
999 if (ret) {
1000 if (ctx->u.rep)
1001 MARK_POP_DISCARD(ctx->lastmark);
1002 RETURN_ON_ERROR(ret);
1003 RETURN_SUCCESS;
1005 if (ctx->u.rep)
1006 MARK_POP_KEEP(ctx->lastmark);
1007 LASTMARK_RESTORE();
1009 if (ctx->u.rep)
1010 MARK_POP_DISCARD(ctx->lastmark);
1011 RETURN_FAILURE;
1013 case SRE_OP_REPEAT_ONE:
1014 /* match repeated sequence (maximizing regexp) */
1016 /* this operator only works if the repeated item is
1017 exactly one character wide, and we're not already
1018 collecting backtracking points. for other cases,
1019 use the MAX_REPEAT operator */
1021 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1023 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1024 ctx->pattern[1], ctx->pattern[2]));
1026 if (ctx->ptr + ctx->pattern[1] > end)
1027 RETURN_FAILURE; /* cannot match */
1029 state->ptr = ctx->ptr;
1031 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1032 RETURN_ON_ERROR(ret);
1033 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1034 ctx->count = ret;
1035 ctx->ptr += ctx->count;
1037 /* when we arrive here, count contains the number of
1038 matches, and ctx->ptr points to the tail of the target
1039 string. check if the rest of the pattern matches,
1040 and backtrack if not. */
1042 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1043 RETURN_FAILURE;
1045 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1046 /* tail is empty. we're finished */
1047 state->ptr = ctx->ptr;
1048 RETURN_SUCCESS;
1051 LASTMARK_SAVE();
1053 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1054 /* tail starts with a literal. skip positions where
1055 the rest of the pattern cannot possibly match */
1056 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1057 for (;;) {
1058 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1059 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1060 ctx->ptr--;
1061 ctx->count--;
1063 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1064 break;
1065 state->ptr = ctx->ptr;
1066 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1067 ctx->pattern+ctx->pattern[0]);
1068 if (ret) {
1069 RETURN_ON_ERROR(ret);
1070 RETURN_SUCCESS;
1073 LASTMARK_RESTORE();
1075 ctx->ptr--;
1076 ctx->count--;
1079 } else {
1080 /* general case */
1081 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1082 state->ptr = ctx->ptr;
1083 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1084 ctx->pattern+ctx->pattern[0]);
1085 if (ret) {
1086 RETURN_ON_ERROR(ret);
1087 RETURN_SUCCESS;
1089 ctx->ptr--;
1090 ctx->count--;
1091 LASTMARK_RESTORE();
1094 RETURN_FAILURE;
1096 case SRE_OP_MIN_REPEAT_ONE:
1097 /* match repeated sequence (minimizing regexp) */
1099 /* this operator only works if the repeated item is
1100 exactly one character wide, and we're not already
1101 collecting backtracking points. for other cases,
1102 use the MIN_REPEAT operator */
1104 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1106 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1107 ctx->pattern[1], ctx->pattern[2]));
1109 if (ctx->ptr + ctx->pattern[1] > end)
1110 RETURN_FAILURE; /* cannot match */
1112 state->ptr = ctx->ptr;
1114 if (ctx->pattern[1] == 0)
1115 ctx->count = 0;
1116 else {
1117 /* count using pattern min as the maximum */
1118 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1119 RETURN_ON_ERROR(ret);
1120 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1121 if (ret < (Py_ssize_t) ctx->pattern[1])
1122 /* didn't match minimum number of times */
1123 RETURN_FAILURE;
1124 /* advance past minimum matches of repeat */
1125 ctx->count = ret;
1126 ctx->ptr += ctx->count;
1129 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1130 /* tail is empty. we're finished */
1131 state->ptr = ctx->ptr;
1132 RETURN_SUCCESS;
1134 } else {
1135 /* general case */
1136 LASTMARK_SAVE();
1137 while ((Py_ssize_t)ctx->pattern[2] == 65535
1138 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1139 state->ptr = ctx->ptr;
1140 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1141 ctx->pattern+ctx->pattern[0]);
1142 if (ret) {
1143 RETURN_ON_ERROR(ret);
1144 RETURN_SUCCESS;
1146 state->ptr = ctx->ptr;
1147 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1148 RETURN_ON_ERROR(ret);
1149 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1150 if (ret == 0)
1151 break;
1152 assert(ret == 1);
1153 ctx->ptr++;
1154 ctx->count++;
1155 LASTMARK_RESTORE();
1158 RETURN_FAILURE;
1160 case SRE_OP_REPEAT:
1161 /* create repeat context. all the hard work is done
1162 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1163 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1164 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1165 ctx->pattern[1], ctx->pattern[2]));
1167 /* install new repeat context */
1168 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1169 if (!ctx->u.rep) {
1170 PyErr_NoMemory();
1171 RETURN_FAILURE;
1173 ctx->u.rep->count = -1;
1174 ctx->u.rep->pattern = ctx->pattern;
1175 ctx->u.rep->prev = state->repeat;
1176 ctx->u.rep->last_ptr = NULL;
1177 state->repeat = ctx->u.rep;
1179 state->ptr = ctx->ptr;
1180 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1181 state->repeat = ctx->u.rep->prev;
1182 PyObject_FREE(ctx->u.rep);
1184 if (ret) {
1185 RETURN_ON_ERROR(ret);
1186 RETURN_SUCCESS;
1188 RETURN_FAILURE;
1190 case SRE_OP_MAX_UNTIL:
1191 /* maximizing repeat */
1192 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1194 /* FIXME: we probably need to deal with zero-width
1195 matches in here... */
1197 ctx->u.rep = state->repeat;
1198 if (!ctx->u.rep)
1199 RETURN_ERROR(SRE_ERROR_STATE);
1201 state->ptr = ctx->ptr;
1203 ctx->count = ctx->u.rep->count+1;
1205 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1206 ctx->ptr, ctx->count));
1208 if (ctx->count < ctx->u.rep->pattern[1]) {
1209 /* not enough matches */
1210 ctx->u.rep->count = ctx->count;
1211 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1212 ctx->u.rep->pattern+3);
1213 if (ret) {
1214 RETURN_ON_ERROR(ret);
1215 RETURN_SUCCESS;
1217 ctx->u.rep->count = ctx->count-1;
1218 state->ptr = ctx->ptr;
1219 RETURN_FAILURE;
1222 if ((ctx->count < ctx->u.rep->pattern[2] ||
1223 ctx->u.rep->pattern[2] == 65535) &&
1224 state->ptr != ctx->u.rep->last_ptr) {
1225 /* we may have enough matches, but if we can
1226 match another item, do so */
1227 ctx->u.rep->count = ctx->count;
1228 LASTMARK_SAVE();
1229 MARK_PUSH(ctx->lastmark);
1230 /* zero-width match protection */
1231 DATA_PUSH(&ctx->u.rep->last_ptr);
1232 ctx->u.rep->last_ptr = state->ptr;
1233 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1234 ctx->u.rep->pattern+3);
1235 DATA_POP(&ctx->u.rep->last_ptr);
1236 if (ret) {
1237 MARK_POP_DISCARD(ctx->lastmark);
1238 RETURN_ON_ERROR(ret);
1239 RETURN_SUCCESS;
1241 MARK_POP(ctx->lastmark);
1242 LASTMARK_RESTORE();
1243 ctx->u.rep->count = ctx->count-1;
1244 state->ptr = ctx->ptr;
1247 /* cannot match more repeated items here. make sure the
1248 tail matches */
1249 state->repeat = ctx->u.rep->prev;
1250 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1251 RETURN_ON_SUCCESS(ret);
1252 state->repeat = ctx->u.rep;
1253 state->ptr = ctx->ptr;
1254 RETURN_FAILURE;
1256 case SRE_OP_MIN_UNTIL:
1257 /* minimizing repeat */
1258 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1260 ctx->u.rep = state->repeat;
1261 if (!ctx->u.rep)
1262 RETURN_ERROR(SRE_ERROR_STATE);
1264 state->ptr = ctx->ptr;
1266 ctx->count = ctx->u.rep->count+1;
1268 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1269 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1271 if (ctx->count < ctx->u.rep->pattern[1]) {
1272 /* not enough matches */
1273 ctx->u.rep->count = ctx->count;
1274 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1275 ctx->u.rep->pattern+3);
1276 if (ret) {
1277 RETURN_ON_ERROR(ret);
1278 RETURN_SUCCESS;
1280 ctx->u.rep->count = ctx->count-1;
1281 state->ptr = ctx->ptr;
1282 RETURN_FAILURE;
1285 LASTMARK_SAVE();
1287 /* see if the tail matches */
1288 state->repeat = ctx->u.rep->prev;
1289 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1290 if (ret) {
1291 RETURN_ON_ERROR(ret);
1292 RETURN_SUCCESS;
1295 state->repeat = ctx->u.rep;
1296 state->ptr = ctx->ptr;
1298 LASTMARK_RESTORE();
1300 if (ctx->count >= ctx->u.rep->pattern[2]
1301 && ctx->u.rep->pattern[2] != 65535)
1302 RETURN_FAILURE;
1304 ctx->u.rep->count = ctx->count;
1305 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1306 ctx->u.rep->pattern+3);
1307 if (ret) {
1308 RETURN_ON_ERROR(ret);
1309 RETURN_SUCCESS;
1311 ctx->u.rep->count = ctx->count-1;
1312 state->ptr = ctx->ptr;
1313 RETURN_FAILURE;
1315 case SRE_OP_GROUPREF:
1316 /* match backreference */
1317 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1318 ctx->ptr, ctx->pattern[0]));
1319 i = ctx->pattern[0];
1321 Py_ssize_t groupref = i+i;
1322 if (groupref >= state->lastmark) {
1323 RETURN_FAILURE;
1324 } else {
1325 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1326 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1327 if (!p || !e || e < p)
1328 RETURN_FAILURE;
1329 while (p < e) {
1330 if (ctx->ptr >= end || *ctx->ptr != *p)
1331 RETURN_FAILURE;
1332 p++; ctx->ptr++;
1336 ctx->pattern++;
1337 break;
1339 case SRE_OP_GROUPREF_IGNORE:
1340 /* match backreference */
1341 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1342 ctx->ptr, ctx->pattern[0]));
1343 i = ctx->pattern[0];
1345 Py_ssize_t groupref = i+i;
1346 if (groupref >= state->lastmark) {
1347 RETURN_FAILURE;
1348 } else {
1349 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1350 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1351 if (!p || !e || e < p)
1352 RETURN_FAILURE;
1353 while (p < e) {
1354 if (ctx->ptr >= end ||
1355 state->lower(*ctx->ptr) != state->lower(*p))
1356 RETURN_FAILURE;
1357 p++; ctx->ptr++;
1361 ctx->pattern++;
1362 break;
1364 case SRE_OP_GROUPREF_EXISTS:
1365 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1366 ctx->ptr, ctx->pattern[0]));
1367 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1368 i = ctx->pattern[0];
1370 Py_ssize_t groupref = i+i;
1371 if (groupref >= state->lastmark) {
1372 ctx->pattern += ctx->pattern[1];
1373 break;
1374 } else {
1375 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1376 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1377 if (!p || !e || e < p) {
1378 ctx->pattern += ctx->pattern[1];
1379 break;
1383 ctx->pattern += 2;
1384 break;
1386 case SRE_OP_ASSERT:
1387 /* assert subpattern */
1388 /* <ASSERT> <skip> <back> <pattern> */
1389 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1390 ctx->ptr, ctx->pattern[1]));
1391 state->ptr = ctx->ptr - ctx->pattern[1];
1392 if (state->ptr < state->beginning)
1393 RETURN_FAILURE;
1394 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1395 RETURN_ON_FAILURE(ret);
1396 ctx->pattern += ctx->pattern[0];
1397 break;
1399 case SRE_OP_ASSERT_NOT:
1400 /* assert not subpattern */
1401 /* <ASSERT_NOT> <skip> <back> <pattern> */
1402 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1403 ctx->ptr, ctx->pattern[1]));
1404 state->ptr = ctx->ptr - ctx->pattern[1];
1405 if (state->ptr >= state->beginning) {
1406 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1407 if (ret) {
1408 RETURN_ON_ERROR(ret);
1409 RETURN_FAILURE;
1412 ctx->pattern += ctx->pattern[0];
1413 break;
1415 case SRE_OP_FAILURE:
1416 /* immediate failure */
1417 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1418 RETURN_FAILURE;
1420 default:
1421 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1422 ctx->pattern[-1]));
1423 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1427 exit:
1428 ctx_pos = ctx->last_ctx_pos;
1429 jump = ctx->jump;
1430 DATA_POP_DISCARD(ctx);
1431 if (ctx_pos == -1)
1432 return ret;
1433 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1435 switch (jump) {
1436 case JUMP_MAX_UNTIL_2:
1437 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1438 goto jump_max_until_2;
1439 case JUMP_MAX_UNTIL_3:
1440 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1441 goto jump_max_until_3;
1442 case JUMP_MIN_UNTIL_2:
1443 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1444 goto jump_min_until_2;
1445 case JUMP_MIN_UNTIL_3:
1446 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1447 goto jump_min_until_3;
1448 case JUMP_BRANCH:
1449 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1450 goto jump_branch;
1451 case JUMP_MAX_UNTIL_1:
1452 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1453 goto jump_max_until_1;
1454 case JUMP_MIN_UNTIL_1:
1455 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1456 goto jump_min_until_1;
1457 case JUMP_REPEAT:
1458 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1459 goto jump_repeat;
1460 case JUMP_REPEAT_ONE_1:
1461 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1462 goto jump_repeat_one_1;
1463 case JUMP_REPEAT_ONE_2:
1464 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1465 goto jump_repeat_one_2;
1466 case JUMP_MIN_REPEAT_ONE:
1467 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1468 goto jump_min_repeat_one;
1469 case JUMP_ASSERT:
1470 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1471 goto jump_assert;
1472 case JUMP_ASSERT_NOT:
1473 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1474 goto jump_assert_not;
1475 case JUMP_NONE:
1476 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1477 break;
1480 return ret; /* should never get here */
1483 LOCAL(Py_ssize_t)
1484 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1486 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1487 SRE_CHAR* end = (SRE_CHAR *)state->end;
1488 Py_ssize_t status = 0;
1489 Py_ssize_t prefix_len = 0;
1490 Py_ssize_t prefix_skip = 0;
1491 SRE_CODE* prefix = NULL;
1492 SRE_CODE* charset = NULL;
1493 SRE_CODE* overlap = NULL;
1494 int flags = 0;
1496 if (pattern[0] == SRE_OP_INFO) {
1497 /* optimization info block */
1498 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1500 flags = pattern[2];
1502 if (pattern[3] > 1) {
1503 /* adjust end point (but make sure we leave at least one
1504 character in there, so literal search will work) */
1505 end -= pattern[3]-1;
1506 if (end <= ptr)
1507 end = ptr+1;
1510 if (flags & SRE_INFO_PREFIX) {
1511 /* pattern starts with a known prefix */
1512 /* <length> <skip> <prefix data> <overlap data> */
1513 prefix_len = pattern[5];
1514 prefix_skip = pattern[6];
1515 prefix = pattern + 7;
1516 overlap = prefix + prefix_len - 1;
1517 } else if (flags & SRE_INFO_CHARSET)
1518 /* pattern starts with a character from a known set */
1519 /* <charset> */
1520 charset = pattern + 5;
1522 pattern += 1 + pattern[1];
1525 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1526 TRACE(("charset = %p\n", charset));
1528 #if defined(USE_FAST_SEARCH)
1529 if (prefix_len > 1) {
1530 /* pattern starts with a known prefix. use the overlap
1531 table to skip forward as fast as we possibly can */
1532 Py_ssize_t i = 0;
1533 end = (SRE_CHAR *)state->end;
1534 while (ptr < end) {
1535 for (;;) {
1536 if ((SRE_CODE) ptr[0] != prefix[i]) {
1537 if (!i)
1538 break;
1539 else
1540 i = overlap[i];
1541 } else {
1542 if (++i == prefix_len) {
1543 /* found a potential match */
1544 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1545 state->start = ptr + 1 - prefix_len;
1546 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1547 if (flags & SRE_INFO_LITERAL)
1548 return 1; /* we got all of it */
1549 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1550 if (status != 0)
1551 return status;
1552 /* close but no cigar -- try again */
1553 i = overlap[i];
1555 break;
1558 ptr++;
1560 return 0;
1562 #endif
1564 if (pattern[0] == SRE_OP_LITERAL) {
1565 /* pattern starts with a literal character. this is used
1566 for short prefixes, and if fast search is disabled */
1567 SRE_CODE chr = pattern[1];
1568 end = (SRE_CHAR *)state->end;
1569 for (;;) {
1570 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1571 ptr++;
1572 if (ptr >= end)
1573 return 0;
1574 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1575 state->start = ptr;
1576 state->ptr = ++ptr;
1577 if (flags & SRE_INFO_LITERAL)
1578 return 1; /* we got all of it */
1579 status = SRE_MATCH(state, pattern + 2);
1580 if (status != 0)
1581 break;
1583 } else if (charset) {
1584 /* pattern starts with a character from a known set */
1585 end = (SRE_CHAR *)state->end;
1586 for (;;) {
1587 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1588 ptr++;
1589 if (ptr >= end)
1590 return 0;
1591 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1592 state->start = ptr;
1593 state->ptr = ptr;
1594 status = SRE_MATCH(state, pattern);
1595 if (status != 0)
1596 break;
1597 ptr++;
1599 } else
1600 /* general case */
1601 while (ptr <= end) {
1602 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1603 state->start = state->ptr = ptr++;
1604 status = SRE_MATCH(state, pattern);
1605 if (status != 0)
1606 break;
1609 return status;
1612 LOCAL(int)
1613 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1615 /* check if given string is a literal template (i.e. no escapes) */
1616 while (len-- > 0)
1617 if (*ptr++ == '\\')
1618 return 0;
1619 return 1;
1622 #if !defined(SRE_RECURSIVE)
1624 /* -------------------------------------------------------------------- */
1625 /* factories and destructors */
1627 /* see sre.h for object declarations */
1628 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1629 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1631 static PyObject *
1632 sre_codesize(PyObject* self, PyObject *unused)
1634 return Py_BuildValue("l", sizeof(SRE_CODE));
1637 static PyObject *
1638 sre_getlower(PyObject* self, PyObject* args)
1640 int character, flags;
1641 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1642 return NULL;
1643 if (flags & SRE_FLAG_LOCALE)
1644 return Py_BuildValue("i", sre_lower_locale(character));
1645 if (flags & SRE_FLAG_UNICODE)
1646 #if defined(HAVE_UNICODE)
1647 return Py_BuildValue("i", sre_lower_unicode(character));
1648 #else
1649 return Py_BuildValue("i", sre_lower_locale(character));
1650 #endif
1651 return Py_BuildValue("i", sre_lower(character));
1654 LOCAL(void)
1655 state_reset(SRE_STATE* state)
1657 /* FIXME: dynamic! */
1658 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1660 state->lastmark = -1;
1661 state->lastindex = -1;
1663 state->repeat = NULL;
1665 data_stack_dealloc(state);
1668 static void*
1669 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1671 /* given a python object, return a data pointer, a length (in
1672 characters), and a character size. return NULL if the object
1673 is not a string (or not compatible) */
1675 PyBufferProcs *buffer;
1676 Py_ssize_t size, bytes;
1677 int charsize;
1678 void* ptr;
1680 #if defined(HAVE_UNICODE)
1681 if (PyUnicode_Check(string)) {
1682 /* unicode strings doesn't always support the buffer interface */
1683 ptr = (void*) PyUnicode_AS_DATA(string);
1684 bytes = PyUnicode_GET_DATA_SIZE(string);
1685 size = PyUnicode_GET_SIZE(string);
1686 charsize = sizeof(Py_UNICODE);
1688 } else {
1689 #endif
1691 /* get pointer to string buffer */
1692 buffer = string->ob_type->tp_as_buffer;
1693 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1694 buffer->bf_getsegcount(string, NULL) != 1) {
1695 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1696 return NULL;
1699 /* determine buffer size */
1700 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1701 if (bytes < 0) {
1702 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1703 return NULL;
1706 /* determine character size */
1707 #if PY_VERSION_HEX >= 0x01060000
1708 size = PyObject_Size(string);
1709 #else
1710 size = PyObject_Length(string);
1711 #endif
1713 if (PyString_Check(string) || bytes == size)
1714 charsize = 1;
1715 #if defined(HAVE_UNICODE)
1716 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1717 charsize = sizeof(Py_UNICODE);
1718 #endif
1719 else {
1720 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1721 return NULL;
1724 #if defined(HAVE_UNICODE)
1726 #endif
1728 *p_length = size;
1729 *p_charsize = charsize;
1731 return ptr;
1734 LOCAL(PyObject*)
1735 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1736 Py_ssize_t start, Py_ssize_t end)
1738 /* prepare state object */
1740 Py_ssize_t length;
1741 int charsize;
1742 void* ptr;
1744 memset(state, 0, sizeof(SRE_STATE));
1746 state->lastmark = -1;
1747 state->lastindex = -1;
1749 ptr = getstring(string, &length, &charsize);
1750 if (!ptr)
1751 return NULL;
1753 /* adjust boundaries */
1754 if (start < 0)
1755 start = 0;
1756 else if (start > length)
1757 start = length;
1759 if (end < 0)
1760 end = 0;
1761 else if (end > length)
1762 end = length;
1764 state->charsize = charsize;
1766 state->beginning = ptr;
1768 state->start = (void*) ((char*) ptr + start * state->charsize);
1769 state->end = (void*) ((char*) ptr + end * state->charsize);
1771 Py_INCREF(string);
1772 state->string = string;
1773 state->pos = start;
1774 state->endpos = end;
1776 if (pattern->flags & SRE_FLAG_LOCALE)
1777 state->lower = sre_lower_locale;
1778 else if (pattern->flags & SRE_FLAG_UNICODE)
1779 #if defined(HAVE_UNICODE)
1780 state->lower = sre_lower_unicode;
1781 #else
1782 state->lower = sre_lower_locale;
1783 #endif
1784 else
1785 state->lower = sre_lower;
1787 return string;
1790 LOCAL(void)
1791 state_fini(SRE_STATE* state)
1793 Py_XDECREF(state->string);
1794 data_stack_dealloc(state);
1797 /* calculate offset from start of string */
1798 #define STATE_OFFSET(state, member)\
1799 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1801 LOCAL(PyObject*)
1802 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1804 Py_ssize_t i, j;
1806 index = (index - 1) * 2;
1808 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1809 if (empty)
1810 /* want empty string */
1811 i = j = 0;
1812 else {
1813 Py_INCREF(Py_None);
1814 return Py_None;
1816 } else {
1817 i = STATE_OFFSET(state, state->mark[index]);
1818 j = STATE_OFFSET(state, state->mark[index+1]);
1821 return PySequence_GetSlice(string, i, j);
1824 static void
1825 pattern_error(int status)
1827 switch (status) {
1828 case SRE_ERROR_RECURSION_LIMIT:
1829 PyErr_SetString(
1830 PyExc_RuntimeError,
1831 "maximum recursion limit exceeded"
1833 break;
1834 case SRE_ERROR_MEMORY:
1835 PyErr_NoMemory();
1836 break;
1837 default:
1838 /* other error codes indicate compiler/engine bugs */
1839 PyErr_SetString(
1840 PyExc_RuntimeError,
1841 "internal error in regular expression engine"
1846 static void
1847 pattern_dealloc(PatternObject* self)
1849 if (self->weakreflist != NULL)
1850 PyObject_ClearWeakRefs((PyObject *) self);
1851 Py_XDECREF(self->pattern);
1852 Py_XDECREF(self->groupindex);
1853 Py_XDECREF(self->indexgroup);
1854 PyObject_DEL(self);
1857 static PyObject*
1858 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1860 SRE_STATE state;
1861 int status;
1863 PyObject* string;
1864 Py_ssize_t start = 0;
1865 Py_ssize_t end = PY_SSIZE_T_MAX;
1866 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1867 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1868 &string, &start, &end))
1869 return NULL;
1871 string = state_init(&state, self, string, start, end);
1872 if (!string)
1873 return NULL;
1875 state.ptr = state.start;
1877 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1879 if (state.charsize == 1) {
1880 status = sre_match(&state, PatternObject_GetCode(self));
1881 } else {
1882 #if defined(HAVE_UNICODE)
1883 status = sre_umatch(&state, PatternObject_GetCode(self));
1884 #endif
1887 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1888 if (PyErr_Occurred())
1889 return NULL;
1891 state_fini(&state);
1893 return pattern_new_match(self, &state, status);
1896 static PyObject*
1897 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1899 SRE_STATE state;
1900 int status;
1902 PyObject* string;
1903 Py_ssize_t start = 0;
1904 Py_ssize_t end = PY_SSIZE_T_MAX;
1905 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1906 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1907 &string, &start, &end))
1908 return NULL;
1910 string = state_init(&state, self, string, start, end);
1911 if (!string)
1912 return NULL;
1914 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1916 if (state.charsize == 1) {
1917 status = sre_search(&state, PatternObject_GetCode(self));
1918 } else {
1919 #if defined(HAVE_UNICODE)
1920 status = sre_usearch(&state, PatternObject_GetCode(self));
1921 #endif
1924 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1926 state_fini(&state);
1928 if (PyErr_Occurred())
1929 return NULL;
1931 return pattern_new_match(self, &state, status);
1934 static PyObject*
1935 call(char* module, char* function, PyObject* args)
1937 PyObject* name;
1938 PyObject* mod;
1939 PyObject* func;
1940 PyObject* result;
1942 if (!args)
1943 return NULL;
1944 name = PyString_FromString(module);
1945 if (!name)
1946 return NULL;
1947 mod = PyImport_Import(name);
1948 Py_DECREF(name);
1949 if (!mod)
1950 return NULL;
1951 func = PyObject_GetAttrString(mod, function);
1952 Py_DECREF(mod);
1953 if (!func)
1954 return NULL;
1955 result = PyObject_CallObject(func, args);
1956 Py_DECREF(func);
1957 Py_DECREF(args);
1958 return result;
1961 #ifdef USE_BUILTIN_COPY
1962 static int
1963 deepcopy(PyObject** object, PyObject* memo)
1965 PyObject* copy;
1967 copy = call(
1968 "copy", "deepcopy",
1969 PyTuple_Pack(2, *object, memo)
1971 if (!copy)
1972 return 0;
1974 Py_DECREF(*object);
1975 *object = copy;
1977 return 1; /* success */
1979 #endif
1981 static PyObject*
1982 join_list(PyObject* list, PyObject* string)
1984 /* join list elements */
1986 PyObject* joiner;
1987 #if PY_VERSION_HEX >= 0x01060000
1988 PyObject* function;
1989 PyObject* args;
1990 #endif
1991 PyObject* result;
1993 joiner = PySequence_GetSlice(string, 0, 0);
1994 if (!joiner)
1995 return NULL;
1997 if (PyList_GET_SIZE(list) == 0) {
1998 Py_DECREF(list);
1999 return joiner;
2002 #if PY_VERSION_HEX >= 0x01060000
2003 function = PyObject_GetAttrString(joiner, "join");
2004 if (!function) {
2005 Py_DECREF(joiner);
2006 return NULL;
2008 args = PyTuple_New(1);
2009 if (!args) {
2010 Py_DECREF(function);
2011 Py_DECREF(joiner);
2012 return NULL;
2014 PyTuple_SET_ITEM(args, 0, list);
2015 result = PyObject_CallObject(function, args);
2016 Py_DECREF(args); /* also removes list */
2017 Py_DECREF(function);
2018 #else
2019 result = call(
2020 "string", "join",
2021 PyTuple_Pack(2, list, joiner)
2023 #endif
2024 Py_DECREF(joiner);
2026 return result;
2029 static PyObject*
2030 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2032 SRE_STATE state;
2033 PyObject* list;
2034 int status;
2035 Py_ssize_t i, b, e;
2037 PyObject* string;
2038 Py_ssize_t start = 0;
2039 Py_ssize_t end = PY_SSIZE_T_MAX;
2040 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2041 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2042 &string, &start, &end))
2043 return NULL;
2045 string = state_init(&state, self, string, start, end);
2046 if (!string)
2047 return NULL;
2049 list = PyList_New(0);
2050 if (!list) {
2051 state_fini(&state);
2052 return NULL;
2055 while (state.start <= state.end) {
2057 PyObject* item;
2059 state_reset(&state);
2061 state.ptr = state.start;
2063 if (state.charsize == 1) {
2064 status = sre_search(&state, PatternObject_GetCode(self));
2065 } else {
2066 #if defined(HAVE_UNICODE)
2067 status = sre_usearch(&state, PatternObject_GetCode(self));
2068 #endif
2071 if (PyErr_Occurred())
2072 goto error;
2074 if (status <= 0) {
2075 if (status == 0)
2076 break;
2077 pattern_error(status);
2078 goto error;
2081 /* don't bother to build a match object */
2082 switch (self->groups) {
2083 case 0:
2084 b = STATE_OFFSET(&state, state.start);
2085 e = STATE_OFFSET(&state, state.ptr);
2086 item = PySequence_GetSlice(string, b, e);
2087 if (!item)
2088 goto error;
2089 break;
2090 case 1:
2091 item = state_getslice(&state, 1, string, 1);
2092 if (!item)
2093 goto error;
2094 break;
2095 default:
2096 item = PyTuple_New(self->groups);
2097 if (!item)
2098 goto error;
2099 for (i = 0; i < self->groups; i++) {
2100 PyObject* o = state_getslice(&state, i+1, string, 1);
2101 if (!o) {
2102 Py_DECREF(item);
2103 goto error;
2105 PyTuple_SET_ITEM(item, i, o);
2107 break;
2110 status = PyList_Append(list, item);
2111 Py_DECREF(item);
2112 if (status < 0)
2113 goto error;
2115 if (state.ptr == state.start)
2116 state.start = (void*) ((char*) state.ptr + state.charsize);
2117 else
2118 state.start = state.ptr;
2122 state_fini(&state);
2123 return list;
2125 error:
2126 Py_DECREF(list);
2127 state_fini(&state);
2128 return NULL;
2132 #if PY_VERSION_HEX >= 0x02020000
2133 static PyObject*
2134 pattern_finditer(PatternObject* pattern, PyObject* args)
2136 PyObject* scanner;
2137 PyObject* search;
2138 PyObject* iterator;
2140 scanner = pattern_scanner(pattern, args);
2141 if (!scanner)
2142 return NULL;
2144 search = PyObject_GetAttrString(scanner, "search");
2145 Py_DECREF(scanner);
2146 if (!search)
2147 return NULL;
2149 iterator = PyCallIter_New(search, Py_None);
2150 Py_DECREF(search);
2152 return iterator;
2154 #endif
2156 static PyObject*
2157 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2159 SRE_STATE state;
2160 PyObject* list;
2161 PyObject* item;
2162 int status;
2163 Py_ssize_t n;
2164 Py_ssize_t i;
2165 void* last;
2167 PyObject* string;
2168 Py_ssize_t maxsplit = 0;
2169 static char* kwlist[] = { "source", "maxsplit", NULL };
2170 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2171 &string, &maxsplit))
2172 return NULL;
2174 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2175 if (!string)
2176 return NULL;
2178 list = PyList_New(0);
2179 if (!list) {
2180 state_fini(&state);
2181 return NULL;
2184 n = 0;
2185 last = state.start;
2187 while (!maxsplit || n < maxsplit) {
2189 state_reset(&state);
2191 state.ptr = state.start;
2193 if (state.charsize == 1) {
2194 status = sre_search(&state, PatternObject_GetCode(self));
2195 } else {
2196 #if defined(HAVE_UNICODE)
2197 status = sre_usearch(&state, PatternObject_GetCode(self));
2198 #endif
2201 if (PyErr_Occurred())
2202 goto error;
2204 if (status <= 0) {
2205 if (status == 0)
2206 break;
2207 pattern_error(status);
2208 goto error;
2211 if (state.start == state.ptr) {
2212 if (last == state.end)
2213 break;
2214 /* skip one character */
2215 state.start = (void*) ((char*) state.ptr + state.charsize);
2216 continue;
2219 /* get segment before this match */
2220 item = PySequence_GetSlice(
2221 string, STATE_OFFSET(&state, last),
2222 STATE_OFFSET(&state, state.start)
2224 if (!item)
2225 goto error;
2226 status = PyList_Append(list, item);
2227 Py_DECREF(item);
2228 if (status < 0)
2229 goto error;
2231 /* add groups (if any) */
2232 for (i = 0; i < self->groups; i++) {
2233 item = state_getslice(&state, i+1, string, 0);
2234 if (!item)
2235 goto error;
2236 status = PyList_Append(list, item);
2237 Py_DECREF(item);
2238 if (status < 0)
2239 goto error;
2242 n = n + 1;
2244 last = state.start = state.ptr;
2248 /* get segment following last match (even if empty) */
2249 item = PySequence_GetSlice(
2250 string, STATE_OFFSET(&state, last), state.endpos
2252 if (!item)
2253 goto error;
2254 status = PyList_Append(list, item);
2255 Py_DECREF(item);
2256 if (status < 0)
2257 goto error;
2259 state_fini(&state);
2260 return list;
2262 error:
2263 Py_DECREF(list);
2264 state_fini(&state);
2265 return NULL;
2269 static PyObject*
2270 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2271 Py_ssize_t count, Py_ssize_t subn)
2273 SRE_STATE state;
2274 PyObject* list;
2275 PyObject* item;
2276 PyObject* filter;
2277 PyObject* args;
2278 PyObject* match;
2279 void* ptr;
2280 int status;
2281 Py_ssize_t n;
2282 Py_ssize_t i, b, e;
2283 int bint;
2284 int filter_is_callable;
2286 if (PyCallable_Check(ptemplate)) {
2287 /* sub/subn takes either a function or a template */
2288 filter = ptemplate;
2289 Py_INCREF(filter);
2290 filter_is_callable = 1;
2291 } else {
2292 /* if not callable, check if it's a literal string */
2293 int literal;
2294 ptr = getstring(ptemplate, &n, &bint);
2295 b = bint;
2296 if (ptr) {
2297 if (b == 1) {
2298 literal = sre_literal_template((unsigned char *)ptr, n);
2299 } else {
2300 #if defined(HAVE_UNICODE)
2301 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2302 #endif
2304 } else {
2305 PyErr_Clear();
2306 literal = 0;
2308 if (literal) {
2309 filter = ptemplate;
2310 Py_INCREF(filter);
2311 filter_is_callable = 0;
2312 } else {
2313 /* not a literal; hand it over to the template compiler */
2314 filter = call(
2315 SRE_PY_MODULE, "_subx",
2316 PyTuple_Pack(2, self, ptemplate)
2318 if (!filter)
2319 return NULL;
2320 filter_is_callable = PyCallable_Check(filter);
2324 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2325 if (!string) {
2326 Py_DECREF(filter);
2327 return NULL;
2330 list = PyList_New(0);
2331 if (!list) {
2332 Py_DECREF(filter);
2333 state_fini(&state);
2334 return NULL;
2337 n = i = 0;
2339 while (!count || n < count) {
2341 state_reset(&state);
2343 state.ptr = state.start;
2345 if (state.charsize == 1) {
2346 status = sre_search(&state, PatternObject_GetCode(self));
2347 } else {
2348 #if defined(HAVE_UNICODE)
2349 status = sre_usearch(&state, PatternObject_GetCode(self));
2350 #endif
2353 if (PyErr_Occurred())
2354 goto error;
2356 if (status <= 0) {
2357 if (status == 0)
2358 break;
2359 pattern_error(status);
2360 goto error;
2363 b = STATE_OFFSET(&state, state.start);
2364 e = STATE_OFFSET(&state, state.ptr);
2366 if (i < b) {
2367 /* get segment before this match */
2368 item = PySequence_GetSlice(string, i, b);
2369 if (!item)
2370 goto error;
2371 status = PyList_Append(list, item);
2372 Py_DECREF(item);
2373 if (status < 0)
2374 goto error;
2376 } else if (i == b && i == e && n > 0)
2377 /* ignore empty match on latest position */
2378 goto next;
2380 if (filter_is_callable) {
2381 /* pass match object through filter */
2382 match = pattern_new_match(self, &state, 1);
2383 if (!match)
2384 goto error;
2385 args = PyTuple_Pack(1, match);
2386 if (!args) {
2387 Py_DECREF(match);
2388 goto error;
2390 item = PyObject_CallObject(filter, args);
2391 Py_DECREF(args);
2392 Py_DECREF(match);
2393 if (!item)
2394 goto error;
2395 } else {
2396 /* filter is literal string */
2397 item = filter;
2398 Py_INCREF(item);
2401 /* add to list */
2402 if (item != Py_None) {
2403 status = PyList_Append(list, item);
2404 Py_DECREF(item);
2405 if (status < 0)
2406 goto error;
2409 i = e;
2410 n = n + 1;
2412 next:
2413 /* move on */
2414 if (state.ptr == state.start)
2415 state.start = (void*) ((char*) state.ptr + state.charsize);
2416 else
2417 state.start = state.ptr;
2421 /* get segment following last match */
2422 if (i < state.endpos) {
2423 item = PySequence_GetSlice(string, i, state.endpos);
2424 if (!item)
2425 goto error;
2426 status = PyList_Append(list, item);
2427 Py_DECREF(item);
2428 if (status < 0)
2429 goto error;
2432 state_fini(&state);
2434 Py_DECREF(filter);
2436 /* convert list to single string (also removes list) */
2437 item = join_list(list, string);
2439 if (!item)
2440 return NULL;
2442 if (subn)
2443 return Py_BuildValue("Ni", item, n);
2445 return item;
2447 error:
2448 Py_DECREF(list);
2449 state_fini(&state);
2450 Py_DECREF(filter);
2451 return NULL;
2455 static PyObject*
2456 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2458 PyObject* ptemplate;
2459 PyObject* string;
2460 Py_ssize_t count = 0;
2461 static char* kwlist[] = { "repl", "string", "count", NULL };
2462 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2463 &ptemplate, &string, &count))
2464 return NULL;
2466 return pattern_subx(self, ptemplate, string, count, 0);
2469 static PyObject*
2470 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2472 PyObject* ptemplate;
2473 PyObject* string;
2474 Py_ssize_t count = 0;
2475 static char* kwlist[] = { "repl", "string", "count", NULL };
2476 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2477 &ptemplate, &string, &count))
2478 return NULL;
2480 return pattern_subx(self, ptemplate, string, count, 1);
2483 static PyObject*
2484 pattern_copy(PatternObject* self, PyObject *unused)
2486 #ifdef USE_BUILTIN_COPY
2487 PatternObject* copy;
2488 int offset;
2490 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2491 if (!copy)
2492 return NULL;
2494 offset = offsetof(PatternObject, groups);
2496 Py_XINCREF(self->groupindex);
2497 Py_XINCREF(self->indexgroup);
2498 Py_XINCREF(self->pattern);
2500 memcpy((char*) copy + offset, (char*) self + offset,
2501 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2502 copy->weakreflist = NULL;
2504 return (PyObject*) copy;
2505 #else
2506 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2507 return NULL;
2508 #endif
2511 static PyObject*
2512 pattern_deepcopy(PatternObject* self, PyObject* memo)
2514 #ifdef USE_BUILTIN_COPY
2515 PatternObject* copy;
2517 copy = (PatternObject*) pattern_copy(self);
2518 if (!copy)
2519 return NULL;
2521 if (!deepcopy(&copy->groupindex, memo) ||
2522 !deepcopy(&copy->indexgroup, memo) ||
2523 !deepcopy(&copy->pattern, memo)) {
2524 Py_DECREF(copy);
2525 return NULL;
2528 #else
2529 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2530 return NULL;
2531 #endif
2534 PyDoc_STRVAR(pattern_match_doc,
2535 "match(string[, pos[, endpos]]) --> match object or None.\n\
2536 Matches zero or more characters at the beginning of the string");
2538 PyDoc_STRVAR(pattern_search_doc,
2539 "search(string[, pos[, endpos]]) --> match object or None.\n\
2540 Scan through string looking for a match, and return a corresponding\n\
2541 MatchObject instance. Return None if no position in the string matches.");
2543 PyDoc_STRVAR(pattern_split_doc,
2544 "split(string[, maxsplit = 0]) --> list.\n\
2545 Split string by the occurrences of pattern.");
2547 PyDoc_STRVAR(pattern_findall_doc,
2548 "findall(string[, pos[, endpos]]) --> list.\n\
2549 Return a list of all non-overlapping matches of pattern in string.");
2551 PyDoc_STRVAR(pattern_finditer_doc,
2552 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2553 Return an iterator over all non-overlapping matches for the \n\
2554 RE pattern in string. For each match, the iterator returns a\n\
2555 match object.");
2557 PyDoc_STRVAR(pattern_sub_doc,
2558 "sub(repl, string[, count = 0]) --> newstring\n\
2559 Return the string obtained by replacing the leftmost non-overlapping\n\
2560 occurrences of pattern in string by the replacement repl.");
2562 PyDoc_STRVAR(pattern_subn_doc,
2563 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2564 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2565 the leftmost non-overlapping occurrences of pattern with the\n\
2566 replacement repl.");
2568 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2570 static PyMethodDef pattern_methods[] = {
2571 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2572 pattern_match_doc},
2573 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2574 pattern_search_doc},
2575 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2576 pattern_sub_doc},
2577 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2578 pattern_subn_doc},
2579 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2580 pattern_split_doc},
2581 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2582 pattern_findall_doc},
2583 #if PY_VERSION_HEX >= 0x02020000
2584 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2585 pattern_finditer_doc},
2586 #endif
2587 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2588 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2589 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2590 {NULL, NULL}
2593 static PyObject*
2594 pattern_getattr(PatternObject* self, char* name)
2596 PyObject* res;
2598 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2600 if (res)
2601 return res;
2603 PyErr_Clear();
2605 /* attributes */
2606 if (!strcmp(name, "pattern")) {
2607 Py_INCREF(self->pattern);
2608 return self->pattern;
2611 if (!strcmp(name, "flags"))
2612 return Py_BuildValue("i", self->flags);
2614 if (!strcmp(name, "groups"))
2615 return Py_BuildValue("i", self->groups);
2617 if (!strcmp(name, "groupindex") && self->groupindex) {
2618 Py_INCREF(self->groupindex);
2619 return self->groupindex;
2622 PyErr_SetString(PyExc_AttributeError, name);
2623 return NULL;
2626 statichere PyTypeObject Pattern_Type = {
2627 PyObject_HEAD_INIT(NULL)
2628 0, "_" SRE_MODULE ".SRE_Pattern",
2629 sizeof(PatternObject), sizeof(SRE_CODE),
2630 (destructor)pattern_dealloc, /*tp_dealloc*/
2631 0, /*tp_print*/
2632 (getattrfunc)pattern_getattr, /*tp_getattr*/
2633 0, /* tp_setattr */
2634 0, /* tp_compare */
2635 0, /* tp_repr */
2636 0, /* tp_as_number */
2637 0, /* tp_as_sequence */
2638 0, /* tp_as_mapping */
2639 0, /* tp_hash */
2640 0, /* tp_call */
2641 0, /* tp_str */
2642 0, /* tp_getattro */
2643 0, /* tp_setattro */
2644 0, /* tp_as_buffer */
2645 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2646 pattern_doc, /* tp_doc */
2647 0, /* tp_traverse */
2648 0, /* tp_clear */
2649 0, /* tp_richcompare */
2650 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2653 static PyObject *
2654 _compile(PyObject* self_, PyObject* args)
2656 /* "compile" pattern descriptor to pattern object */
2658 PatternObject* self;
2659 Py_ssize_t i, n;
2661 PyObject* pattern;
2662 int flags = 0;
2663 PyObject* code;
2664 Py_ssize_t groups = 0;
2665 PyObject* groupindex = NULL;
2666 PyObject* indexgroup = NULL;
2667 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2668 &PyList_Type, &code, &groups,
2669 &groupindex, &indexgroup))
2670 return NULL;
2672 n = PyList_GET_SIZE(code);
2674 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2675 if (!self)
2676 return NULL;
2678 self->codesize = n;
2680 for (i = 0; i < n; i++) {
2681 PyObject *o = PyList_GET_ITEM(code, i);
2682 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2683 : PyLong_AsUnsignedLong(o);
2684 self->code[i] = (SRE_CODE) value;
2685 if ((unsigned long) self->code[i] != value) {
2686 PyErr_SetString(PyExc_OverflowError,
2687 "regular expression code size limit exceeded");
2688 break;
2692 if (PyErr_Occurred()) {
2693 PyObject_DEL(self);
2694 return NULL;
2697 Py_INCREF(pattern);
2698 self->pattern = pattern;
2700 self->flags = flags;
2702 self->groups = groups;
2704 Py_XINCREF(groupindex);
2705 self->groupindex = groupindex;
2707 Py_XINCREF(indexgroup);
2708 self->indexgroup = indexgroup;
2710 self->weakreflist = NULL;
2712 return (PyObject*) self;
2715 /* -------------------------------------------------------------------- */
2716 /* match methods */
2718 static void
2719 match_dealloc(MatchObject* self)
2721 Py_XDECREF(self->regs);
2722 Py_XDECREF(self->string);
2723 Py_DECREF(self->pattern);
2724 PyObject_DEL(self);
2727 static PyObject*
2728 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2730 if (index < 0 || index >= self->groups) {
2731 /* raise IndexError if we were given a bad group number */
2732 PyErr_SetString(
2733 PyExc_IndexError,
2734 "no such group"
2736 return NULL;
2739 index *= 2;
2741 if (self->string == Py_None || self->mark[index] < 0) {
2742 /* return default value if the string or group is undefined */
2743 Py_INCREF(def);
2744 return def;
2747 return PySequence_GetSlice(
2748 self->string, self->mark[index], self->mark[index+1]
2752 static Py_ssize_t
2753 match_getindex(MatchObject* self, PyObject* index)
2755 Py_ssize_t i;
2757 if (PyInt_Check(index))
2758 return PyInt_AsSsize_t(index);
2760 i = -1;
2762 if (self->pattern->groupindex) {
2763 index = PyObject_GetItem(self->pattern->groupindex, index);
2764 if (index) {
2765 if (PyInt_Check(index) || PyLong_Check(index))
2766 i = PyInt_AsSsize_t(index);
2767 Py_DECREF(index);
2768 } else
2769 PyErr_Clear();
2772 return i;
2775 static PyObject*
2776 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2778 return match_getslice_by_index(self, match_getindex(self, index), def);
2781 static PyObject*
2782 match_expand(MatchObject* self, PyObject* ptemplate)
2784 /* delegate to Python code */
2785 return call(
2786 SRE_PY_MODULE, "_expand",
2787 PyTuple_Pack(3, self->pattern, self, ptemplate)
2791 static PyObject*
2792 match_group(MatchObject* self, PyObject* args)
2794 PyObject* result;
2795 Py_ssize_t i, size;
2797 size = PyTuple_GET_SIZE(args);
2799 switch (size) {
2800 case 0:
2801 result = match_getslice(self, Py_False, Py_None);
2802 break;
2803 case 1:
2804 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2805 break;
2806 default:
2807 /* fetch multiple items */
2808 result = PyTuple_New(size);
2809 if (!result)
2810 return NULL;
2811 for (i = 0; i < size; i++) {
2812 PyObject* item = match_getslice(
2813 self, PyTuple_GET_ITEM(args, i), Py_None
2815 if (!item) {
2816 Py_DECREF(result);
2817 return NULL;
2819 PyTuple_SET_ITEM(result, i, item);
2821 break;
2823 return result;
2826 static PyObject*
2827 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2829 PyObject* result;
2830 Py_ssize_t index;
2832 PyObject* def = Py_None;
2833 static char* kwlist[] = { "default", NULL };
2834 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2835 return NULL;
2837 result = PyTuple_New(self->groups-1);
2838 if (!result)
2839 return NULL;
2841 for (index = 1; index < self->groups; index++) {
2842 PyObject* item;
2843 item = match_getslice_by_index(self, index, def);
2844 if (!item) {
2845 Py_DECREF(result);
2846 return NULL;
2848 PyTuple_SET_ITEM(result, index-1, item);
2851 return result;
2854 static PyObject*
2855 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2857 PyObject* result;
2858 PyObject* keys;
2859 Py_ssize_t index;
2861 PyObject* def = Py_None;
2862 static char* kwlist[] = { "default", NULL };
2863 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2864 return NULL;
2866 result = PyDict_New();
2867 if (!result || !self->pattern->groupindex)
2868 return result;
2870 keys = PyMapping_Keys(self->pattern->groupindex);
2871 if (!keys)
2872 goto failed;
2874 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2875 int status;
2876 PyObject* key;
2877 PyObject* value;
2878 key = PyList_GET_ITEM(keys, index);
2879 if (!key)
2880 goto failed;
2881 value = match_getslice(self, key, def);
2882 if (!value) {
2883 Py_DECREF(key);
2884 goto failed;
2886 status = PyDict_SetItem(result, key, value);
2887 Py_DECREF(value);
2888 if (status < 0)
2889 goto failed;
2892 Py_DECREF(keys);
2894 return result;
2896 failed:
2897 Py_XDECREF(keys);
2898 Py_DECREF(result);
2899 return NULL;
2902 static PyObject*
2903 match_start(MatchObject* self, PyObject* args)
2905 Py_ssize_t index;
2907 PyObject* index_ = Py_False; /* zero */
2908 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
2909 return NULL;
2911 index = match_getindex(self, index_);
2913 if (index < 0 || index >= self->groups) {
2914 PyErr_SetString(
2915 PyExc_IndexError,
2916 "no such group"
2918 return NULL;
2921 /* mark is -1 if group is undefined */
2922 return Py_BuildValue("i", self->mark[index*2]);
2925 static PyObject*
2926 match_end(MatchObject* self, PyObject* args)
2928 Py_ssize_t index;
2930 PyObject* index_ = Py_False; /* zero */
2931 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
2932 return NULL;
2934 index = match_getindex(self, index_);
2936 if (index < 0 || index >= self->groups) {
2937 PyErr_SetString(
2938 PyExc_IndexError,
2939 "no such group"
2941 return NULL;
2944 /* mark is -1 if group is undefined */
2945 return Py_BuildValue("i", self->mark[index*2+1]);
2948 LOCAL(PyObject*)
2949 _pair(Py_ssize_t i1, Py_ssize_t i2)
2951 PyObject* pair;
2952 PyObject* item;
2954 pair = PyTuple_New(2);
2955 if (!pair)
2956 return NULL;
2958 item = PyInt_FromSsize_t(i1);
2959 if (!item)
2960 goto error;
2961 PyTuple_SET_ITEM(pair, 0, item);
2963 item = PyInt_FromSsize_t(i2);
2964 if (!item)
2965 goto error;
2966 PyTuple_SET_ITEM(pair, 1, item);
2968 return pair;
2970 error:
2971 Py_DECREF(pair);
2972 return NULL;
2975 static PyObject*
2976 match_span(MatchObject* self, PyObject* args)
2978 Py_ssize_t index;
2980 PyObject* index_ = Py_False; /* zero */
2981 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
2982 return NULL;
2984 index = match_getindex(self, index_);
2986 if (index < 0 || index >= self->groups) {
2987 PyErr_SetString(
2988 PyExc_IndexError,
2989 "no such group"
2991 return NULL;
2994 /* marks are -1 if group is undefined */
2995 return _pair(self->mark[index*2], self->mark[index*2+1]);
2998 static PyObject*
2999 match_regs(MatchObject* self)
3001 PyObject* regs;
3002 PyObject* item;
3003 Py_ssize_t index;
3005 regs = PyTuple_New(self->groups);
3006 if (!regs)
3007 return NULL;
3009 for (index = 0; index < self->groups; index++) {
3010 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3011 if (!item) {
3012 Py_DECREF(regs);
3013 return NULL;
3015 PyTuple_SET_ITEM(regs, index, item);
3018 Py_INCREF(regs);
3019 self->regs = regs;
3021 return regs;
3024 static PyObject*
3025 match_copy(MatchObject* self, PyObject *unused)
3027 #ifdef USE_BUILTIN_COPY
3028 MatchObject* copy;
3029 Py_ssize_t slots, offset;
3031 slots = 2 * (self->pattern->groups+1);
3033 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3034 if (!copy)
3035 return NULL;
3037 /* this value a constant, but any compiler should be able to
3038 figure that out all by itself */
3039 offset = offsetof(MatchObject, string);
3041 Py_XINCREF(self->pattern);
3042 Py_XINCREF(self->string);
3043 Py_XINCREF(self->regs);
3045 memcpy((char*) copy + offset, (char*) self + offset,
3046 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3048 return (PyObject*) copy;
3049 #else
3050 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3051 return NULL;
3052 #endif
3055 static PyObject*
3056 match_deepcopy(MatchObject* self, PyObject* memo)
3058 #ifdef USE_BUILTIN_COPY
3059 MatchObject* copy;
3061 copy = (MatchObject*) match_copy(self);
3062 if (!copy)
3063 return NULL;
3065 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3066 !deepcopy(&copy->string, memo) ||
3067 !deepcopy(&copy->regs, memo)) {
3068 Py_DECREF(copy);
3069 return NULL;
3072 #else
3073 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3074 return NULL;
3075 #endif
3078 static PyMethodDef match_methods[] = {
3079 {"group", (PyCFunction) match_group, METH_VARARGS},
3080 {"start", (PyCFunction) match_start, METH_VARARGS},
3081 {"end", (PyCFunction) match_end, METH_VARARGS},
3082 {"span", (PyCFunction) match_span, METH_VARARGS},
3083 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3084 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3085 {"expand", (PyCFunction) match_expand, METH_O},
3086 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3087 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3088 {NULL, NULL}
3091 static PyObject*
3092 match_getattr(MatchObject* self, char* name)
3094 PyObject* res;
3096 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3097 if (res)
3098 return res;
3100 PyErr_Clear();
3102 if (!strcmp(name, "lastindex")) {
3103 if (self->lastindex >= 0)
3104 return Py_BuildValue("i", self->lastindex);
3105 Py_INCREF(Py_None);
3106 return Py_None;
3109 if (!strcmp(name, "lastgroup")) {
3110 if (self->pattern->indexgroup && self->lastindex >= 0) {
3111 PyObject* result = PySequence_GetItem(
3112 self->pattern->indexgroup, self->lastindex
3114 if (result)
3115 return result;
3116 PyErr_Clear();
3118 Py_INCREF(Py_None);
3119 return Py_None;
3122 if (!strcmp(name, "string")) {
3123 if (self->string) {
3124 Py_INCREF(self->string);
3125 return self->string;
3126 } else {
3127 Py_INCREF(Py_None);
3128 return Py_None;
3132 if (!strcmp(name, "regs")) {
3133 if (self->regs) {
3134 Py_INCREF(self->regs);
3135 return self->regs;
3136 } else
3137 return match_regs(self);
3140 if (!strcmp(name, "re")) {
3141 Py_INCREF(self->pattern);
3142 return (PyObject*) self->pattern;
3145 if (!strcmp(name, "pos"))
3146 return Py_BuildValue("i", self->pos);
3148 if (!strcmp(name, "endpos"))
3149 return Py_BuildValue("i", self->endpos);
3151 PyErr_SetString(PyExc_AttributeError, name);
3152 return NULL;
3155 /* FIXME: implement setattr("string", None) as a special case (to
3156 detach the associated string, if any */
3158 statichere PyTypeObject Match_Type = {
3159 PyObject_HEAD_INIT(NULL)
3160 0, "_" SRE_MODULE ".SRE_Match",
3161 sizeof(MatchObject), sizeof(Py_ssize_t),
3162 (destructor)match_dealloc, /*tp_dealloc*/
3163 0, /*tp_print*/
3164 (getattrfunc)match_getattr /*tp_getattr*/
3167 static PyObject*
3168 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3170 /* create match object (from state object) */
3172 MatchObject* match;
3173 Py_ssize_t i, j;
3174 char* base;
3175 int n;
3177 if (status > 0) {
3179 /* create match object (with room for extra group marks) */
3180 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3181 2*(pattern->groups+1));
3182 if (!match)
3183 return NULL;
3185 Py_INCREF(pattern);
3186 match->pattern = pattern;
3188 Py_INCREF(state->string);
3189 match->string = state->string;
3191 match->regs = NULL;
3192 match->groups = pattern->groups+1;
3194 /* fill in group slices */
3196 base = (char*) state->beginning;
3197 n = state->charsize;
3199 match->mark[0] = ((char*) state->start - base) / n;
3200 match->mark[1] = ((char*) state->ptr - base) / n;
3202 for (i = j = 0; i < pattern->groups; i++, j+=2)
3203 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3204 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3205 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3206 } else
3207 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3209 match->pos = state->pos;
3210 match->endpos = state->endpos;
3212 match->lastindex = state->lastindex;
3214 return (PyObject*) match;
3216 } else if (status == 0) {
3218 /* no match */
3219 Py_INCREF(Py_None);
3220 return Py_None;
3224 /* internal error */
3225 pattern_error(status);
3226 return NULL;
3230 /* -------------------------------------------------------------------- */
3231 /* scanner methods (experimental) */
3233 static void
3234 scanner_dealloc(ScannerObject* self)
3236 state_fini(&self->state);
3237 Py_DECREF(self->pattern);
3238 PyObject_DEL(self);
3241 static PyObject*
3242 scanner_match(ScannerObject* self, PyObject *unused)
3244 SRE_STATE* state = &self->state;
3245 PyObject* match;
3246 int status;
3248 state_reset(state);
3250 state->ptr = state->start;
3252 if (state->charsize == 1) {
3253 status = sre_match(state, PatternObject_GetCode(self->pattern));
3254 } else {
3255 #if defined(HAVE_UNICODE)
3256 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3257 #endif
3259 if (PyErr_Occurred())
3260 return NULL;
3262 match = pattern_new_match((PatternObject*) self->pattern,
3263 state, status);
3265 if (status == 0 || state->ptr == state->start)
3266 state->start = (void*) ((char*) state->ptr + state->charsize);
3267 else
3268 state->start = state->ptr;
3270 return match;
3274 static PyObject*
3275 scanner_search(ScannerObject* self, PyObject *unused)
3277 SRE_STATE* state = &self->state;
3278 PyObject* match;
3279 int status;
3281 state_reset(state);
3283 state->ptr = state->start;
3285 if (state->charsize == 1) {
3286 status = sre_search(state, PatternObject_GetCode(self->pattern));
3287 } else {
3288 #if defined(HAVE_UNICODE)
3289 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3290 #endif
3292 if (PyErr_Occurred())
3293 return NULL;
3295 match = pattern_new_match((PatternObject*) self->pattern,
3296 state, status);
3298 if (status == 0 || state->ptr == state->start)
3299 state->start = (void*) ((char*) state->ptr + state->charsize);
3300 else
3301 state->start = state->ptr;
3303 return match;
3306 static PyMethodDef scanner_methods[] = {
3307 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3308 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3309 {NULL, NULL}
3312 static PyObject*
3313 scanner_getattr(ScannerObject* self, char* name)
3315 PyObject* res;
3317 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3318 if (res)
3319 return res;
3321 PyErr_Clear();
3323 /* attributes */
3324 if (!strcmp(name, "pattern")) {
3325 Py_INCREF(self->pattern);
3326 return self->pattern;
3329 PyErr_SetString(PyExc_AttributeError, name);
3330 return NULL;
3333 statichere PyTypeObject Scanner_Type = {
3334 PyObject_HEAD_INIT(NULL)
3335 0, "_" SRE_MODULE ".SRE_Scanner",
3336 sizeof(ScannerObject), 0,
3337 (destructor)scanner_dealloc, /*tp_dealloc*/
3338 0, /*tp_print*/
3339 (getattrfunc)scanner_getattr, /*tp_getattr*/
3342 static PyObject*
3343 pattern_scanner(PatternObject* pattern, PyObject* args)
3345 /* create search state object */
3347 ScannerObject* self;
3349 PyObject* string;
3350 Py_ssize_t start = 0;
3351 Py_ssize_t end = PY_SSIZE_T_MAX;
3352 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3353 return NULL;
3355 /* create scanner object */
3356 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3357 if (!self)
3358 return NULL;
3360 string = state_init(&self->state, pattern, string, start, end);
3361 if (!string) {
3362 PyObject_DEL(self);
3363 return NULL;
3366 Py_INCREF(pattern);
3367 self->pattern = (PyObject*) pattern;
3369 return (PyObject*) self;
3372 static PyMethodDef _functions[] = {
3373 {"compile", _compile, METH_VARARGS},
3374 {"getcodesize", sre_codesize, METH_NOARGS},
3375 {"getlower", sre_getlower, METH_VARARGS},
3376 {NULL, NULL}
3379 #if PY_VERSION_HEX < 0x02030000
3380 DL_EXPORT(void) init_sre(void)
3381 #else
3382 PyMODINIT_FUNC init_sre(void)
3383 #endif
3385 PyObject* m;
3386 PyObject* d;
3387 PyObject* x;
3389 /* Patch object types */
3390 Pattern_Type.ob_type = Match_Type.ob_type =
3391 Scanner_Type.ob_type = &PyType_Type;
3393 m = Py_InitModule("_" SRE_MODULE, _functions);
3394 if (m == NULL)
3395 return;
3396 d = PyModule_GetDict(m);
3398 x = PyInt_FromLong(SRE_MAGIC);
3399 if (x) {
3400 PyDict_SetItemString(d, "MAGIC", x);
3401 Py_DECREF(x);
3404 x = PyInt_FromLong(sizeof(SRE_CODE));
3405 if (x) {
3406 PyDict_SetItemString(d, "CODESIZE", x);
3407 Py_DECREF(x);
3410 x = PyString_FromString(copyright);
3411 if (x) {
3412 PyDict_SetItemString(d, "copyright", x);
3413 Py_DECREF(x);
3417 #endif /* !defined(SRE_RECURSIVE) */
3419 /* vim:ts=4:sw=4:et