Revert last checkin, it is better to do make distclean
[python.git] / Modules / _sre.c
blob4d7b4fcc27a004c737e1ce79ddb2a89882e064e8
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 #ifndef SRE_RECURSIVE
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #include "Python.h"
43 #include "structmember.h" /* offsetof */
45 #include "sre.h"
47 #include <ctype.h>
49 /* name of this module, minus the leading underscore */
50 #if !defined(SRE_MODULE)
51 #define SRE_MODULE "sre"
52 #endif
54 #define SRE_PY_MODULE "re"
56 /* defining this one enables tracing */
57 #undef VERBOSE
59 #if PY_VERSION_HEX >= 0x01060000
60 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
61 /* defining this enables unicode support (default under 1.6a1 and later) */
62 #define HAVE_UNICODE
63 #endif
64 #endif
66 /* -------------------------------------------------------------------- */
67 /* optional features */
69 /* enables fast searching */
70 #define USE_FAST_SEARCH
72 /* enables aggressive inlining (always on for Visual C) */
73 #undef USE_INLINE
75 /* enables copy/deepcopy handling (work in progress) */
76 #undef USE_BUILTIN_COPY
78 #if PY_VERSION_HEX < 0x01060000
79 #define PyObject_DEL(op) PyMem_DEL((op))
80 #endif
82 /* -------------------------------------------------------------------- */
84 #if defined(_MSC_VER)
85 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
86 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
87 /* fastest possible local call under MSVC */
88 #define LOCAL(type) static __inline type __fastcall
89 #elif defined(USE_INLINE)
90 #define LOCAL(type) static inline type
91 #else
92 #define LOCAL(type) static type
93 #endif
95 /* error codes */
96 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
97 #define SRE_ERROR_STATE -2 /* illegal state */
98 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
99 #define SRE_ERROR_MEMORY -9 /* out of memory */
101 #if defined(VERBOSE)
102 #define TRACE(v) printf v
103 #else
104 #define TRACE(v)
105 #endif
107 /* -------------------------------------------------------------------- */
108 /* search engine state */
110 /* default character predicates (run sre_chars.py to regenerate tables) */
112 #define SRE_DIGIT_MASK 1
113 #define SRE_SPACE_MASK 2
114 #define SRE_LINEBREAK_MASK 4
115 #define SRE_ALNUM_MASK 8
116 #define SRE_WORD_MASK 16
118 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
120 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
121 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
123 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
124 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
125 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
126 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
128 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
129 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
130 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
131 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
132 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
133 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
134 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
135 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
136 120, 121, 122, 123, 124, 125, 126, 127 };
138 #define SRE_IS_DIGIT(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
140 #define SRE_IS_SPACE(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
142 #define SRE_IS_LINEBREAK(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
144 #define SRE_IS_ALNUM(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
146 #define SRE_IS_WORD(ch)\
147 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
149 static unsigned int sre_lower(unsigned int ch)
151 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
154 /* locale-specific character predicates */
155 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
156 * warnings when c's type supports only numbers < N+1 */
157 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
158 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
159 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
160 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
161 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
163 static unsigned int sre_lower_locale(unsigned int ch)
165 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
168 /* unicode-specific character predicates */
170 #if defined(HAVE_UNICODE)
172 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
173 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
174 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
178 static unsigned int sre_lower_unicode(unsigned int ch)
180 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
183 #endif
185 LOCAL(int)
186 sre_category(SRE_CODE category, unsigned int ch)
188 switch (category) {
190 case SRE_CATEGORY_DIGIT:
191 return SRE_IS_DIGIT(ch);
192 case SRE_CATEGORY_NOT_DIGIT:
193 return !SRE_IS_DIGIT(ch);
194 case SRE_CATEGORY_SPACE:
195 return SRE_IS_SPACE(ch);
196 case SRE_CATEGORY_NOT_SPACE:
197 return !SRE_IS_SPACE(ch);
198 case SRE_CATEGORY_WORD:
199 return SRE_IS_WORD(ch);
200 case SRE_CATEGORY_NOT_WORD:
201 return !SRE_IS_WORD(ch);
202 case SRE_CATEGORY_LINEBREAK:
203 return SRE_IS_LINEBREAK(ch);
204 case SRE_CATEGORY_NOT_LINEBREAK:
205 return !SRE_IS_LINEBREAK(ch);
207 case SRE_CATEGORY_LOC_WORD:
208 return SRE_LOC_IS_WORD(ch);
209 case SRE_CATEGORY_LOC_NOT_WORD:
210 return !SRE_LOC_IS_WORD(ch);
212 #if defined(HAVE_UNICODE)
213 case SRE_CATEGORY_UNI_DIGIT:
214 return SRE_UNI_IS_DIGIT(ch);
215 case SRE_CATEGORY_UNI_NOT_DIGIT:
216 return !SRE_UNI_IS_DIGIT(ch);
217 case SRE_CATEGORY_UNI_SPACE:
218 return SRE_UNI_IS_SPACE(ch);
219 case SRE_CATEGORY_UNI_NOT_SPACE:
220 return !SRE_UNI_IS_SPACE(ch);
221 case SRE_CATEGORY_UNI_WORD:
222 return SRE_UNI_IS_WORD(ch);
223 case SRE_CATEGORY_UNI_NOT_WORD:
224 return !SRE_UNI_IS_WORD(ch);
225 case SRE_CATEGORY_UNI_LINEBREAK:
226 return SRE_UNI_IS_LINEBREAK(ch);
227 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
228 return !SRE_UNI_IS_LINEBREAK(ch);
229 #else
230 case SRE_CATEGORY_UNI_DIGIT:
231 return SRE_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_NOT_DIGIT:
233 return !SRE_IS_DIGIT(ch);
234 case SRE_CATEGORY_UNI_SPACE:
235 return SRE_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_NOT_SPACE:
237 return !SRE_IS_SPACE(ch);
238 case SRE_CATEGORY_UNI_WORD:
239 return SRE_LOC_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_NOT_WORD:
241 return !SRE_LOC_IS_WORD(ch);
242 case SRE_CATEGORY_UNI_LINEBREAK:
243 return SRE_IS_LINEBREAK(ch);
244 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
245 return !SRE_IS_LINEBREAK(ch);
246 #endif
248 return 0;
251 /* helpers */
253 static void
254 data_stack_dealloc(SRE_STATE* state)
256 if (state->data_stack) {
257 PyMem_FREE(state->data_stack);
258 state->data_stack = NULL;
260 state->data_stack_size = state->data_stack_base = 0;
263 static int
264 data_stack_grow(SRE_STATE* state, int size)
266 int minsize, cursize;
267 minsize = state->data_stack_base+size;
268 cursize = state->data_stack_size;
269 if (cursize < minsize) {
270 void* stack;
271 cursize = minsize+minsize/4+1024;
272 TRACE(("allocate/grow stack %d\n", cursize));
273 stack = PyMem_REALLOC(state->data_stack, cursize);
274 if (!stack) {
275 data_stack_dealloc(state);
276 return SRE_ERROR_MEMORY;
278 state->data_stack = (char *)stack;
279 state->data_stack_size = cursize;
281 return 0;
284 /* generate 8-bit version */
286 #define SRE_CHAR unsigned char
287 #define SRE_AT sre_at
288 #define SRE_COUNT sre_count
289 #define SRE_CHARSET sre_charset
290 #define SRE_INFO sre_info
291 #define SRE_MATCH sre_match
292 #define SRE_MATCH_CONTEXT sre_match_context
293 #define SRE_SEARCH sre_search
294 #define SRE_LITERAL_TEMPLATE sre_literal_template
296 #if defined(HAVE_UNICODE)
298 #define SRE_RECURSIVE
299 #include "_sre.c"
300 #undef SRE_RECURSIVE
302 #undef SRE_LITERAL_TEMPLATE
303 #undef SRE_SEARCH
304 #undef SRE_MATCH
305 #undef SRE_MATCH_CONTEXT
306 #undef SRE_INFO
307 #undef SRE_CHARSET
308 #undef SRE_COUNT
309 #undef SRE_AT
310 #undef SRE_CHAR
312 /* generate 16-bit unicode version */
314 #define SRE_CHAR Py_UNICODE
315 #define SRE_AT sre_uat
316 #define SRE_COUNT sre_ucount
317 #define SRE_CHARSET sre_ucharset
318 #define SRE_INFO sre_uinfo
319 #define SRE_MATCH sre_umatch
320 #define SRE_MATCH_CONTEXT sre_umatch_context
321 #define SRE_SEARCH sre_usearch
322 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
323 #endif
325 #endif /* SRE_RECURSIVE */
327 /* -------------------------------------------------------------------- */
328 /* String matching engine */
330 /* the following section is compiled twice, with different character
331 settings */
333 LOCAL(int)
334 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
336 /* check if pointer is at given position */
338 int thisp, thatp;
340 switch (at) {
342 case SRE_AT_BEGINNING:
343 case SRE_AT_BEGINNING_STRING:
344 return ((void*) ptr == state->beginning);
346 case SRE_AT_BEGINNING_LINE:
347 return ((void*) ptr == state->beginning ||
348 SRE_IS_LINEBREAK((int) ptr[-1]));
350 case SRE_AT_END:
351 return (((void*) (ptr+1) == state->end &&
352 SRE_IS_LINEBREAK((int) ptr[0])) ||
353 ((void*) ptr == state->end));
355 case SRE_AT_END_LINE:
356 return ((void*) ptr == state->end ||
357 SRE_IS_LINEBREAK((int) ptr[0]));
359 case SRE_AT_END_STRING:
360 return ((void*) ptr == state->end);
362 case SRE_AT_BOUNDARY:
363 if (state->beginning == state->end)
364 return 0;
365 thatp = ((void*) ptr > state->beginning) ?
366 SRE_IS_WORD((int) ptr[-1]) : 0;
367 thisp = ((void*) ptr < state->end) ?
368 SRE_IS_WORD((int) ptr[0]) : 0;
369 return thisp != thatp;
371 case SRE_AT_NON_BOUNDARY:
372 if (state->beginning == state->end)
373 return 0;
374 thatp = ((void*) ptr > state->beginning) ?
375 SRE_IS_WORD((int) ptr[-1]) : 0;
376 thisp = ((void*) ptr < state->end) ?
377 SRE_IS_WORD((int) ptr[0]) : 0;
378 return thisp == thatp;
380 case SRE_AT_LOC_BOUNDARY:
381 if (state->beginning == state->end)
382 return 0;
383 thatp = ((void*) ptr > state->beginning) ?
384 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
385 thisp = ((void*) ptr < state->end) ?
386 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
387 return thisp != thatp;
389 case SRE_AT_LOC_NON_BOUNDARY:
390 if (state->beginning == state->end)
391 return 0;
392 thatp = ((void*) ptr > state->beginning) ?
393 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
394 thisp = ((void*) ptr < state->end) ?
395 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
396 return thisp == thatp;
398 #if defined(HAVE_UNICODE)
399 case SRE_AT_UNI_BOUNDARY:
400 if (state->beginning == state->end)
401 return 0;
402 thatp = ((void*) ptr > state->beginning) ?
403 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
404 thisp = ((void*) ptr < state->end) ?
405 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
406 return thisp != thatp;
408 case SRE_AT_UNI_NON_BOUNDARY:
409 if (state->beginning == state->end)
410 return 0;
411 thatp = ((void*) ptr > state->beginning) ?
412 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
413 thisp = ((void*) ptr < state->end) ?
414 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
415 return thisp == thatp;
416 #endif
420 return 0;
423 LOCAL(int)
424 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
426 /* check if character is a member of the given set */
428 int ok = 1;
430 for (;;) {
431 switch (*set++) {
433 case SRE_OP_FAILURE:
434 return !ok;
436 case SRE_OP_LITERAL:
437 /* <LITERAL> <code> */
438 if (ch == set[0])
439 return ok;
440 set++;
441 break;
443 case SRE_OP_CATEGORY:
444 /* <CATEGORY> <code> */
445 if (sre_category(set[0], (int) ch))
446 return ok;
447 set += 1;
448 break;
450 case SRE_OP_CHARSET:
451 if (sizeof(SRE_CODE) == 2) {
452 /* <CHARSET> <bitmap> (16 bits per code word) */
453 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
454 return ok;
455 set += 16;
457 else {
458 /* <CHARSET> <bitmap> (32 bits per code word) */
459 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
460 return ok;
461 set += 8;
463 break;
465 case SRE_OP_RANGE:
466 /* <RANGE> <lower> <upper> */
467 if (set[0] <= ch && ch <= set[1])
468 return ok;
469 set += 2;
470 break;
472 case SRE_OP_NEGATE:
473 ok = !ok;
474 break;
476 case SRE_OP_BIGCHARSET:
477 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
479 int count, block;
480 count = *(set++);
482 if (sizeof(SRE_CODE) == 2) {
483 block = ((unsigned char*)set)[ch >> 8];
484 set += 128;
485 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
486 return ok;
487 set += count*16;
489 else {
490 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
491 * warnings when c's type supports only numbers < N+1 */
492 if (!(ch & ~65535))
493 block = ((unsigned char*)set)[ch >> 8];
494 else
495 block = -1;
496 set += 64;
497 if (block >=0 &&
498 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
499 return ok;
500 set += count*8;
502 break;
505 default:
506 /* internal error -- there's not much we can do about it
507 here, so let's just pretend it didn't match... */
508 return 0;
513 LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
515 LOCAL(int)
516 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
518 SRE_CODE chr;
519 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
520 SRE_CHAR* end = (SRE_CHAR *)state->end;
521 int i;
523 /* adjust end */
524 if (maxcount < end - ptr && maxcount != 65535)
525 end = ptr + maxcount;
527 switch (pattern[0]) {
529 case SRE_OP_IN:
530 /* repeated set */
531 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
532 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
533 ptr++;
534 break;
536 case SRE_OP_ANY:
537 /* repeated dot wildcard. */
538 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
539 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
540 ptr++;
541 break;
543 case SRE_OP_ANY_ALL:
544 /* repeated dot wildcard. skip to the end of the target
545 string, and backtrack from there */
546 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
547 ptr = end;
548 break;
550 case SRE_OP_LITERAL:
551 /* repeated literal */
552 chr = pattern[1];
553 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
554 while (ptr < end && (SRE_CODE) *ptr == chr)
555 ptr++;
556 break;
558 case SRE_OP_LITERAL_IGNORE:
559 /* repeated literal */
560 chr = pattern[1];
561 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
562 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
563 ptr++;
564 break;
566 case SRE_OP_NOT_LITERAL:
567 /* repeated non-literal */
568 chr = pattern[1];
569 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
570 while (ptr < end && (SRE_CODE) *ptr != chr)
571 ptr++;
572 break;
574 case SRE_OP_NOT_LITERAL_IGNORE:
575 /* repeated non-literal */
576 chr = pattern[1];
577 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
578 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
579 ptr++;
580 break;
582 default:
583 /* repeated single character pattern */
584 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
585 while ((SRE_CHAR*) state->ptr < end) {
586 i = SRE_MATCH(state, pattern);
587 if (i < 0)
588 return i;
589 if (!i)
590 break;
592 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
593 (SRE_CHAR*) state->ptr - ptr));
594 return (SRE_CHAR*) state->ptr - ptr;
597 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
598 return ptr - (SRE_CHAR*) state->ptr;
601 #if 0 /* not used in this release */
602 LOCAL(int)
603 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
605 /* check if an SRE_OP_INFO block matches at the current position.
606 returns the number of SRE_CODE objects to skip if successful, 0
607 if no match */
609 SRE_CHAR* end = state->end;
610 SRE_CHAR* ptr = state->ptr;
611 int i;
613 /* check minimal length */
614 if (pattern[3] && (end - ptr) < pattern[3])
615 return 0;
617 /* check known prefix */
618 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
619 /* <length> <skip> <prefix data> <overlap data> */
620 for (i = 0; i < pattern[5]; i++)
621 if ((SRE_CODE) ptr[i] != pattern[7 + i])
622 return 0;
623 return pattern[0] + 2 * pattern[6];
625 return pattern[0];
627 #endif
629 /* The macros below should be used to protect recursive SRE_MATCH()
630 * calls that *failed* and do *not* return immediately (IOW, those
631 * that will backtrack). Explaining:
633 * - Recursive SRE_MATCH() returned true: that's usually a success
634 * (besides atypical cases like ASSERT_NOT), therefore there's no
635 * reason to restore lastmark;
637 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
638 * is returning to the caller: If the current SRE_MATCH() is the
639 * top function of the recursion, returning false will be a matching
640 * failure, and it doesn't matter where lastmark is pointing to.
641 * If it's *not* the top function, it will be a recursive SRE_MATCH()
642 * failure by itself, and the calling SRE_MATCH() will have to deal
643 * with the failure by the same rules explained here (it will restore
644 * lastmark by itself if necessary);
646 * - Recursive SRE_MATCH() returned false, and will continue the
647 * outside 'for' loop: must be protected when breaking, since the next
648 * OP could potentially depend on lastmark;
650 * - Recursive SRE_MATCH() returned false, and will be called again
651 * inside a local for/while loop: must be protected between each
652 * loop iteration, since the recursive SRE_MATCH() could do anything,
653 * and could potentially depend on lastmark.
655 * For more information, check the discussion at SF patch #712900.
657 #define LASTMARK_SAVE() \
658 do { \
659 ctx->lastmark = state->lastmark; \
660 ctx->lastindex = state->lastindex; \
661 } while (0)
662 #define LASTMARK_RESTORE() \
663 do { \
664 state->lastmark = ctx->lastmark; \
665 state->lastindex = ctx->lastindex; \
666 } while (0)
668 #define RETURN_ERROR(i) do { return i; } while(0)
669 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
670 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
672 #define RETURN_ON_ERROR(i) \
673 do { if (i < 0) RETURN_ERROR(i); } while (0)
674 #define RETURN_ON_SUCCESS(i) \
675 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
676 #define RETURN_ON_FAILURE(i) \
677 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
679 #define SFY(x) #x
681 #define DATA_STACK_ALLOC(state, type, ptr) \
682 do { \
683 alloc_pos = state->data_stack_base; \
684 TRACE(("allocating %s in %d (%d)\n", \
685 SFY(type), alloc_pos, sizeof(type))); \
686 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
687 int j = data_stack_grow(state, sizeof(type)); \
688 if (j < 0) return j; \
689 if (ctx_pos != -1) \
690 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
692 ptr = (type*)(state->data_stack+alloc_pos); \
693 state->data_stack_base += sizeof(type); \
694 } while (0)
696 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
697 do { \
698 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
699 ptr = (type*)(state->data_stack+pos); \
700 } while (0)
702 #define DATA_STACK_PUSH(state, data, size) \
703 do { \
704 TRACE(("copy data in %p to %d (%d)\n", \
705 data, state->data_stack_base, size)); \
706 if (state->data_stack_size < state->data_stack_base+size) { \
707 int j = data_stack_grow(state, size); \
708 if (j < 0) return j; \
709 if (ctx_pos != -1) \
710 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
712 memcpy(state->data_stack+state->data_stack_base, data, size); \
713 state->data_stack_base += size; \
714 } while (0)
716 #define DATA_STACK_POP(state, data, size, discard) \
717 do { \
718 TRACE(("copy data to %p from %d (%d)\n", \
719 data, state->data_stack_base-size, size)); \
720 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
721 if (discard) \
722 state->data_stack_base -= size; \
723 } while (0)
725 #define DATA_STACK_POP_DISCARD(state, size) \
726 do { \
727 TRACE(("discard data from %d (%d)\n", \
728 state->data_stack_base-size, size)); \
729 state->data_stack_base -= size; \
730 } while(0)
732 #define DATA_PUSH(x) \
733 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
734 #define DATA_POP(x) \
735 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
736 #define DATA_POP_DISCARD(x) \
737 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
738 #define DATA_ALLOC(t,p) \
739 DATA_STACK_ALLOC(state, t, p)
740 #define DATA_LOOKUP_AT(t,p,pos) \
741 DATA_STACK_LOOKUP_AT(state,t,p,pos)
743 #define MARK_PUSH(lastmark) \
744 do if (lastmark > 0) { \
745 i = lastmark; /* ctx->lastmark may change if reallocated */ \
746 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
747 } while (0)
748 #define MARK_POP(lastmark) \
749 do if (lastmark > 0) { \
750 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
751 } while (0)
752 #define MARK_POP_KEEP(lastmark) \
753 do if (lastmark > 0) { \
754 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
755 } while (0)
756 #define MARK_POP_DISCARD(lastmark) \
757 do if (lastmark > 0) { \
758 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
759 } while (0)
761 #define JUMP_NONE 0
762 #define JUMP_MAX_UNTIL_1 1
763 #define JUMP_MAX_UNTIL_2 2
764 #define JUMP_MAX_UNTIL_3 3
765 #define JUMP_MIN_UNTIL_1 4
766 #define JUMP_MIN_UNTIL_2 5
767 #define JUMP_MIN_UNTIL_3 6
768 #define JUMP_REPEAT 7
769 #define JUMP_REPEAT_ONE_1 8
770 #define JUMP_REPEAT_ONE_2 9
771 #define JUMP_MIN_REPEAT_ONE 10
772 #define JUMP_BRANCH 11
773 #define JUMP_ASSERT 12
774 #define JUMP_ASSERT_NOT 13
776 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
777 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
778 nextctx->last_ctx_pos = ctx_pos; \
779 nextctx->jump = jumpvalue; \
780 nextctx->pattern = nextpattern; \
781 ctx_pos = alloc_pos; \
782 ctx = nextctx; \
783 goto entrance; \
784 jumplabel: \
785 while (0) /* gcc doesn't like labels at end of scopes */ \
787 typedef struct {
788 int last_ctx_pos;
789 int jump;
790 SRE_CHAR* ptr;
791 SRE_CODE* pattern;
792 int count;
793 int lastmark;
794 int lastindex;
795 union {
796 SRE_CODE chr;
797 SRE_REPEAT* rep;
798 } u;
799 } SRE_MATCH_CONTEXT;
801 /* check if string matches the given pattern. returns <0 for
802 error, 0 for failure, and 1 for success */
803 LOCAL(int)
804 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
806 SRE_CHAR* end = (SRE_CHAR *)state->end;
807 int alloc_pos, ctx_pos = -1;
808 int i, ret = 0;
809 int jump;
811 SRE_MATCH_CONTEXT* ctx;
812 SRE_MATCH_CONTEXT* nextctx;
814 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
816 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817 ctx->last_ctx_pos = -1;
818 ctx->jump = JUMP_NONE;
819 ctx->pattern = pattern;
820 ctx_pos = alloc_pos;
822 entrance:
824 ctx->ptr = (SRE_CHAR *)state->ptr;
826 if (ctx->pattern[0] == SRE_OP_INFO) {
827 /* optimization info block */
828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
829 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
830 TRACE(("reject (got %d chars, need %d)\n",
831 (end - ctx->ptr), ctx->pattern[3]));
832 RETURN_FAILURE;
834 ctx->pattern += ctx->pattern[1] + 1;
837 for (;;) {
839 switch (*ctx->pattern++) {
841 case SRE_OP_MARK:
842 /* set mark */
843 /* <MARK> <gid> */
844 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
845 ctx->ptr, ctx->pattern[0]));
846 i = ctx->pattern[0];
847 if (i & 1)
848 state->lastindex = i/2 + 1;
849 if (i > state->lastmark) {
850 /* state->lastmark is the highest valid index in the
851 state->mark array. If it is increased by more than 1,
852 the intervening marks must be set to NULL to signal
853 that these marks have not been encountered. */
854 int j = state->lastmark + 1;
855 while (j < i)
856 state->mark[j++] = NULL;
857 state->lastmark = i;
859 state->mark[i] = ctx->ptr;
860 ctx->pattern++;
861 break;
863 case SRE_OP_LITERAL:
864 /* match literal string */
865 /* <LITERAL> <code> */
866 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
867 ctx->ptr, *ctx->pattern));
868 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
869 RETURN_FAILURE;
870 ctx->pattern++;
871 ctx->ptr++;
872 break;
874 case SRE_OP_NOT_LITERAL:
875 /* match anything that is not literal character */
876 /* <NOT_LITERAL> <code> */
877 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
878 ctx->ptr, *ctx->pattern));
879 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
880 RETURN_FAILURE;
881 ctx->pattern++;
882 ctx->ptr++;
883 break;
885 case SRE_OP_SUCCESS:
886 /* end of pattern */
887 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
888 state->ptr = ctx->ptr;
889 RETURN_SUCCESS;
891 case SRE_OP_AT:
892 /* match at given position */
893 /* <AT> <code> */
894 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
895 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
896 RETURN_FAILURE;
897 ctx->pattern++;
898 break;
900 case SRE_OP_CATEGORY:
901 /* match at given category */
902 /* <CATEGORY> <code> */
903 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
904 ctx->ptr, *ctx->pattern));
905 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
906 RETURN_FAILURE;
907 ctx->pattern++;
908 ctx->ptr++;
909 break;
911 case SRE_OP_ANY:
912 /* match anything (except a newline) */
913 /* <ANY> */
914 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
915 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
916 RETURN_FAILURE;
917 ctx->ptr++;
918 break;
920 case SRE_OP_ANY_ALL:
921 /* match anything */
922 /* <ANY_ALL> */
923 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
924 if (ctx->ptr >= end)
925 RETURN_FAILURE;
926 ctx->ptr++;
927 break;
929 case SRE_OP_IN:
930 /* match set member (or non_member) */
931 /* <IN> <skip> <set> */
932 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
933 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
934 RETURN_FAILURE;
935 ctx->pattern += ctx->pattern[0];
936 ctx->ptr++;
937 break;
939 case SRE_OP_LITERAL_IGNORE:
940 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
941 ctx->pattern, ctx->ptr, ctx->pattern[0]));
942 if (ctx->ptr >= end ||
943 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
944 RETURN_FAILURE;
945 ctx->pattern++;
946 ctx->ptr++;
947 break;
949 case SRE_OP_NOT_LITERAL_IGNORE:
950 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
951 ctx->pattern, ctx->ptr, *ctx->pattern));
952 if (ctx->ptr >= end ||
953 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
954 RETURN_FAILURE;
955 ctx->pattern++;
956 ctx->ptr++;
957 break;
959 case SRE_OP_IN_IGNORE:
960 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
961 if (ctx->ptr >= end
962 || !SRE_CHARSET(ctx->pattern+1,
963 (SRE_CODE)state->lower(*ctx->ptr)))
964 RETURN_FAILURE;
965 ctx->pattern += ctx->pattern[0];
966 ctx->ptr++;
967 break;
969 case SRE_OP_JUMP:
970 case SRE_OP_INFO:
971 /* jump forward */
972 /* <JUMP> <offset> */
973 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
974 ctx->ptr, ctx->pattern[0]));
975 ctx->pattern += ctx->pattern[0];
976 break;
978 case SRE_OP_BRANCH:
979 /* alternation */
980 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
981 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
982 LASTMARK_SAVE();
983 ctx->u.rep = state->repeat;
984 if (ctx->u.rep)
985 MARK_PUSH(ctx->lastmark);
986 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
987 if (ctx->pattern[1] == SRE_OP_LITERAL &&
988 (ctx->ptr >= end ||
989 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
990 continue;
991 if (ctx->pattern[1] == SRE_OP_IN &&
992 (ctx->ptr >= end ||
993 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
994 continue;
995 state->ptr = ctx->ptr;
996 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
997 if (ret) {
998 if (ctx->u.rep)
999 MARK_POP_DISCARD(ctx->lastmark);
1000 RETURN_ON_ERROR(ret);
1001 RETURN_SUCCESS;
1003 if (ctx->u.rep)
1004 MARK_POP_KEEP(ctx->lastmark);
1005 LASTMARK_RESTORE();
1007 if (ctx->u.rep)
1008 MARK_POP_DISCARD(ctx->lastmark);
1009 RETURN_FAILURE;
1011 case SRE_OP_REPEAT_ONE:
1012 /* match repeated sequence (maximizing regexp) */
1014 /* this operator only works if the repeated item is
1015 exactly one character wide, and we're not already
1016 collecting backtracking points. for other cases,
1017 use the MAX_REPEAT operator */
1019 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1021 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1022 ctx->pattern[1], ctx->pattern[2]));
1024 if (ctx->ptr + ctx->pattern[1] > end)
1025 RETURN_FAILURE; /* cannot match */
1027 state->ptr = ctx->ptr;
1029 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1030 RETURN_ON_ERROR(ret);
1031 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1032 ctx->count = ret;
1033 ctx->ptr += ctx->count;
1035 /* when we arrive here, count contains the number of
1036 matches, and ctx->ptr points to the tail of the target
1037 string. check if the rest of the pattern matches,
1038 and backtrack if not. */
1040 if (ctx->count < (int) ctx->pattern[1])
1041 RETURN_FAILURE;
1043 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1044 /* tail is empty. we're finished */
1045 state->ptr = ctx->ptr;
1046 RETURN_SUCCESS;
1049 LASTMARK_SAVE();
1051 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1052 /* tail starts with a literal. skip positions where
1053 the rest of the pattern cannot possibly match */
1054 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1055 for (;;) {
1056 while (ctx->count >= (int) ctx->pattern[1] &&
1057 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1058 ctx->ptr--;
1059 ctx->count--;
1061 if (ctx->count < (int) ctx->pattern[1])
1062 break;
1063 state->ptr = ctx->ptr;
1064 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1065 ctx->pattern+ctx->pattern[0]);
1066 if (ret) {
1067 RETURN_ON_ERROR(ret);
1068 RETURN_SUCCESS;
1071 LASTMARK_RESTORE();
1073 ctx->ptr--;
1074 ctx->count--;
1077 } else {
1078 /* general case */
1079 while (ctx->count >= (int) ctx->pattern[1]) {
1080 state->ptr = ctx->ptr;
1081 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1082 ctx->pattern+ctx->pattern[0]);
1083 if (ret) {
1084 RETURN_ON_ERROR(ret);
1085 RETURN_SUCCESS;
1087 ctx->ptr--;
1088 ctx->count--;
1089 LASTMARK_RESTORE();
1092 RETURN_FAILURE;
1094 case SRE_OP_MIN_REPEAT_ONE:
1095 /* match repeated sequence (minimizing regexp) */
1097 /* this operator only works if the repeated item is
1098 exactly one character wide, and we're not already
1099 collecting backtracking points. for other cases,
1100 use the MIN_REPEAT operator */
1102 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1104 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1105 ctx->pattern[1], ctx->pattern[2]));
1107 if (ctx->ptr + ctx->pattern[1] > end)
1108 RETURN_FAILURE; /* cannot match */
1110 state->ptr = ctx->ptr;
1112 if (ctx->pattern[1] == 0)
1113 ctx->count = 0;
1114 else {
1115 /* count using pattern min as the maximum */
1116 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1117 RETURN_ON_ERROR(ret);
1118 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1119 if (ret < (int) ctx->pattern[1])
1120 /* didn't match minimum number of times */
1121 RETURN_FAILURE;
1122 /* advance past minimum matches of repeat */
1123 ctx->count = ret;
1124 ctx->ptr += ctx->count;
1127 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1128 /* tail is empty. we're finished */
1129 state->ptr = ctx->ptr;
1130 RETURN_SUCCESS;
1132 } else {
1133 /* general case */
1134 LASTMARK_SAVE();
1135 while ((int)ctx->pattern[2] == 65535
1136 || ctx->count <= (int)ctx->pattern[2]) {
1137 state->ptr = ctx->ptr;
1138 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1139 ctx->pattern+ctx->pattern[0]);
1140 if (ret) {
1141 RETURN_ON_ERROR(ret);
1142 RETURN_SUCCESS;
1144 state->ptr = ctx->ptr;
1145 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1146 RETURN_ON_ERROR(ret);
1147 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1148 if (ret == 0)
1149 break;
1150 assert(ret == 1);
1151 ctx->ptr++;
1152 ctx->count++;
1153 LASTMARK_RESTORE();
1156 RETURN_FAILURE;
1158 case SRE_OP_REPEAT:
1159 /* create repeat context. all the hard work is done
1160 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1161 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1162 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1163 ctx->pattern[1], ctx->pattern[2]));
1165 /* install new repeat context */
1166 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1167 ctx->u.rep->count = -1;
1168 ctx->u.rep->pattern = ctx->pattern;
1169 ctx->u.rep->prev = state->repeat;
1170 ctx->u.rep->last_ptr = NULL;
1171 state->repeat = ctx->u.rep;
1173 state->ptr = ctx->ptr;
1174 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1175 state->repeat = ctx->u.rep->prev;
1176 PyObject_FREE(ctx->u.rep);
1178 if (ret) {
1179 RETURN_ON_ERROR(ret);
1180 RETURN_SUCCESS;
1182 RETURN_FAILURE;
1184 case SRE_OP_MAX_UNTIL:
1185 /* maximizing repeat */
1186 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1188 /* FIXME: we probably need to deal with zero-width
1189 matches in here... */
1191 ctx->u.rep = state->repeat;
1192 if (!ctx->u.rep)
1193 RETURN_ERROR(SRE_ERROR_STATE);
1195 state->ptr = ctx->ptr;
1197 ctx->count = ctx->u.rep->count+1;
1199 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1200 ctx->ptr, ctx->count));
1202 if (ctx->count < ctx->u.rep->pattern[1]) {
1203 /* not enough matches */
1204 ctx->u.rep->count = ctx->count;
1205 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1206 ctx->u.rep->pattern+3);
1207 if (ret) {
1208 RETURN_ON_ERROR(ret);
1209 RETURN_SUCCESS;
1211 ctx->u.rep->count = ctx->count-1;
1212 state->ptr = ctx->ptr;
1213 RETURN_FAILURE;
1216 if ((ctx->count < ctx->u.rep->pattern[2] ||
1217 ctx->u.rep->pattern[2] == 65535) &&
1218 state->ptr != ctx->u.rep->last_ptr) {
1219 /* we may have enough matches, but if we can
1220 match another item, do so */
1221 ctx->u.rep->count = ctx->count;
1222 LASTMARK_SAVE();
1223 MARK_PUSH(ctx->lastmark);
1224 /* zero-width match protection */
1225 DATA_PUSH(&ctx->u.rep->last_ptr);
1226 ctx->u.rep->last_ptr = state->ptr;
1227 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1228 ctx->u.rep->pattern+3);
1229 DATA_POP(&ctx->u.rep->last_ptr);
1230 if (ret) {
1231 MARK_POP_DISCARD(ctx->lastmark);
1232 RETURN_ON_ERROR(ret);
1233 RETURN_SUCCESS;
1235 MARK_POP(ctx->lastmark);
1236 LASTMARK_RESTORE();
1237 ctx->u.rep->count = ctx->count-1;
1238 state->ptr = ctx->ptr;
1241 /* cannot match more repeated items here. make sure the
1242 tail matches */
1243 state->repeat = ctx->u.rep->prev;
1244 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1245 RETURN_ON_SUCCESS(ret);
1246 state->repeat = ctx->u.rep;
1247 state->ptr = ctx->ptr;
1248 RETURN_FAILURE;
1250 case SRE_OP_MIN_UNTIL:
1251 /* minimizing repeat */
1252 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1254 ctx->u.rep = state->repeat;
1255 if (!ctx->u.rep)
1256 RETURN_ERROR(SRE_ERROR_STATE);
1258 state->ptr = ctx->ptr;
1260 ctx->count = ctx->u.rep->count+1;
1262 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1263 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1265 if (ctx->count < ctx->u.rep->pattern[1]) {
1266 /* not enough matches */
1267 ctx->u.rep->count = ctx->count;
1268 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1269 ctx->u.rep->pattern+3);
1270 if (ret) {
1271 RETURN_ON_ERROR(ret);
1272 RETURN_SUCCESS;
1274 ctx->u.rep->count = ctx->count-1;
1275 state->ptr = ctx->ptr;
1276 RETURN_FAILURE;
1279 LASTMARK_SAVE();
1281 /* see if the tail matches */
1282 state->repeat = ctx->u.rep->prev;
1283 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1284 if (ret) {
1285 RETURN_ON_ERROR(ret);
1286 RETURN_SUCCESS;
1289 state->repeat = ctx->u.rep;
1290 state->ptr = ctx->ptr;
1292 LASTMARK_RESTORE();
1294 if (ctx->count >= ctx->u.rep->pattern[2]
1295 && ctx->u.rep->pattern[2] != 65535)
1296 RETURN_FAILURE;
1298 ctx->u.rep->count = ctx->count;
1299 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1300 ctx->u.rep->pattern+3);
1301 if (ret) {
1302 RETURN_ON_ERROR(ret);
1303 RETURN_SUCCESS;
1305 ctx->u.rep->count = ctx->count-1;
1306 state->ptr = ctx->ptr;
1307 RETURN_FAILURE;
1309 case SRE_OP_GROUPREF:
1310 /* match backreference */
1311 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1312 ctx->ptr, ctx->pattern[0]));
1313 i = ctx->pattern[0];
1315 int groupref = i+i;
1316 if (groupref >= state->lastmark) {
1317 RETURN_FAILURE;
1318 } else {
1319 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1320 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1321 if (!p || !e || e < p)
1322 RETURN_FAILURE;
1323 while (p < e) {
1324 if (ctx->ptr >= end || *ctx->ptr != *p)
1325 RETURN_FAILURE;
1326 p++; ctx->ptr++;
1330 ctx->pattern++;
1331 break;
1333 case SRE_OP_GROUPREF_IGNORE:
1334 /* match backreference */
1335 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1336 ctx->ptr, ctx->pattern[0]));
1337 i = ctx->pattern[0];
1339 int groupref = i+i;
1340 if (groupref >= state->lastmark) {
1341 RETURN_FAILURE;
1342 } else {
1343 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1344 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1345 if (!p || !e || e < p)
1346 RETURN_FAILURE;
1347 while (p < e) {
1348 if (ctx->ptr >= end ||
1349 state->lower(*ctx->ptr) != state->lower(*p))
1350 RETURN_FAILURE;
1351 p++; ctx->ptr++;
1355 ctx->pattern++;
1356 break;
1358 case SRE_OP_GROUPREF_EXISTS:
1359 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1360 ctx->ptr, ctx->pattern[0]));
1361 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1362 i = ctx->pattern[0];
1364 int groupref = i+i;
1365 if (groupref >= state->lastmark) {
1366 ctx->pattern += ctx->pattern[1];
1367 break;
1368 } else {
1369 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1370 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1371 if (!p || !e || e < p) {
1372 ctx->pattern += ctx->pattern[1];
1373 break;
1377 ctx->pattern += 2;
1378 break;
1380 case SRE_OP_ASSERT:
1381 /* assert subpattern */
1382 /* <ASSERT> <skip> <back> <pattern> */
1383 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1384 ctx->ptr, ctx->pattern[1]));
1385 state->ptr = ctx->ptr - ctx->pattern[1];
1386 if (state->ptr < state->beginning)
1387 RETURN_FAILURE;
1388 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1389 RETURN_ON_FAILURE(ret);
1390 ctx->pattern += ctx->pattern[0];
1391 break;
1393 case SRE_OP_ASSERT_NOT:
1394 /* assert not subpattern */
1395 /* <ASSERT_NOT> <skip> <back> <pattern> */
1396 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1397 ctx->ptr, ctx->pattern[1]));
1398 state->ptr = ctx->ptr - ctx->pattern[1];
1399 if (state->ptr >= state->beginning) {
1400 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1401 if (ret) {
1402 RETURN_ON_ERROR(ret);
1403 RETURN_FAILURE;
1406 ctx->pattern += ctx->pattern[0];
1407 break;
1409 case SRE_OP_FAILURE:
1410 /* immediate failure */
1411 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1412 RETURN_FAILURE;
1414 default:
1415 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1416 ctx->pattern[-1]));
1417 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1421 exit:
1422 ctx_pos = ctx->last_ctx_pos;
1423 jump = ctx->jump;
1424 DATA_POP_DISCARD(ctx);
1425 if (ctx_pos == -1)
1426 return ret;
1427 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1429 switch (jump) {
1430 case JUMP_MAX_UNTIL_2:
1431 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1432 goto jump_max_until_2;
1433 case JUMP_MAX_UNTIL_3:
1434 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1435 goto jump_max_until_3;
1436 case JUMP_MIN_UNTIL_2:
1437 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1438 goto jump_min_until_2;
1439 case JUMP_MIN_UNTIL_3:
1440 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1441 goto jump_min_until_3;
1442 case JUMP_BRANCH:
1443 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1444 goto jump_branch;
1445 case JUMP_MAX_UNTIL_1:
1446 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1447 goto jump_max_until_1;
1448 case JUMP_MIN_UNTIL_1:
1449 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1450 goto jump_min_until_1;
1451 case JUMP_REPEAT:
1452 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1453 goto jump_repeat;
1454 case JUMP_REPEAT_ONE_1:
1455 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1456 goto jump_repeat_one_1;
1457 case JUMP_REPEAT_ONE_2:
1458 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1459 goto jump_repeat_one_2;
1460 case JUMP_MIN_REPEAT_ONE:
1461 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1462 goto jump_min_repeat_one;
1463 case JUMP_ASSERT:
1464 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1465 goto jump_assert;
1466 case JUMP_ASSERT_NOT:
1467 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1468 goto jump_assert_not;
1469 case JUMP_NONE:
1470 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1471 break;
1474 return ret; /* should never get here */
1477 LOCAL(int)
1478 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1480 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1481 SRE_CHAR* end = (SRE_CHAR *)state->end;
1482 int status = 0;
1483 int prefix_len = 0;
1484 int prefix_skip = 0;
1485 SRE_CODE* prefix = NULL;
1486 SRE_CODE* charset = NULL;
1487 SRE_CODE* overlap = NULL;
1488 int flags = 0;
1490 if (pattern[0] == SRE_OP_INFO) {
1491 /* optimization info block */
1492 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1494 flags = pattern[2];
1496 if (pattern[3] > 1) {
1497 /* adjust end point (but make sure we leave at least one
1498 character in there, so literal search will work) */
1499 end -= pattern[3]-1;
1500 if (end <= ptr)
1501 end = ptr+1;
1504 if (flags & SRE_INFO_PREFIX) {
1505 /* pattern starts with a known prefix */
1506 /* <length> <skip> <prefix data> <overlap data> */
1507 prefix_len = pattern[5];
1508 prefix_skip = pattern[6];
1509 prefix = pattern + 7;
1510 overlap = prefix + prefix_len - 1;
1511 } else if (flags & SRE_INFO_CHARSET)
1512 /* pattern starts with a character from a known set */
1513 /* <charset> */
1514 charset = pattern + 5;
1516 pattern += 1 + pattern[1];
1519 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1520 TRACE(("charset = %p\n", charset));
1522 #if defined(USE_FAST_SEARCH)
1523 if (prefix_len > 1) {
1524 /* pattern starts with a known prefix. use the overlap
1525 table to skip forward as fast as we possibly can */
1526 int i = 0;
1527 end = (SRE_CHAR *)state->end;
1528 while (ptr < end) {
1529 for (;;) {
1530 if ((SRE_CODE) ptr[0] != prefix[i]) {
1531 if (!i)
1532 break;
1533 else
1534 i = overlap[i];
1535 } else {
1536 if (++i == prefix_len) {
1537 /* found a potential match */
1538 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1539 state->start = ptr + 1 - prefix_len;
1540 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1541 if (flags & SRE_INFO_LITERAL)
1542 return 1; /* we got all of it */
1543 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1544 if (status != 0)
1545 return status;
1546 /* close but no cigar -- try again */
1547 i = overlap[i];
1549 break;
1552 ptr++;
1554 return 0;
1556 #endif
1558 if (pattern[0] == SRE_OP_LITERAL) {
1559 /* pattern starts with a literal character. this is used
1560 for short prefixes, and if fast search is disabled */
1561 SRE_CODE chr = pattern[1];
1562 end = (SRE_CHAR *)state->end;
1563 for (;;) {
1564 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1565 ptr++;
1566 if (ptr >= end)
1567 return 0;
1568 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1569 state->start = ptr;
1570 state->ptr = ++ptr;
1571 if (flags & SRE_INFO_LITERAL)
1572 return 1; /* we got all of it */
1573 status = SRE_MATCH(state, pattern + 2);
1574 if (status != 0)
1575 break;
1577 } else if (charset) {
1578 /* pattern starts with a character from a known set */
1579 end = (SRE_CHAR *)state->end;
1580 for (;;) {
1581 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1582 ptr++;
1583 if (ptr >= end)
1584 return 0;
1585 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1586 state->start = ptr;
1587 state->ptr = ptr;
1588 status = SRE_MATCH(state, pattern);
1589 if (status != 0)
1590 break;
1591 ptr++;
1593 } else
1594 /* general case */
1595 while (ptr <= end) {
1596 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1597 state->start = state->ptr = ptr++;
1598 status = SRE_MATCH(state, pattern);
1599 if (status != 0)
1600 break;
1603 return status;
1606 LOCAL(int)
1607 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1609 /* check if given string is a literal template (i.e. no escapes) */
1610 while (len-- > 0)
1611 if (*ptr++ == '\\')
1612 return 0;
1613 return 1;
1616 #if !defined(SRE_RECURSIVE)
1618 /* -------------------------------------------------------------------- */
1619 /* factories and destructors */
1621 /* see sre.h for object declarations */
1622 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1623 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1625 static PyObject *
1626 sre_codesize(PyObject* self, PyObject *unused)
1628 return Py_BuildValue("i", sizeof(SRE_CODE));
1631 static PyObject *
1632 sre_getlower(PyObject* self, PyObject* args)
1634 int character, flags;
1635 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1636 return NULL;
1637 if (flags & SRE_FLAG_LOCALE)
1638 return Py_BuildValue("i", sre_lower_locale(character));
1639 if (flags & SRE_FLAG_UNICODE)
1640 #if defined(HAVE_UNICODE)
1641 return Py_BuildValue("i", sre_lower_unicode(character));
1642 #else
1643 return Py_BuildValue("i", sre_lower_locale(character));
1644 #endif
1645 return Py_BuildValue("i", sre_lower(character));
1648 LOCAL(void)
1649 state_reset(SRE_STATE* state)
1651 /* FIXME: dynamic! */
1652 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1654 state->lastmark = -1;
1655 state->lastindex = -1;
1657 state->repeat = NULL;
1659 data_stack_dealloc(state);
1662 static void*
1663 getstring(PyObject* string, int* p_length, int* p_charsize)
1665 /* given a python object, return a data pointer, a length (in
1666 characters), and a character size. return NULL if the object
1667 is not a string (or not compatible) */
1669 PyBufferProcs *buffer;
1670 int size, bytes, charsize;
1671 void* ptr;
1673 #if defined(HAVE_UNICODE)
1674 if (PyUnicode_Check(string)) {
1675 /* unicode strings doesn't always support the buffer interface */
1676 ptr = (void*) PyUnicode_AS_DATA(string);
1677 bytes = PyUnicode_GET_DATA_SIZE(string);
1678 size = PyUnicode_GET_SIZE(string);
1679 charsize = sizeof(Py_UNICODE);
1681 } else {
1682 #endif
1684 /* get pointer to string buffer */
1685 buffer = string->ob_type->tp_as_buffer;
1686 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1687 buffer->bf_getsegcount(string, NULL) != 1) {
1688 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1689 return NULL;
1692 /* determine buffer size */
1693 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1694 if (bytes < 0) {
1695 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1696 return NULL;
1699 /* determine character size */
1700 #if PY_VERSION_HEX >= 0x01060000
1701 size = PyObject_Size(string);
1702 #else
1703 size = PyObject_Length(string);
1704 #endif
1706 if (PyString_Check(string) || bytes == size)
1707 charsize = 1;
1708 #if defined(HAVE_UNICODE)
1709 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1710 charsize = sizeof(Py_UNICODE);
1711 #endif
1712 else {
1713 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1714 return NULL;
1717 #if defined(HAVE_UNICODE)
1719 #endif
1721 *p_length = size;
1722 *p_charsize = charsize;
1724 return ptr;
1727 LOCAL(PyObject*)
1728 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1729 int start, int end)
1731 /* prepare state object */
1733 int length;
1734 int charsize;
1735 void* ptr;
1737 memset(state, 0, sizeof(SRE_STATE));
1739 state->lastmark = -1;
1740 state->lastindex = -1;
1742 ptr = getstring(string, &length, &charsize);
1743 if (!ptr)
1744 return NULL;
1746 /* adjust boundaries */
1747 if (start < 0)
1748 start = 0;
1749 else if (start > length)
1750 start = length;
1752 if (end < 0)
1753 end = 0;
1754 else if (end > length)
1755 end = length;
1757 state->charsize = charsize;
1759 state->beginning = ptr;
1761 state->start = (void*) ((char*) ptr + start * state->charsize);
1762 state->end = (void*) ((char*) ptr + end * state->charsize);
1764 Py_INCREF(string);
1765 state->string = string;
1766 state->pos = start;
1767 state->endpos = end;
1769 if (pattern->flags & SRE_FLAG_LOCALE)
1770 state->lower = sre_lower_locale;
1771 else if (pattern->flags & SRE_FLAG_UNICODE)
1772 #if defined(HAVE_UNICODE)
1773 state->lower = sre_lower_unicode;
1774 #else
1775 state->lower = sre_lower_locale;
1776 #endif
1777 else
1778 state->lower = sre_lower;
1780 return string;
1783 LOCAL(void)
1784 state_fini(SRE_STATE* state)
1786 Py_XDECREF(state->string);
1787 data_stack_dealloc(state);
1790 /* calculate offset from start of string */
1791 #define STATE_OFFSET(state, member)\
1792 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1794 LOCAL(PyObject*)
1795 state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
1797 int i, j;
1799 index = (index - 1) * 2;
1801 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1802 if (empty)
1803 /* want empty string */
1804 i = j = 0;
1805 else {
1806 Py_INCREF(Py_None);
1807 return Py_None;
1809 } else {
1810 i = STATE_OFFSET(state, state->mark[index]);
1811 j = STATE_OFFSET(state, state->mark[index+1]);
1814 return PySequence_GetSlice(string, i, j);
1817 static void
1818 pattern_error(int status)
1820 switch (status) {
1821 case SRE_ERROR_RECURSION_LIMIT:
1822 PyErr_SetString(
1823 PyExc_RuntimeError,
1824 "maximum recursion limit exceeded"
1826 break;
1827 case SRE_ERROR_MEMORY:
1828 PyErr_NoMemory();
1829 break;
1830 default:
1831 /* other error codes indicate compiler/engine bugs */
1832 PyErr_SetString(
1833 PyExc_RuntimeError,
1834 "internal error in regular expression engine"
1839 static void
1840 pattern_dealloc(PatternObject* self)
1842 if (self->weakreflist != NULL)
1843 PyObject_ClearWeakRefs((PyObject *) self);
1844 Py_XDECREF(self->pattern);
1845 Py_XDECREF(self->groupindex);
1846 Py_XDECREF(self->indexgroup);
1847 PyObject_DEL(self);
1850 static PyObject*
1851 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1853 SRE_STATE state;
1854 int status;
1856 PyObject* string;
1857 int start = 0;
1858 int end = INT_MAX;
1859 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1860 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1861 &string, &start, &end))
1862 return NULL;
1864 string = state_init(&state, self, string, start, end);
1865 if (!string)
1866 return NULL;
1868 state.ptr = state.start;
1870 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1872 if (state.charsize == 1) {
1873 status = sre_match(&state, PatternObject_GetCode(self));
1874 } else {
1875 #if defined(HAVE_UNICODE)
1876 status = sre_umatch(&state, PatternObject_GetCode(self));
1877 #endif
1880 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1882 state_fini(&state);
1884 return pattern_new_match(self, &state, status);
1887 static PyObject*
1888 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1890 SRE_STATE state;
1891 int status;
1893 PyObject* string;
1894 int start = 0;
1895 int end = INT_MAX;
1896 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1897 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1898 &string, &start, &end))
1899 return NULL;
1901 string = state_init(&state, self, string, start, end);
1902 if (!string)
1903 return NULL;
1905 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1907 if (state.charsize == 1) {
1908 status = sre_search(&state, PatternObject_GetCode(self));
1909 } else {
1910 #if defined(HAVE_UNICODE)
1911 status = sre_usearch(&state, PatternObject_GetCode(self));
1912 #endif
1915 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1917 state_fini(&state);
1919 return pattern_new_match(self, &state, status);
1922 static PyObject*
1923 call(char* module, char* function, PyObject* args)
1925 PyObject* name;
1926 PyObject* mod;
1927 PyObject* func;
1928 PyObject* result;
1930 if (!args)
1931 return NULL;
1932 name = PyString_FromString(module);
1933 if (!name)
1934 return NULL;
1935 mod = PyImport_Import(name);
1936 Py_DECREF(name);
1937 if (!mod)
1938 return NULL;
1939 func = PyObject_GetAttrString(mod, function);
1940 Py_DECREF(mod);
1941 if (!func)
1942 return NULL;
1943 result = PyObject_CallObject(func, args);
1944 Py_DECREF(func);
1945 Py_DECREF(args);
1946 return result;
1949 #ifdef USE_BUILTIN_COPY
1950 static int
1951 deepcopy(PyObject** object, PyObject* memo)
1953 PyObject* copy;
1955 copy = call(
1956 "copy", "deepcopy",
1957 PyTuple_Pack(2, *object, memo)
1959 if (!copy)
1960 return 0;
1962 Py_DECREF(*object);
1963 *object = copy;
1965 return 1; /* success */
1967 #endif
1969 static PyObject*
1970 join_list(PyObject* list, PyObject* pattern)
1972 /* join list elements */
1974 PyObject* joiner;
1975 #if PY_VERSION_HEX >= 0x01060000
1976 PyObject* function;
1977 PyObject* args;
1978 #endif
1979 PyObject* result;
1981 switch (PyList_GET_SIZE(list)) {
1982 case 0:
1983 Py_DECREF(list);
1984 return PySequence_GetSlice(pattern, 0, 0);
1985 case 1:
1986 result = PyList_GET_ITEM(list, 0);
1987 Py_INCREF(result);
1988 Py_DECREF(list);
1989 return result;
1992 /* two or more elements: slice out a suitable separator from the
1993 first member, and use that to join the entire list */
1995 joiner = PySequence_GetSlice(pattern, 0, 0);
1996 if (!joiner)
1997 return NULL;
1999 #if PY_VERSION_HEX >= 0x01060000
2000 function = PyObject_GetAttrString(joiner, "join");
2001 if (!function) {
2002 Py_DECREF(joiner);
2003 return NULL;
2005 args = PyTuple_New(1);
2006 if (!args) {
2007 Py_DECREF(function);
2008 Py_DECREF(joiner);
2009 return NULL;
2011 PyTuple_SET_ITEM(args, 0, list);
2012 result = PyObject_CallObject(function, args);
2013 Py_DECREF(args); /* also removes list */
2014 Py_DECREF(function);
2015 #else
2016 result = call(
2017 "string", "join",
2018 PyTuple_Pack(2, list, joiner)
2020 #endif
2021 Py_DECREF(joiner);
2023 return result;
2026 static PyObject*
2027 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2029 SRE_STATE state;
2030 PyObject* list;
2031 int status;
2032 int i, b, e;
2034 PyObject* string;
2035 int start = 0;
2036 int end = INT_MAX;
2037 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2038 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2039 &string, &start, &end))
2040 return NULL;
2042 string = state_init(&state, self, string, start, end);
2043 if (!string)
2044 return NULL;
2046 list = PyList_New(0);
2047 if (!list) {
2048 state_fini(&state);
2049 return NULL;
2052 while (state.start <= state.end) {
2054 PyObject* item;
2056 state_reset(&state);
2058 state.ptr = state.start;
2060 if (state.charsize == 1) {
2061 status = sre_search(&state, PatternObject_GetCode(self));
2062 } else {
2063 #if defined(HAVE_UNICODE)
2064 status = sre_usearch(&state, PatternObject_GetCode(self));
2065 #endif
2068 if (status <= 0) {
2069 if (status == 0)
2070 break;
2071 pattern_error(status);
2072 goto error;
2075 /* don't bother to build a match object */
2076 switch (self->groups) {
2077 case 0:
2078 b = STATE_OFFSET(&state, state.start);
2079 e = STATE_OFFSET(&state, state.ptr);
2080 item = PySequence_GetSlice(string, b, e);
2081 if (!item)
2082 goto error;
2083 break;
2084 case 1:
2085 item = state_getslice(&state, 1, string, 1);
2086 if (!item)
2087 goto error;
2088 break;
2089 default:
2090 item = PyTuple_New(self->groups);
2091 if (!item)
2092 goto error;
2093 for (i = 0; i < self->groups; i++) {
2094 PyObject* o = state_getslice(&state, i+1, string, 1);
2095 if (!o) {
2096 Py_DECREF(item);
2097 goto error;
2099 PyTuple_SET_ITEM(item, i, o);
2101 break;
2104 status = PyList_Append(list, item);
2105 Py_DECREF(item);
2106 if (status < 0)
2107 goto error;
2109 if (state.ptr == state.start)
2110 state.start = (void*) ((char*) state.ptr + state.charsize);
2111 else
2112 state.start = state.ptr;
2116 state_fini(&state);
2117 return list;
2119 error:
2120 Py_DECREF(list);
2121 state_fini(&state);
2122 return NULL;
2126 #if PY_VERSION_HEX >= 0x02020000
2127 static PyObject*
2128 pattern_finditer(PatternObject* pattern, PyObject* args)
2130 PyObject* scanner;
2131 PyObject* search;
2132 PyObject* iterator;
2134 scanner = pattern_scanner(pattern, args);
2135 if (!scanner)
2136 return NULL;
2138 search = PyObject_GetAttrString(scanner, "search");
2139 Py_DECREF(scanner);
2140 if (!search)
2141 return NULL;
2143 iterator = PyCallIter_New(search, Py_None);
2144 Py_DECREF(search);
2146 return iterator;
2148 #endif
2150 static PyObject*
2151 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2153 SRE_STATE state;
2154 PyObject* list;
2155 PyObject* item;
2156 int status;
2157 int n;
2158 int i;
2159 void* last;
2161 PyObject* string;
2162 int maxsplit = 0;
2163 static char* kwlist[] = { "source", "maxsplit", NULL };
2164 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2165 &string, &maxsplit))
2166 return NULL;
2168 string = state_init(&state, self, string, 0, INT_MAX);
2169 if (!string)
2170 return NULL;
2172 list = PyList_New(0);
2173 if (!list) {
2174 state_fini(&state);
2175 return NULL;
2178 n = 0;
2179 last = state.start;
2181 while (!maxsplit || n < maxsplit) {
2183 state_reset(&state);
2185 state.ptr = state.start;
2187 if (state.charsize == 1) {
2188 status = sre_search(&state, PatternObject_GetCode(self));
2189 } else {
2190 #if defined(HAVE_UNICODE)
2191 status = sre_usearch(&state, PatternObject_GetCode(self));
2192 #endif
2195 if (status <= 0) {
2196 if (status == 0)
2197 break;
2198 pattern_error(status);
2199 goto error;
2202 if (state.start == state.ptr) {
2203 if (last == state.end)
2204 break;
2205 /* skip one character */
2206 state.start = (void*) ((char*) state.ptr + state.charsize);
2207 continue;
2210 /* get segment before this match */
2211 item = PySequence_GetSlice(
2212 string, STATE_OFFSET(&state, last),
2213 STATE_OFFSET(&state, state.start)
2215 if (!item)
2216 goto error;
2217 status = PyList_Append(list, item);
2218 Py_DECREF(item);
2219 if (status < 0)
2220 goto error;
2222 /* add groups (if any) */
2223 for (i = 0; i < self->groups; i++) {
2224 item = state_getslice(&state, i+1, string, 0);
2225 if (!item)
2226 goto error;
2227 status = PyList_Append(list, item);
2228 Py_DECREF(item);
2229 if (status < 0)
2230 goto error;
2233 n = n + 1;
2235 last = state.start = state.ptr;
2239 /* get segment following last match (even if empty) */
2240 item = PySequence_GetSlice(
2241 string, STATE_OFFSET(&state, last), state.endpos
2243 if (!item)
2244 goto error;
2245 status = PyList_Append(list, item);
2246 Py_DECREF(item);
2247 if (status < 0)
2248 goto error;
2250 state_fini(&state);
2251 return list;
2253 error:
2254 Py_DECREF(list);
2255 state_fini(&state);
2256 return NULL;
2260 static PyObject*
2261 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2262 int count, int subn)
2264 SRE_STATE state;
2265 PyObject* list;
2266 PyObject* item;
2267 PyObject* filter;
2268 PyObject* args;
2269 PyObject* match;
2270 void* ptr;
2271 int status;
2272 int n;
2273 int i, b, e;
2274 int filter_is_callable;
2276 if (PyCallable_Check(ptemplate)) {
2277 /* sub/subn takes either a function or a template */
2278 filter = ptemplate;
2279 Py_INCREF(filter);
2280 filter_is_callable = 1;
2281 } else {
2282 /* if not callable, check if it's a literal string */
2283 int literal;
2284 ptr = getstring(ptemplate, &n, &b);
2285 if (ptr) {
2286 if (b == 1) {
2287 literal = sre_literal_template((unsigned char *)ptr, n);
2288 } else {
2289 #if defined(HAVE_UNICODE)
2290 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2291 #endif
2293 } else {
2294 PyErr_Clear();
2295 literal = 0;
2297 if (literal) {
2298 filter = ptemplate;
2299 Py_INCREF(filter);
2300 filter_is_callable = 0;
2301 } else {
2302 /* not a literal; hand it over to the template compiler */
2303 filter = call(
2304 SRE_PY_MODULE, "_subx",
2305 PyTuple_Pack(2, self, ptemplate)
2307 if (!filter)
2308 return NULL;
2309 filter_is_callable = PyCallable_Check(filter);
2313 string = state_init(&state, self, string, 0, INT_MAX);
2314 if (!string) {
2315 Py_DECREF(filter);
2316 return NULL;
2319 list = PyList_New(0);
2320 if (!list) {
2321 Py_DECREF(filter);
2322 state_fini(&state);
2323 return NULL;
2326 n = i = 0;
2328 while (!count || n < count) {
2330 state_reset(&state);
2332 state.ptr = state.start;
2334 if (state.charsize == 1) {
2335 status = sre_search(&state, PatternObject_GetCode(self));
2336 } else {
2337 #if defined(HAVE_UNICODE)
2338 status = sre_usearch(&state, PatternObject_GetCode(self));
2339 #endif
2342 if (status <= 0) {
2343 if (status == 0)
2344 break;
2345 pattern_error(status);
2346 goto error;
2349 b = STATE_OFFSET(&state, state.start);
2350 e = STATE_OFFSET(&state, state.ptr);
2352 if (i < b) {
2353 /* get segment before this match */
2354 item = PySequence_GetSlice(string, i, b);
2355 if (!item)
2356 goto error;
2357 status = PyList_Append(list, item);
2358 Py_DECREF(item);
2359 if (status < 0)
2360 goto error;
2362 } else if (i == b && i == e && n > 0)
2363 /* ignore empty match on latest position */
2364 goto next;
2366 if (filter_is_callable) {
2367 /* pass match object through filter */
2368 match = pattern_new_match(self, &state, 1);
2369 if (!match)
2370 goto error;
2371 args = PyTuple_Pack(1, match);
2372 if (!args) {
2373 Py_DECREF(match);
2374 goto error;
2376 item = PyObject_CallObject(filter, args);
2377 Py_DECREF(args);
2378 Py_DECREF(match);
2379 if (!item)
2380 goto error;
2381 } else {
2382 /* filter is literal string */
2383 item = filter;
2384 Py_INCREF(item);
2387 /* add to list */
2388 if (item != Py_None) {
2389 status = PyList_Append(list, item);
2390 Py_DECREF(item);
2391 if (status < 0)
2392 goto error;
2395 i = e;
2396 n = n + 1;
2398 next:
2399 /* move on */
2400 if (state.ptr == state.start)
2401 state.start = (void*) ((char*) state.ptr + state.charsize);
2402 else
2403 state.start = state.ptr;
2407 /* get segment following last match */
2408 if (i < state.endpos) {
2409 item = PySequence_GetSlice(string, i, state.endpos);
2410 if (!item)
2411 goto error;
2412 status = PyList_Append(list, item);
2413 Py_DECREF(item);
2414 if (status < 0)
2415 goto error;
2418 state_fini(&state);
2420 Py_DECREF(filter);
2422 /* convert list to single string (also removes list) */
2423 item = join_list(list, self->pattern);
2425 if (!item)
2426 return NULL;
2428 if (subn)
2429 return Py_BuildValue("Ni", item, n);
2431 return item;
2433 error:
2434 Py_DECREF(list);
2435 state_fini(&state);
2436 Py_DECREF(filter);
2437 return NULL;
2441 static PyObject*
2442 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2444 PyObject* ptemplate;
2445 PyObject* string;
2446 int count = 0;
2447 static char* kwlist[] = { "repl", "string", "count", NULL };
2448 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2449 &ptemplate, &string, &count))
2450 return NULL;
2452 return pattern_subx(self, ptemplate, string, count, 0);
2455 static PyObject*
2456 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2458 PyObject* ptemplate;
2459 PyObject* string;
2460 int count = 0;
2461 static char* kwlist[] = { "repl", "string", "count", NULL };
2462 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2463 &ptemplate, &string, &count))
2464 return NULL;
2466 return pattern_subx(self, ptemplate, string, count, 1);
2469 static PyObject*
2470 pattern_copy(PatternObject* self, PyObject *unused)
2472 #ifdef USE_BUILTIN_COPY
2473 PatternObject* copy;
2474 int offset;
2476 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2477 if (!copy)
2478 return NULL;
2480 offset = offsetof(PatternObject, groups);
2482 Py_XINCREF(self->groupindex);
2483 Py_XINCREF(self->indexgroup);
2484 Py_XINCREF(self->pattern);
2486 memcpy((char*) copy + offset, (char*) self + offset,
2487 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2488 copy->weakreflist = NULL;
2490 return (PyObject*) copy;
2491 #else
2492 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2493 return NULL;
2494 #endif
2497 static PyObject*
2498 pattern_deepcopy(PatternObject* self, PyObject* memo)
2500 #ifdef USE_BUILTIN_COPY
2501 PatternObject* copy;
2503 copy = (PatternObject*) pattern_copy(self);
2504 if (!copy)
2505 return NULL;
2507 if (!deepcopy(&copy->groupindex, memo) ||
2508 !deepcopy(&copy->indexgroup, memo) ||
2509 !deepcopy(&copy->pattern, memo)) {
2510 Py_DECREF(copy);
2511 return NULL;
2514 #else
2515 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2516 return NULL;
2517 #endif
2520 PyDoc_STRVAR(pattern_match_doc,
2521 "match(string[, pos[, endpos]]) --> match object or None.\n\
2522 Matches zero or more characters at the beginning of the string");
2524 PyDoc_STRVAR(pattern_search_doc,
2525 "search(string[, pos[, endpos]]) --> match object or None.\n\
2526 Scan through string looking for a match, and return a corresponding\n\
2527 MatchObject instance. Return None if no position in the string matches.");
2529 PyDoc_STRVAR(pattern_split_doc,
2530 "split(string[, maxsplit = 0]) --> list.\n\
2531 Split string by the occurrences of pattern.");
2533 PyDoc_STRVAR(pattern_findall_doc,
2534 "findall(string[, pos[, endpos]]) --> list.\n\
2535 Return a list of all non-overlapping matches of pattern in string.");
2537 PyDoc_STRVAR(pattern_finditer_doc,
2538 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2539 Return an iterator over all non-overlapping matches for the \n\
2540 RE pattern in string. For each match, the iterator returns a\n\
2541 match object.");
2543 PyDoc_STRVAR(pattern_sub_doc,
2544 "sub(repl, string[, count = 0]) --> newstring\n\
2545 Return the string obtained by replacing the leftmost non-overlapping\n\
2546 occurrences of pattern in string by the replacement repl.");
2548 PyDoc_STRVAR(pattern_subn_doc,
2549 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2550 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2551 the leftmost non-overlapping occurrences of pattern with the\n\
2552 replacement repl.");
2554 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2556 static PyMethodDef pattern_methods[] = {
2557 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2558 pattern_match_doc},
2559 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2560 pattern_search_doc},
2561 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2562 pattern_sub_doc},
2563 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2564 pattern_subn_doc},
2565 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2566 pattern_split_doc},
2567 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2568 pattern_findall_doc},
2569 #if PY_VERSION_HEX >= 0x02020000
2570 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2571 pattern_finditer_doc},
2572 #endif
2573 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2574 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2575 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2576 {NULL, NULL}
2579 static PyObject*
2580 pattern_getattr(PatternObject* self, char* name)
2582 PyObject* res;
2584 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2586 if (res)
2587 return res;
2589 PyErr_Clear();
2591 /* attributes */
2592 if (!strcmp(name, "pattern")) {
2593 Py_INCREF(self->pattern);
2594 return self->pattern;
2597 if (!strcmp(name, "flags"))
2598 return Py_BuildValue("i", self->flags);
2600 if (!strcmp(name, "groups"))
2601 return Py_BuildValue("i", self->groups);
2603 if (!strcmp(name, "groupindex") && self->groupindex) {
2604 Py_INCREF(self->groupindex);
2605 return self->groupindex;
2608 PyErr_SetString(PyExc_AttributeError, name);
2609 return NULL;
2612 statichere PyTypeObject Pattern_Type = {
2613 PyObject_HEAD_INIT(NULL)
2614 0, "_" SRE_MODULE ".SRE_Pattern",
2615 sizeof(PatternObject), sizeof(SRE_CODE),
2616 (destructor)pattern_dealloc, /*tp_dealloc*/
2617 0, /*tp_print*/
2618 (getattrfunc)pattern_getattr, /*tp_getattr*/
2619 0, /* tp_setattr */
2620 0, /* tp_compare */
2621 0, /* tp_repr */
2622 0, /* tp_as_number */
2623 0, /* tp_as_sequence */
2624 0, /* tp_as_mapping */
2625 0, /* tp_hash */
2626 0, /* tp_call */
2627 0, /* tp_str */
2628 0, /* tp_getattro */
2629 0, /* tp_setattro */
2630 0, /* tp_as_buffer */
2631 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2632 pattern_doc, /* tp_doc */
2633 0, /* tp_traverse */
2634 0, /* tp_clear */
2635 0, /* tp_richcompare */
2636 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2639 static PyObject *
2640 _compile(PyObject* self_, PyObject* args)
2642 /* "compile" pattern descriptor to pattern object */
2644 PatternObject* self;
2645 int i, n;
2647 PyObject* pattern;
2648 int flags = 0;
2649 PyObject* code;
2650 int groups = 0;
2651 PyObject* groupindex = NULL;
2652 PyObject* indexgroup = NULL;
2653 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
2654 &PyList_Type, &code, &groups,
2655 &groupindex, &indexgroup))
2656 return NULL;
2658 n = PyList_GET_SIZE(code);
2660 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2661 if (!self)
2662 return NULL;
2664 self->codesize = n;
2666 for (i = 0; i < n; i++) {
2667 PyObject *o = PyList_GET_ITEM(code, i);
2668 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2669 : PyLong_AsUnsignedLong(o);
2670 self->code[i] = (SRE_CODE) value;
2671 if ((unsigned long) self->code[i] != value) {
2672 PyErr_SetString(PyExc_OverflowError,
2673 "regular expression code size limit exceeded");
2674 break;
2678 if (PyErr_Occurred()) {
2679 PyObject_DEL(self);
2680 return NULL;
2683 Py_INCREF(pattern);
2684 self->pattern = pattern;
2686 self->flags = flags;
2688 self->groups = groups;
2690 Py_XINCREF(groupindex);
2691 self->groupindex = groupindex;
2693 Py_XINCREF(indexgroup);
2694 self->indexgroup = indexgroup;
2696 self->weakreflist = NULL;
2698 return (PyObject*) self;
2701 /* -------------------------------------------------------------------- */
2702 /* match methods */
2704 static void
2705 match_dealloc(MatchObject* self)
2707 Py_XDECREF(self->regs);
2708 Py_XDECREF(self->string);
2709 Py_DECREF(self->pattern);
2710 PyObject_DEL(self);
2713 static PyObject*
2714 match_getslice_by_index(MatchObject* self, int index, PyObject* def)
2716 if (index < 0 || index >= self->groups) {
2717 /* raise IndexError if we were given a bad group number */
2718 PyErr_SetString(
2719 PyExc_IndexError,
2720 "no such group"
2722 return NULL;
2725 index *= 2;
2727 if (self->string == Py_None || self->mark[index] < 0) {
2728 /* return default value if the string or group is undefined */
2729 Py_INCREF(def);
2730 return def;
2733 return PySequence_GetSlice(
2734 self->string, self->mark[index], self->mark[index+1]
2738 static int
2739 match_getindex(MatchObject* self, PyObject* index)
2741 int i;
2743 if (PyInt_Check(index))
2744 return (int) PyInt_AS_LONG(index);
2746 i = -1;
2748 if (self->pattern->groupindex) {
2749 index = PyObject_GetItem(self->pattern->groupindex, index);
2750 if (index) {
2751 if (PyInt_Check(index))
2752 i = (int) PyInt_AS_LONG(index);
2753 Py_DECREF(index);
2754 } else
2755 PyErr_Clear();
2758 return i;
2761 static PyObject*
2762 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2764 return match_getslice_by_index(self, match_getindex(self, index), def);
2767 static PyObject*
2768 match_expand(MatchObject* self, PyObject* ptemplate)
2770 /* delegate to Python code */
2771 return call(
2772 SRE_PY_MODULE, "_expand",
2773 PyTuple_Pack(3, self->pattern, self, ptemplate)
2777 static PyObject*
2778 match_group(MatchObject* self, PyObject* args)
2780 PyObject* result;
2781 int i, size;
2783 size = PyTuple_GET_SIZE(args);
2785 switch (size) {
2786 case 0:
2787 result = match_getslice(self, Py_False, Py_None);
2788 break;
2789 case 1:
2790 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2791 break;
2792 default:
2793 /* fetch multiple items */
2794 result = PyTuple_New(size);
2795 if (!result)
2796 return NULL;
2797 for (i = 0; i < size; i++) {
2798 PyObject* item = match_getslice(
2799 self, PyTuple_GET_ITEM(args, i), Py_None
2801 if (!item) {
2802 Py_DECREF(result);
2803 return NULL;
2805 PyTuple_SET_ITEM(result, i, item);
2807 break;
2809 return result;
2812 static PyObject*
2813 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2815 PyObject* result;
2816 int index;
2818 PyObject* def = Py_None;
2819 static char* kwlist[] = { "default", NULL };
2820 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2821 return NULL;
2823 result = PyTuple_New(self->groups-1);
2824 if (!result)
2825 return NULL;
2827 for (index = 1; index < self->groups; index++) {
2828 PyObject* item;
2829 item = match_getslice_by_index(self, index, def);
2830 if (!item) {
2831 Py_DECREF(result);
2832 return NULL;
2834 PyTuple_SET_ITEM(result, index-1, item);
2837 return result;
2840 static PyObject*
2841 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2843 PyObject* result;
2844 PyObject* keys;
2845 int index;
2847 PyObject* def = Py_None;
2848 static char* kwlist[] = { "default", NULL };
2849 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2850 return NULL;
2852 result = PyDict_New();
2853 if (!result || !self->pattern->groupindex)
2854 return result;
2856 keys = PyMapping_Keys(self->pattern->groupindex);
2857 if (!keys)
2858 goto failed;
2860 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2861 int status;
2862 PyObject* key;
2863 PyObject* value;
2864 key = PyList_GET_ITEM(keys, index);
2865 if (!key)
2866 goto failed;
2867 value = match_getslice(self, key, def);
2868 if (!value) {
2869 Py_DECREF(key);
2870 goto failed;
2872 status = PyDict_SetItem(result, key, value);
2873 Py_DECREF(value);
2874 if (status < 0)
2875 goto failed;
2878 Py_DECREF(keys);
2880 return result;
2882 failed:
2883 Py_XDECREF(keys);
2884 Py_DECREF(result);
2885 return NULL;
2888 static PyObject*
2889 match_start(MatchObject* self, PyObject* args)
2891 int index;
2893 PyObject* index_ = Py_False; /* zero */
2894 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
2895 return NULL;
2897 index = match_getindex(self, index_);
2899 if (index < 0 || index >= self->groups) {
2900 PyErr_SetString(
2901 PyExc_IndexError,
2902 "no such group"
2904 return NULL;
2907 /* mark is -1 if group is undefined */
2908 return Py_BuildValue("i", self->mark[index*2]);
2911 static PyObject*
2912 match_end(MatchObject* self, PyObject* args)
2914 int index;
2916 PyObject* index_ = Py_False; /* zero */
2917 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
2918 return NULL;
2920 index = match_getindex(self, index_);
2922 if (index < 0 || index >= self->groups) {
2923 PyErr_SetString(
2924 PyExc_IndexError,
2925 "no such group"
2927 return NULL;
2930 /* mark is -1 if group is undefined */
2931 return Py_BuildValue("i", self->mark[index*2+1]);
2934 LOCAL(PyObject*)
2935 _pair(int i1, int i2)
2937 PyObject* pair;
2938 PyObject* item;
2940 pair = PyTuple_New(2);
2941 if (!pair)
2942 return NULL;
2944 item = PyInt_FromLong(i1);
2945 if (!item)
2946 goto error;
2947 PyTuple_SET_ITEM(pair, 0, item);
2949 item = PyInt_FromLong(i2);
2950 if (!item)
2951 goto error;
2952 PyTuple_SET_ITEM(pair, 1, item);
2954 return pair;
2956 error:
2957 Py_DECREF(pair);
2958 return NULL;
2961 static PyObject*
2962 match_span(MatchObject* self, PyObject* args)
2964 int index;
2966 PyObject* index_ = Py_False; /* zero */
2967 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
2968 return NULL;
2970 index = match_getindex(self, index_);
2972 if (index < 0 || index >= self->groups) {
2973 PyErr_SetString(
2974 PyExc_IndexError,
2975 "no such group"
2977 return NULL;
2980 /* marks are -1 if group is undefined */
2981 return _pair(self->mark[index*2], self->mark[index*2+1]);
2984 static PyObject*
2985 match_regs(MatchObject* self)
2987 PyObject* regs;
2988 PyObject* item;
2989 int index;
2991 regs = PyTuple_New(self->groups);
2992 if (!regs)
2993 return NULL;
2995 for (index = 0; index < self->groups; index++) {
2996 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2997 if (!item) {
2998 Py_DECREF(regs);
2999 return NULL;
3001 PyTuple_SET_ITEM(regs, index, item);
3004 Py_INCREF(regs);
3005 self->regs = regs;
3007 return regs;
3010 static PyObject*
3011 match_copy(MatchObject* self, PyObject *unused)
3013 #ifdef USE_BUILTIN_COPY
3014 MatchObject* copy;
3015 int slots, offset;
3017 slots = 2 * (self->pattern->groups+1);
3019 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3020 if (!copy)
3021 return NULL;
3023 /* this value a constant, but any compiler should be able to
3024 figure that out all by itself */
3025 offset = offsetof(MatchObject, string);
3027 Py_XINCREF(self->pattern);
3028 Py_XINCREF(self->string);
3029 Py_XINCREF(self->regs);
3031 memcpy((char*) copy + offset, (char*) self + offset,
3032 sizeof(MatchObject) + slots * sizeof(int) - offset);
3034 return (PyObject*) copy;
3035 #else
3036 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3037 return NULL;
3038 #endif
3041 static PyObject*
3042 match_deepcopy(MatchObject* self, PyObject* memo)
3044 #ifdef USE_BUILTIN_COPY
3045 MatchObject* copy;
3047 copy = (MatchObject*) match_copy(self);
3048 if (!copy)
3049 return NULL;
3051 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3052 !deepcopy(&copy->string, memo) ||
3053 !deepcopy(&copy->regs, memo)) {
3054 Py_DECREF(copy);
3055 return NULL;
3058 #else
3059 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3060 return NULL;
3061 #endif
3064 static PyMethodDef match_methods[] = {
3065 {"group", (PyCFunction) match_group, METH_VARARGS},
3066 {"start", (PyCFunction) match_start, METH_VARARGS},
3067 {"end", (PyCFunction) match_end, METH_VARARGS},
3068 {"span", (PyCFunction) match_span, METH_VARARGS},
3069 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3070 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3071 {"expand", (PyCFunction) match_expand, METH_O},
3072 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3073 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3074 {NULL, NULL}
3077 static PyObject*
3078 match_getattr(MatchObject* self, char* name)
3080 PyObject* res;
3082 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3083 if (res)
3084 return res;
3086 PyErr_Clear();
3088 if (!strcmp(name, "lastindex")) {
3089 if (self->lastindex >= 0)
3090 return Py_BuildValue("i", self->lastindex);
3091 Py_INCREF(Py_None);
3092 return Py_None;
3095 if (!strcmp(name, "lastgroup")) {
3096 if (self->pattern->indexgroup && self->lastindex >= 0) {
3097 PyObject* result = PySequence_GetItem(
3098 self->pattern->indexgroup, self->lastindex
3100 if (result)
3101 return result;
3102 PyErr_Clear();
3104 Py_INCREF(Py_None);
3105 return Py_None;
3108 if (!strcmp(name, "string")) {
3109 if (self->string) {
3110 Py_INCREF(self->string);
3111 return self->string;
3112 } else {
3113 Py_INCREF(Py_None);
3114 return Py_None;
3118 if (!strcmp(name, "regs")) {
3119 if (self->regs) {
3120 Py_INCREF(self->regs);
3121 return self->regs;
3122 } else
3123 return match_regs(self);
3126 if (!strcmp(name, "re")) {
3127 Py_INCREF(self->pattern);
3128 return (PyObject*) self->pattern;
3131 if (!strcmp(name, "pos"))
3132 return Py_BuildValue("i", self->pos);
3134 if (!strcmp(name, "endpos"))
3135 return Py_BuildValue("i", self->endpos);
3137 PyErr_SetString(PyExc_AttributeError, name);
3138 return NULL;
3141 /* FIXME: implement setattr("string", None) as a special case (to
3142 detach the associated string, if any */
3144 statichere PyTypeObject Match_Type = {
3145 PyObject_HEAD_INIT(NULL)
3146 0, "_" SRE_MODULE ".SRE_Match",
3147 sizeof(MatchObject), sizeof(int),
3148 (destructor)match_dealloc, /*tp_dealloc*/
3149 0, /*tp_print*/
3150 (getattrfunc)match_getattr /*tp_getattr*/
3153 static PyObject*
3154 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3156 /* create match object (from state object) */
3158 MatchObject* match;
3159 int i, j;
3160 char* base;
3161 int n;
3163 if (status > 0) {
3165 /* create match object (with room for extra group marks) */
3166 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3167 2*(pattern->groups+1));
3168 if (!match)
3169 return NULL;
3171 Py_INCREF(pattern);
3172 match->pattern = pattern;
3174 Py_INCREF(state->string);
3175 match->string = state->string;
3177 match->regs = NULL;
3178 match->groups = pattern->groups+1;
3180 /* fill in group slices */
3182 base = (char*) state->beginning;
3183 n = state->charsize;
3185 match->mark[0] = ((char*) state->start - base) / n;
3186 match->mark[1] = ((char*) state->ptr - base) / n;
3188 for (i = j = 0; i < pattern->groups; i++, j+=2)
3189 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3190 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3191 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3192 } else
3193 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3195 match->pos = state->pos;
3196 match->endpos = state->endpos;
3198 match->lastindex = state->lastindex;
3200 return (PyObject*) match;
3202 } else if (status == 0) {
3204 /* no match */
3205 Py_INCREF(Py_None);
3206 return Py_None;
3210 /* internal error */
3211 pattern_error(status);
3212 return NULL;
3216 /* -------------------------------------------------------------------- */
3217 /* scanner methods (experimental) */
3219 static void
3220 scanner_dealloc(ScannerObject* self)
3222 state_fini(&self->state);
3223 Py_DECREF(self->pattern);
3224 PyObject_DEL(self);
3227 static PyObject*
3228 scanner_match(ScannerObject* self, PyObject *unused)
3230 SRE_STATE* state = &self->state;
3231 PyObject* match;
3232 int status;
3234 state_reset(state);
3236 state->ptr = state->start;
3238 if (state->charsize == 1) {
3239 status = sre_match(state, PatternObject_GetCode(self->pattern));
3240 } else {
3241 #if defined(HAVE_UNICODE)
3242 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3243 #endif
3246 match = pattern_new_match((PatternObject*) self->pattern,
3247 state, status);
3249 if (status == 0 || state->ptr == state->start)
3250 state->start = (void*) ((char*) state->ptr + state->charsize);
3251 else
3252 state->start = state->ptr;
3254 return match;
3258 static PyObject*
3259 scanner_search(ScannerObject* self, PyObject *unused)
3261 SRE_STATE* state = &self->state;
3262 PyObject* match;
3263 int status;
3265 state_reset(state);
3267 state->ptr = state->start;
3269 if (state->charsize == 1) {
3270 status = sre_search(state, PatternObject_GetCode(self->pattern));
3271 } else {
3272 #if defined(HAVE_UNICODE)
3273 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3274 #endif
3277 match = pattern_new_match((PatternObject*) self->pattern,
3278 state, status);
3280 if (status == 0 || state->ptr == state->start)
3281 state->start = (void*) ((char*) state->ptr + state->charsize);
3282 else
3283 state->start = state->ptr;
3285 return match;
3288 static PyMethodDef scanner_methods[] = {
3289 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3290 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3291 {NULL, NULL}
3294 static PyObject*
3295 scanner_getattr(ScannerObject* self, char* name)
3297 PyObject* res;
3299 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3300 if (res)
3301 return res;
3303 PyErr_Clear();
3305 /* attributes */
3306 if (!strcmp(name, "pattern")) {
3307 Py_INCREF(self->pattern);
3308 return self->pattern;
3311 PyErr_SetString(PyExc_AttributeError, name);
3312 return NULL;
3315 statichere PyTypeObject Scanner_Type = {
3316 PyObject_HEAD_INIT(NULL)
3317 0, "_" SRE_MODULE ".SRE_Scanner",
3318 sizeof(ScannerObject), 0,
3319 (destructor)scanner_dealloc, /*tp_dealloc*/
3320 0, /*tp_print*/
3321 (getattrfunc)scanner_getattr, /*tp_getattr*/
3324 static PyObject*
3325 pattern_scanner(PatternObject* pattern, PyObject* args)
3327 /* create search state object */
3329 ScannerObject* self;
3331 PyObject* string;
3332 int start = 0;
3333 int end = INT_MAX;
3334 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
3335 return NULL;
3337 /* create scanner object */
3338 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3339 if (!self)
3340 return NULL;
3342 string = state_init(&self->state, pattern, string, start, end);
3343 if (!string) {
3344 PyObject_DEL(self);
3345 return NULL;
3348 Py_INCREF(pattern);
3349 self->pattern = (PyObject*) pattern;
3351 return (PyObject*) self;
3354 static PyMethodDef _functions[] = {
3355 {"compile", _compile, METH_VARARGS},
3356 {"getcodesize", sre_codesize, METH_NOARGS},
3357 {"getlower", sre_getlower, METH_VARARGS},
3358 {NULL, NULL}
3361 #if PY_VERSION_HEX < 0x02030000
3362 DL_EXPORT(void) init_sre(void)
3363 #else
3364 PyMODINIT_FUNC init_sre(void)
3365 #endif
3367 PyObject* m;
3368 PyObject* d;
3369 PyObject* x;
3371 /* Patch object types */
3372 Pattern_Type.ob_type = Match_Type.ob_type =
3373 Scanner_Type.ob_type = &PyType_Type;
3375 m = Py_InitModule("_" SRE_MODULE, _functions);
3376 if (m == NULL)
3377 return;
3378 d = PyModule_GetDict(m);
3380 x = PyInt_FromLong(SRE_MAGIC);
3381 if (x) {
3382 PyDict_SetItemString(d, "MAGIC", x);
3383 Py_DECREF(x);
3386 x = PyInt_FromLong(sizeof(SRE_CODE));
3387 if (x) {
3388 PyDict_SetItemString(d, "CODESIZE", x);
3389 Py_DECREF(x);
3392 x = PyString_FromString(copyright);
3393 if (x) {
3394 PyDict_SetItemString(d, "copyright", x);
3395 Py_DECREF(x);
3399 #endif /* !defined(SRE_RECURSIVE) */
3401 /* vim:ts=4:sw=4:et