Merged revisions 77838 via svnmerge from
[python/dscho.git] / Modules / _sre.c
blob74d942568f857a3c3b18a12b4accffe462120377
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 #ifndef SRE_RECURSIVE
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #define PY_SSIZE_T_CLEAN
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
47 #include "sre.h"
49 #include <ctype.h>
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
56 #define SRE_PY_MODULE "re"
58 /* defining this one enables tracing */
59 #undef VERBOSE
61 /* defining this enables unicode support (default under 1.6a1 and later) */
62 #define HAVE_UNICODE
64 /* -------------------------------------------------------------------- */
65 /* optional features */
67 /* enables fast searching */
68 #define USE_FAST_SEARCH
70 /* enables aggressive inlining (always on for Visual C) */
71 #undef USE_INLINE
73 /* enables copy/deepcopy handling (work in progress) */
74 #undef USE_BUILTIN_COPY
76 #if PY_VERSION_HEX < 0x01060000
77 #define PyObject_DEL(op) PyMem_DEL((op))
78 #endif
80 /* -------------------------------------------------------------------- */
82 #if defined(_MSC_VER)
83 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
84 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
85 /* fastest possible local call under MSVC */
86 #define LOCAL(type) static __inline type __fastcall
87 #elif defined(USE_INLINE)
88 #define LOCAL(type) static inline type
89 #else
90 #define LOCAL(type) static type
91 #endif
93 /* error codes */
94 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
95 #define SRE_ERROR_STATE -2 /* illegal state */
96 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
97 #define SRE_ERROR_MEMORY -9 /* out of memory */
98 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
100 #if defined(VERBOSE)
101 #define TRACE(v) printf v
102 #else
103 #define TRACE(v)
104 #endif
106 /* -------------------------------------------------------------------- */
107 /* search engine state */
109 /* default character predicates (run sre_chars.py to regenerate tables) */
111 #define SRE_DIGIT_MASK 1
112 #define SRE_SPACE_MASK 2
113 #define SRE_LINEBREAK_MASK 4
114 #define SRE_ALNUM_MASK 8
115 #define SRE_WORD_MASK 16
117 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
119 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
120 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
122 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
123 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
124 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
125 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
127 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
128 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
129 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
130 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
131 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
132 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
133 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
134 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
135 120, 121, 122, 123, 124, 125, 126, 127 };
137 #define SRE_IS_DIGIT(ch)\
138 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
139 #define SRE_IS_SPACE(ch)\
140 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
141 #define SRE_IS_LINEBREAK(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
143 #define SRE_IS_ALNUM(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
145 #define SRE_IS_WORD(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
148 static unsigned int sre_lower(unsigned int ch)
150 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
153 /* locale-specific character predicates */
154 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
155 * warnings when c's type supports only numbers < N+1 */
156 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
157 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
158 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
159 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
160 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
162 static unsigned int sre_lower_locale(unsigned int ch)
164 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
167 /* unicode-specific character predicates */
169 #if defined(HAVE_UNICODE)
171 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
172 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
173 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
174 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
177 static unsigned int sre_lower_unicode(unsigned int ch)
179 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
182 #endif
184 LOCAL(int)
185 sre_category(SRE_CODE category, unsigned int ch)
187 switch (category) {
189 case SRE_CATEGORY_DIGIT:
190 return SRE_IS_DIGIT(ch);
191 case SRE_CATEGORY_NOT_DIGIT:
192 return !SRE_IS_DIGIT(ch);
193 case SRE_CATEGORY_SPACE:
194 return SRE_IS_SPACE(ch);
195 case SRE_CATEGORY_NOT_SPACE:
196 return !SRE_IS_SPACE(ch);
197 case SRE_CATEGORY_WORD:
198 return SRE_IS_WORD(ch);
199 case SRE_CATEGORY_NOT_WORD:
200 return !SRE_IS_WORD(ch);
201 case SRE_CATEGORY_LINEBREAK:
202 return SRE_IS_LINEBREAK(ch);
203 case SRE_CATEGORY_NOT_LINEBREAK:
204 return !SRE_IS_LINEBREAK(ch);
206 case SRE_CATEGORY_LOC_WORD:
207 return SRE_LOC_IS_WORD(ch);
208 case SRE_CATEGORY_LOC_NOT_WORD:
209 return !SRE_LOC_IS_WORD(ch);
211 #if defined(HAVE_UNICODE)
212 case SRE_CATEGORY_UNI_DIGIT:
213 return SRE_UNI_IS_DIGIT(ch);
214 case SRE_CATEGORY_UNI_NOT_DIGIT:
215 return !SRE_UNI_IS_DIGIT(ch);
216 case SRE_CATEGORY_UNI_SPACE:
217 return SRE_UNI_IS_SPACE(ch);
218 case SRE_CATEGORY_UNI_NOT_SPACE:
219 return !SRE_UNI_IS_SPACE(ch);
220 case SRE_CATEGORY_UNI_WORD:
221 return SRE_UNI_IS_WORD(ch);
222 case SRE_CATEGORY_UNI_NOT_WORD:
223 return !SRE_UNI_IS_WORD(ch);
224 case SRE_CATEGORY_UNI_LINEBREAK:
225 return SRE_UNI_IS_LINEBREAK(ch);
226 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
227 return !SRE_UNI_IS_LINEBREAK(ch);
228 #else
229 case SRE_CATEGORY_UNI_DIGIT:
230 return SRE_IS_DIGIT(ch);
231 case SRE_CATEGORY_UNI_NOT_DIGIT:
232 return !SRE_IS_DIGIT(ch);
233 case SRE_CATEGORY_UNI_SPACE:
234 return SRE_IS_SPACE(ch);
235 case SRE_CATEGORY_UNI_NOT_SPACE:
236 return !SRE_IS_SPACE(ch);
237 case SRE_CATEGORY_UNI_WORD:
238 return SRE_LOC_IS_WORD(ch);
239 case SRE_CATEGORY_UNI_NOT_WORD:
240 return !SRE_LOC_IS_WORD(ch);
241 case SRE_CATEGORY_UNI_LINEBREAK:
242 return SRE_IS_LINEBREAK(ch);
243 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
244 return !SRE_IS_LINEBREAK(ch);
245 #endif
247 return 0;
250 /* helpers */
252 static void
253 data_stack_dealloc(SRE_STATE* state)
255 if (state->data_stack) {
256 PyMem_FREE(state->data_stack);
257 state->data_stack = NULL;
259 state->data_stack_size = state->data_stack_base = 0;
262 static int
263 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
265 Py_ssize_t minsize, cursize;
266 minsize = state->data_stack_base+size;
267 cursize = state->data_stack_size;
268 if (cursize < minsize) {
269 void* stack;
270 cursize = minsize+minsize/4+1024;
271 TRACE(("allocate/grow stack %d\n", cursize));
272 stack = PyMem_REALLOC(state->data_stack, cursize);
273 if (!stack) {
274 data_stack_dealloc(state);
275 return SRE_ERROR_MEMORY;
277 state->data_stack = (char *)stack;
278 state->data_stack_size = cursize;
280 return 0;
283 /* generate 8-bit version */
285 #define SRE_CHAR unsigned char
286 #define SRE_AT sre_at
287 #define SRE_COUNT sre_count
288 #define SRE_CHARSET sre_charset
289 #define SRE_INFO sre_info
290 #define SRE_MATCH sre_match
291 #define SRE_MATCH_CONTEXT sre_match_context
292 #define SRE_SEARCH sre_search
293 #define SRE_LITERAL_TEMPLATE sre_literal_template
295 #if defined(HAVE_UNICODE)
297 #define SRE_RECURSIVE
298 #include "_sre.c"
299 #undef SRE_RECURSIVE
301 #undef SRE_LITERAL_TEMPLATE
302 #undef SRE_SEARCH
303 #undef SRE_MATCH
304 #undef SRE_MATCH_CONTEXT
305 #undef SRE_INFO
306 #undef SRE_CHARSET
307 #undef SRE_COUNT
308 #undef SRE_AT
309 #undef SRE_CHAR
311 /* generate 16-bit unicode version */
313 #define SRE_CHAR Py_UNICODE
314 #define SRE_AT sre_uat
315 #define SRE_COUNT sre_ucount
316 #define SRE_CHARSET sre_ucharset
317 #define SRE_INFO sre_uinfo
318 #define SRE_MATCH sre_umatch
319 #define SRE_MATCH_CONTEXT sre_umatch_context
320 #define SRE_SEARCH sre_usearch
321 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
322 #endif
324 #endif /* SRE_RECURSIVE */
326 /* -------------------------------------------------------------------- */
327 /* String matching engine */
329 /* the following section is compiled twice, with different character
330 settings */
332 LOCAL(int)
333 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
335 /* check if pointer is at given position */
337 Py_ssize_t thisp, thatp;
339 switch (at) {
341 case SRE_AT_BEGINNING:
342 case SRE_AT_BEGINNING_STRING:
343 return ((void*) ptr == state->beginning);
345 case SRE_AT_BEGINNING_LINE:
346 return ((void*) ptr == state->beginning ||
347 SRE_IS_LINEBREAK((int) ptr[-1]));
349 case SRE_AT_END:
350 return (((void*) (ptr+1) == state->end &&
351 SRE_IS_LINEBREAK((int) ptr[0])) ||
352 ((void*) ptr == state->end));
354 case SRE_AT_END_LINE:
355 return ((void*) ptr == state->end ||
356 SRE_IS_LINEBREAK((int) ptr[0]));
358 case SRE_AT_END_STRING:
359 return ((void*) ptr == state->end);
361 case SRE_AT_BOUNDARY:
362 if (state->beginning == state->end)
363 return 0;
364 thatp = ((void*) ptr > state->beginning) ?
365 SRE_IS_WORD((int) ptr[-1]) : 0;
366 thisp = ((void*) ptr < state->end) ?
367 SRE_IS_WORD((int) ptr[0]) : 0;
368 return thisp != thatp;
370 case SRE_AT_NON_BOUNDARY:
371 if (state->beginning == state->end)
372 return 0;
373 thatp = ((void*) ptr > state->beginning) ?
374 SRE_IS_WORD((int) ptr[-1]) : 0;
375 thisp = ((void*) ptr < state->end) ?
376 SRE_IS_WORD((int) ptr[0]) : 0;
377 return thisp == thatp;
379 case SRE_AT_LOC_BOUNDARY:
380 if (state->beginning == state->end)
381 return 0;
382 thatp = ((void*) ptr > state->beginning) ?
383 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
384 thisp = ((void*) ptr < state->end) ?
385 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
386 return thisp != thatp;
388 case SRE_AT_LOC_NON_BOUNDARY:
389 if (state->beginning == state->end)
390 return 0;
391 thatp = ((void*) ptr > state->beginning) ?
392 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
393 thisp = ((void*) ptr < state->end) ?
394 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
395 return thisp == thatp;
397 #if defined(HAVE_UNICODE)
398 case SRE_AT_UNI_BOUNDARY:
399 if (state->beginning == state->end)
400 return 0;
401 thatp = ((void*) ptr > state->beginning) ?
402 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
403 thisp = ((void*) ptr < state->end) ?
404 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
405 return thisp != thatp;
407 case SRE_AT_UNI_NON_BOUNDARY:
408 if (state->beginning == state->end)
409 return 0;
410 thatp = ((void*) ptr > state->beginning) ?
411 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
412 thisp = ((void*) ptr < state->end) ?
413 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
414 return thisp == thatp;
415 #endif
419 return 0;
422 LOCAL(int)
423 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
425 /* check if character is a member of the given set */
427 int ok = 1;
429 for (;;) {
430 switch (*set++) {
432 case SRE_OP_FAILURE:
433 return !ok;
435 case SRE_OP_LITERAL:
436 /* <LITERAL> <code> */
437 if (ch == set[0])
438 return ok;
439 set++;
440 break;
442 case SRE_OP_CATEGORY:
443 /* <CATEGORY> <code> */
444 if (sre_category(set[0], (int) ch))
445 return ok;
446 set += 1;
447 break;
449 case SRE_OP_CHARSET:
450 if (sizeof(SRE_CODE) == 2) {
451 /* <CHARSET> <bitmap> (16 bits per code word) */
452 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
453 return ok;
454 set += 16;
456 else {
457 /* <CHARSET> <bitmap> (32 bits per code word) */
458 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
459 return ok;
460 set += 8;
462 break;
464 case SRE_OP_RANGE:
465 /* <RANGE> <lower> <upper> */
466 if (set[0] <= ch && ch <= set[1])
467 return ok;
468 set += 2;
469 break;
471 case SRE_OP_NEGATE:
472 ok = !ok;
473 break;
475 case SRE_OP_BIGCHARSET:
476 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
478 Py_ssize_t count, block;
479 count = *(set++);
481 if (sizeof(SRE_CODE) == 2) {
482 block = ((unsigned char*)set)[ch >> 8];
483 set += 128;
484 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
485 return ok;
486 set += count*16;
488 else {
489 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
490 * warnings when c's type supports only numbers < N+1 */
491 if (!(ch & ~65535))
492 block = ((unsigned char*)set)[ch >> 8];
493 else
494 block = -1;
495 set += 64;
496 if (block >=0 &&
497 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
498 return ok;
499 set += count*8;
501 break;
504 default:
505 /* internal error -- there's not much we can do about it
506 here, so let's just pretend it didn't match... */
507 return 0;
512 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
514 LOCAL(Py_ssize_t)
515 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
517 SRE_CODE chr;
518 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
519 SRE_CHAR* end = (SRE_CHAR *)state->end;
520 Py_ssize_t i;
522 /* adjust end */
523 if (maxcount < end - ptr && maxcount != 65535)
524 end = ptr + maxcount;
526 switch (pattern[0]) {
528 case SRE_OP_IN:
529 /* repeated set */
530 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
531 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
532 ptr++;
533 break;
535 case SRE_OP_ANY:
536 /* repeated dot wildcard. */
537 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
538 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
539 ptr++;
540 break;
542 case SRE_OP_ANY_ALL:
543 /* repeated dot wildcard. skip to the end of the target
544 string, and backtrack from there */
545 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
546 ptr = end;
547 break;
549 case SRE_OP_LITERAL:
550 /* repeated literal */
551 chr = pattern[1];
552 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
553 while (ptr < end && (SRE_CODE) *ptr == chr)
554 ptr++;
555 break;
557 case SRE_OP_LITERAL_IGNORE:
558 /* repeated literal */
559 chr = pattern[1];
560 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
561 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
562 ptr++;
563 break;
565 case SRE_OP_NOT_LITERAL:
566 /* repeated non-literal */
567 chr = pattern[1];
568 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
569 while (ptr < end && (SRE_CODE) *ptr != chr)
570 ptr++;
571 break;
573 case SRE_OP_NOT_LITERAL_IGNORE:
574 /* repeated non-literal */
575 chr = pattern[1];
576 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
577 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
578 ptr++;
579 break;
581 default:
582 /* repeated single character pattern */
583 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
584 while ((SRE_CHAR*) state->ptr < end) {
585 i = SRE_MATCH(state, pattern);
586 if (i < 0)
587 return i;
588 if (!i)
589 break;
591 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
592 (SRE_CHAR*) state->ptr - ptr));
593 return (SRE_CHAR*) state->ptr - ptr;
596 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
597 return ptr - (SRE_CHAR*) state->ptr;
600 #if 0 /* not used in this release */
601 LOCAL(int)
602 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
604 /* check if an SRE_OP_INFO block matches at the current position.
605 returns the number of SRE_CODE objects to skip if successful, 0
606 if no match */
608 SRE_CHAR* end = state->end;
609 SRE_CHAR* ptr = state->ptr;
610 Py_ssize_t i;
612 /* check minimal length */
613 if (pattern[3] && (end - ptr) < pattern[3])
614 return 0;
616 /* check known prefix */
617 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
618 /* <length> <skip> <prefix data> <overlap data> */
619 for (i = 0; i < pattern[5]; i++)
620 if ((SRE_CODE) ptr[i] != pattern[7 + i])
621 return 0;
622 return pattern[0] + 2 * pattern[6];
624 return pattern[0];
626 #endif
628 /* The macros below should be used to protect recursive SRE_MATCH()
629 * calls that *failed* and do *not* return immediately (IOW, those
630 * that will backtrack). Explaining:
632 * - Recursive SRE_MATCH() returned true: that's usually a success
633 * (besides atypical cases like ASSERT_NOT), therefore there's no
634 * reason to restore lastmark;
636 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
637 * is returning to the caller: If the current SRE_MATCH() is the
638 * top function of the recursion, returning false will be a matching
639 * failure, and it doesn't matter where lastmark is pointing to.
640 * If it's *not* the top function, it will be a recursive SRE_MATCH()
641 * failure by itself, and the calling SRE_MATCH() will have to deal
642 * with the failure by the same rules explained here (it will restore
643 * lastmark by itself if necessary);
645 * - Recursive SRE_MATCH() returned false, and will continue the
646 * outside 'for' loop: must be protected when breaking, since the next
647 * OP could potentially depend on lastmark;
649 * - Recursive SRE_MATCH() returned false, and will be called again
650 * inside a local for/while loop: must be protected between each
651 * loop iteration, since the recursive SRE_MATCH() could do anything,
652 * and could potentially depend on lastmark.
654 * For more information, check the discussion at SF patch #712900.
656 #define LASTMARK_SAVE() \
657 do { \
658 ctx->lastmark = state->lastmark; \
659 ctx->lastindex = state->lastindex; \
660 } while (0)
661 #define LASTMARK_RESTORE() \
662 do { \
663 state->lastmark = ctx->lastmark; \
664 state->lastindex = ctx->lastindex; \
665 } while (0)
667 #define RETURN_ERROR(i) do { return i; } while(0)
668 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
669 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
671 #define RETURN_ON_ERROR(i) \
672 do { if (i < 0) RETURN_ERROR(i); } while (0)
673 #define RETURN_ON_SUCCESS(i) \
674 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
675 #define RETURN_ON_FAILURE(i) \
676 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
678 #define SFY(x) #x
680 #define DATA_STACK_ALLOC(state, type, ptr) \
681 do { \
682 alloc_pos = state->data_stack_base; \
683 TRACE(("allocating %s in %d (%d)\n", \
684 SFY(type), alloc_pos, sizeof(type))); \
685 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
686 int j = data_stack_grow(state, sizeof(type)); \
687 if (j < 0) return j; \
688 if (ctx_pos != -1) \
689 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
691 ptr = (type*)(state->data_stack+alloc_pos); \
692 state->data_stack_base += sizeof(type); \
693 } while (0)
695 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
696 do { \
697 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
698 ptr = (type*)(state->data_stack+pos); \
699 } while (0)
701 #define DATA_STACK_PUSH(state, data, size) \
702 do { \
703 TRACE(("copy data in %p to %d (%d)\n", \
704 data, state->data_stack_base, size)); \
705 if (state->data_stack_size < state->data_stack_base+size) { \
706 int j = data_stack_grow(state, size); \
707 if (j < 0) return j; \
708 if (ctx_pos != -1) \
709 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
711 memcpy(state->data_stack+state->data_stack_base, data, size); \
712 state->data_stack_base += size; \
713 } while (0)
715 #define DATA_STACK_POP(state, data, size, discard) \
716 do { \
717 TRACE(("copy data to %p from %d (%d)\n", \
718 data, state->data_stack_base-size, size)); \
719 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
720 if (discard) \
721 state->data_stack_base -= size; \
722 } while (0)
724 #define DATA_STACK_POP_DISCARD(state, size) \
725 do { \
726 TRACE(("discard data from %d (%d)\n", \
727 state->data_stack_base-size, size)); \
728 state->data_stack_base -= size; \
729 } while(0)
731 #define DATA_PUSH(x) \
732 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
733 #define DATA_POP(x) \
734 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
735 #define DATA_POP_DISCARD(x) \
736 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
737 #define DATA_ALLOC(t,p) \
738 DATA_STACK_ALLOC(state, t, p)
739 #define DATA_LOOKUP_AT(t,p,pos) \
740 DATA_STACK_LOOKUP_AT(state,t,p,pos)
742 #define MARK_PUSH(lastmark) \
743 do if (lastmark > 0) { \
744 i = lastmark; /* ctx->lastmark may change if reallocated */ \
745 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
746 } while (0)
747 #define MARK_POP(lastmark) \
748 do if (lastmark > 0) { \
749 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
750 } while (0)
751 #define MARK_POP_KEEP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
754 } while (0)
755 #define MARK_POP_DISCARD(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
758 } while (0)
760 #define JUMP_NONE 0
761 #define JUMP_MAX_UNTIL_1 1
762 #define JUMP_MAX_UNTIL_2 2
763 #define JUMP_MAX_UNTIL_3 3
764 #define JUMP_MIN_UNTIL_1 4
765 #define JUMP_MIN_UNTIL_2 5
766 #define JUMP_MIN_UNTIL_3 6
767 #define JUMP_REPEAT 7
768 #define JUMP_REPEAT_ONE_1 8
769 #define JUMP_REPEAT_ONE_2 9
770 #define JUMP_MIN_REPEAT_ONE 10
771 #define JUMP_BRANCH 11
772 #define JUMP_ASSERT 12
773 #define JUMP_ASSERT_NOT 13
775 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
776 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
777 nextctx->last_ctx_pos = ctx_pos; \
778 nextctx->jump = jumpvalue; \
779 nextctx->pattern = nextpattern; \
780 ctx_pos = alloc_pos; \
781 ctx = nextctx; \
782 goto entrance; \
783 jumplabel: \
784 while (0) /* gcc doesn't like labels at end of scopes */ \
786 typedef struct {
787 Py_ssize_t last_ctx_pos;
788 Py_ssize_t jump;
789 SRE_CHAR* ptr;
790 SRE_CODE* pattern;
791 Py_ssize_t count;
792 Py_ssize_t lastmark;
793 Py_ssize_t lastindex;
794 union {
795 SRE_CODE chr;
796 SRE_REPEAT* rep;
797 } u;
798 } SRE_MATCH_CONTEXT;
800 /* check if string matches the given pattern. returns <0 for
801 error, 0 for failure, and 1 for success */
802 LOCAL(Py_ssize_t)
803 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
805 SRE_CHAR* end = (SRE_CHAR *)state->end;
806 Py_ssize_t alloc_pos, ctx_pos = -1;
807 Py_ssize_t i, ret = 0;
808 Py_ssize_t jump;
809 unsigned int sigcount=0;
811 SRE_MATCH_CONTEXT* ctx;
812 SRE_MATCH_CONTEXT* nextctx;
814 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
816 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817 ctx->last_ctx_pos = -1;
818 ctx->jump = JUMP_NONE;
819 ctx->pattern = pattern;
820 ctx_pos = alloc_pos;
822 entrance:
824 ctx->ptr = (SRE_CHAR *)state->ptr;
826 if (ctx->pattern[0] == SRE_OP_INFO) {
827 /* optimization info block */
828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
829 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
830 TRACE(("reject (got %d chars, need %d)\n",
831 (end - ctx->ptr), ctx->pattern[3]));
832 RETURN_FAILURE;
834 ctx->pattern += ctx->pattern[1] + 1;
837 for (;;) {
838 ++sigcount;
839 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
840 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
842 switch (*ctx->pattern++) {
844 case SRE_OP_MARK:
845 /* set mark */
846 /* <MARK> <gid> */
847 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
848 ctx->ptr, ctx->pattern[0]));
849 i = ctx->pattern[0];
850 if (i & 1)
851 state->lastindex = i/2 + 1;
852 if (i > state->lastmark) {
853 /* state->lastmark is the highest valid index in the
854 state->mark array. If it is increased by more than 1,
855 the intervening marks must be set to NULL to signal
856 that these marks have not been encountered. */
857 Py_ssize_t j = state->lastmark + 1;
858 while (j < i)
859 state->mark[j++] = NULL;
860 state->lastmark = i;
862 state->mark[i] = ctx->ptr;
863 ctx->pattern++;
864 break;
866 case SRE_OP_LITERAL:
867 /* match literal string */
868 /* <LITERAL> <code> */
869 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
870 ctx->ptr, *ctx->pattern));
871 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
872 RETURN_FAILURE;
873 ctx->pattern++;
874 ctx->ptr++;
875 break;
877 case SRE_OP_NOT_LITERAL:
878 /* match anything that is not literal character */
879 /* <NOT_LITERAL> <code> */
880 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
881 ctx->ptr, *ctx->pattern));
882 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
883 RETURN_FAILURE;
884 ctx->pattern++;
885 ctx->ptr++;
886 break;
888 case SRE_OP_SUCCESS:
889 /* end of pattern */
890 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
891 state->ptr = ctx->ptr;
892 RETURN_SUCCESS;
894 case SRE_OP_AT:
895 /* match at given position */
896 /* <AT> <code> */
897 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
898 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
899 RETURN_FAILURE;
900 ctx->pattern++;
901 break;
903 case SRE_OP_CATEGORY:
904 /* match at given category */
905 /* <CATEGORY> <code> */
906 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
907 ctx->ptr, *ctx->pattern));
908 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
909 RETURN_FAILURE;
910 ctx->pattern++;
911 ctx->ptr++;
912 break;
914 case SRE_OP_ANY:
915 /* match anything (except a newline) */
916 /* <ANY> */
917 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
918 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
919 RETURN_FAILURE;
920 ctx->ptr++;
921 break;
923 case SRE_OP_ANY_ALL:
924 /* match anything */
925 /* <ANY_ALL> */
926 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
927 if (ctx->ptr >= end)
928 RETURN_FAILURE;
929 ctx->ptr++;
930 break;
932 case SRE_OP_IN:
933 /* match set member (or non_member) */
934 /* <IN> <skip> <set> */
935 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
936 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
937 RETURN_FAILURE;
938 ctx->pattern += ctx->pattern[0];
939 ctx->ptr++;
940 break;
942 case SRE_OP_LITERAL_IGNORE:
943 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
944 ctx->pattern, ctx->ptr, ctx->pattern[0]));
945 if (ctx->ptr >= end ||
946 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
947 RETURN_FAILURE;
948 ctx->pattern++;
949 ctx->ptr++;
950 break;
952 case SRE_OP_NOT_LITERAL_IGNORE:
953 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
954 ctx->pattern, ctx->ptr, *ctx->pattern));
955 if (ctx->ptr >= end ||
956 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
957 RETURN_FAILURE;
958 ctx->pattern++;
959 ctx->ptr++;
960 break;
962 case SRE_OP_IN_IGNORE:
963 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
964 if (ctx->ptr >= end
965 || !SRE_CHARSET(ctx->pattern+1,
966 (SRE_CODE)state->lower(*ctx->ptr)))
967 RETURN_FAILURE;
968 ctx->pattern += ctx->pattern[0];
969 ctx->ptr++;
970 break;
972 case SRE_OP_JUMP:
973 case SRE_OP_INFO:
974 /* jump forward */
975 /* <JUMP> <offset> */
976 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
977 ctx->ptr, ctx->pattern[0]));
978 ctx->pattern += ctx->pattern[0];
979 break;
981 case SRE_OP_BRANCH:
982 /* alternation */
983 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
984 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
985 LASTMARK_SAVE();
986 ctx->u.rep = state->repeat;
987 if (ctx->u.rep)
988 MARK_PUSH(ctx->lastmark);
989 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
990 if (ctx->pattern[1] == SRE_OP_LITERAL &&
991 (ctx->ptr >= end ||
992 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
993 continue;
994 if (ctx->pattern[1] == SRE_OP_IN &&
995 (ctx->ptr >= end ||
996 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
997 continue;
998 state->ptr = ctx->ptr;
999 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1000 if (ret) {
1001 if (ctx->u.rep)
1002 MARK_POP_DISCARD(ctx->lastmark);
1003 RETURN_ON_ERROR(ret);
1004 RETURN_SUCCESS;
1006 if (ctx->u.rep)
1007 MARK_POP_KEEP(ctx->lastmark);
1008 LASTMARK_RESTORE();
1010 if (ctx->u.rep)
1011 MARK_POP_DISCARD(ctx->lastmark);
1012 RETURN_FAILURE;
1014 case SRE_OP_REPEAT_ONE:
1015 /* match repeated sequence (maximizing regexp) */
1017 /* this operator only works if the repeated item is
1018 exactly one character wide, and we're not already
1019 collecting backtracking points. for other cases,
1020 use the MAX_REPEAT operator */
1022 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1024 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1025 ctx->pattern[1], ctx->pattern[2]));
1027 if (ctx->ptr + ctx->pattern[1] > end)
1028 RETURN_FAILURE; /* cannot match */
1030 state->ptr = ctx->ptr;
1032 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1033 RETURN_ON_ERROR(ret);
1034 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1035 ctx->count = ret;
1036 ctx->ptr += ctx->count;
1038 /* when we arrive here, count contains the number of
1039 matches, and ctx->ptr points to the tail of the target
1040 string. check if the rest of the pattern matches,
1041 and backtrack if not. */
1043 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1044 RETURN_FAILURE;
1046 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1047 /* tail is empty. we're finished */
1048 state->ptr = ctx->ptr;
1049 RETURN_SUCCESS;
1052 LASTMARK_SAVE();
1054 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1055 /* tail starts with a literal. skip positions where
1056 the rest of the pattern cannot possibly match */
1057 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1058 for (;;) {
1059 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1060 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1061 ctx->ptr--;
1062 ctx->count--;
1064 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1065 break;
1066 state->ptr = ctx->ptr;
1067 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1068 ctx->pattern+ctx->pattern[0]);
1069 if (ret) {
1070 RETURN_ON_ERROR(ret);
1071 RETURN_SUCCESS;
1074 LASTMARK_RESTORE();
1076 ctx->ptr--;
1077 ctx->count--;
1080 } else {
1081 /* general case */
1082 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1083 state->ptr = ctx->ptr;
1084 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1085 ctx->pattern+ctx->pattern[0]);
1086 if (ret) {
1087 RETURN_ON_ERROR(ret);
1088 RETURN_SUCCESS;
1090 ctx->ptr--;
1091 ctx->count--;
1092 LASTMARK_RESTORE();
1095 RETURN_FAILURE;
1097 case SRE_OP_MIN_REPEAT_ONE:
1098 /* match repeated sequence (minimizing regexp) */
1100 /* this operator only works if the repeated item is
1101 exactly one character wide, and we're not already
1102 collecting backtracking points. for other cases,
1103 use the MIN_REPEAT operator */
1105 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1107 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1108 ctx->pattern[1], ctx->pattern[2]));
1110 if (ctx->ptr + ctx->pattern[1] > end)
1111 RETURN_FAILURE; /* cannot match */
1113 state->ptr = ctx->ptr;
1115 if (ctx->pattern[1] == 0)
1116 ctx->count = 0;
1117 else {
1118 /* count using pattern min as the maximum */
1119 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1120 RETURN_ON_ERROR(ret);
1121 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1122 if (ret < (Py_ssize_t) ctx->pattern[1])
1123 /* didn't match minimum number of times */
1124 RETURN_FAILURE;
1125 /* advance past minimum matches of repeat */
1126 ctx->count = ret;
1127 ctx->ptr += ctx->count;
1130 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1131 /* tail is empty. we're finished */
1132 state->ptr = ctx->ptr;
1133 RETURN_SUCCESS;
1135 } else {
1136 /* general case */
1137 LASTMARK_SAVE();
1138 while ((Py_ssize_t)ctx->pattern[2] == 65535
1139 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1140 state->ptr = ctx->ptr;
1141 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1142 ctx->pattern+ctx->pattern[0]);
1143 if (ret) {
1144 RETURN_ON_ERROR(ret);
1145 RETURN_SUCCESS;
1147 state->ptr = ctx->ptr;
1148 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1149 RETURN_ON_ERROR(ret);
1150 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1151 if (ret == 0)
1152 break;
1153 assert(ret == 1);
1154 ctx->ptr++;
1155 ctx->count++;
1156 LASTMARK_RESTORE();
1159 RETURN_FAILURE;
1161 case SRE_OP_REPEAT:
1162 /* create repeat context. all the hard work is done
1163 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1164 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1165 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1166 ctx->pattern[1], ctx->pattern[2]));
1168 /* install new repeat context */
1169 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1170 if (!ctx->u.rep) {
1171 PyErr_NoMemory();
1172 RETURN_FAILURE;
1174 ctx->u.rep->count = -1;
1175 ctx->u.rep->pattern = ctx->pattern;
1176 ctx->u.rep->prev = state->repeat;
1177 ctx->u.rep->last_ptr = NULL;
1178 state->repeat = ctx->u.rep;
1180 state->ptr = ctx->ptr;
1181 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1182 state->repeat = ctx->u.rep->prev;
1183 PyObject_FREE(ctx->u.rep);
1185 if (ret) {
1186 RETURN_ON_ERROR(ret);
1187 RETURN_SUCCESS;
1189 RETURN_FAILURE;
1191 case SRE_OP_MAX_UNTIL:
1192 /* maximizing repeat */
1193 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1195 /* FIXME: we probably need to deal with zero-width
1196 matches in here... */
1198 ctx->u.rep = state->repeat;
1199 if (!ctx->u.rep)
1200 RETURN_ERROR(SRE_ERROR_STATE);
1202 state->ptr = ctx->ptr;
1204 ctx->count = ctx->u.rep->count+1;
1206 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1207 ctx->ptr, ctx->count));
1209 if (ctx->count < ctx->u.rep->pattern[1]) {
1210 /* not enough matches */
1211 ctx->u.rep->count = ctx->count;
1212 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1213 ctx->u.rep->pattern+3);
1214 if (ret) {
1215 RETURN_ON_ERROR(ret);
1216 RETURN_SUCCESS;
1218 ctx->u.rep->count = ctx->count-1;
1219 state->ptr = ctx->ptr;
1220 RETURN_FAILURE;
1223 if ((ctx->count < ctx->u.rep->pattern[2] ||
1224 ctx->u.rep->pattern[2] == 65535) &&
1225 state->ptr != ctx->u.rep->last_ptr) {
1226 /* we may have enough matches, but if we can
1227 match another item, do so */
1228 ctx->u.rep->count = ctx->count;
1229 LASTMARK_SAVE();
1230 MARK_PUSH(ctx->lastmark);
1231 /* zero-width match protection */
1232 DATA_PUSH(&ctx->u.rep->last_ptr);
1233 ctx->u.rep->last_ptr = state->ptr;
1234 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1235 ctx->u.rep->pattern+3);
1236 DATA_POP(&ctx->u.rep->last_ptr);
1237 if (ret) {
1238 MARK_POP_DISCARD(ctx->lastmark);
1239 RETURN_ON_ERROR(ret);
1240 RETURN_SUCCESS;
1242 MARK_POP(ctx->lastmark);
1243 LASTMARK_RESTORE();
1244 ctx->u.rep->count = ctx->count-1;
1245 state->ptr = ctx->ptr;
1248 /* cannot match more repeated items here. make sure the
1249 tail matches */
1250 state->repeat = ctx->u.rep->prev;
1251 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1252 RETURN_ON_SUCCESS(ret);
1253 state->repeat = ctx->u.rep;
1254 state->ptr = ctx->ptr;
1255 RETURN_FAILURE;
1257 case SRE_OP_MIN_UNTIL:
1258 /* minimizing repeat */
1259 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1261 ctx->u.rep = state->repeat;
1262 if (!ctx->u.rep)
1263 RETURN_ERROR(SRE_ERROR_STATE);
1265 state->ptr = ctx->ptr;
1267 ctx->count = ctx->u.rep->count+1;
1269 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1270 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1272 if (ctx->count < ctx->u.rep->pattern[1]) {
1273 /* not enough matches */
1274 ctx->u.rep->count = ctx->count;
1275 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1276 ctx->u.rep->pattern+3);
1277 if (ret) {
1278 RETURN_ON_ERROR(ret);
1279 RETURN_SUCCESS;
1281 ctx->u.rep->count = ctx->count-1;
1282 state->ptr = ctx->ptr;
1283 RETURN_FAILURE;
1286 LASTMARK_SAVE();
1288 /* see if the tail matches */
1289 state->repeat = ctx->u.rep->prev;
1290 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1291 if (ret) {
1292 RETURN_ON_ERROR(ret);
1293 RETURN_SUCCESS;
1296 state->repeat = ctx->u.rep;
1297 state->ptr = ctx->ptr;
1299 LASTMARK_RESTORE();
1301 if (ctx->count >= ctx->u.rep->pattern[2]
1302 && ctx->u.rep->pattern[2] != 65535)
1303 RETURN_FAILURE;
1305 ctx->u.rep->count = ctx->count;
1306 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1307 ctx->u.rep->pattern+3);
1308 if (ret) {
1309 RETURN_ON_ERROR(ret);
1310 RETURN_SUCCESS;
1312 ctx->u.rep->count = ctx->count-1;
1313 state->ptr = ctx->ptr;
1314 RETURN_FAILURE;
1316 case SRE_OP_GROUPREF:
1317 /* match backreference */
1318 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1319 ctx->ptr, ctx->pattern[0]));
1320 i = ctx->pattern[0];
1322 Py_ssize_t groupref = i+i;
1323 if (groupref >= state->lastmark) {
1324 RETURN_FAILURE;
1325 } else {
1326 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1327 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1328 if (!p || !e || e < p)
1329 RETURN_FAILURE;
1330 while (p < e) {
1331 if (ctx->ptr >= end || *ctx->ptr != *p)
1332 RETURN_FAILURE;
1333 p++; ctx->ptr++;
1337 ctx->pattern++;
1338 break;
1340 case SRE_OP_GROUPREF_IGNORE:
1341 /* match backreference */
1342 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1343 ctx->ptr, ctx->pattern[0]));
1344 i = ctx->pattern[0];
1346 Py_ssize_t groupref = i+i;
1347 if (groupref >= state->lastmark) {
1348 RETURN_FAILURE;
1349 } else {
1350 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1351 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1352 if (!p || !e || e < p)
1353 RETURN_FAILURE;
1354 while (p < e) {
1355 if (ctx->ptr >= end ||
1356 state->lower(*ctx->ptr) != state->lower(*p))
1357 RETURN_FAILURE;
1358 p++; ctx->ptr++;
1362 ctx->pattern++;
1363 break;
1365 case SRE_OP_GROUPREF_EXISTS:
1366 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1367 ctx->ptr, ctx->pattern[0]));
1368 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1369 i = ctx->pattern[0];
1371 Py_ssize_t groupref = i+i;
1372 if (groupref >= state->lastmark) {
1373 ctx->pattern += ctx->pattern[1];
1374 break;
1375 } else {
1376 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1377 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1378 if (!p || !e || e < p) {
1379 ctx->pattern += ctx->pattern[1];
1380 break;
1384 ctx->pattern += 2;
1385 break;
1387 case SRE_OP_ASSERT:
1388 /* assert subpattern */
1389 /* <ASSERT> <skip> <back> <pattern> */
1390 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1391 ctx->ptr, ctx->pattern[1]));
1392 state->ptr = ctx->ptr - ctx->pattern[1];
1393 if (state->ptr < state->beginning)
1394 RETURN_FAILURE;
1395 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1396 RETURN_ON_FAILURE(ret);
1397 ctx->pattern += ctx->pattern[0];
1398 break;
1400 case SRE_OP_ASSERT_NOT:
1401 /* assert not subpattern */
1402 /* <ASSERT_NOT> <skip> <back> <pattern> */
1403 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1404 ctx->ptr, ctx->pattern[1]));
1405 state->ptr = ctx->ptr - ctx->pattern[1];
1406 if (state->ptr >= state->beginning) {
1407 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1408 if (ret) {
1409 RETURN_ON_ERROR(ret);
1410 RETURN_FAILURE;
1413 ctx->pattern += ctx->pattern[0];
1414 break;
1416 case SRE_OP_FAILURE:
1417 /* immediate failure */
1418 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1419 RETURN_FAILURE;
1421 default:
1422 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1423 ctx->pattern[-1]));
1424 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1428 exit:
1429 ctx_pos = ctx->last_ctx_pos;
1430 jump = ctx->jump;
1431 DATA_POP_DISCARD(ctx);
1432 if (ctx_pos == -1)
1433 return ret;
1434 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1436 switch (jump) {
1437 case JUMP_MAX_UNTIL_2:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_2;
1440 case JUMP_MAX_UNTIL_3:
1441 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1442 goto jump_max_until_3;
1443 case JUMP_MIN_UNTIL_2:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_2;
1446 case JUMP_MIN_UNTIL_3:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_until_3;
1449 case JUMP_BRANCH:
1450 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1451 goto jump_branch;
1452 case JUMP_MAX_UNTIL_1:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_max_until_1;
1455 case JUMP_MIN_UNTIL_1:
1456 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1457 goto jump_min_until_1;
1458 case JUMP_REPEAT:
1459 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1460 goto jump_repeat;
1461 case JUMP_REPEAT_ONE_1:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1463 goto jump_repeat_one_1;
1464 case JUMP_REPEAT_ONE_2:
1465 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1466 goto jump_repeat_one_2;
1467 case JUMP_MIN_REPEAT_ONE:
1468 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1469 goto jump_min_repeat_one;
1470 case JUMP_ASSERT:
1471 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1472 goto jump_assert;
1473 case JUMP_ASSERT_NOT:
1474 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1475 goto jump_assert_not;
1476 case JUMP_NONE:
1477 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1478 break;
1481 return ret; /* should never get here */
1484 LOCAL(Py_ssize_t)
1485 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1487 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1488 SRE_CHAR* end = (SRE_CHAR *)state->end;
1489 Py_ssize_t status = 0;
1490 Py_ssize_t prefix_len = 0;
1491 Py_ssize_t prefix_skip = 0;
1492 SRE_CODE* prefix = NULL;
1493 SRE_CODE* charset = NULL;
1494 SRE_CODE* overlap = NULL;
1495 int flags = 0;
1497 if (pattern[0] == SRE_OP_INFO) {
1498 /* optimization info block */
1499 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1501 flags = pattern[2];
1503 if (pattern[3] > 1) {
1504 /* adjust end point (but make sure we leave at least one
1505 character in there, so literal search will work) */
1506 end -= pattern[3]-1;
1507 if (end <= ptr)
1508 end = ptr+1;
1511 if (flags & SRE_INFO_PREFIX) {
1512 /* pattern starts with a known prefix */
1513 /* <length> <skip> <prefix data> <overlap data> */
1514 prefix_len = pattern[5];
1515 prefix_skip = pattern[6];
1516 prefix = pattern + 7;
1517 overlap = prefix + prefix_len - 1;
1518 } else if (flags & SRE_INFO_CHARSET)
1519 /* pattern starts with a character from a known set */
1520 /* <charset> */
1521 charset = pattern + 5;
1523 pattern += 1 + pattern[1];
1526 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1527 TRACE(("charset = %p\n", charset));
1529 #if defined(USE_FAST_SEARCH)
1530 if (prefix_len > 1) {
1531 /* pattern starts with a known prefix. use the overlap
1532 table to skip forward as fast as we possibly can */
1533 Py_ssize_t i = 0;
1534 end = (SRE_CHAR *)state->end;
1535 while (ptr < end) {
1536 for (;;) {
1537 if ((SRE_CODE) ptr[0] != prefix[i]) {
1538 if (!i)
1539 break;
1540 else
1541 i = overlap[i];
1542 } else {
1543 if (++i == prefix_len) {
1544 /* found a potential match */
1545 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1546 state->start = ptr + 1 - prefix_len;
1547 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1548 if (flags & SRE_INFO_LITERAL)
1549 return 1; /* we got all of it */
1550 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1551 if (status != 0)
1552 return status;
1553 /* close but no cigar -- try again */
1554 i = overlap[i];
1556 break;
1559 ptr++;
1561 return 0;
1563 #endif
1565 if (pattern[0] == SRE_OP_LITERAL) {
1566 /* pattern starts with a literal character. this is used
1567 for short prefixes, and if fast search is disabled */
1568 SRE_CODE chr = pattern[1];
1569 end = (SRE_CHAR *)state->end;
1570 for (;;) {
1571 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1572 ptr++;
1573 if (ptr >= end)
1574 return 0;
1575 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1576 state->start = ptr;
1577 state->ptr = ++ptr;
1578 if (flags & SRE_INFO_LITERAL)
1579 return 1; /* we got all of it */
1580 status = SRE_MATCH(state, pattern + 2);
1581 if (status != 0)
1582 break;
1584 } else if (charset) {
1585 /* pattern starts with a character from a known set */
1586 end = (SRE_CHAR *)state->end;
1587 for (;;) {
1588 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1589 ptr++;
1590 if (ptr >= end)
1591 return 0;
1592 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1593 state->start = ptr;
1594 state->ptr = ptr;
1595 status = SRE_MATCH(state, pattern);
1596 if (status != 0)
1597 break;
1598 ptr++;
1600 } else
1601 /* general case */
1602 while (ptr <= end) {
1603 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1604 state->start = state->ptr = ptr++;
1605 status = SRE_MATCH(state, pattern);
1606 if (status != 0)
1607 break;
1610 return status;
1613 LOCAL(int)
1614 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1616 /* check if given string is a literal template (i.e. no escapes) */
1617 while (len-- > 0)
1618 if (*ptr++ == '\\')
1619 return 0;
1620 return 1;
1623 #if !defined(SRE_RECURSIVE)
1625 /* -------------------------------------------------------------------- */
1626 /* factories and destructors */
1628 /* see sre.h for object declarations */
1629 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1630 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1632 static PyObject *
1633 sre_codesize(PyObject* self, PyObject *unused)
1635 return Py_BuildValue("l", sizeof(SRE_CODE));
1638 static PyObject *
1639 sre_getlower(PyObject* self, PyObject* args)
1641 int character, flags;
1642 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1643 return NULL;
1644 if (flags & SRE_FLAG_LOCALE)
1645 return Py_BuildValue("i", sre_lower_locale(character));
1646 if (flags & SRE_FLAG_UNICODE)
1647 #if defined(HAVE_UNICODE)
1648 return Py_BuildValue("i", sre_lower_unicode(character));
1649 #else
1650 return Py_BuildValue("i", sre_lower_locale(character));
1651 #endif
1652 return Py_BuildValue("i", sre_lower(character));
1655 LOCAL(void)
1656 state_reset(SRE_STATE* state)
1658 /* FIXME: dynamic! */
1659 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1661 state->lastmark = -1;
1662 state->lastindex = -1;
1664 state->repeat = NULL;
1666 data_stack_dealloc(state);
1669 static void*
1670 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1672 /* given a python object, return a data pointer, a length (in
1673 characters), and a character size. return NULL if the object
1674 is not a string (or not compatible) */
1676 PyBufferProcs *buffer;
1677 Py_ssize_t size, bytes;
1678 int charsize;
1679 void* ptr;
1680 Py_buffer view;
1682 /* Unicode objects do not support the buffer API. So, get the data
1683 directly instead. */
1684 if (PyUnicode_Check(string)) {
1685 ptr = (void *)PyUnicode_AS_DATA(string);
1686 *p_length = PyUnicode_GET_SIZE(string);
1687 *p_charsize = sizeof(Py_UNICODE);
1688 return ptr;
1691 /* get pointer to string buffer */
1692 view.len = -1;
1693 buffer = Py_TYPE(string)->tp_as_buffer;
1694 if (!buffer || !buffer->bf_getbuffer ||
1695 (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
1696 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1697 return NULL;
1700 /* determine buffer size */
1701 bytes = view.len;
1702 ptr = view.buf;
1704 /* Release the buffer immediately --- possibly dangerous
1705 but doing something else would require some re-factoring
1707 PyBuffer_Release(&view);
1709 if (bytes < 0) {
1710 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1711 return NULL;
1714 /* determine character size */
1715 size = PyObject_Size(string);
1717 if (PyBytes_Check(string) || bytes == size)
1718 charsize = 1;
1719 #if defined(HAVE_UNICODE)
1720 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1721 charsize = sizeof(Py_UNICODE);
1722 #endif
1723 else {
1724 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1725 return NULL;
1728 *p_length = size;
1729 *p_charsize = charsize;
1731 if (ptr == NULL) {
1732 PyErr_SetString(PyExc_ValueError,
1733 "Buffer is NULL");
1735 return ptr;
1738 LOCAL(PyObject*)
1739 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1740 Py_ssize_t start, Py_ssize_t end)
1742 /* prepare state object */
1744 Py_ssize_t length;
1745 int charsize;
1746 void* ptr;
1748 memset(state, 0, sizeof(SRE_STATE));
1750 state->lastmark = -1;
1751 state->lastindex = -1;
1753 ptr = getstring(string, &length, &charsize);
1754 if (!ptr)
1755 return NULL;
1757 if (charsize == 1 && pattern->charsize > 1) {
1758 PyErr_SetString(PyExc_TypeError,
1759 "can't use a string pattern on a bytes-like object");
1760 return NULL;
1762 if (charsize > 1 && pattern->charsize == 1) {
1763 PyErr_SetString(PyExc_TypeError,
1764 "can't use a bytes pattern on a string-like object");
1765 return NULL;
1768 /* adjust boundaries */
1769 if (start < 0)
1770 start = 0;
1771 else if (start > length)
1772 start = length;
1774 if (end < 0)
1775 end = 0;
1776 else if (end > length)
1777 end = length;
1779 state->charsize = charsize;
1781 state->beginning = ptr;
1783 state->start = (void*) ((char*) ptr + start * state->charsize);
1784 state->end = (void*) ((char*) ptr + end * state->charsize);
1786 Py_INCREF(string);
1787 state->string = string;
1788 state->pos = start;
1789 state->endpos = end;
1791 if (pattern->flags & SRE_FLAG_LOCALE)
1792 state->lower = sre_lower_locale;
1793 else if (pattern->flags & SRE_FLAG_UNICODE)
1794 #if defined(HAVE_UNICODE)
1795 state->lower = sre_lower_unicode;
1796 #else
1797 state->lower = sre_lower_locale;
1798 #endif
1799 else
1800 state->lower = sre_lower;
1802 return string;
1805 LOCAL(void)
1806 state_fini(SRE_STATE* state)
1808 Py_XDECREF(state->string);
1809 data_stack_dealloc(state);
1812 /* calculate offset from start of string */
1813 #define STATE_OFFSET(state, member)\
1814 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1816 LOCAL(PyObject*)
1817 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1819 Py_ssize_t i, j;
1821 index = (index - 1) * 2;
1823 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1824 if (empty)
1825 /* want empty string */
1826 i = j = 0;
1827 else {
1828 Py_INCREF(Py_None);
1829 return Py_None;
1831 } else {
1832 i = STATE_OFFSET(state, state->mark[index]);
1833 j = STATE_OFFSET(state, state->mark[index+1]);
1836 return PySequence_GetSlice(string, i, j);
1839 static void
1840 pattern_error(int status)
1842 switch (status) {
1843 case SRE_ERROR_RECURSION_LIMIT:
1844 PyErr_SetString(
1845 PyExc_RuntimeError,
1846 "maximum recursion limit exceeded"
1848 break;
1849 case SRE_ERROR_MEMORY:
1850 PyErr_NoMemory();
1851 break;
1852 case SRE_ERROR_INTERRUPTED:
1853 /* An exception has already been raised, so let it fly */
1854 break;
1855 default:
1856 /* other error codes indicate compiler/engine bugs */
1857 PyErr_SetString(
1858 PyExc_RuntimeError,
1859 "internal error in regular expression engine"
1864 static void
1865 pattern_dealloc(PatternObject* self)
1867 if (self->weakreflist != NULL)
1868 PyObject_ClearWeakRefs((PyObject *) self);
1869 Py_XDECREF(self->pattern);
1870 Py_XDECREF(self->groupindex);
1871 Py_XDECREF(self->indexgroup);
1872 PyObject_DEL(self);
1875 static PyObject*
1876 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1878 SRE_STATE state;
1879 int status;
1881 PyObject* string;
1882 Py_ssize_t start = 0;
1883 Py_ssize_t end = PY_SSIZE_T_MAX;
1884 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1885 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1886 &string, &start, &end))
1887 return NULL;
1889 string = state_init(&state, self, string, start, end);
1890 if (!string)
1891 return NULL;
1893 state.ptr = state.start;
1895 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1897 if (state.charsize == 1) {
1898 status = sre_match(&state, PatternObject_GetCode(self));
1899 } else {
1900 #if defined(HAVE_UNICODE)
1901 status = sre_umatch(&state, PatternObject_GetCode(self));
1902 #endif
1905 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1906 if (PyErr_Occurred())
1907 return NULL;
1909 state_fini(&state);
1911 return pattern_new_match(self, &state, status);
1914 static PyObject*
1915 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1917 SRE_STATE state;
1918 int status;
1920 PyObject* string;
1921 Py_ssize_t start = 0;
1922 Py_ssize_t end = PY_SSIZE_T_MAX;
1923 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1924 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1925 &string, &start, &end))
1926 return NULL;
1928 string = state_init(&state, self, string, start, end);
1929 if (!string)
1930 return NULL;
1932 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1934 if (state.charsize == 1) {
1935 status = sre_search(&state, PatternObject_GetCode(self));
1936 } else {
1937 #if defined(HAVE_UNICODE)
1938 status = sre_usearch(&state, PatternObject_GetCode(self));
1939 #endif
1942 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1944 state_fini(&state);
1946 if (PyErr_Occurred())
1947 return NULL;
1949 return pattern_new_match(self, &state, status);
1952 static PyObject*
1953 call(char* module, char* function, PyObject* args)
1955 PyObject* name;
1956 PyObject* mod;
1957 PyObject* func;
1958 PyObject* result;
1960 if (!args)
1961 return NULL;
1962 name = PyUnicode_FromString(module);
1963 if (!name)
1964 return NULL;
1965 mod = PyImport_Import(name);
1966 Py_DECREF(name);
1967 if (!mod)
1968 return NULL;
1969 func = PyObject_GetAttrString(mod, function);
1970 Py_DECREF(mod);
1971 if (!func)
1972 return NULL;
1973 result = PyObject_CallObject(func, args);
1974 Py_DECREF(func);
1975 Py_DECREF(args);
1976 return result;
1979 #ifdef USE_BUILTIN_COPY
1980 static int
1981 deepcopy(PyObject** object, PyObject* memo)
1983 PyObject* copy;
1985 copy = call(
1986 "copy", "deepcopy",
1987 PyTuple_Pack(2, *object, memo)
1989 if (!copy)
1990 return 0;
1992 Py_DECREF(*object);
1993 *object = copy;
1995 return 1; /* success */
1997 #endif
1999 static PyObject*
2000 join_list(PyObject* list, PyObject* string)
2002 /* join list elements */
2004 PyObject* joiner;
2005 #if PY_VERSION_HEX >= 0x01060000
2006 PyObject* function;
2007 PyObject* args;
2008 #endif
2009 PyObject* result;
2011 joiner = PySequence_GetSlice(string, 0, 0);
2012 if (!joiner)
2013 return NULL;
2015 if (PyList_GET_SIZE(list) == 0) {
2016 Py_DECREF(list);
2017 return joiner;
2020 #if PY_VERSION_HEX >= 0x01060000
2021 function = PyObject_GetAttrString(joiner, "join");
2022 if (!function) {
2023 Py_DECREF(joiner);
2024 return NULL;
2026 args = PyTuple_New(1);
2027 if (!args) {
2028 Py_DECREF(function);
2029 Py_DECREF(joiner);
2030 return NULL;
2032 PyTuple_SET_ITEM(args, 0, list);
2033 result = PyObject_CallObject(function, args);
2034 Py_DECREF(args); /* also removes list */
2035 Py_DECREF(function);
2036 #else
2037 result = call(
2038 "string", "join",
2039 PyTuple_Pack(2, list, joiner)
2041 #endif
2042 Py_DECREF(joiner);
2044 return result;
2047 static PyObject*
2048 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2050 SRE_STATE state;
2051 PyObject* list;
2052 int status;
2053 Py_ssize_t i, b, e;
2055 PyObject* string;
2056 Py_ssize_t start = 0;
2057 Py_ssize_t end = PY_SSIZE_T_MAX;
2058 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2059 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2060 &string, &start, &end))
2061 return NULL;
2063 string = state_init(&state, self, string, start, end);
2064 if (!string)
2065 return NULL;
2067 list = PyList_New(0);
2068 if (!list) {
2069 state_fini(&state);
2070 return NULL;
2073 while (state.start <= state.end) {
2075 PyObject* item;
2077 state_reset(&state);
2079 state.ptr = state.start;
2081 if (state.charsize == 1) {
2082 status = sre_search(&state, PatternObject_GetCode(self));
2083 } else {
2084 #if defined(HAVE_UNICODE)
2085 status = sre_usearch(&state, PatternObject_GetCode(self));
2086 #endif
2089 if (PyErr_Occurred())
2090 goto error;
2092 if (status <= 0) {
2093 if (status == 0)
2094 break;
2095 pattern_error(status);
2096 goto error;
2099 /* don't bother to build a match object */
2100 switch (self->groups) {
2101 case 0:
2102 b = STATE_OFFSET(&state, state.start);
2103 e = STATE_OFFSET(&state, state.ptr);
2104 item = PySequence_GetSlice(string, b, e);
2105 if (!item)
2106 goto error;
2107 break;
2108 case 1:
2109 item = state_getslice(&state, 1, string, 1);
2110 if (!item)
2111 goto error;
2112 break;
2113 default:
2114 item = PyTuple_New(self->groups);
2115 if (!item)
2116 goto error;
2117 for (i = 0; i < self->groups; i++) {
2118 PyObject* o = state_getslice(&state, i+1, string, 1);
2119 if (!o) {
2120 Py_DECREF(item);
2121 goto error;
2123 PyTuple_SET_ITEM(item, i, o);
2125 break;
2128 status = PyList_Append(list, item);
2129 Py_DECREF(item);
2130 if (status < 0)
2131 goto error;
2133 if (state.ptr == state.start)
2134 state.start = (void*) ((char*) state.ptr + state.charsize);
2135 else
2136 state.start = state.ptr;
2140 state_fini(&state);
2141 return list;
2143 error:
2144 Py_DECREF(list);
2145 state_fini(&state);
2146 return NULL;
2150 #if PY_VERSION_HEX >= 0x02020000
2151 static PyObject*
2152 pattern_finditer(PatternObject* pattern, PyObject* args)
2154 PyObject* scanner;
2155 PyObject* search;
2156 PyObject* iterator;
2158 scanner = pattern_scanner(pattern, args);
2159 if (!scanner)
2160 return NULL;
2162 search = PyObject_GetAttrString(scanner, "search");
2163 Py_DECREF(scanner);
2164 if (!search)
2165 return NULL;
2167 iterator = PyCallIter_New(search, Py_None);
2168 Py_DECREF(search);
2170 return iterator;
2172 #endif
2174 static PyObject*
2175 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2177 SRE_STATE state;
2178 PyObject* list;
2179 PyObject* item;
2180 int status;
2181 Py_ssize_t n;
2182 Py_ssize_t i;
2183 void* last;
2185 PyObject* string;
2186 Py_ssize_t maxsplit = 0;
2187 static char* kwlist[] = { "source", "maxsplit", NULL };
2188 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2189 &string, &maxsplit))
2190 return NULL;
2192 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2193 if (!string)
2194 return NULL;
2196 list = PyList_New(0);
2197 if (!list) {
2198 state_fini(&state);
2199 return NULL;
2202 n = 0;
2203 last = state.start;
2205 while (!maxsplit || n < maxsplit) {
2207 state_reset(&state);
2209 state.ptr = state.start;
2211 if (state.charsize == 1) {
2212 status = sre_search(&state, PatternObject_GetCode(self));
2213 } else {
2214 #if defined(HAVE_UNICODE)
2215 status = sre_usearch(&state, PatternObject_GetCode(self));
2216 #endif
2219 if (PyErr_Occurred())
2220 goto error;
2222 if (status <= 0) {
2223 if (status == 0)
2224 break;
2225 pattern_error(status);
2226 goto error;
2229 if (state.start == state.ptr) {
2230 if (last == state.end)
2231 break;
2232 /* skip one character */
2233 state.start = (void*) ((char*) state.ptr + state.charsize);
2234 continue;
2237 /* get segment before this match */
2238 item = PySequence_GetSlice(
2239 string, STATE_OFFSET(&state, last),
2240 STATE_OFFSET(&state, state.start)
2242 if (!item)
2243 goto error;
2244 status = PyList_Append(list, item);
2245 Py_DECREF(item);
2246 if (status < 0)
2247 goto error;
2249 /* add groups (if any) */
2250 for (i = 0; i < self->groups; i++) {
2251 item = state_getslice(&state, i+1, string, 0);
2252 if (!item)
2253 goto error;
2254 status = PyList_Append(list, item);
2255 Py_DECREF(item);
2256 if (status < 0)
2257 goto error;
2260 n = n + 1;
2262 last = state.start = state.ptr;
2266 /* get segment following last match (even if empty) */
2267 item = PySequence_GetSlice(
2268 string, STATE_OFFSET(&state, last), state.endpos
2270 if (!item)
2271 goto error;
2272 status = PyList_Append(list, item);
2273 Py_DECREF(item);
2274 if (status < 0)
2275 goto error;
2277 state_fini(&state);
2278 return list;
2280 error:
2281 Py_DECREF(list);
2282 state_fini(&state);
2283 return NULL;
2287 static PyObject*
2288 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2289 Py_ssize_t count, Py_ssize_t subn)
2291 SRE_STATE state;
2292 PyObject* list;
2293 PyObject* item;
2294 PyObject* filter;
2295 PyObject* args;
2296 PyObject* match;
2297 void* ptr;
2298 int status;
2299 Py_ssize_t n;
2300 Py_ssize_t i, b, e;
2301 int bint;
2302 int filter_is_callable;
2304 if (PyCallable_Check(ptemplate)) {
2305 /* sub/subn takes either a function or a template */
2306 filter = ptemplate;
2307 Py_INCREF(filter);
2308 filter_is_callable = 1;
2309 } else {
2310 /* if not callable, check if it's a literal string */
2311 int literal;
2312 ptr = getstring(ptemplate, &n, &bint);
2313 b = bint;
2314 if (ptr) {
2315 if (b == 1) {
2316 literal = sre_literal_template((unsigned char *)ptr, n);
2317 } else {
2318 #if defined(HAVE_UNICODE)
2319 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2320 #endif
2322 } else {
2323 PyErr_Clear();
2324 literal = 0;
2326 if (literal) {
2327 filter = ptemplate;
2328 Py_INCREF(filter);
2329 filter_is_callable = 0;
2330 } else {
2331 /* not a literal; hand it over to the template compiler */
2332 filter = call(
2333 SRE_PY_MODULE, "_subx",
2334 PyTuple_Pack(2, self, ptemplate)
2336 if (!filter)
2337 return NULL;
2338 filter_is_callable = PyCallable_Check(filter);
2342 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2343 if (!string) {
2344 Py_DECREF(filter);
2345 return NULL;
2348 list = PyList_New(0);
2349 if (!list) {
2350 Py_DECREF(filter);
2351 state_fini(&state);
2352 return NULL;
2355 n = i = 0;
2357 while (!count || n < count) {
2359 state_reset(&state);
2361 state.ptr = state.start;
2363 if (state.charsize == 1) {
2364 status = sre_search(&state, PatternObject_GetCode(self));
2365 } else {
2366 #if defined(HAVE_UNICODE)
2367 status = sre_usearch(&state, PatternObject_GetCode(self));
2368 #endif
2371 if (PyErr_Occurred())
2372 goto error;
2374 if (status <= 0) {
2375 if (status == 0)
2376 break;
2377 pattern_error(status);
2378 goto error;
2381 b = STATE_OFFSET(&state, state.start);
2382 e = STATE_OFFSET(&state, state.ptr);
2384 if (i < b) {
2385 /* get segment before this match */
2386 item = PySequence_GetSlice(string, i, b);
2387 if (!item)
2388 goto error;
2389 status = PyList_Append(list, item);
2390 Py_DECREF(item);
2391 if (status < 0)
2392 goto error;
2394 } else if (i == b && i == e && n > 0)
2395 /* ignore empty match on latest position */
2396 goto next;
2398 if (filter_is_callable) {
2399 /* pass match object through filter */
2400 match = pattern_new_match(self, &state, 1);
2401 if (!match)
2402 goto error;
2403 args = PyTuple_Pack(1, match);
2404 if (!args) {
2405 Py_DECREF(match);
2406 goto error;
2408 item = PyObject_CallObject(filter, args);
2409 Py_DECREF(args);
2410 Py_DECREF(match);
2411 if (!item)
2412 goto error;
2413 } else {
2414 /* filter is literal string */
2415 item = filter;
2416 Py_INCREF(item);
2419 /* add to list */
2420 if (item != Py_None) {
2421 status = PyList_Append(list, item);
2422 Py_DECREF(item);
2423 if (status < 0)
2424 goto error;
2427 i = e;
2428 n = n + 1;
2430 next:
2431 /* move on */
2432 if (state.ptr == state.start)
2433 state.start = (void*) ((char*) state.ptr + state.charsize);
2434 else
2435 state.start = state.ptr;
2439 /* get segment following last match */
2440 if (i < state.endpos) {
2441 item = PySequence_GetSlice(string, i, state.endpos);
2442 if (!item)
2443 goto error;
2444 status = PyList_Append(list, item);
2445 Py_DECREF(item);
2446 if (status < 0)
2447 goto error;
2450 state_fini(&state);
2452 Py_DECREF(filter);
2454 /* convert list to single string (also removes list) */
2455 item = join_list(list, string);
2457 if (!item)
2458 return NULL;
2460 if (subn)
2461 return Py_BuildValue("Ni", item, n);
2463 return item;
2465 error:
2466 Py_DECREF(list);
2467 state_fini(&state);
2468 Py_DECREF(filter);
2469 return NULL;
2473 static PyObject*
2474 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2476 PyObject* ptemplate;
2477 PyObject* string;
2478 Py_ssize_t count = 0;
2479 static char* kwlist[] = { "repl", "string", "count", NULL };
2480 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2481 &ptemplate, &string, &count))
2482 return NULL;
2484 return pattern_subx(self, ptemplate, string, count, 0);
2487 static PyObject*
2488 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2490 PyObject* ptemplate;
2491 PyObject* string;
2492 Py_ssize_t count = 0;
2493 static char* kwlist[] = { "repl", "string", "count", NULL };
2494 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2495 &ptemplate, &string, &count))
2496 return NULL;
2498 return pattern_subx(self, ptemplate, string, count, 1);
2501 static PyObject*
2502 pattern_copy(PatternObject* self, PyObject *unused)
2504 #ifdef USE_BUILTIN_COPY
2505 PatternObject* copy;
2506 int offset;
2508 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2509 if (!copy)
2510 return NULL;
2512 offset = offsetof(PatternObject, groups);
2514 Py_XINCREF(self->groupindex);
2515 Py_XINCREF(self->indexgroup);
2516 Py_XINCREF(self->pattern);
2518 memcpy((char*) copy + offset, (char*) self + offset,
2519 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2520 copy->weakreflist = NULL;
2522 return (PyObject*) copy;
2523 #else
2524 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2525 return NULL;
2526 #endif
2529 static PyObject*
2530 pattern_deepcopy(PatternObject* self, PyObject* memo)
2532 #ifdef USE_BUILTIN_COPY
2533 PatternObject* copy;
2535 copy = (PatternObject*) pattern_copy(self);
2536 if (!copy)
2537 return NULL;
2539 if (!deepcopy(&copy->groupindex, memo) ||
2540 !deepcopy(&copy->indexgroup, memo) ||
2541 !deepcopy(&copy->pattern, memo)) {
2542 Py_DECREF(copy);
2543 return NULL;
2546 #else
2547 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2548 return NULL;
2549 #endif
2552 PyDoc_STRVAR(pattern_match_doc,
2553 "match(string[, pos[, endpos]]) --> match object or None.\n\
2554 Matches zero or more characters at the beginning of the string");
2556 PyDoc_STRVAR(pattern_search_doc,
2557 "search(string[, pos[, endpos]]) --> match object or None.\n\
2558 Scan through string looking for a match, and return a corresponding\n\
2559 MatchObject instance. Return None if no position in the string matches.");
2561 PyDoc_STRVAR(pattern_split_doc,
2562 "split(string[, maxsplit = 0]) --> list.\n\
2563 Split string by the occurrences of pattern.");
2565 PyDoc_STRVAR(pattern_findall_doc,
2566 "findall(string[, pos[, endpos]]) --> list.\n\
2567 Return a list of all non-overlapping matches of pattern in string.");
2569 PyDoc_STRVAR(pattern_finditer_doc,
2570 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2571 Return an iterator over all non-overlapping matches for the \n\
2572 RE pattern in string. For each match, the iterator returns a\n\
2573 match object.");
2575 PyDoc_STRVAR(pattern_sub_doc,
2576 "sub(repl, string[, count = 0]) --> newstring\n\
2577 Return the string obtained by replacing the leftmost non-overlapping\n\
2578 occurrences of pattern in string by the replacement repl.");
2580 PyDoc_STRVAR(pattern_subn_doc,
2581 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2582 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2583 the leftmost non-overlapping occurrences of pattern with the\n\
2584 replacement repl.");
2586 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2588 static PyMethodDef pattern_methods[] = {
2589 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2590 pattern_match_doc},
2591 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2592 pattern_search_doc},
2593 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2594 pattern_sub_doc},
2595 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2596 pattern_subn_doc},
2597 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2598 pattern_split_doc},
2599 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2600 pattern_findall_doc},
2601 #if PY_VERSION_HEX >= 0x02020000
2602 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2603 pattern_finditer_doc},
2604 #endif
2605 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2606 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2607 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2608 {NULL, NULL}
2611 #define PAT_OFF(x) offsetof(PatternObject, x)
2612 static PyMemberDef pattern_members[] = {
2613 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2614 {"flags", T_INT, PAT_OFF(flags), READONLY},
2615 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2616 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2617 {NULL} /* Sentinel */
2620 static PyTypeObject Pattern_Type = {
2621 PyVarObject_HEAD_INIT(NULL, 0)
2622 "_" SRE_MODULE ".SRE_Pattern",
2623 sizeof(PatternObject), sizeof(SRE_CODE),
2624 (destructor)pattern_dealloc, /* tp_dealloc */
2625 0, /* tp_print */
2626 0, /* tp_getattr */
2627 0, /* tp_setattr */
2628 0, /* tp_reserved */
2629 0, /* tp_repr */
2630 0, /* tp_as_number */
2631 0, /* tp_as_sequence */
2632 0, /* tp_as_mapping */
2633 0, /* tp_hash */
2634 0, /* tp_call */
2635 0, /* tp_str */
2636 0, /* tp_getattro */
2637 0, /* tp_setattro */
2638 0, /* tp_as_buffer */
2639 Py_TPFLAGS_DEFAULT, /* tp_flags */
2640 pattern_doc, /* tp_doc */
2641 0, /* tp_traverse */
2642 0, /* tp_clear */
2643 0, /* tp_richcompare */
2644 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2645 0, /* tp_iter */
2646 0, /* tp_iternext */
2647 pattern_methods, /* tp_methods */
2648 pattern_members, /* tp_members */
2651 static int _validate(PatternObject *self); /* Forward */
2653 static PyObject *
2654 _compile(PyObject* self_, PyObject* args)
2656 /* "compile" pattern descriptor to pattern object */
2658 PatternObject* self;
2659 Py_ssize_t i, n;
2661 PyObject* pattern;
2662 int flags = 0;
2663 PyObject* code;
2664 Py_ssize_t groups = 0;
2665 PyObject* groupindex = NULL;
2666 PyObject* indexgroup = NULL;
2667 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2668 &PyList_Type, &code, &groups,
2669 &groupindex, &indexgroup))
2670 return NULL;
2672 n = PyList_GET_SIZE(code);
2673 /* coverity[ampersand_in_size] */
2674 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2675 if (!self)
2676 return NULL;
2677 self->weakreflist = NULL;
2678 self->pattern = NULL;
2679 self->groupindex = NULL;
2680 self->indexgroup = NULL;
2682 self->codesize = n;
2684 for (i = 0; i < n; i++) {
2685 PyObject *o = PyList_GET_ITEM(code, i);
2686 unsigned long value = PyLong_AsUnsignedLong(o);
2687 self->code[i] = (SRE_CODE) value;
2688 if ((unsigned long) self->code[i] != value) {
2689 PyErr_SetString(PyExc_OverflowError,
2690 "regular expression code size limit exceeded");
2691 break;
2695 if (PyErr_Occurred()) {
2696 Py_DECREF(self);
2697 return NULL;
2700 if (pattern == Py_None)
2701 self->charsize = -1;
2702 else {
2703 Py_ssize_t p_length;
2704 if (!getstring(pattern, &p_length, &self->charsize)) {
2705 PyObject_DEL(self);
2706 return NULL;
2710 Py_INCREF(pattern);
2711 self->pattern = pattern;
2713 self->flags = flags;
2715 self->groups = groups;
2717 Py_XINCREF(groupindex);
2718 self->groupindex = groupindex;
2720 Py_XINCREF(indexgroup);
2721 self->indexgroup = indexgroup;
2723 self->weakreflist = NULL;
2725 if (!_validate(self)) {
2726 Py_DECREF(self);
2727 return NULL;
2730 return (PyObject*) self;
2733 /* -------------------------------------------------------------------- */
2734 /* Code validation */
2736 /* To learn more about this code, have a look at the _compile() function in
2737 Lib/sre_compile.py. The validation functions below checks the code array
2738 for conformance with the code patterns generated there.
2740 The nice thing about the generated code is that it is position-independent:
2741 all jumps are relative jumps forward. Also, jumps don't cross each other:
2742 the target of a later jump is always earlier than the target of an earlier
2743 jump. IOW, this is okay:
2745 J---------J-------T--------T
2746 \ \_____/ /
2747 \______________________/
2749 but this is not:
2751 J---------J-------T--------T
2752 \_________\_____/ /
2753 \____________/
2755 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2756 bytes wide (the latter if Python is compiled for "wide" unicode support).
2759 /* Defining this one enables tracing of the validator */
2760 #undef VVERBOSE
2762 /* Trace macro for the validator */
2763 #if defined(VVERBOSE)
2764 #define VTRACE(v) printf v
2765 #else
2766 #define VTRACE(v)
2767 #endif
2769 /* Report failure */
2770 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2772 /* Extract opcode, argument, or skip count from code array */
2773 #define GET_OP \
2774 do { \
2775 VTRACE(("%p: ", code)); \
2776 if (code >= end) FAIL; \
2777 op = *code++; \
2778 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2779 } while (0)
2780 #define GET_ARG \
2781 do { \
2782 VTRACE(("%p= ", code)); \
2783 if (code >= end) FAIL; \
2784 arg = *code++; \
2785 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2786 } while (0)
2787 #define GET_SKIP_ADJ(adj) \
2788 do { \
2789 VTRACE(("%p= ", code)); \
2790 if (code >= end) FAIL; \
2791 skip = *code; \
2792 VTRACE(("%lu (skip to %p)\n", \
2793 (unsigned long)skip, code+skip)); \
2794 if (code+skip-adj < code || code+skip-adj > end)\
2795 FAIL; \
2796 code++; \
2797 } while (0)
2798 #define GET_SKIP GET_SKIP_ADJ(0)
2800 static int
2801 _validate_charset(SRE_CODE *code, SRE_CODE *end)
2803 /* Some variables are manipulated by the macros above */
2804 SRE_CODE op;
2805 SRE_CODE arg;
2806 SRE_CODE offset;
2807 int i;
2809 while (code < end) {
2810 GET_OP;
2811 switch (op) {
2813 case SRE_OP_NEGATE:
2814 break;
2816 case SRE_OP_LITERAL:
2817 GET_ARG;
2818 break;
2820 case SRE_OP_RANGE:
2821 GET_ARG;
2822 GET_ARG;
2823 break;
2825 case SRE_OP_CHARSET:
2826 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2827 if (code+offset < code || code+offset > end)
2828 FAIL;
2829 code += offset;
2830 break;
2832 case SRE_OP_BIGCHARSET:
2833 GET_ARG; /* Number of blocks */
2834 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2835 if (code+offset < code || code+offset > end)
2836 FAIL;
2837 /* Make sure that each byte points to a valid block */
2838 for (i = 0; i < 256; i++) {
2839 if (((unsigned char *)code)[i] >= arg)
2840 FAIL;
2842 code += offset;
2843 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2844 if (code+offset < code || code+offset > end)
2845 FAIL;
2846 code += offset;
2847 break;
2849 case SRE_OP_CATEGORY:
2850 GET_ARG;
2851 switch (arg) {
2852 case SRE_CATEGORY_DIGIT:
2853 case SRE_CATEGORY_NOT_DIGIT:
2854 case SRE_CATEGORY_SPACE:
2855 case SRE_CATEGORY_NOT_SPACE:
2856 case SRE_CATEGORY_WORD:
2857 case SRE_CATEGORY_NOT_WORD:
2858 case SRE_CATEGORY_LINEBREAK:
2859 case SRE_CATEGORY_NOT_LINEBREAK:
2860 case SRE_CATEGORY_LOC_WORD:
2861 case SRE_CATEGORY_LOC_NOT_WORD:
2862 case SRE_CATEGORY_UNI_DIGIT:
2863 case SRE_CATEGORY_UNI_NOT_DIGIT:
2864 case SRE_CATEGORY_UNI_SPACE:
2865 case SRE_CATEGORY_UNI_NOT_SPACE:
2866 case SRE_CATEGORY_UNI_WORD:
2867 case SRE_CATEGORY_UNI_NOT_WORD:
2868 case SRE_CATEGORY_UNI_LINEBREAK:
2869 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2870 break;
2871 default:
2872 FAIL;
2874 break;
2876 default:
2877 FAIL;
2882 return 1;
2885 static int
2886 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2888 /* Some variables are manipulated by the macros above */
2889 SRE_CODE op;
2890 SRE_CODE arg;
2891 SRE_CODE skip;
2893 VTRACE(("code=%p, end=%p\n", code, end));
2895 if (code > end)
2896 FAIL;
2898 while (code < end) {
2899 GET_OP;
2900 switch (op) {
2902 case SRE_OP_MARK:
2903 /* We don't check whether marks are properly nested; the
2904 sre_match() code is robust even if they don't, and the worst
2905 you can get is nonsensical match results. */
2906 GET_ARG;
2907 if (arg > 2*groups+1) {
2908 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2909 FAIL;
2911 break;
2913 case SRE_OP_LITERAL:
2914 case SRE_OP_NOT_LITERAL:
2915 case SRE_OP_LITERAL_IGNORE:
2916 case SRE_OP_NOT_LITERAL_IGNORE:
2917 GET_ARG;
2918 /* The arg is just a character, nothing to check */
2919 break;
2921 case SRE_OP_SUCCESS:
2922 case SRE_OP_FAILURE:
2923 /* Nothing to check; these normally end the matching process */
2924 break;
2926 case SRE_OP_AT:
2927 GET_ARG;
2928 switch (arg) {
2929 case SRE_AT_BEGINNING:
2930 case SRE_AT_BEGINNING_STRING:
2931 case SRE_AT_BEGINNING_LINE:
2932 case SRE_AT_END:
2933 case SRE_AT_END_LINE:
2934 case SRE_AT_END_STRING:
2935 case SRE_AT_BOUNDARY:
2936 case SRE_AT_NON_BOUNDARY:
2937 case SRE_AT_LOC_BOUNDARY:
2938 case SRE_AT_LOC_NON_BOUNDARY:
2939 case SRE_AT_UNI_BOUNDARY:
2940 case SRE_AT_UNI_NON_BOUNDARY:
2941 break;
2942 default:
2943 FAIL;
2945 break;
2947 case SRE_OP_ANY:
2948 case SRE_OP_ANY_ALL:
2949 /* These have no operands */
2950 break;
2952 case SRE_OP_IN:
2953 case SRE_OP_IN_IGNORE:
2954 GET_SKIP;
2955 /* Stop 1 before the end; we check the FAILURE below */
2956 if (!_validate_charset(code, code+skip-2))
2957 FAIL;
2958 if (code[skip-2] != SRE_OP_FAILURE)
2959 FAIL;
2960 code += skip-1;
2961 break;
2963 case SRE_OP_INFO:
2965 /* A minimal info field is
2966 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2967 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2968 more follows. */
2969 SRE_CODE flags, min, max, i;
2970 SRE_CODE *newcode;
2971 GET_SKIP;
2972 newcode = code+skip-1;
2973 GET_ARG; flags = arg;
2974 GET_ARG; min = arg;
2975 GET_ARG; max = arg;
2976 /* Check that only valid flags are present */
2977 if ((flags & ~(SRE_INFO_PREFIX |
2978 SRE_INFO_LITERAL |
2979 SRE_INFO_CHARSET)) != 0)
2980 FAIL;
2981 /* PREFIX and CHARSET are mutually exclusive */
2982 if ((flags & SRE_INFO_PREFIX) &&
2983 (flags & SRE_INFO_CHARSET))
2984 FAIL;
2985 /* LITERAL implies PREFIX */
2986 if ((flags & SRE_INFO_LITERAL) &&
2987 !(flags & SRE_INFO_PREFIX))
2988 FAIL;
2989 /* Validate the prefix */
2990 if (flags & SRE_INFO_PREFIX) {
2991 SRE_CODE prefix_len, prefix_skip;
2992 GET_ARG; prefix_len = arg;
2993 GET_ARG; prefix_skip = arg;
2994 /* Here comes the prefix string */
2995 if (code+prefix_len < code || code+prefix_len > newcode)
2996 FAIL;
2997 code += prefix_len;
2998 /* And here comes the overlap table */
2999 if (code+prefix_len < code || code+prefix_len > newcode)
3000 FAIL;
3001 /* Each overlap value should be < prefix_len */
3002 for (i = 0; i < prefix_len; i++) {
3003 if (code[i] >= prefix_len)
3004 FAIL;
3006 code += prefix_len;
3008 /* Validate the charset */
3009 if (flags & SRE_INFO_CHARSET) {
3010 if (!_validate_charset(code, newcode-1))
3011 FAIL;
3012 if (newcode[-1] != SRE_OP_FAILURE)
3013 FAIL;
3014 code = newcode;
3016 else if (code != newcode) {
3017 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3018 FAIL;
3021 break;
3023 case SRE_OP_BRANCH:
3025 SRE_CODE *target = NULL;
3026 for (;;) {
3027 GET_SKIP;
3028 if (skip == 0)
3029 break;
3030 /* Stop 2 before the end; we check the JUMP below */
3031 if (!_validate_inner(code, code+skip-3, groups))
3032 FAIL;
3033 code += skip-3;
3034 /* Check that it ends with a JUMP, and that each JUMP
3035 has the same target */
3036 GET_OP;
3037 if (op != SRE_OP_JUMP)
3038 FAIL;
3039 GET_SKIP;
3040 if (target == NULL)
3041 target = code+skip-1;
3042 else if (code+skip-1 != target)
3043 FAIL;
3046 break;
3048 case SRE_OP_REPEAT_ONE:
3049 case SRE_OP_MIN_REPEAT_ONE:
3051 SRE_CODE min, max;
3052 GET_SKIP;
3053 GET_ARG; min = arg;
3054 GET_ARG; max = arg;
3055 if (min > max)
3056 FAIL;
3057 #ifdef Py_UNICODE_WIDE
3058 if (max > 65535)
3059 FAIL;
3060 #endif
3061 if (!_validate_inner(code, code+skip-4, groups))
3062 FAIL;
3063 code += skip-4;
3064 GET_OP;
3065 if (op != SRE_OP_SUCCESS)
3066 FAIL;
3068 break;
3070 case SRE_OP_REPEAT:
3072 SRE_CODE min, max;
3073 GET_SKIP;
3074 GET_ARG; min = arg;
3075 GET_ARG; max = arg;
3076 if (min > max)
3077 FAIL;
3078 #ifdef Py_UNICODE_WIDE
3079 if (max > 65535)
3080 FAIL;
3081 #endif
3082 if (!_validate_inner(code, code+skip-3, groups))
3083 FAIL;
3084 code += skip-3;
3085 GET_OP;
3086 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3087 FAIL;
3089 break;
3091 case SRE_OP_GROUPREF:
3092 case SRE_OP_GROUPREF_IGNORE:
3093 GET_ARG;
3094 if (arg >= groups)
3095 FAIL;
3096 break;
3098 case SRE_OP_GROUPREF_EXISTS:
3099 /* The regex syntax for this is: '(?(group)then|else)', where
3100 'group' is either an integer group number or a group name,
3101 'then' and 'else' are sub-regexes, and 'else' is optional. */
3102 GET_ARG;
3103 if (arg >= groups)
3104 FAIL;
3105 GET_SKIP_ADJ(1);
3106 code--; /* The skip is relative to the first arg! */
3107 /* There are two possibilities here: if there is both a 'then'
3108 part and an 'else' part, the generated code looks like:
3110 GROUPREF_EXISTS
3111 <group>
3112 <skipyes>
3113 ...then part...
3114 JUMP
3115 <skipno>
3116 (<skipyes> jumps here)
3117 ...else part...
3118 (<skipno> jumps here)
3120 If there is only a 'then' part, it looks like:
3122 GROUPREF_EXISTS
3123 <group>
3124 <skip>
3125 ...then part...
3126 (<skip> jumps here)
3128 There is no direct way to decide which it is, and we don't want
3129 to allow arbitrary jumps anywhere in the code; so we just look
3130 for a JUMP opcode preceding our skip target.
3132 if (skip >= 3 && code+skip-3 >= code &&
3133 code[skip-3] == SRE_OP_JUMP)
3135 VTRACE(("both then and else parts present\n"));
3136 if (!_validate_inner(code+1, code+skip-3, groups))
3137 FAIL;
3138 code += skip-2; /* Position after JUMP, at <skipno> */
3139 GET_SKIP;
3140 if (!_validate_inner(code, code+skip-1, groups))
3141 FAIL;
3142 code += skip-1;
3144 else {
3145 VTRACE(("only a then part present\n"));
3146 if (!_validate_inner(code+1, code+skip-1, groups))
3147 FAIL;
3148 code += skip-1;
3150 break;
3152 case SRE_OP_ASSERT:
3153 case SRE_OP_ASSERT_NOT:
3154 GET_SKIP;
3155 GET_ARG; /* 0 for lookahead, width for lookbehind */
3156 code--; /* Back up over arg to simplify math below */
3157 if (arg & 0x80000000)
3158 FAIL; /* Width too large */
3159 /* Stop 1 before the end; we check the SUCCESS below */
3160 if (!_validate_inner(code+1, code+skip-2, groups))
3161 FAIL;
3162 code += skip-2;
3163 GET_OP;
3164 if (op != SRE_OP_SUCCESS)
3165 FAIL;
3166 break;
3168 default:
3169 FAIL;
3174 VTRACE(("okay\n"));
3175 return 1;
3178 static int
3179 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3181 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3182 FAIL;
3183 if (groups == 0) /* fix for simplejson */
3184 groups = 100; /* 100 groups should always be safe */
3185 return _validate_inner(code, end-1, groups);
3188 static int
3189 _validate(PatternObject *self)
3191 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3193 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3194 return 0;
3196 else
3197 VTRACE(("Success!\n"));
3198 return 1;
3201 /* -------------------------------------------------------------------- */
3202 /* match methods */
3204 static void
3205 match_dealloc(MatchObject* self)
3207 Py_XDECREF(self->regs);
3208 Py_XDECREF(self->string);
3209 Py_DECREF(self->pattern);
3210 PyObject_DEL(self);
3213 static PyObject*
3214 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3216 if (index < 0 || index >= self->groups) {
3217 /* raise IndexError if we were given a bad group number */
3218 PyErr_SetString(
3219 PyExc_IndexError,
3220 "no such group"
3222 return NULL;
3225 index *= 2;
3227 if (self->string == Py_None || self->mark[index] < 0) {
3228 /* return default value if the string or group is undefined */
3229 Py_INCREF(def);
3230 return def;
3233 return PySequence_GetSlice(
3234 self->string, self->mark[index], self->mark[index+1]
3238 static Py_ssize_t
3239 match_getindex(MatchObject* self, PyObject* index)
3241 Py_ssize_t i;
3243 if (index == NULL)
3244 /* Default value */
3245 return 0;
3247 if (PyLong_Check(index))
3248 return PyLong_AsSsize_t(index);
3250 i = -1;
3252 if (self->pattern->groupindex) {
3253 index = PyObject_GetItem(self->pattern->groupindex, index);
3254 if (index) {
3255 if (PyLong_Check(index))
3256 i = PyLong_AsSsize_t(index);
3257 Py_DECREF(index);
3258 } else
3259 PyErr_Clear();
3262 return i;
3265 static PyObject*
3266 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3268 return match_getslice_by_index(self, match_getindex(self, index), def);
3271 static PyObject*
3272 match_expand(MatchObject* self, PyObject* ptemplate)
3274 /* delegate to Python code */
3275 return call(
3276 SRE_PY_MODULE, "_expand",
3277 PyTuple_Pack(3, self->pattern, self, ptemplate)
3281 static PyObject*
3282 match_group(MatchObject* self, PyObject* args)
3284 PyObject* result;
3285 Py_ssize_t i, size;
3287 size = PyTuple_GET_SIZE(args);
3289 switch (size) {
3290 case 0:
3291 result = match_getslice(self, Py_False, Py_None);
3292 break;
3293 case 1:
3294 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3295 break;
3296 default:
3297 /* fetch multiple items */
3298 result = PyTuple_New(size);
3299 if (!result)
3300 return NULL;
3301 for (i = 0; i < size; i++) {
3302 PyObject* item = match_getslice(
3303 self, PyTuple_GET_ITEM(args, i), Py_None
3305 if (!item) {
3306 Py_DECREF(result);
3307 return NULL;
3309 PyTuple_SET_ITEM(result, i, item);
3311 break;
3313 return result;
3316 static PyObject*
3317 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3319 PyObject* result;
3320 Py_ssize_t index;
3322 PyObject* def = Py_None;
3323 static char* kwlist[] = { "default", NULL };
3324 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3325 return NULL;
3327 result = PyTuple_New(self->groups-1);
3328 if (!result)
3329 return NULL;
3331 for (index = 1; index < self->groups; index++) {
3332 PyObject* item;
3333 item = match_getslice_by_index(self, index, def);
3334 if (!item) {
3335 Py_DECREF(result);
3336 return NULL;
3338 PyTuple_SET_ITEM(result, index-1, item);
3341 return result;
3344 static PyObject*
3345 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3347 PyObject* result;
3348 PyObject* keys;
3349 Py_ssize_t index;
3351 PyObject* def = Py_None;
3352 static char* kwlist[] = { "default", NULL };
3353 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3354 return NULL;
3356 result = PyDict_New();
3357 if (!result || !self->pattern->groupindex)
3358 return result;
3360 keys = PyMapping_Keys(self->pattern->groupindex);
3361 if (!keys)
3362 goto failed;
3364 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3365 int status;
3366 PyObject* key;
3367 PyObject* value;
3368 key = PyList_GET_ITEM(keys, index);
3369 if (!key)
3370 goto failed;
3371 value = match_getslice(self, key, def);
3372 if (!value) {
3373 Py_DECREF(key);
3374 goto failed;
3376 status = PyDict_SetItem(result, key, value);
3377 Py_DECREF(value);
3378 if (status < 0)
3379 goto failed;
3382 Py_DECREF(keys);
3384 return result;
3386 failed:
3387 Py_XDECREF(keys);
3388 Py_DECREF(result);
3389 return NULL;
3392 static PyObject*
3393 match_start(MatchObject* self, PyObject* args)
3395 Py_ssize_t index;
3397 PyObject* index_ = NULL;
3398 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3399 return NULL;
3401 index = match_getindex(self, index_);
3403 if (index < 0 || index >= self->groups) {
3404 PyErr_SetString(
3405 PyExc_IndexError,
3406 "no such group"
3408 return NULL;
3411 /* mark is -1 if group is undefined */
3412 return Py_BuildValue("i", self->mark[index*2]);
3415 static PyObject*
3416 match_end(MatchObject* self, PyObject* args)
3418 Py_ssize_t index;
3420 PyObject* index_ = NULL;
3421 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3422 return NULL;
3424 index = match_getindex(self, index_);
3426 if (index < 0 || index >= self->groups) {
3427 PyErr_SetString(
3428 PyExc_IndexError,
3429 "no such group"
3431 return NULL;
3434 /* mark is -1 if group is undefined */
3435 return Py_BuildValue("i", self->mark[index*2+1]);
3438 LOCAL(PyObject*)
3439 _pair(Py_ssize_t i1, Py_ssize_t i2)
3441 PyObject* pair;
3442 PyObject* item;
3444 pair = PyTuple_New(2);
3445 if (!pair)
3446 return NULL;
3448 item = PyLong_FromSsize_t(i1);
3449 if (!item)
3450 goto error;
3451 PyTuple_SET_ITEM(pair, 0, item);
3453 item = PyLong_FromSsize_t(i2);
3454 if (!item)
3455 goto error;
3456 PyTuple_SET_ITEM(pair, 1, item);
3458 return pair;
3460 error:
3461 Py_DECREF(pair);
3462 return NULL;
3465 static PyObject*
3466 match_span(MatchObject* self, PyObject* args)
3468 Py_ssize_t index;
3470 PyObject* index_ = NULL;
3471 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3472 return NULL;
3474 index = match_getindex(self, index_);
3476 if (index < 0 || index >= self->groups) {
3477 PyErr_SetString(
3478 PyExc_IndexError,
3479 "no such group"
3481 return NULL;
3484 /* marks are -1 if group is undefined */
3485 return _pair(self->mark[index*2], self->mark[index*2+1]);
3488 static PyObject*
3489 match_regs(MatchObject* self)
3491 PyObject* regs;
3492 PyObject* item;
3493 Py_ssize_t index;
3495 regs = PyTuple_New(self->groups);
3496 if (!regs)
3497 return NULL;
3499 for (index = 0; index < self->groups; index++) {
3500 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3501 if (!item) {
3502 Py_DECREF(regs);
3503 return NULL;
3505 PyTuple_SET_ITEM(regs, index, item);
3508 Py_INCREF(regs);
3509 self->regs = regs;
3511 return regs;
3514 static PyObject*
3515 match_copy(MatchObject* self, PyObject *unused)
3517 #ifdef USE_BUILTIN_COPY
3518 MatchObject* copy;
3519 Py_ssize_t slots, offset;
3521 slots = 2 * (self->pattern->groups+1);
3523 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3524 if (!copy)
3525 return NULL;
3527 /* this value a constant, but any compiler should be able to
3528 figure that out all by itself */
3529 offset = offsetof(MatchObject, string);
3531 Py_XINCREF(self->pattern);
3532 Py_XINCREF(self->string);
3533 Py_XINCREF(self->regs);
3535 memcpy((char*) copy + offset, (char*) self + offset,
3536 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3538 return (PyObject*) copy;
3539 #else
3540 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3541 return NULL;
3542 #endif
3545 static PyObject*
3546 match_deepcopy(MatchObject* self, PyObject* memo)
3548 #ifdef USE_BUILTIN_COPY
3549 MatchObject* copy;
3551 copy = (MatchObject*) match_copy(self);
3552 if (!copy)
3553 return NULL;
3555 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3556 !deepcopy(&copy->string, memo) ||
3557 !deepcopy(&copy->regs, memo)) {
3558 Py_DECREF(copy);
3559 return NULL;
3562 #else
3563 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3564 return NULL;
3565 #endif
3568 static PyMethodDef match_methods[] = {
3569 {"group", (PyCFunction) match_group, METH_VARARGS},
3570 {"start", (PyCFunction) match_start, METH_VARARGS},
3571 {"end", (PyCFunction) match_end, METH_VARARGS},
3572 {"span", (PyCFunction) match_span, METH_VARARGS},
3573 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3574 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3575 {"expand", (PyCFunction) match_expand, METH_O},
3576 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3577 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3578 {NULL, NULL}
3581 static PyObject *
3582 match_lastindex_get(MatchObject *self)
3584 if (self->lastindex >= 0)
3585 return Py_BuildValue("i", self->lastindex);
3586 Py_INCREF(Py_None);
3587 return Py_None;
3590 static PyObject *
3591 match_lastgroup_get(MatchObject *self)
3593 if (self->pattern->indexgroup && self->lastindex >= 0) {
3594 PyObject* result = PySequence_GetItem(
3595 self->pattern->indexgroup, self->lastindex
3597 if (result)
3598 return result;
3599 PyErr_Clear();
3601 Py_INCREF(Py_None);
3602 return Py_None;
3605 static PyObject *
3606 match_regs_get(MatchObject *self)
3608 if (self->regs) {
3609 Py_INCREF(self->regs);
3610 return self->regs;
3611 } else
3612 return match_regs(self);
3615 static PyGetSetDef match_getset[] = {
3616 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3617 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3618 {"regs", (getter)match_regs_get, (setter)NULL},
3619 {NULL}
3622 #define MATCH_OFF(x) offsetof(MatchObject, x)
3623 static PyMemberDef match_members[] = {
3624 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3625 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3626 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3627 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3628 {NULL}
3631 /* FIXME: implement setattr("string", None) as a special case (to
3632 detach the associated string, if any */
3634 static PyTypeObject Match_Type = {
3635 PyVarObject_HEAD_INIT(NULL,0)
3636 "_" SRE_MODULE ".SRE_Match",
3637 sizeof(MatchObject), sizeof(Py_ssize_t),
3638 (destructor)match_dealloc, /* tp_dealloc */
3639 0, /* tp_print */
3640 0, /* tp_getattr */
3641 0, /* tp_setattr */
3642 0, /* tp_reserved */
3643 0, /* tp_repr */
3644 0, /* tp_as_number */
3645 0, /* tp_as_sequence */
3646 0, /* tp_as_mapping */
3647 0, /* tp_hash */
3648 0, /* tp_call */
3649 0, /* tp_str */
3650 0, /* tp_getattro */
3651 0, /* tp_setattro */
3652 0, /* tp_as_buffer */
3653 Py_TPFLAGS_DEFAULT, /* tp_flags */
3654 0, /* tp_doc */
3655 0, /* tp_traverse */
3656 0, /* tp_clear */
3657 0, /* tp_richcompare */
3658 0, /* tp_weaklistoffset */
3659 0, /* tp_iter */
3660 0, /* tp_iternext */
3661 match_methods, /* tp_methods */
3662 match_members, /* tp_members */
3663 match_getset, /* tp_getset */
3666 static PyObject*
3667 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3669 /* create match object (from state object) */
3671 MatchObject* match;
3672 Py_ssize_t i, j;
3673 char* base;
3674 int n;
3676 if (status > 0) {
3678 /* create match object (with room for extra group marks) */
3679 /* coverity[ampersand_in_size] */
3680 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3681 2*(pattern->groups+1));
3682 if (!match)
3683 return NULL;
3685 Py_INCREF(pattern);
3686 match->pattern = pattern;
3688 Py_INCREF(state->string);
3689 match->string = state->string;
3691 match->regs = NULL;
3692 match->groups = pattern->groups+1;
3694 /* fill in group slices */
3696 base = (char*) state->beginning;
3697 n = state->charsize;
3699 match->mark[0] = ((char*) state->start - base) / n;
3700 match->mark[1] = ((char*) state->ptr - base) / n;
3702 for (i = j = 0; i < pattern->groups; i++, j+=2)
3703 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3704 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3705 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3706 } else
3707 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3709 match->pos = state->pos;
3710 match->endpos = state->endpos;
3712 match->lastindex = state->lastindex;
3714 return (PyObject*) match;
3716 } else if (status == 0) {
3718 /* no match */
3719 Py_INCREF(Py_None);
3720 return Py_None;
3724 /* internal error */
3725 pattern_error(status);
3726 return NULL;
3730 /* -------------------------------------------------------------------- */
3731 /* scanner methods (experimental) */
3733 static void
3734 scanner_dealloc(ScannerObject* self)
3736 state_fini(&self->state);
3737 Py_XDECREF(self->pattern);
3738 PyObject_DEL(self);
3741 static PyObject*
3742 scanner_match(ScannerObject* self, PyObject *unused)
3744 SRE_STATE* state = &self->state;
3745 PyObject* match;
3746 int status;
3748 state_reset(state);
3750 state->ptr = state->start;
3752 if (state->charsize == 1) {
3753 status = sre_match(state, PatternObject_GetCode(self->pattern));
3754 } else {
3755 #if defined(HAVE_UNICODE)
3756 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3757 #endif
3759 if (PyErr_Occurred())
3760 return NULL;
3762 match = pattern_new_match((PatternObject*) self->pattern,
3763 state, status);
3765 if (status == 0 || state->ptr == state->start)
3766 state->start = (void*) ((char*) state->ptr + state->charsize);
3767 else
3768 state->start = state->ptr;
3770 return match;
3774 static PyObject*
3775 scanner_search(ScannerObject* self, PyObject *unused)
3777 SRE_STATE* state = &self->state;
3778 PyObject* match;
3779 int status;
3781 state_reset(state);
3783 state->ptr = state->start;
3785 if (state->charsize == 1) {
3786 status = sre_search(state, PatternObject_GetCode(self->pattern));
3787 } else {
3788 #if defined(HAVE_UNICODE)
3789 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3790 #endif
3792 if (PyErr_Occurred())
3793 return NULL;
3795 match = pattern_new_match((PatternObject*) self->pattern,
3796 state, status);
3798 if (status == 0 || state->ptr == state->start)
3799 state->start = (void*) ((char*) state->ptr + state->charsize);
3800 else
3801 state->start = state->ptr;
3803 return match;
3806 static PyMethodDef scanner_methods[] = {
3807 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3808 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3809 {NULL, NULL}
3812 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3813 static PyMemberDef scanner_members[] = {
3814 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3815 {NULL} /* Sentinel */
3818 static PyTypeObject Scanner_Type = {
3819 PyVarObject_HEAD_INIT(NULL, 0)
3820 "_" SRE_MODULE ".SRE_Scanner",
3821 sizeof(ScannerObject), 0,
3822 (destructor)scanner_dealloc,/* tp_dealloc */
3823 0, /* tp_print */
3824 0, /* tp_getattr */
3825 0, /* tp_setattr */
3826 0, /* tp_reserved */
3827 0, /* tp_repr */
3828 0, /* tp_as_number */
3829 0, /* tp_as_sequence */
3830 0, /* tp_as_mapping */
3831 0, /* tp_hash */
3832 0, /* tp_call */
3833 0, /* tp_str */
3834 0, /* tp_getattro */
3835 0, /* tp_setattro */
3836 0, /* tp_as_buffer */
3837 Py_TPFLAGS_DEFAULT, /* tp_flags */
3838 0, /* tp_doc */
3839 0, /* tp_traverse */
3840 0, /* tp_clear */
3841 0, /* tp_richcompare */
3842 0, /* tp_weaklistoffset */
3843 0, /* tp_iter */
3844 0, /* tp_iternext */
3845 scanner_methods, /* tp_methods */
3846 scanner_members, /* tp_members */
3847 0, /* tp_getset */
3850 static PyObject*
3851 pattern_scanner(PatternObject* pattern, PyObject* args)
3853 /* create search state object */
3855 ScannerObject* self;
3857 PyObject* string;
3858 Py_ssize_t start = 0;
3859 Py_ssize_t end = PY_SSIZE_T_MAX;
3860 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3861 return NULL;
3863 /* create scanner object */
3864 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3865 if (!self)
3866 return NULL;
3867 self->pattern = NULL;
3869 string = state_init(&self->state, pattern, string, start, end);
3870 if (!string) {
3871 Py_DECREF(self);
3872 return NULL;
3875 Py_INCREF(pattern);
3876 self->pattern = (PyObject*) pattern;
3878 return (PyObject*) self;
3881 static PyMethodDef _functions[] = {
3882 {"compile", _compile, METH_VARARGS},
3883 {"getcodesize", sre_codesize, METH_NOARGS},
3884 {"getlower", sre_getlower, METH_VARARGS},
3885 {NULL, NULL}
3888 static struct PyModuleDef sremodule = {
3889 PyModuleDef_HEAD_INIT,
3890 "_" SRE_MODULE,
3891 NULL,
3893 _functions,
3894 NULL,
3895 NULL,
3896 NULL,
3897 NULL
3900 PyMODINIT_FUNC PyInit__sre(void)
3902 PyObject* m;
3903 PyObject* d;
3904 PyObject* x;
3906 /* Initialize object types */
3907 if (PyType_Ready(&Pattern_Type) < 0)
3908 return NULL;
3909 if (PyType_Ready(&Match_Type) < 0)
3910 return NULL;
3911 if (PyType_Ready(&Scanner_Type) < 0)
3912 return NULL;
3914 m = PyModule_Create(&sremodule);
3915 if (m == NULL)
3916 return NULL;
3917 d = PyModule_GetDict(m);
3919 x = PyLong_FromLong(SRE_MAGIC);
3920 if (x) {
3921 PyDict_SetItemString(d, "MAGIC", x);
3922 Py_DECREF(x);
3925 x = PyLong_FromLong(sizeof(SRE_CODE));
3926 if (x) {
3927 PyDict_SetItemString(d, "CODESIZE", x);
3928 Py_DECREF(x);
3931 x = PyUnicode_FromString(copyright);
3932 if (x) {
3933 PyDict_SetItemString(d, "copyright", x);
3934 Py_DECREF(x);
3936 return m;
3939 #endif /* !defined(SRE_RECURSIVE) */
3941 /* vim:ts=4:sw=4:et